]>
Commit | Line | Data |
---|---|---|
cc90b958 BS |
1 | From: kernel.org |
2 | Subject: 2.6.25 | |
3 | Patch-mainline: 2.6.25 | |
4 | ||
5 | Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> | |
6 | ||
7 | Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches.py | |
8 | ||
9 | --- | |
10 | arch/x86/Kconfig | 18 | |
11 | arch/x86/Kconfig.debug | 1 | |
12 | arch/x86/ia32/ia32entry-xen.S | 12 | |
13 | arch/x86/kernel/Makefile | 3 | |
14 | arch/x86/kernel/acpi/boot.c | 3 | |
15 | arch/x86/kernel/acpi/sleep-xen.c | 95 + | |
16 | arch/x86/kernel/acpi/sleep_32-xen.c | 117 -- | |
17 | arch/x86/kernel/acpi/sleep_64-xen.c | 125 -- | |
18 | arch/x86/kernel/apic_32-xen.c | 2 | |
19 | arch/x86/kernel/apic_64-xen.c | 73 - | |
20 | arch/x86/kernel/asm-offsets_32.c | 2 | |
21 | arch/x86/kernel/cpu/common-xen.c | 214 +-- | |
22 | arch/x86/kernel/cpu/mtrr/main-xen.c | 19 | |
23 | arch/x86/kernel/e820_32-xen.c | 275 ----- | |
24 | arch/x86/kernel/e820_64-xen.c | 485 +++++--- | |
25 | arch/x86/kernel/early_printk-xen.c | 2 | |
26 | arch/x86/kernel/entry_32-xen.S | 195 +++ | |
27 | arch/x86/kernel/entry_64-xen.S | 91 - | |
28 | arch/x86/kernel/fixup.c | 2 | |
29 | arch/x86/kernel/genapic_64-xen.c | 15 | |
30 | arch/x86/kernel/head64-xen.c | 63 + | |
31 | arch/x86/kernel/head_32-xen.S | 3 | |
32 | arch/x86/kernel/init_task-xen.c | 2 | |
33 | arch/x86/kernel/io_apic_32-xen.c | 15 | |
34 | arch/x86/kernel/io_apic_64-xen.c | 110 +- | |
35 | arch/x86/kernel/ioport-xen.c | 112 ++ | |
36 | arch/x86/kernel/ioport_32-xen.c | 121 -- | |
37 | arch/x86/kernel/ioport_64-xen.c | 99 - | |
38 | arch/x86/kernel/irq_32-xen.c | 22 | |
39 | arch/x86/kernel/irq_64-xen.c | 43 | |
40 | arch/x86/kernel/ldt-xen.c | 272 +++++ | |
41 | arch/x86/kernel/ldt_32-xen.c | 265 ---- | |
42 | arch/x86/kernel/ldt_64-xen.c | 271 ---- | |
43 | arch/x86/kernel/machine_kexec_64.c | 2 | |
44 | arch/x86/kernel/microcode-xen.c | 2 | |
45 | arch/x86/kernel/mpparse_32-xen.c | 49 | |
46 | arch/x86/kernel/mpparse_64-xen.c | 30 | |
47 | arch/x86/kernel/pci-dma-xen.c | 20 | |
48 | arch/x86/kernel/process_32-xen.c | 438 ++------ | |
49 | arch/x86/kernel/process_64-xen.c | 303 ++--- | |
50 | arch/x86/kernel/quirks-xen.c | 82 - | |
51 | arch/x86/kernel/rtc.c | 8 | |
52 | arch/x86/kernel/setup64-xen.c | 70 + | |
53 | arch/x86/kernel/setup_32-xen.c | 311 ++++- | |
54 | arch/x86/kernel/setup_64-xen.c | 686 ++++++------ | |
55 | arch/x86/kernel/smp_32-xen.c | 5 | |
56 | arch/x86/kernel/smp_64-xen.c | 91 - | |
57 | arch/x86/kernel/time_32-xen.c | 136 -- | |
58 | arch/x86/kernel/traps_32-xen.c | 320 +++-- | |
59 | arch/x86/kernel/traps_64-xen.c | 371 +++--- | |
60 | arch/x86/kernel/vsyscall_64-xen.c | 60 - | |
61 | arch/x86/kernel/xen_entry_64.S | 36 | |
62 | arch/x86/mach-xen/setup.c | 11 | |
63 | arch/x86/mm/fault-xen.c | 1026 ++++++++++++++++++ | |
64 | arch/x86/mm/fault_32-xen.c | 757 ------------- | |
65 | arch/x86/mm/fault_64-xen.c | 686 ------------ | |
66 | arch/x86/mm/highmem_32-xen.c | 45 | |
67 | arch/x86/mm/hypervisor.c | 10 | |
68 | arch/x86/mm/init_32-xen.c | 464 +++----- | |
69 | arch/x86/mm/init_64-xen.c | 517 ++++----- | |
70 | arch/x86/mm/ioremap-xen.c | 685 ++++++++++++ | |
71 | arch/x86/mm/ioremap_32-xen.c | 445 -------- | |
72 | arch/x86/mm/pageattr-xen.c | 1412 ++++++++++++++++++++++++++ | |
73 | arch/x86/mm/pageattr_64-xen.c | 542 --------- | |
74 | arch/x86/mm/pgtable_32-xen.c | 672 ++---------- | |
75 | arch/x86/pci/irq-xen.c | 24 | |
76 | arch/x86/vdso/Makefile | 1 | |
77 | arch/x86/vdso/vdso32-setup-xen.c | 506 +++++++++ | |
78 | arch/x86/vdso/vdso32-setup.c | 34 | |
79 | arch/x86/vdso/vdso32.S | 12 | |
80 | arch/x86/vdso/vdso32/syscall.S | 2 | |
81 | drivers/pci/msi-xen.c | 98 - | |
82 | drivers/pci/pci.c | 5 | |
83 | drivers/xen/balloon/sysfs.c | 2 | |
84 | drivers/xen/blkback/blkback.c | 5 | |
85 | drivers/xen/blkfront/blkfront.c | 9 | |
86 | drivers/xen/blktap/blktap.c | 8 | |
87 | drivers/xen/core/Makefile | 1 | |
88 | drivers/xen/core/evtchn.c | 46 | |
89 | drivers/xen/core/hypervisor_sysfs.c | 2 | |
90 | drivers/xen/core/smpboot.c | 29 | |
91 | drivers/xen/core/spinlock.c | 161 ++ | |
92 | drivers/xen/core/xen_sysfs.c | 30 | |
93 | drivers/xen/gntdev/gntdev.c | 4 | |
94 | drivers/xen/scsifront/scsifront.c | 49 | |
95 | drivers/xen/xenoprof/xenoprofile.c | 2 | |
96 | include/asm-x86/mach-xen/asm/agp.h | 9 | |
97 | include/asm-x86/mach-xen/asm/desc.h | 403 +++++++ | |
98 | include/asm-x86/mach-xen/asm/desc_32.h | 262 ---- | |
99 | include/asm-x86/mach-xen/asm/desc_64.h | 228 ---- | |
100 | include/asm-x86/mach-xen/asm/dma-mapping_32.h | 18 | |
101 | include/asm-x86/mach-xen/asm/fixmap_32.h | 24 | |
102 | include/asm-x86/mach-xen/asm/fixmap_64.h | 25 | |
103 | include/asm-x86/mach-xen/asm/highmem.h | 10 | |
104 | include/asm-x86/mach-xen/asm/hypervisor.h | 19 | |
105 | include/asm-x86/mach-xen/asm/io_32.h | 69 - | |
106 | include/asm-x86/mach-xen/asm/io_64.h | 62 - | |
107 | include/asm-x86/mach-xen/asm/irqflags.h | 248 ++++ | |
108 | include/asm-x86/mach-xen/asm/irqflags_32.h | 212 --- | |
109 | include/asm-x86/mach-xen/asm/irqflags_64.h | 178 --- | |
110 | include/asm-x86/mach-xen/asm/maddr_32.h | 21 | |
111 | include/asm-x86/mach-xen/asm/maddr_64.h | 19 | |
112 | include/asm-x86/mach-xen/asm/mmu_context_32.h | 2 | |
113 | include/asm-x86/mach-xen/asm/mmu_context_64.h | 12 | |
114 | include/asm-x86/mach-xen/asm/page.h | 238 ++++ | |
115 | include/asm-x86/mach-xen/asm/page_64.h | 196 --- | |
116 | include/asm-x86/mach-xen/asm/pci.h | 17 | |
117 | include/asm-x86/mach-xen/asm/pci_64.h | 1 | |
118 | include/asm-x86/mach-xen/asm/pgalloc_32.h | 116 +- | |
119 | include/asm-x86/mach-xen/asm/pgalloc_64.h | 87 - | |
120 | include/asm-x86/mach-xen/asm/pgtable-3level.h | 107 - | |
121 | include/asm-x86/mach-xen/asm/pgtable.h | 449 ++++++++ | |
122 | include/asm-x86/mach-xen/asm/pgtable_32.h | 361 ------ | |
123 | include/asm-x86/mach-xen/asm/pgtable_64.h | 400 +------ | |
124 | include/asm-x86/mach-xen/asm/processor.h | 792 ++++++++++++++ | |
125 | include/asm-x86/mach-xen/asm/processor_32.h | 751 ------------- | |
126 | include/asm-x86/mach-xen/asm/processor_64.h | 461 -------- | |
127 | include/asm-x86/mach-xen/asm/segment.h | 203 +++ | |
128 | include/asm-x86/mach-xen/asm/segment_32.h | 150 -- | |
129 | include/asm-x86/mach-xen/asm/smp_32.h | 125 +- | |
130 | include/asm-x86/mach-xen/asm/smp_64.h | 138 -- | |
131 | include/asm-x86/mach-xen/asm/spinlock.h | 333 ++++++ | |
132 | include/asm-x86/mach-xen/asm/system.h | 392 +++++++ | |
133 | include/asm-x86/mach-xen/asm/system_32.h | 312 ----- | |
134 | include/asm-x86/mach-xen/asm/system_64.h | 159 -- | |
135 | include/asm-x86/mach-xen/asm/tlbflush.h | 105 + | |
136 | include/asm-x86/mach-xen/asm/tlbflush_32.h | 99 - | |
137 | include/asm-x86/mach-xen/asm/tlbflush_64.h | 97 - | |
138 | include/asm-x86/mach-xen/irq_vectors.h | 3 | |
139 | include/asm-x86/mmu.h | 2 | |
140 | include/asm-x86/ptrace.h | 4 | |
141 | include/asm-x86/thread_info.h | 12 | |
142 | include/asm-x86/time.h | 6 | |
143 | include/linux/page-flags.h | 4 | |
144 | include/linux/pci.h | 3 | |
145 | include/xen/evtchn.h | 25 | |
146 | kernel/sysctl_check.c | 2 | |
147 | lib/swiotlb-xen.c | 35 | |
148 | 138 files changed, 11322 insertions(+), 11153 deletions(-) | |
149 | ||
150 | --- a/arch/x86/ia32/ia32entry-xen.S | |
151 | +++ b/arch/x86/ia32/ia32entry-xen.S | |
152 | @@ -12,7 +12,6 @@ | |
153 | #include <asm/ia32_unistd.h> | |
154 | #include <asm/thread_info.h> | |
155 | #include <asm/segment.h> | |
156 | -#include <asm/vsyscall32.h> | |
157 | #include <asm/irqflags.h> | |
158 | #include <linux/linkage.h> | |
159 | ||
160 | @@ -99,10 +98,11 @@ ENTRY(ia32_sysenter_target) | |
161 | CFI_RESTORE rcx | |
162 | movl %ebp,%ebp /* zero extension */ | |
163 | movl %eax,%eax | |
164 | + movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d | |
165 | movl $__USER32_DS,40(%rsp) | |
166 | movq %rbp,32(%rsp) | |
167 | movl $__USER32_CS,16(%rsp) | |
168 | - movl $VSYSCALL32_SYSEXIT,8(%rsp) | |
169 | + movq %r10,8(%rsp) | |
170 | movq %rax,(%rsp) | |
171 | cld | |
172 | SAVE_ARGS 0,0,1 | |
173 | @@ -582,8 +582,8 @@ ia32_sys_call_table: | |
174 | .quad compat_sys_futex /* 240 */ | |
175 | .quad compat_sys_sched_setaffinity | |
176 | .quad compat_sys_sched_getaffinity | |
177 | - .quad sys32_set_thread_area | |
178 | - .quad sys32_get_thread_area | |
179 | + .quad sys_set_thread_area | |
180 | + .quad sys_get_thread_area | |
181 | .quad compat_sys_io_setup /* 245 */ | |
182 | .quad sys_io_destroy | |
183 | .quad compat_sys_io_getevents | |
184 | @@ -661,7 +661,9 @@ ia32_sys_call_table: | |
185 | .quad sys_epoll_pwait | |
186 | .quad compat_sys_utimensat /* 320 */ | |
187 | .quad compat_sys_signalfd | |
188 | - .quad compat_sys_timerfd | |
189 | + .quad sys_timerfd_create | |
190 | .quad sys_eventfd | |
191 | .quad sys32_fallocate | |
192 | + .quad compat_sys_timerfd_settime /* 325 */ | |
193 | + .quad compat_sys_timerfd_gettime | |
194 | ia32_syscall_end: | |
195 | --- a/arch/x86/Kconfig | |
196 | +++ b/arch/x86/Kconfig | |
197 | @@ -27,7 +27,7 @@ config X86 | |
198 | select HAVE_KRETPROBES | |
199 | select HAVE_DYNAMIC_FTRACE | |
200 | select HAVE_FTRACE | |
201 | - select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) | |
202 | + select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN | |
203 | select HAVE_ARCH_KGDB if !X86_VOYAGER | |
204 | select HAVE_ARCH_TRACEHOOK | |
205 | select HAVE_GENERIC_DMA_COHERENT if X86_32 | |
206 | @@ -208,14 +208,12 @@ config X86_TRAMPOLINE | |
207 | default y | |
208 | ||
209 | config X86_NO_TSS | |
210 | - bool | |
211 | + def_bool y | |
212 | depends on XEN | |
213 | - default y | |
214 | ||
215 | config X86_NO_IDT | |
216 | - bool | |
217 | + def_bool y | |
218 | depends on XEN | |
219 | - default y | |
220 | ||
221 | config KTIME_SCALAR | |
222 | def_bool X86_32 | |
223 | @@ -724,9 +722,8 @@ config X86_VISWS_APIC | |
224 | depends on X86_32 && X86_VISWS | |
225 | ||
226 | config X86_XEN_GENAPIC | |
227 | - bool | |
228 | + def_bool y | |
229 | depends on X86_64_XEN | |
230 | - default y | |
231 | ||
232 | config X86_MCE | |
233 | bool "Machine Check Exception" | |
234 | @@ -1113,7 +1110,7 @@ config ARCH_DISCONTIGMEM_DEFAULT | |
235 | ||
236 | config ARCH_SPARSEMEM_DEFAULT | |
237 | def_bool y | |
238 | - depends on X86_64 | |
239 | + depends on X86_64 && !X86_64_XEN | |
240 | ||
241 | config ARCH_SPARSEMEM_ENABLE | |
242 | def_bool y | |
243 | @@ -1743,10 +1740,10 @@ config PCI_MMCONFIG | |
244 | depends on X86_64 && PCI && ACPI | |
245 | ||
246 | config XEN_PCIDEV_FRONTEND | |
247 | - bool "Xen PCI Frontend" if X86_64 | |
248 | + def_bool y | |
249 | + prompt "Xen PCI Frontend" if X86_64 | |
250 | depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64) | |
251 | select HOTPLUG | |
252 | - default y | |
253 | help | |
254 | The PCI device frontend driver allows the kernel to import arbitrary | |
255 | PCI devices from a PCI backend to support PCI driver domains. | |
256 | @@ -1754,7 +1751,6 @@ config XEN_PCIDEV_FRONTEND | |
257 | config XEN_PCIDEV_FE_DEBUG | |
258 | bool "Xen PCI Frontend Debugging" | |
259 | depends on XEN_PCIDEV_FRONTEND | |
260 | - default n | |
261 | help | |
262 | Enables some debug statements within the PCI Frontend. | |
263 | ||
264 | --- a/arch/x86/Kconfig.debug | |
265 | +++ b/arch/x86/Kconfig.debug | |
266 | @@ -266,6 +266,7 @@ config DEBUG_BOOT_PARAMS | |
267 | bool "Debug boot parameters" | |
268 | depends on DEBUG_KERNEL | |
269 | depends on DEBUG_FS | |
270 | + depends on !XEN | |
271 | help | |
272 | This option will cause struct boot_params to be exported via debugfs. | |
273 | ||
274 | --- a/arch/x86/kernel/acpi/boot.c | |
275 | +++ b/arch/x86/kernel/acpi/boot.c | |
276 | @@ -133,6 +133,9 @@ char *__init __acpi_map_table(unsigned l | |
277 | #ifndef CONFIG_XEN | |
278 | if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT)) | |
279 | return __va(phys); | |
280 | +#else | |
281 | + if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT)) | |
282 | + return isa_bus_to_virt(phys); | |
283 | #endif | |
284 | ||
285 | offset = phys & (PAGE_SIZE - 1); | |
286 | --- a/arch/x86/kernel/acpi/sleep_32-xen.c | |
287 | +++ /dev/null | |
288 | @@ -1,117 +0,0 @@ | |
289 | -/* | |
290 | - * sleep.c - x86-specific ACPI sleep support. | |
291 | - * | |
292 | - * Copyright (C) 2001-2003 Patrick Mochel | |
293 | - * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> | |
294 | - */ | |
295 | - | |
296 | -#include <linux/acpi.h> | |
297 | -#include <linux/bootmem.h> | |
298 | -#include <linux/dmi.h> | |
299 | -#include <linux/cpumask.h> | |
300 | - | |
301 | -#include <asm/smp.h> | |
302 | - | |
303 | -#ifndef CONFIG_ACPI_PV_SLEEP | |
304 | -/* address in low memory of the wakeup routine. */ | |
305 | -unsigned long acpi_wakeup_address = 0; | |
306 | -unsigned long acpi_realmode_flags; | |
307 | -extern char wakeup_start, wakeup_end; | |
308 | - | |
309 | -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); | |
310 | -#endif | |
311 | - | |
312 | -/** | |
313 | - * acpi_save_state_mem - save kernel state | |
314 | - * | |
315 | - * Create an identity mapped page table and copy the wakeup routine to | |
316 | - * low memory. | |
317 | - */ | |
318 | -int acpi_save_state_mem(void) | |
319 | -{ | |
320 | -#ifndef CONFIG_ACPI_PV_SLEEP | |
321 | - if (!acpi_wakeup_address) | |
322 | - return 1; | |
323 | - memcpy((void *)acpi_wakeup_address, &wakeup_start, | |
324 | - &wakeup_end - &wakeup_start); | |
325 | - acpi_copy_wakeup_routine(acpi_wakeup_address); | |
326 | -#endif | |
327 | - return 0; | |
328 | -} | |
329 | - | |
330 | -/* | |
331 | - * acpi_restore_state - undo effects of acpi_save_state_mem | |
332 | - */ | |
333 | -void acpi_restore_state_mem(void) | |
334 | -{ | |
335 | -} | |
336 | - | |
337 | -/** | |
338 | - * acpi_reserve_bootmem - do _very_ early ACPI initialisation | |
339 | - * | |
340 | - * We allocate a page from the first 1MB of memory for the wakeup | |
341 | - * routine for when we come back from a sleep state. The | |
342 | - * runtime allocator allows specification of <16MB pages, but not | |
343 | - * <1MB pages. | |
344 | - */ | |
345 | -void __init acpi_reserve_bootmem(void) | |
346 | -{ | |
347 | -#ifndef CONFIG_ACPI_PV_SLEEP | |
348 | - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) { | |
349 | - printk(KERN_ERR | |
350 | - "ACPI: Wakeup code way too big, S3 disabled.\n"); | |
351 | - return; | |
352 | - } | |
353 | - | |
354 | - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); | |
355 | - if (!acpi_wakeup_address) | |
356 | - printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); | |
357 | -#endif | |
358 | -} | |
359 | - | |
360 | -#ifndef CONFIG_ACPI_PV_SLEEP | |
361 | -static int __init acpi_sleep_setup(char *str) | |
362 | -{ | |
363 | - while ((str != NULL) && (*str != '\0')) { | |
364 | - if (strncmp(str, "s3_bios", 7) == 0) | |
365 | - acpi_realmode_flags |= 1; | |
366 | - if (strncmp(str, "s3_mode", 7) == 0) | |
367 | - acpi_realmode_flags |= 2; | |
368 | - if (strncmp(str, "s3_beep", 7) == 0) | |
369 | - acpi_realmode_flags |= 4; | |
370 | - str = strchr(str, ','); | |
371 | - if (str != NULL) | |
372 | - str += strspn(str, ", \t"); | |
373 | - } | |
374 | - return 1; | |
375 | -} | |
376 | - | |
377 | -__setup("acpi_sleep=", acpi_sleep_setup); | |
378 | - | |
379 | -/* Ouch, we want to delete this. We already have better version in userspace, in | |
380 | - s2ram from suspend.sf.net project */ | |
381 | -static __init int reset_videomode_after_s3(const struct dmi_system_id *d) | |
382 | -{ | |
383 | - acpi_realmode_flags |= 2; | |
384 | - return 0; | |
385 | -} | |
386 | - | |
387 | -static __initdata struct dmi_system_id acpisleep_dmi_table[] = { | |
388 | - { /* Reset video mode after returning from ACPI S3 sleep */ | |
389 | - .callback = reset_videomode_after_s3, | |
390 | - .ident = "Toshiba Satellite 4030cdt", | |
391 | - .matches = { | |
392 | - DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), | |
393 | - }, | |
394 | - }, | |
395 | - {} | |
396 | -}; | |
397 | - | |
398 | -static int __init acpisleep_dmi_init(void) | |
399 | -{ | |
400 | - dmi_check_system(acpisleep_dmi_table); | |
401 | - return 0; | |
402 | -} | |
403 | - | |
404 | -core_initcall(acpisleep_dmi_init); | |
405 | -#endif /* CONFIG_ACPI_PV_SLEEP */ | |
406 | --- a/arch/x86/kernel/acpi/sleep_64-xen.c | |
407 | +++ /dev/null | |
408 | @@ -1,125 +0,0 @@ | |
409 | -/* | |
410 | - * acpi.c - Architecture-Specific Low-Level ACPI Support | |
411 | - * | |
412 | - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> | |
413 | - * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> | |
414 | - * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> | |
415 | - * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) | |
416 | - * Copyright (C) 2003 Pavel Machek, SuSE Labs | |
417 | - * | |
418 | - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
419 | - * | |
420 | - * This program is free software; you can redistribute it and/or modify | |
421 | - * it under the terms of the GNU General Public License as published by | |
422 | - * the Free Software Foundation; either version 2 of the License, or | |
423 | - * (at your option) any later version. | |
424 | - * | |
425 | - * This program is distributed in the hope that it will be useful, | |
426 | - * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
427 | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
428 | - * GNU General Public License for more details. | |
429 | - * | |
430 | - * You should have received a copy of the GNU General Public License | |
431 | - * along with this program; if not, write to the Free Software | |
432 | - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
433 | - * | |
434 | - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
435 | - */ | |
436 | - | |
437 | -#include <linux/kernel.h> | |
438 | -#include <linux/init.h> | |
439 | -#include <linux/types.h> | |
440 | -#include <linux/stddef.h> | |
441 | -#include <linux/slab.h> | |
442 | -#include <linux/pci.h> | |
443 | -#include <linux/bootmem.h> | |
444 | -#include <linux/acpi.h> | |
445 | -#include <linux/cpumask.h> | |
446 | - | |
447 | -#include <asm/mpspec.h> | |
448 | -#include <asm/io.h> | |
449 | -#include <asm/apic.h> | |
450 | -#include <asm/apicdef.h> | |
451 | -#include <asm/page.h> | |
452 | -#include <asm/pgtable.h> | |
453 | -#include <asm/pgalloc.h> | |
454 | -#include <asm/io_apic.h> | |
455 | -#include <asm/proto.h> | |
456 | -#include <asm/tlbflush.h> | |
457 | - | |
458 | -/* -------------------------------------------------------------------------- | |
459 | - Low-Level Sleep Support | |
460 | - -------------------------------------------------------------------------- */ | |
461 | - | |
462 | -#ifndef CONFIG_ACPI_PV_SLEEP | |
463 | -/* address in low memory of the wakeup routine. */ | |
464 | -unsigned long acpi_wakeup_address = 0; | |
465 | -unsigned long acpi_realmode_flags; | |
466 | -extern char wakeup_start, wakeup_end; | |
467 | - | |
468 | -extern unsigned long acpi_copy_wakeup_routine(unsigned long); | |
469 | -#endif | |
470 | - | |
471 | -/** | |
472 | - * acpi_save_state_mem - save kernel state | |
473 | - * | |
474 | - * Create an identity mapped page table and copy the wakeup routine to | |
475 | - * low memory. | |
476 | - */ | |
477 | -int acpi_save_state_mem(void) | |
478 | -{ | |
479 | -#ifndef CONFIG_ACPI_PV_SLEEP | |
480 | - memcpy((void *)acpi_wakeup_address, &wakeup_start, | |
481 | - &wakeup_end - &wakeup_start); | |
482 | - acpi_copy_wakeup_routine(acpi_wakeup_address); | |
483 | -#endif | |
484 | - return 0; | |
485 | -} | |
486 | - | |
487 | -/* | |
488 | - * acpi_restore_state | |
489 | - */ | |
490 | -void acpi_restore_state_mem(void) | |
491 | -{ | |
492 | -} | |
493 | - | |
494 | -/** | |
495 | - * acpi_reserve_bootmem - do _very_ early ACPI initialisation | |
496 | - * | |
497 | - * We allocate a page in low memory for the wakeup | |
498 | - * routine for when we come back from a sleep state. The | |
499 | - * runtime allocator allows specification of <16M pages, but not | |
500 | - * <1M pages. | |
501 | - */ | |
502 | -void __init acpi_reserve_bootmem(void) | |
503 | -{ | |
504 | -#ifndef CONFIG_ACPI_PV_SLEEP | |
505 | - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); | |
506 | - if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2)) | |
507 | - printk(KERN_CRIT | |
508 | - "ACPI: Wakeup code way too big, will crash on attempt" | |
509 | - " to suspend\n"); | |
510 | -#endif | |
511 | -} | |
512 | - | |
513 | -#ifndef CONFIG_ACPI_PV_SLEEP | |
514 | -static int __init acpi_sleep_setup(char *str) | |
515 | -{ | |
516 | - while ((str != NULL) && (*str != '\0')) { | |
517 | - if (strncmp(str, "s3_bios", 7) == 0) | |
518 | - acpi_realmode_flags |= 1; | |
519 | - if (strncmp(str, "s3_mode", 7) == 0) | |
520 | - acpi_realmode_flags |= 2; | |
521 | - if (strncmp(str, "s3_beep", 7) == 0) | |
522 | - acpi_realmode_flags |= 4; | |
523 | - str = strchr(str, ','); | |
524 | - if (str != NULL) | |
525 | - str += strspn(str, ", \t"); | |
526 | - } | |
527 | - | |
528 | - return 1; | |
529 | -} | |
530 | - | |
531 | -__setup("acpi_sleep=", acpi_sleep_setup); | |
532 | -#endif /* CONFIG_ACPI_PV_SLEEP */ | |
533 | - | |
534 | --- /dev/null | |
535 | +++ b/arch/x86/kernel/acpi/sleep-xen.c | |
536 | @@ -0,0 +1,95 @@ | |
537 | +/* | |
538 | + * sleep.c - x86-specific ACPI sleep support. | |
539 | + * | |
540 | + * Copyright (C) 2001-2003 Patrick Mochel | |
541 | + * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> | |
542 | + */ | |
543 | + | |
544 | +#include <linux/acpi.h> | |
545 | +#include <linux/bootmem.h> | |
546 | +#include <linux/dmi.h> | |
547 | +#include <linux/cpumask.h> | |
548 | + | |
549 | +#include <asm/smp.h> | |
550 | + | |
551 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
552 | +/* address in low memory of the wakeup routine. */ | |
553 | +unsigned long acpi_wakeup_address = 0; | |
554 | +unsigned long acpi_realmode_flags; | |
555 | +extern char wakeup_start, wakeup_end; | |
556 | + | |
557 | +extern unsigned long acpi_copy_wakeup_routine(unsigned long); | |
558 | +#endif | |
559 | + | |
560 | +/** | |
561 | + * acpi_save_state_mem - save kernel state | |
562 | + * | |
563 | + * Create an identity mapped page table and copy the wakeup routine to | |
564 | + * low memory. | |
565 | + */ | |
566 | +int acpi_save_state_mem(void) | |
567 | +{ | |
568 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
569 | + if (!acpi_wakeup_address) { | |
570 | + printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n"); | |
571 | + return -ENOMEM; | |
572 | + } | |
573 | + memcpy((void *)acpi_wakeup_address, &wakeup_start, | |
574 | + &wakeup_end - &wakeup_start); | |
575 | + acpi_copy_wakeup_routine(acpi_wakeup_address); | |
576 | +#endif | |
577 | + | |
578 | + return 0; | |
579 | +} | |
580 | + | |
581 | +/* | |
582 | + * acpi_restore_state - undo effects of acpi_save_state_mem | |
583 | + */ | |
584 | +void acpi_restore_state_mem(void) | |
585 | +{ | |
586 | +} | |
587 | + | |
588 | + | |
589 | +/** | |
590 | + * acpi_reserve_bootmem - do _very_ early ACPI initialisation | |
591 | + * | |
592 | + * We allocate a page from the first 1MB of memory for the wakeup | |
593 | + * routine for when we come back from a sleep state. The | |
594 | + * runtime allocator allows specification of <16MB pages, but not | |
595 | + * <1MB pages. | |
596 | + */ | |
597 | +void __init acpi_reserve_bootmem(void) | |
598 | +{ | |
599 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
600 | + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) { | |
601 | + printk(KERN_ERR | |
602 | + "ACPI: Wakeup code way too big, S3 disabled.\n"); | |
603 | + return; | |
604 | + } | |
605 | + | |
606 | + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); | |
607 | + if (!acpi_wakeup_address) | |
608 | + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); | |
609 | +#endif | |
610 | +} | |
611 | + | |
612 | + | |
613 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
614 | +static int __init acpi_sleep_setup(char *str) | |
615 | +{ | |
616 | + while ((str != NULL) && (*str != '\0')) { | |
617 | + if (strncmp(str, "s3_bios", 7) == 0) | |
618 | + acpi_realmode_flags |= 1; | |
619 | + if (strncmp(str, "s3_mode", 7) == 0) | |
620 | + acpi_realmode_flags |= 2; | |
621 | + if (strncmp(str, "s3_beep", 7) == 0) | |
622 | + acpi_realmode_flags |= 4; | |
623 | + str = strchr(str, ','); | |
624 | + if (str != NULL) | |
625 | + str += strspn(str, ", \t"); | |
626 | + } | |
627 | + return 1; | |
628 | +} | |
629 | + | |
630 | +__setup("acpi_sleep=", acpi_sleep_setup); | |
631 | +#endif /* CONFIG_ACPI_PV_SLEEP */ | |
632 | --- a/arch/x86/kernel/apic_32-xen.c | |
633 | +++ b/arch/x86/kernel/apic_32-xen.c | |
634 | @@ -86,7 +86,7 @@ int setup_profiling_timer(unsigned int m | |
635 | * This initializes the IO-APIC and APIC hardware if this is | |
636 | * a UP kernel. | |
637 | */ | |
638 | -int __init APIC_init_uniprocessor (void) | |
639 | +int __init APIC_init_uniprocessor(void) | |
640 | { | |
641 | #ifdef CONFIG_X86_IO_APIC | |
642 | if (smp_found_config) | |
643 | --- a/arch/x86/kernel/apic_64-xen.c | |
644 | +++ b/arch/x86/kernel/apic_64-xen.c | |
645 | @@ -34,34 +34,17 @@ | |
646 | #include <asm/hpet.h> | |
647 | #include <asm/idle.h> | |
648 | ||
649 | -int apic_verbosity; | |
650 | +int disable_apic; | |
651 | ||
652 | /* | |
653 | - * 'what should we do if we get a hw irq event on an illegal vector'. | |
654 | - * each architecture has to answer this themselves. | |
655 | + * Debug level, exported for io_apic.c | |
656 | */ | |
657 | -void ack_bad_irq(unsigned int irq) | |
658 | -{ | |
659 | - printk("unexpected IRQ trap at irq %02x\n", irq); | |
660 | - /* | |
661 | - * Currently unexpected vectors happen only on SMP and APIC. | |
662 | - * We _must_ ack these because every local APIC has only N | |
663 | - * irq slots per priority level, and a 'hanging, unacked' IRQ | |
664 | - * holds up an irq slot - in excessive cases (when multiple | |
665 | - * unexpected vectors occur) that might lock up the APIC | |
666 | - * completely. | |
667 | - * But don't ack when the APIC is disabled. -AK | |
668 | - */ | |
669 | - if (!disable_apic) | |
670 | - ack_APIC_irq(); | |
671 | -} | |
672 | - | |
673 | -int setup_profiling_timer(unsigned int multiplier) | |
674 | -{ | |
675 | - return -EINVAL; | |
676 | -} | |
677 | +int apic_verbosity; | |
678 | ||
679 | -void smp_local_timer_interrupt(void) | |
680 | +/* | |
681 | + * The guts of the apic timer interrupt | |
682 | + */ | |
683 | +static void local_apic_timer_interrupt(void) | |
684 | { | |
685 | #ifndef CONFIG_XEN | |
686 | int cpu = smp_processor_id(); | |
687 | @@ -121,11 +104,34 @@ void smp_apic_timer_interrupt(struct pt_ | |
688 | */ | |
689 | exit_idle(); | |
690 | irq_enter(); | |
691 | - smp_local_timer_interrupt(); | |
692 | + local_apic_timer_interrupt(); | |
693 | irq_exit(); | |
694 | set_irq_regs(old_regs); | |
695 | } | |
696 | ||
697 | +int setup_profiling_timer(unsigned int multiplier) | |
698 | +{ | |
699 | + return -EINVAL; | |
700 | +} | |
701 | + | |
702 | +/* | |
703 | + * This initializes the IO-APIC and APIC hardware if this is | |
704 | + * a UP kernel. | |
705 | + */ | |
706 | +int __init APIC_init_uniprocessor(void) | |
707 | +{ | |
708 | +#ifdef CONFIG_X86_IO_APIC | |
709 | + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) | |
710 | + setup_IO_APIC(); | |
711 | +#endif | |
712 | + | |
713 | + return 1; | |
714 | +} | |
715 | + | |
716 | +/* | |
717 | + * Local APIC interrupts | |
718 | + */ | |
719 | + | |
720 | /* | |
721 | * This interrupt should _never_ happen with our APIC/SMP architecture | |
722 | */ | |
723 | @@ -150,7 +156,6 @@ asmlinkage void smp_spurious_interrupt(v | |
724 | /* | |
725 | * This interrupt should never happen with our APIC/SMP architecture | |
726 | */ | |
727 | - | |
728 | asmlinkage void smp_error_interrupt(void) | |
729 | { | |
730 | unsigned int v, v1; | |
731 | @@ -178,19 +183,3 @@ asmlinkage void smp_error_interrupt(void | |
732 | smp_processor_id(), v , v1); | |
733 | irq_exit(); | |
734 | } | |
735 | - | |
736 | -int disable_apic; | |
737 | - | |
738 | -/* | |
739 | - * This initializes the IO-APIC and APIC hardware if this is | |
740 | - * a UP kernel. | |
741 | - */ | |
742 | -int __init APIC_init_uniprocessor (void) | |
743 | -{ | |
744 | -#ifdef CONFIG_X86_IO_APIC | |
745 | - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) | |
746 | - setup_IO_APIC(); | |
747 | -#endif | |
748 | - | |
749 | - return 1; | |
750 | -} | |
751 | --- a/arch/x86/kernel/asm-offsets_32.c | |
752 | +++ b/arch/x86/kernel/asm-offsets_32.c | |
753 | @@ -23,8 +23,10 @@ | |
754 | #include <xen/interface/xen.h> | |
755 | #endif | |
756 | ||
757 | +#ifdef CONFIG_LGUEST_GUEST | |
758 | #include <linux/lguest.h> | |
759 | #include "../../../drivers/lguest/lg.h" | |
760 | +#endif | |
761 | ||
762 | /* workaround for a warning with -Wmissing-prototypes */ | |
763 | void foo(void); | |
764 | --- a/arch/x86/kernel/cpu/common-xen.c | |
765 | +++ b/arch/x86/kernel/cpu/common-xen.c | |
766 | @@ -27,45 +27,50 @@ | |
767 | #include "cpu.h" | |
768 | ||
769 | DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { | |
770 | - [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, | |
771 | - [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, | |
772 | - [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, | |
773 | - [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, | |
774 | + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, | |
775 | + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, | |
776 | + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, | |
777 | + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, | |
778 | #ifndef CONFIG_XEN | |
779 | /* | |
780 | * Segments used for calling PnP BIOS have byte granularity. | |
781 | * They code segments and data segments have fixed 64k limits, | |
782 | * the transfer segment sizes are set at run time. | |
783 | */ | |
784 | - [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ | |
785 | - [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ | |
786 | - [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ | |
787 | - [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ | |
788 | - [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ | |
789 | + /* 32-bit code */ | |
790 | + [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, | |
791 | + /* 16-bit code */ | |
792 | + [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, | |
793 | + /* 16-bit data */ | |
794 | + [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, | |
795 | + /* 16-bit data */ | |
796 | + [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, | |
797 | + /* 16-bit data */ | |
798 | + [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, | |
799 | /* | |
800 | * The APM segments have byte granularity and their bases | |
801 | * are set at run time. All have 64k limits. | |
802 | */ | |
803 | - [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ | |
804 | + /* 32-bit code */ | |
805 | + [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, | |
806 | /* 16-bit code */ | |
807 | - [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, | |
808 | - [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ | |
809 | + [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, | |
810 | + /* data */ | |
811 | + [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, | |
812 | ||
813 | - [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, | |
814 | + [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, | |
815 | #endif | |
816 | - [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, | |
817 | + [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, | |
818 | } }; | |
819 | EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); | |
820 | ||
821 | +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; | |
822 | + | |
823 | static int cachesize_override __cpuinitdata = -1; | |
824 | -static int disable_x86_fxsr __cpuinitdata; | |
825 | static int disable_x86_serial_nr __cpuinitdata = 1; | |
826 | -static int disable_x86_sep __cpuinitdata; | |
827 | ||
828 | struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; | |
829 | ||
830 | -extern int disable_pse; | |
831 | - | |
832 | static void __cpuinit default_init(struct cpuinfo_x86 * c) | |
833 | { | |
834 | /* Not much we can do here... */ | |
835 | @@ -214,16 +219,8 @@ static void __cpuinit get_cpu_vendor(str | |
836 | ||
837 | static int __init x86_fxsr_setup(char * s) | |
838 | { | |
839 | - /* Tell all the other CPUs to not use it... */ | |
840 | - disable_x86_fxsr = 1; | |
841 | - | |
842 | - /* | |
843 | - * ... and clear the bits early in the boot_cpu_data | |
844 | - * so that the bootup process doesn't try to do this | |
845 | - * either. | |
846 | - */ | |
847 | - clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability); | |
848 | - clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability); | |
849 | + setup_clear_cpu_cap(X86_FEATURE_FXSR); | |
850 | + setup_clear_cpu_cap(X86_FEATURE_XMM); | |
851 | return 1; | |
852 | } | |
853 | __setup("nofxsr", x86_fxsr_setup); | |
854 | @@ -231,7 +228,7 @@ __setup("nofxsr", x86_fxsr_setup); | |
855 | ||
856 | static int __init x86_sep_setup(char * s) | |
857 | { | |
858 | - disable_x86_sep = 1; | |
859 | + setup_clear_cpu_cap(X86_FEATURE_SEP); | |
860 | return 1; | |
861 | } | |
862 | __setup("nosep", x86_sep_setup); | |
863 | @@ -268,10 +265,10 @@ static int __cpuinit have_cpuid_p(void) | |
864 | void __init cpu_detect(struct cpuinfo_x86 *c) | |
865 | { | |
866 | /* Get vendor name */ | |
867 | - cpuid(0x00000000, &c->cpuid_level, | |
868 | - (int *)&c->x86_vendor_id[0], | |
869 | - (int *)&c->x86_vendor_id[8], | |
870 | - (int *)&c->x86_vendor_id[4]); | |
871 | + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, | |
872 | + (unsigned int *)&c->x86_vendor_id[0], | |
873 | + (unsigned int *)&c->x86_vendor_id[8], | |
874 | + (unsigned int *)&c->x86_vendor_id[4]); | |
875 | ||
876 | c->x86 = 4; | |
877 | if (c->cpuid_level >= 0x00000001) { | |
878 | @@ -284,9 +281,38 @@ void __init cpu_detect(struct cpuinfo_x8 | |
879 | if (c->x86 >= 0x6) | |
880 | c->x86_model += ((tfms >> 16) & 0xF) << 4; | |
881 | c->x86_mask = tfms & 15; | |
882 | - if (cap0 & (1<<19)) | |
883 | + if (cap0 & (1<<19)) { | |
884 | c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; | |
885 | + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; | |
886 | + } | |
887 | + } | |
888 | +} | |
889 | +static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) | |
890 | +{ | |
891 | + u32 tfms, xlvl; | |
892 | + unsigned int ebx; | |
893 | + | |
894 | + memset(&c->x86_capability, 0, sizeof c->x86_capability); | |
895 | + if (have_cpuid_p()) { | |
896 | + /* Intel-defined flags: level 0x00000001 */ | |
897 | + if (c->cpuid_level >= 0x00000001) { | |
898 | + u32 capability, excap; | |
899 | + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); | |
900 | + c->x86_capability[0] = capability; | |
901 | + c->x86_capability[4] = excap; | |
902 | + } | |
903 | + | |
904 | + /* AMD-defined flags: level 0x80000001 */ | |
905 | + xlvl = cpuid_eax(0x80000000); | |
906 | + if ((xlvl & 0xffff0000) == 0x80000000) { | |
907 | + if (xlvl >= 0x80000001) { | |
908 | + c->x86_capability[1] = cpuid_edx(0x80000001); | |
909 | + c->x86_capability[6] = cpuid_ecx(0x80000001); | |
910 | + } | |
911 | + } | |
912 | + | |
913 | } | |
914 | + | |
915 | } | |
916 | ||
917 | /* Do minimum CPU detection early. | |
918 | @@ -300,6 +326,7 @@ static void __init early_cpu_detect(void | |
919 | struct cpuinfo_x86 *c = &boot_cpu_data; | |
920 | ||
921 | c->x86_cache_alignment = 32; | |
922 | + c->x86_clflush_size = 32; | |
923 | ||
924 | if (!have_cpuid_p()) | |
925 | return; | |
926 | @@ -307,19 +334,30 @@ static void __init early_cpu_detect(void | |
927 | cpu_detect(c); | |
928 | ||
929 | get_cpu_vendor(c, 1); | |
930 | + | |
931 | + switch (c->x86_vendor) { | |
932 | + case X86_VENDOR_AMD: | |
933 | + early_init_amd(c); | |
934 | + break; | |
935 | + case X86_VENDOR_INTEL: | |
936 | + early_init_intel(c); | |
937 | + break; | |
938 | + } | |
939 | + | |
940 | + early_get_cap(c); | |
941 | } | |
942 | ||
943 | static void __cpuinit generic_identify(struct cpuinfo_x86 * c) | |
944 | { | |
945 | u32 tfms, xlvl; | |
946 | - int ebx; | |
947 | + unsigned int ebx; | |
948 | ||
949 | if (have_cpuid_p()) { | |
950 | /* Get vendor name */ | |
951 | - cpuid(0x00000000, &c->cpuid_level, | |
952 | - (int *)&c->x86_vendor_id[0], | |
953 | - (int *)&c->x86_vendor_id[8], | |
954 | - (int *)&c->x86_vendor_id[4]); | |
955 | + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, | |
956 | + (unsigned int *)&c->x86_vendor_id[0], | |
957 | + (unsigned int *)&c->x86_vendor_id[8], | |
958 | + (unsigned int *)&c->x86_vendor_id[4]); | |
959 | ||
960 | get_cpu_vendor(c, 0); | |
961 | /* Initialize the standard set of capabilities */ | |
962 | @@ -364,8 +402,6 @@ static void __cpuinit generic_identify(s | |
963 | init_scattered_cpuid_features(c); | |
964 | } | |
965 | ||
966 | - early_intel_workaround(c); | |
967 | - | |
968 | #ifdef CONFIG_X86_HT | |
969 | c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; | |
970 | #endif | |
971 | @@ -399,7 +435,7 @@ __setup("serialnumber", x86_serial_nr_se | |
972 | /* | |
973 | * This does the hard work of actually picking apart the CPU stuff... | |
974 | */ | |
975 | -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |
976 | +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |
977 | { | |
978 | int i; | |
979 | ||
980 | @@ -425,20 +461,9 @@ static void __cpuinit identify_cpu(struc | |
981 | ||
982 | generic_identify(c); | |
983 | ||
984 | - printk(KERN_DEBUG "CPU: After generic identify, caps:"); | |
985 | - for (i = 0; i < NCAPINTS; i++) | |
986 | - printk(" %08lx", c->x86_capability[i]); | |
987 | - printk("\n"); | |
988 | - | |
989 | - if (this_cpu->c_identify) { | |
990 | + if (this_cpu->c_identify) | |
991 | this_cpu->c_identify(c); | |
992 | ||
993 | - printk(KERN_DEBUG "CPU: After vendor identify, caps:"); | |
994 | - for (i = 0; i < NCAPINTS; i++) | |
995 | - printk(" %08lx", c->x86_capability[i]); | |
996 | - printk("\n"); | |
997 | - } | |
998 | - | |
999 | /* | |
1000 | * Vendor-specific initialization. In this section we | |
1001 | * canonicalize the feature flags, meaning if there are | |
1002 | @@ -460,23 +485,6 @@ static void __cpuinit identify_cpu(struc | |
1003 | * we do "generic changes." | |
1004 | */ | |
1005 | ||
1006 | - /* TSC disabled? */ | |
1007 | - if ( tsc_disable ) | |
1008 | - clear_bit(X86_FEATURE_TSC, c->x86_capability); | |
1009 | - | |
1010 | - /* FXSR disabled? */ | |
1011 | - if (disable_x86_fxsr) { | |
1012 | - clear_bit(X86_FEATURE_FXSR, c->x86_capability); | |
1013 | - clear_bit(X86_FEATURE_XMM, c->x86_capability); | |
1014 | - } | |
1015 | - | |
1016 | - /* SEP disabled? */ | |
1017 | - if (disable_x86_sep) | |
1018 | - clear_bit(X86_FEATURE_SEP, c->x86_capability); | |
1019 | - | |
1020 | - if (disable_pse) | |
1021 | - clear_bit(X86_FEATURE_PSE, c->x86_capability); | |
1022 | - | |
1023 | /* If the model name is still unset, do table lookup. */ | |
1024 | if ( !c->x86_model_id[0] ) { | |
1025 | char *p; | |
1026 | @@ -489,13 +497,6 @@ static void __cpuinit identify_cpu(struc | |
1027 | c->x86, c->x86_model); | |
1028 | } | |
1029 | ||
1030 | - /* Now the feature flags better reflect actual CPU features! */ | |
1031 | - | |
1032 | - printk(KERN_DEBUG "CPU: After all inits, caps:"); | |
1033 | - for (i = 0; i < NCAPINTS; i++) | |
1034 | - printk(" %08lx", c->x86_capability[i]); | |
1035 | - printk("\n"); | |
1036 | - | |
1037 | /* | |
1038 | * On SMP, boot_cpu_data holds the common feature set between | |
1039 | * all CPUs; so make sure that we indicate which features are | |
1040 | @@ -508,8 +509,14 @@ static void __cpuinit identify_cpu(struc | |
1041 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | |
1042 | } | |
1043 | ||
1044 | + /* Clear all flags overriden by options */ | |
1045 | + for (i = 0; i < NCAPINTS; i++) | |
1046 | + c->x86_capability[i] &= ~cleared_cpu_caps[i]; | |
1047 | + | |
1048 | /* Init Machine Check Exception if available. */ | |
1049 | mcheck_init(c); | |
1050 | + | |
1051 | + select_idle_routine(c); | |
1052 | } | |
1053 | ||
1054 | void __init identify_boot_cpu(void) | |
1055 | @@ -517,7 +524,6 @@ void __init identify_boot_cpu(void) | |
1056 | identify_cpu(&boot_cpu_data); | |
1057 | sysenter_setup(); | |
1058 | enable_sep_cpu(); | |
1059 | - mtrr_bp_init(); | |
1060 | } | |
1061 | ||
1062 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) | |
1063 | @@ -574,6 +580,13 @@ void __cpuinit detect_ht(struct cpuinfo_ | |
1064 | } | |
1065 | #endif | |
1066 | ||
1067 | +static __init int setup_noclflush(char *arg) | |
1068 | +{ | |
1069 | + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); | |
1070 | + return 1; | |
1071 | +} | |
1072 | +__setup("noclflush", setup_noclflush); | |
1073 | + | |
1074 | void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) | |
1075 | { | |
1076 | char *vendor = NULL; | |
1077 | @@ -597,6 +610,17 @@ void __cpuinit print_cpu_info(struct cpu | |
1078 | printk("\n"); | |
1079 | } | |
1080 | ||
1081 | +static __init int setup_disablecpuid(char *arg) | |
1082 | +{ | |
1083 | + int bit; | |
1084 | + if (get_option(&arg, &bit) && bit < NCAPINTS*32) | |
1085 | + setup_clear_cpu_cap(bit); | |
1086 | + else | |
1087 | + return 0; | |
1088 | + return 1; | |
1089 | +} | |
1090 | +__setup("clearcpuid=", setup_disablecpuid); | |
1091 | + | |
1092 | cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; | |
1093 | ||
1094 | /* This is hacky. :) | |
1095 | @@ -606,16 +630,6 @@ cpumask_t cpu_initialized __cpuinitdata | |
1096 | * They will insert themselves into the cpu_devs structure. | |
1097 | * Then, when cpu_init() is called, we can just iterate over that array. | |
1098 | */ | |
1099 | - | |
1100 | -extern int intel_cpu_init(void); | |
1101 | -extern int cyrix_init_cpu(void); | |
1102 | -extern int nsc_init_cpu(void); | |
1103 | -extern int amd_init_cpu(void); | |
1104 | -extern int centaur_init_cpu(void); | |
1105 | -extern int transmeta_init_cpu(void); | |
1106 | -extern int nexgen_init_cpu(void); | |
1107 | -extern int umc_init_cpu(void); | |
1108 | - | |
1109 | void __init early_cpu_init(void) | |
1110 | { | |
1111 | intel_cpu_init(); | |
1112 | @@ -627,21 +641,13 @@ void __init early_cpu_init(void) | |
1113 | nexgen_init_cpu(); | |
1114 | umc_init_cpu(); | |
1115 | early_cpu_detect(); | |
1116 | - | |
1117 | -#ifdef CONFIG_DEBUG_PAGEALLOC | |
1118 | - /* pse is not compatible with on-the-fly unmapping, | |
1119 | - * disable it even if the cpus claim to support it. | |
1120 | - */ | |
1121 | - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); | |
1122 | - disable_pse = 1; | |
1123 | -#endif | |
1124 | } | |
1125 | ||
1126 | /* Make sure %fs is initialized properly in idle threads */ | |
1127 | -struct pt_regs * __devinit idle_regs(struct pt_regs *regs) | |
1128 | +struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) | |
1129 | { | |
1130 | memset(regs, 0, sizeof(struct pt_regs)); | |
1131 | - regs->xfs = __KERNEL_PERCPU; | |
1132 | + regs->fs = __KERNEL_PERCPU; | |
1133 | return regs; | |
1134 | } | |
1135 | ||
1136 | @@ -649,7 +655,7 @@ struct pt_regs * __devinit idle_regs(str | |
1137 | * it's on the real one. */ | |
1138 | void switch_to_new_gdt(void) | |
1139 | { | |
1140 | - struct Xgt_desc_struct gdt_descr; | |
1141 | + struct desc_ptr gdt_descr; | |
1142 | unsigned long va, frames[16]; | |
1143 | int f; | |
1144 | ||
1145 | @@ -692,12 +698,6 @@ void __cpuinit cpu_init(void) | |
1146 | ||
1147 | if (cpu_has_vme || cpu_has_de) | |
1148 | clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); | |
1149 | - if (tsc_disable && cpu_has_tsc) { | |
1150 | - printk(KERN_NOTICE "Disabling TSC...\n"); | |
1151 | - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ | |
1152 | - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); | |
1153 | - set_in_cr4(X86_CR4_TSD); | |
1154 | - } | |
1155 | ||
1156 | switch_to_new_gdt(); | |
1157 | ||
1158 | @@ -710,7 +710,7 @@ void __cpuinit cpu_init(void) | |
1159 | BUG(); | |
1160 | enter_lazy_tlb(&init_mm, curr); | |
1161 | ||
1162 | - load_esp0(t, thread); | |
1163 | + load_sp0(t, thread); | |
1164 | ||
1165 | load_LDT(&init_mm.context); | |
1166 | ||
1167 | --- a/arch/x86/kernel/cpu/mtrr/main-xen.c | |
1168 | +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c | |
1169 | @@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = { | |
1170 | ||
1171 | struct mtrr_ops *mtrr_if = &generic_mtrr_ops; | |
1172 | unsigned int num_var_ranges; | |
1173 | -unsigned int *usage_table; | |
1174 | +unsigned int mtrr_usage_table[MAX_VAR_RANGES]; | |
1175 | ||
1176 | static void __init set_num_var_ranges(void) | |
1177 | { | |
1178 | @@ -52,17 +52,12 @@ static void __init init_table(void) | |
1179 | int i, max; | |
1180 | ||
1181 | max = num_var_ranges; | |
1182 | - if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) | |
1183 | - == NULL) { | |
1184 | - printk(KERN_ERR "mtrr: could not allocate\n"); | |
1185 | - return; | |
1186 | - } | |
1187 | for (i = 0; i < max; i++) | |
1188 | - usage_table[i] = 0; | |
1189 | + mtrr_usage_table[i] = 0; | |
1190 | } | |
1191 | ||
1192 | int mtrr_add_page(unsigned long base, unsigned long size, | |
1193 | - unsigned int type, char increment) | |
1194 | + unsigned int type, bool increment) | |
1195 | { | |
1196 | int error; | |
1197 | struct xen_platform_op op; | |
1198 | @@ -81,7 +76,7 @@ int mtrr_add_page(unsigned long base, un | |
1199 | } | |
1200 | ||
1201 | if (increment) | |
1202 | - ++usage_table[op.u.add_memtype.reg]; | |
1203 | + ++mtrr_usage_table[op.u.add_memtype.reg]; | |
1204 | ||
1205 | mutex_unlock(&mtrr_mutex); | |
1206 | ||
1207 | @@ -103,7 +98,7 @@ static int mtrr_check(unsigned long base | |
1208 | ||
1209 | int | |
1210 | mtrr_add(unsigned long base, unsigned long size, unsigned int type, | |
1211 | - char increment) | |
1212 | + bool increment) | |
1213 | { | |
1214 | if (mtrr_check(base, size)) | |
1215 | return -EINVAL; | |
1216 | @@ -136,11 +131,11 @@ int mtrr_del_page(int reg, unsigned long | |
1217 | goto out; | |
1218 | } | |
1219 | } | |
1220 | - if (usage_table[reg] < 1) { | |
1221 | + if (mtrr_usage_table[reg] < 1) { | |
1222 | printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); | |
1223 | goto out; | |
1224 | } | |
1225 | - if (--usage_table[reg] < 1) { | |
1226 | + if (--mtrr_usage_table[reg] < 1) { | |
1227 | op.cmd = XENPF_del_memtype; | |
1228 | op.u.del_memtype.handle = 0; | |
1229 | op.u.del_memtype.reg = reg; | |
1230 | --- a/arch/x86/kernel/e820_32-xen.c | |
1231 | +++ b/arch/x86/kernel/e820_32-xen.c | |
1232 | @@ -7,7 +7,6 @@ | |
1233 | #include <linux/kexec.h> | |
1234 | #include <linux/module.h> | |
1235 | #include <linux/mm.h> | |
1236 | -#include <linux/efi.h> | |
1237 | #include <linux/pfn.h> | |
1238 | #include <linux/uaccess.h> | |
1239 | #include <linux/suspend.h> | |
1240 | @@ -18,11 +17,6 @@ | |
1241 | #include <asm/setup.h> | |
1242 | #include <xen/interface/memory.h> | |
1243 | ||
1244 | -#ifdef CONFIG_EFI | |
1245 | -int efi_enabled = 0; | |
1246 | -EXPORT_SYMBOL(efi_enabled); | |
1247 | -#endif | |
1248 | - | |
1249 | struct e820map e820; | |
1250 | struct change_member { | |
1251 | struct e820entry *pbios; /* pointer to original bios entry */ | |
1252 | @@ -38,26 +32,6 @@ unsigned long pci_mem_start = 0x10000000 | |
1253 | EXPORT_SYMBOL(pci_mem_start); | |
1254 | #endif | |
1255 | extern int user_defined_memmap; | |
1256 | -struct resource data_resource = { | |
1257 | - .name = "Kernel data", | |
1258 | - .start = 0, | |
1259 | - .end = 0, | |
1260 | - .flags = IORESOURCE_BUSY | IORESOURCE_MEM | |
1261 | -}; | |
1262 | - | |
1263 | -struct resource code_resource = { | |
1264 | - .name = "Kernel code", | |
1265 | - .start = 0, | |
1266 | - .end = 0, | |
1267 | - .flags = IORESOURCE_BUSY | IORESOURCE_MEM | |
1268 | -}; | |
1269 | - | |
1270 | -struct resource bss_resource = { | |
1271 | - .name = "Kernel bss", | |
1272 | - .start = 0, | |
1273 | - .end = 0, | |
1274 | - .flags = IORESOURCE_BUSY | IORESOURCE_MEM | |
1275 | -}; | |
1276 | ||
1277 | static struct resource system_rom_resource = { | |
1278 | .name = "System ROM", | |
1279 | @@ -112,60 +86,6 @@ static struct resource video_rom_resourc | |
1280 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | |
1281 | }; | |
1282 | ||
1283 | -static struct resource video_ram_resource = { | |
1284 | - .name = "Video RAM area", | |
1285 | - .start = 0xa0000, | |
1286 | - .end = 0xbffff, | |
1287 | - .flags = IORESOURCE_BUSY | IORESOURCE_MEM | |
1288 | -}; | |
1289 | - | |
1290 | -static struct resource standard_io_resources[] = { { | |
1291 | - .name = "dma1", | |
1292 | - .start = 0x0000, | |
1293 | - .end = 0x001f, | |
1294 | - .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
1295 | -}, { | |
1296 | - .name = "pic1", | |
1297 | - .start = 0x0020, | |
1298 | - .end = 0x0021, | |
1299 | - .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
1300 | -}, { | |
1301 | - .name = "timer0", | |
1302 | - .start = 0x0040, | |
1303 | - .end = 0x0043, | |
1304 | - .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
1305 | -}, { | |
1306 | - .name = "timer1", | |
1307 | - .start = 0x0050, | |
1308 | - .end = 0x0053, | |
1309 | - .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
1310 | -}, { | |
1311 | - .name = "keyboard", | |
1312 | - .start = 0x0060, | |
1313 | - .end = 0x006f, | |
1314 | - .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
1315 | -}, { | |
1316 | - .name = "dma page reg", | |
1317 | - .start = 0x0080, | |
1318 | - .end = 0x008f, | |
1319 | - .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
1320 | -}, { | |
1321 | - .name = "pic2", | |
1322 | - .start = 0x00a0, | |
1323 | - .end = 0x00a1, | |
1324 | - .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
1325 | -}, { | |
1326 | - .name = "dma2", | |
1327 | - .start = 0x00c0, | |
1328 | - .end = 0x00df, | |
1329 | - .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
1330 | -}, { | |
1331 | - .name = "fpu", | |
1332 | - .start = 0x00f0, | |
1333 | - .end = 0x00ff, | |
1334 | - .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
1335 | -} }; | |
1336 | - | |
1337 | #define ROMSIGNATURE 0xaa55 | |
1338 | ||
1339 | static int __init romsignature(const unsigned char *rom) | |
1340 | @@ -272,10 +192,9 @@ static struct e820map machine_e820; | |
1341 | * Request address space for all standard RAM and ROM resources | |
1342 | * and also for regions reported as reserved by the e820. | |
1343 | */ | |
1344 | -static void __init | |
1345 | -legacy_init_iomem_resources(struct resource *code_resource, | |
1346 | - struct resource *data_resource, | |
1347 | - struct resource *bss_resource) | |
1348 | +void __init init_iomem_resources(struct resource *code_resource, | |
1349 | + struct resource *data_resource, | |
1350 | + struct resource *bss_resource) | |
1351 | { | |
1352 | int i; | |
1353 | ||
1354 | @@ -324,39 +243,6 @@ legacy_init_iomem_resources(struct resou | |
1355 | ||
1356 | #undef e820 | |
1357 | ||
1358 | -/* | |
1359 | - * Request address space for all standard resources | |
1360 | - * | |
1361 | - * This is called just before pcibios_init(), which is also a | |
1362 | - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). | |
1363 | - */ | |
1364 | -static int __init request_standard_resources(void) | |
1365 | -{ | |
1366 | - int i; | |
1367 | - | |
1368 | - /* Nothing to do if not running in dom0. */ | |
1369 | - if (!is_initial_xendomain()) | |
1370 | - return 0; | |
1371 | - | |
1372 | - printk("Setting up standard PCI resources\n"); | |
1373 | - if (efi_enabled) | |
1374 | - efi_initialize_iomem_resources(&code_resource, | |
1375 | - &data_resource, &bss_resource); | |
1376 | - else | |
1377 | - legacy_init_iomem_resources(&code_resource, | |
1378 | - &data_resource, &bss_resource); | |
1379 | - | |
1380 | - /* EFI systems may still have VGA */ | |
1381 | - request_resource(&iomem_resource, &video_ram_resource); | |
1382 | - | |
1383 | - /* request I/O space for devices used on all i[345]86 PCs */ | |
1384 | - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) | |
1385 | - request_resource(&ioport_resource, &standard_io_resources[i]); | |
1386 | - return 0; | |
1387 | -} | |
1388 | - | |
1389 | -subsys_initcall(request_standard_resources); | |
1390 | - | |
1391 | #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) | |
1392 | /** | |
1393 | * e820_mark_nosave_regions - Find the ranges of physical addresses that do not | |
1394 | @@ -393,19 +279,17 @@ void __init add_memory_region(unsigned l | |
1395 | { | |
1396 | int x; | |
1397 | ||
1398 | - if (!efi_enabled) { | |
1399 | - x = e820.nr_map; | |
1400 | - | |
1401 | - if (x == E820MAX) { | |
1402 | - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | |
1403 | - return; | |
1404 | - } | |
1405 | + x = e820.nr_map; | |
1406 | ||
1407 | - e820.map[x].addr = start; | |
1408 | - e820.map[x].size = size; | |
1409 | - e820.map[x].type = type; | |
1410 | - e820.nr_map++; | |
1411 | + if (x == E820MAX) { | |
1412 | + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | |
1413 | + return; | |
1414 | } | |
1415 | + | |
1416 | + e820.map[x].addr = start; | |
1417 | + e820.map[x].size = size; | |
1418 | + e820.map[x].type = type; | |
1419 | + e820.nr_map++; | |
1420 | } /* add_memory_region */ | |
1421 | ||
1422 | /* | |
1423 | @@ -642,29 +526,6 @@ int __init copy_e820_map(struct e820entr | |
1424 | } | |
1425 | ||
1426 | /* | |
1427 | - * Callback for efi_memory_walk. | |
1428 | - */ | |
1429 | -static int __init | |
1430 | -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) | |
1431 | -{ | |
1432 | - unsigned long *max_pfn = arg, pfn; | |
1433 | - | |
1434 | - if (start < end) { | |
1435 | - pfn = PFN_UP(end -1); | |
1436 | - if (pfn > *max_pfn) | |
1437 | - *max_pfn = pfn; | |
1438 | - } | |
1439 | - return 0; | |
1440 | -} | |
1441 | - | |
1442 | -static int __init | |
1443 | -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) | |
1444 | -{ | |
1445 | - memory_present(0, PFN_UP(start), PFN_DOWN(end)); | |
1446 | - return 0; | |
1447 | -} | |
1448 | - | |
1449 | -/* | |
1450 | * Find the highest page frame number we have available | |
1451 | */ | |
1452 | void __init find_max_pfn(void) | |
1453 | @@ -672,11 +533,6 @@ void __init find_max_pfn(void) | |
1454 | int i; | |
1455 | ||
1456 | max_pfn = 0; | |
1457 | - if (efi_enabled) { | |
1458 | - efi_memmap_walk(efi_find_max_pfn, &max_pfn); | |
1459 | - efi_memmap_walk(efi_memory_present_wrapper, NULL); | |
1460 | - return; | |
1461 | - } | |
1462 | ||
1463 | for (i = 0; i < e820.nr_map; i++) { | |
1464 | unsigned long start, end; | |
1465 | @@ -694,34 +550,12 @@ void __init find_max_pfn(void) | |
1466 | } | |
1467 | ||
1468 | /* | |
1469 | - * Free all available memory for boot time allocation. Used | |
1470 | - * as a callback function by efi_memory_walk() | |
1471 | - */ | |
1472 | - | |
1473 | -static int __init | |
1474 | -free_available_memory(unsigned long start, unsigned long end, void *arg) | |
1475 | -{ | |
1476 | - /* check max_low_pfn */ | |
1477 | - if (start >= (max_low_pfn << PAGE_SHIFT)) | |
1478 | - return 0; | |
1479 | - if (end >= (max_low_pfn << PAGE_SHIFT)) | |
1480 | - end = max_low_pfn << PAGE_SHIFT; | |
1481 | - if (start < end) | |
1482 | - free_bootmem(start, end - start); | |
1483 | - | |
1484 | - return 0; | |
1485 | -} | |
1486 | -/* | |
1487 | * Register fully available low RAM pages with the bootmem allocator. | |
1488 | */ | |
1489 | void __init register_bootmem_low_pages(unsigned long max_low_pfn) | |
1490 | { | |
1491 | int i; | |
1492 | ||
1493 | - if (efi_enabled) { | |
1494 | - efi_memmap_walk(free_available_memory, NULL); | |
1495 | - return; | |
1496 | - } | |
1497 | for (i = 0; i < e820.nr_map; i++) { | |
1498 | unsigned long curr_pfn, last_pfn, size; | |
1499 | /* | |
1500 | @@ -855,56 +689,12 @@ void __init print_memory_map(char *who) | |
1501 | } | |
1502 | } | |
1503 | ||
1504 | -static __init __always_inline void efi_limit_regions(unsigned long long size) | |
1505 | -{ | |
1506 | - unsigned long long current_addr = 0; | |
1507 | - efi_memory_desc_t *md, *next_md; | |
1508 | - void *p, *p1; | |
1509 | - int i, j; | |
1510 | - | |
1511 | - j = 0; | |
1512 | - p1 = memmap.map; | |
1513 | - for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { | |
1514 | - md = p; | |
1515 | - next_md = p1; | |
1516 | - current_addr = md->phys_addr + | |
1517 | - PFN_PHYS(md->num_pages); | |
1518 | - if (is_available_memory(md)) { | |
1519 | - if (md->phys_addr >= size) continue; | |
1520 | - memcpy(next_md, md, memmap.desc_size); | |
1521 | - if (current_addr >= size) { | |
1522 | - next_md->num_pages -= | |
1523 | - PFN_UP(current_addr-size); | |
1524 | - } | |
1525 | - p1 += memmap.desc_size; | |
1526 | - next_md = p1; | |
1527 | - j++; | |
1528 | - } else if ((md->attribute & EFI_MEMORY_RUNTIME) == | |
1529 | - EFI_MEMORY_RUNTIME) { | |
1530 | - /* In order to make runtime services | |
1531 | - * available we have to include runtime | |
1532 | - * memory regions in memory map */ | |
1533 | - memcpy(next_md, md, memmap.desc_size); | |
1534 | - p1 += memmap.desc_size; | |
1535 | - next_md = p1; | |
1536 | - j++; | |
1537 | - } | |
1538 | - } | |
1539 | - memmap.nr_map = j; | |
1540 | - memmap.map_end = memmap.map + | |
1541 | - (memmap.nr_map * memmap.desc_size); | |
1542 | -} | |
1543 | - | |
1544 | void __init limit_regions(unsigned long long size) | |
1545 | { | |
1546 | unsigned long long current_addr = 0; | |
1547 | int i; | |
1548 | ||
1549 | print_memory_map("limit_regions start"); | |
1550 | - if (efi_enabled) { | |
1551 | - efi_limit_regions(size); | |
1552 | - return; | |
1553 | - } | |
1554 | for (i = 0; i < e820.nr_map; i++) { | |
1555 | current_addr = e820.map[i].addr + e820.map[i].size; | |
1556 | if (current_addr < size) | |
1557 | @@ -1056,3 +846,44 @@ static int __init parse_memmap(char *arg | |
1558 | return 0; | |
1559 | } | |
1560 | early_param("memmap", parse_memmap); | |
1561 | + | |
1562 | +#ifndef CONFIG_XEN | |
1563 | +void __init update_memory_range(u64 start, u64 size, unsigned old_type, | |
1564 | + unsigned new_type) | |
1565 | +{ | |
1566 | + int i; | |
1567 | + | |
1568 | + BUG_ON(old_type == new_type); | |
1569 | + | |
1570 | + for (i = 0; i < e820.nr_map; i++) { | |
1571 | + struct e820entry *ei = &e820.map[i]; | |
1572 | + u64 final_start, final_end; | |
1573 | + if (ei->type != old_type) | |
1574 | + continue; | |
1575 | + /* totally covered? */ | |
1576 | + if (ei->addr >= start && ei->size <= size) { | |
1577 | + ei->type = new_type; | |
1578 | + continue; | |
1579 | + } | |
1580 | + /* partially covered */ | |
1581 | + final_start = max(start, ei->addr); | |
1582 | + final_end = min(start + size, ei->addr + ei->size); | |
1583 | + if (final_start >= final_end) | |
1584 | + continue; | |
1585 | + add_memory_region(final_start, final_end - final_start, | |
1586 | + new_type); | |
1587 | + } | |
1588 | +} | |
1589 | + | |
1590 | +void __init update_e820(void) | |
1591 | +{ | |
1592 | + u8 nr_map; | |
1593 | + | |
1594 | + nr_map = e820.nr_map; | |
1595 | + if (sanitize_e820_map(e820.map, &nr_map)) | |
1596 | + return; | |
1597 | + e820.nr_map = nr_map; | |
1598 | + printk(KERN_INFO "modified physical RAM map:\n"); | |
1599 | + print_memory_map("modified"); | |
1600 | +} | |
1601 | +#endif | |
1602 | --- a/arch/x86/kernel/e820_64-xen.c | |
1603 | +++ b/arch/x86/kernel/e820_64-xen.c | |
1604 | @@ -1,4 +1,4 @@ | |
1605 | -/* | |
1606 | +/* | |
1607 | * Handle the memory map. | |
1608 | * The functions here do the job until bootmem takes over. | |
1609 | * | |
1610 | @@ -26,6 +26,7 @@ | |
1611 | #include <asm/proto.h> | |
1612 | #include <asm/setup.h> | |
1613 | #include <asm/sections.h> | |
1614 | +#include <asm/kdebug.h> | |
1615 | #include <xen/interface/memory.h> | |
1616 | ||
1617 | struct e820map e820 __initdata; | |
1618 | @@ -33,96 +34,103 @@ struct e820map e820 __initdata; | |
1619 | struct e820map machine_e820; | |
1620 | #endif | |
1621 | ||
1622 | -/* | |
1623 | +/* | |
1624 | * PFN of last memory page. | |
1625 | */ | |
1626 | -unsigned long end_pfn; | |
1627 | -EXPORT_SYMBOL(end_pfn); | |
1628 | +unsigned long end_pfn; | |
1629 | ||
1630 | -/* | |
1631 | +/* | |
1632 | * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. | |
1633 | * The direct mapping extends to end_pfn_map, so that we can directly access | |
1634 | * apertures, ACPI and other tables without having to play with fixmaps. | |
1635 | - */ | |
1636 | -unsigned long end_pfn_map; | |
1637 | + */ | |
1638 | +unsigned long end_pfn_map; | |
1639 | ||
1640 | -/* | |
1641 | +/* | |
1642 | * Last pfn which the user wants to use. | |
1643 | */ | |
1644 | static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; | |
1645 | ||
1646 | -extern struct resource code_resource, data_resource, bss_resource; | |
1647 | - | |
1648 | -/* Check for some hardcoded bad areas that early boot is not allowed to touch */ | |
1649 | -static inline int bad_addr(unsigned long *addrp, unsigned long size) | |
1650 | -{ | |
1651 | - unsigned long addr = *addrp, last = addr + size; | |
1652 | +/* | |
1653 | + * Early reserved memory areas. | |
1654 | + */ | |
1655 | +#define MAX_EARLY_RES 20 | |
1656 | ||
1657 | +struct early_res { | |
1658 | + unsigned long start, end; | |
1659 | + char name[16]; | |
1660 | +}; | |
1661 | +static struct early_res early_res[MAX_EARLY_RES] __initdata = { | |
1662 | #ifndef CONFIG_XEN | |
1663 | - /* various gunk below that needed for SMP startup */ | |
1664 | - if (addr < 0x8000) { | |
1665 | - *addrp = PAGE_ALIGN(0x8000); | |
1666 | - return 1; | |
1667 | - } | |
1668 | - | |
1669 | - /* direct mapping tables of the kernel */ | |
1670 | - if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { | |
1671 | - *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT); | |
1672 | - return 1; | |
1673 | - } | |
1674 | - | |
1675 | - /* initrd */ | |
1676 | -#ifdef CONFIG_BLK_DEV_INITRD | |
1677 | - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { | |
1678 | - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; | |
1679 | - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; | |
1680 | - unsigned long ramdisk_end = ramdisk_image+ramdisk_size; | |
1681 | - | |
1682 | - if (last >= ramdisk_image && addr < ramdisk_end) { | |
1683 | - *addrp = PAGE_ALIGN(ramdisk_end); | |
1684 | - return 1; | |
1685 | - } | |
1686 | - } | |
1687 | + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ | |
1688 | +#ifdef CONFIG_SMP | |
1689 | + { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" }, | |
1690 | #endif | |
1691 | - /* kernel code */ | |
1692 | - if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { | |
1693 | - *addrp = PAGE_ALIGN(__pa_symbol(&_end)); | |
1694 | - return 1; | |
1695 | - } | |
1696 | +#endif | |
1697 | + {} | |
1698 | +}; | |
1699 | ||
1700 | - if (last >= ebda_addr && addr < ebda_addr + ebda_size) { | |
1701 | - *addrp = PAGE_ALIGN(ebda_addr + ebda_size); | |
1702 | - return 1; | |
1703 | +void __init reserve_early(unsigned long start, unsigned long end, char *name) | |
1704 | +{ | |
1705 | + int i; | |
1706 | + struct early_res *r; | |
1707 | + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { | |
1708 | + r = &early_res[i]; | |
1709 | + if (end > r->start && start < r->end) | |
1710 | + panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n", | |
1711 | + start, end - 1, name?name:"", r->start, r->end - 1, r->name); | |
1712 | } | |
1713 | + if (i >= MAX_EARLY_RES) | |
1714 | + panic("Too many early reservations"); | |
1715 | + r = &early_res[i]; | |
1716 | + r->start = start; | |
1717 | + r->end = end; | |
1718 | + if (name) | |
1719 | + strncpy(r->name, name, sizeof(r->name) - 1); | |
1720 | +} | |
1721 | ||
1722 | -#ifdef CONFIG_NUMA | |
1723 | - /* NUMA memory to node map */ | |
1724 | - if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { | |
1725 | - *addrp = nodemap_addr + nodemap_size; | |
1726 | - return 1; | |
1727 | +void __init early_res_to_bootmem(void) | |
1728 | +{ | |
1729 | + int i; | |
1730 | + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { | |
1731 | + struct early_res *r = &early_res[i]; | |
1732 | + printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i, | |
1733 | + r->start, r->end - 1, r->name); | |
1734 | + reserve_bootmem_generic(r->start, r->end - r->start); | |
1735 | } | |
1736 | -#endif | |
1737 | - /* XXX ramdisk image here? */ | |
1738 | -#else | |
1739 | - if (last < (table_end<<PAGE_SHIFT)) { | |
1740 | - *addrp = table_end << PAGE_SHIFT; | |
1741 | - return 1; | |
1742 | +} | |
1743 | + | |
1744 | +/* Check for already reserved areas */ | |
1745 | +static inline int bad_addr(unsigned long *addrp, unsigned long size) | |
1746 | +{ | |
1747 | + int i; | |
1748 | + unsigned long addr = *addrp, last; | |
1749 | + int changed = 0; | |
1750 | +again: | |
1751 | + last = addr + size; | |
1752 | + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { | |
1753 | + struct early_res *r = &early_res[i]; | |
1754 | + if (last >= r->start && addr < r->end) { | |
1755 | + *addrp = addr = r->end; | |
1756 | + changed = 1; | |
1757 | + goto again; | |
1758 | + } | |
1759 | } | |
1760 | -#endif | |
1761 | - return 0; | |
1762 | -} | |
1763 | + return changed; | |
1764 | +} | |
1765 | ||
1766 | /* | |
1767 | * This function checks if any part of the range <start,end> is mapped | |
1768 | * with type. | |
1769 | */ | |
1770 | -int e820_any_mapped(unsigned long start, unsigned long end, unsigned type) | |
1771 | -{ | |
1772 | +int | |
1773 | +e820_any_mapped(unsigned long start, unsigned long end, unsigned type) | |
1774 | +{ | |
1775 | int i; | |
1776 | ||
1777 | #ifndef CONFIG_XEN | |
1778 | - for (i = 0; i < e820.nr_map; i++) { | |
1779 | - struct e820entry *ei = &e820.map[i]; | |
1780 | + for (i = 0; i < e820.nr_map; i++) { | |
1781 | + struct e820entry *ei = &e820.map[i]; | |
1782 | #else | |
1783 | if (!is_initial_xendomain()) | |
1784 | return 0; | |
1785 | @@ -130,12 +138,12 @@ int e820_any_mapped(unsigned long start, | |
1786 | const struct e820entry *ei = &machine_e820.map[i]; | |
1787 | #endif | |
1788 | ||
1789 | - if (type && ei->type != type) | |
1790 | + if (type && ei->type != type) | |
1791 | continue; | |
1792 | if (ei->addr >= end || ei->addr + ei->size <= start) | |
1793 | - continue; | |
1794 | - return 1; | |
1795 | - } | |
1796 | + continue; | |
1797 | + return 1; | |
1798 | + } | |
1799 | return 0; | |
1800 | } | |
1801 | EXPORT_SYMBOL_GPL(e820_any_mapped); | |
1802 | @@ -146,7 +154,8 @@ EXPORT_SYMBOL_GPL(e820_any_mapped); | |
1803 | * Note: this function only works correct if the e820 table is sorted and | |
1804 | * not-overlapping, which is the case | |
1805 | */ | |
1806 | -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) | |
1807 | +int __init e820_all_mapped(unsigned long start, unsigned long end, | |
1808 | + unsigned type) | |
1809 | { | |
1810 | int i; | |
1811 | ||
1812 | @@ -171,65 +180,77 @@ int __init e820_all_mapped(unsigned long | |
1813 | */ | |
1814 | if (ei->addr <= start) | |
1815 | start = ei->addr + ei->size; | |
1816 | - /* if start is now at or beyond end, we're done, full coverage */ | |
1817 | + /* | |
1818 | + * if start is now at or beyond end, we're done, full | |
1819 | + * coverage | |
1820 | + */ | |
1821 | if (start >= end) | |
1822 | - return 1; /* we're done */ | |
1823 | + return 1; | |
1824 | } | |
1825 | return 0; | |
1826 | } | |
1827 | ||
1828 | -/* | |
1829 | - * Find a free area in a specific range. | |
1830 | - */ | |
1831 | -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) | |
1832 | -{ | |
1833 | - int i; | |
1834 | - for (i = 0; i < e820.nr_map; i++) { | |
1835 | - struct e820entry *ei = &e820.map[i]; | |
1836 | - unsigned long addr = ei->addr, last; | |
1837 | - if (ei->type != E820_RAM) | |
1838 | - continue; | |
1839 | - if (addr < start) | |
1840 | +/* | |
1841 | + * Find a free area with specified alignment in a specific range. | |
1842 | + */ | |
1843 | +unsigned long __init find_e820_area(unsigned long start, unsigned long end, | |
1844 | + unsigned size, unsigned long align) | |
1845 | +{ | |
1846 | + int i; | |
1847 | + unsigned long mask = ~(align - 1); | |
1848 | + | |
1849 | + for (i = 0; i < e820.nr_map; i++) { | |
1850 | + struct e820entry *ei = &e820.map[i]; | |
1851 | + unsigned long addr = ei->addr, last; | |
1852 | + | |
1853 | + if (ei->type != E820_RAM) | |
1854 | + continue; | |
1855 | + if (addr < start) | |
1856 | addr = start; | |
1857 | - if (addr > ei->addr + ei->size) | |
1858 | - continue; | |
1859 | + if (addr > ei->addr + ei->size) | |
1860 | + continue; | |
1861 | while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) | |
1862 | ; | |
1863 | - last = PAGE_ALIGN(addr) + size; | |
1864 | + addr = (addr + align - 1) & mask; | |
1865 | + last = addr + size; | |
1866 | if (last > ei->addr + ei->size) | |
1867 | continue; | |
1868 | - if (last > end) | |
1869 | + if (last > end) | |
1870 | continue; | |
1871 | - return addr; | |
1872 | - } | |
1873 | - return -1UL; | |
1874 | -} | |
1875 | + return addr; | |
1876 | + } | |
1877 | + return -1UL; | |
1878 | +} | |
1879 | ||
1880 | /* | |
1881 | * Find the highest page frame number we have available | |
1882 | */ | |
1883 | unsigned long __init e820_end_of_ram(void) | |
1884 | { | |
1885 | - unsigned long end_pfn = 0; | |
1886 | + unsigned long end_pfn; | |
1887 | + | |
1888 | end_pfn = find_max_pfn_with_active_regions(); | |
1889 | - | |
1890 | - if (end_pfn > end_pfn_map) | |
1891 | + | |
1892 | + if (end_pfn > end_pfn_map) | |
1893 | end_pfn_map = end_pfn; | |
1894 | if (end_pfn_map > MAXMEM>>PAGE_SHIFT) | |
1895 | end_pfn_map = MAXMEM>>PAGE_SHIFT; | |
1896 | if (end_pfn > end_user_pfn) | |
1897 | end_pfn = end_user_pfn; | |
1898 | - if (end_pfn > end_pfn_map) | |
1899 | - end_pfn = end_pfn_map; | |
1900 | + if (end_pfn > end_pfn_map) | |
1901 | + end_pfn = end_pfn_map; | |
1902 | ||
1903 | - printk("end_pfn_map = %lu\n", end_pfn_map); | |
1904 | - return end_pfn; | |
1905 | + printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map); | |
1906 | + return end_pfn; | |
1907 | } | |
1908 | ||
1909 | /* | |
1910 | * Mark e820 reserved areas as busy for the resource manager. | |
1911 | */ | |
1912 | -void __init e820_reserve_resources(struct e820entry *e820, int nr_map) | |
1913 | +void __init e820_reserve_resources(struct e820entry *e820, int nr_map, | |
1914 | + struct resource *code_resource, | |
1915 | + struct resource *data_resource, | |
1916 | + struct resource *bss_resource) | |
1917 | { | |
1918 | int i; | |
1919 | for (i = 0; i < nr_map; i++) { | |
1920 | @@ -247,14 +268,14 @@ void __init e820_reserve_resources(struc | |
1921 | request_resource(&iomem_resource, res); | |
1922 | if (e820[i].type == E820_RAM) { | |
1923 | /* | |
1924 | - * We don't know which RAM region contains kernel data, | |
1925 | - * so we try it repeatedly and let the resource manager | |
1926 | - * test it. | |
1927 | + * We don't know which RAM region contains kernel data, | |
1928 | + * so we try it repeatedly and let the resource manager | |
1929 | + * test it. | |
1930 | */ | |
1931 | #ifndef CONFIG_XEN | |
1932 | - request_resource(res, &code_resource); | |
1933 | - request_resource(res, &data_resource); | |
1934 | - request_resource(res, &bss_resource); | |
1935 | + request_resource(res, code_resource); | |
1936 | + request_resource(res, data_resource); | |
1937 | + request_resource(res, bss_resource); | |
1938 | #endif | |
1939 | #ifdef CONFIG_KEXEC | |
1940 | if (crashk_res.start != crashk_res.end) | |
1941 | @@ -357,9 +378,9 @@ e820_register_active_regions(int nid, un | |
1942 | add_active_range(nid, ei_startpfn, ei_endpfn); | |
1943 | } | |
1944 | ||
1945 | -/* | |
1946 | +/* | |
1947 | * Add a memory region to the kernel e820 map. | |
1948 | - */ | |
1949 | + */ | |
1950 | void __init add_memory_region(unsigned long start, unsigned long size, int type) | |
1951 | { | |
1952 | int x = e820.nr_map; | |
1953 | @@ -384,9 +405,7 @@ unsigned long __init e820_hole_size(unsi | |
1954 | { | |
1955 | unsigned long start_pfn = start >> PAGE_SHIFT; | |
1956 | unsigned long end_pfn = end >> PAGE_SHIFT; | |
1957 | - unsigned long ei_startpfn; | |
1958 | - unsigned long ei_endpfn; | |
1959 | - unsigned long ram = 0; | |
1960 | + unsigned long ei_startpfn, ei_endpfn, ram = 0; | |
1961 | int i; | |
1962 | ||
1963 | for (i = 0; i < e820.nr_map; i++) { | |
1964 | @@ -398,28 +417,31 @@ unsigned long __init e820_hole_size(unsi | |
1965 | return end - start - (ram << PAGE_SHIFT); | |
1966 | } | |
1967 | ||
1968 | -void __init e820_print_map(char *who) | |
1969 | +static void __init e820_print_map(char *who) | |
1970 | { | |
1971 | int i; | |
1972 | ||
1973 | for (i = 0; i < e820.nr_map; i++) { | |
1974 | printk(KERN_INFO " %s: %016Lx - %016Lx ", who, | |
1975 | - (unsigned long long) e820.map[i].addr, | |
1976 | - (unsigned long long) (e820.map[i].addr + e820.map[i].size)); | |
1977 | + (unsigned long long) e820.map[i].addr, | |
1978 | + (unsigned long long) | |
1979 | + (e820.map[i].addr + e820.map[i].size)); | |
1980 | switch (e820.map[i].type) { | |
1981 | - case E820_RAM: printk("(usable)\n"); | |
1982 | - break; | |
1983 | + case E820_RAM: | |
1984 | + printk(KERN_CONT "(usable)\n"); | |
1985 | + break; | |
1986 | case E820_RESERVED: | |
1987 | - printk("(reserved)\n"); | |
1988 | - break; | |
1989 | + printk(KERN_CONT "(reserved)\n"); | |
1990 | + break; | |
1991 | case E820_ACPI: | |
1992 | - printk("(ACPI data)\n"); | |
1993 | - break; | |
1994 | + printk(KERN_CONT "(ACPI data)\n"); | |
1995 | + break; | |
1996 | case E820_NVS: | |
1997 | - printk("(ACPI NVS)\n"); | |
1998 | - break; | |
1999 | - default: printk("type %u\n", e820.map[i].type); | |
2000 | - break; | |
2001 | + printk(KERN_CONT "(ACPI NVS)\n"); | |
2002 | + break; | |
2003 | + default: | |
2004 | + printk(KERN_CONT "type %u\n", e820.map[i].type); | |
2005 | + break; | |
2006 | } | |
2007 | } | |
2008 | } | |
2009 | @@ -427,11 +449,11 @@ void __init e820_print_map(char *who) | |
2010 | /* | |
2011 | * Sanitize the BIOS e820 map. | |
2012 | * | |
2013 | - * Some e820 responses include overlapping entries. The following | |
2014 | + * Some e820 responses include overlapping entries. The following | |
2015 | * replaces the original e820 map with a new one, removing overlaps. | |
2016 | * | |
2017 | */ | |
2018 | -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) | |
2019 | +static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) | |
2020 | { | |
2021 | struct change_member { | |
2022 | struct e820entry *pbios; /* pointer to original bios entry */ | |
2023 | @@ -451,7 +473,8 @@ static int __init sanitize_e820_map(stru | |
2024 | int i; | |
2025 | ||
2026 | /* | |
2027 | - Visually we're performing the following (1,2,3,4 = memory types)... | |
2028 | + Visually we're performing the following | |
2029 | + (1,2,3,4 = memory types)... | |
2030 | ||
2031 | Sample memory map (w/overlaps): | |
2032 | ____22__________________ | |
2033 | @@ -493,22 +516,23 @@ static int __init sanitize_e820_map(stru | |
2034 | old_nr = *pnr_map; | |
2035 | ||
2036 | /* bail out if we find any unreasonable addresses in bios map */ | |
2037 | - for (i=0; i<old_nr; i++) | |
2038 | + for (i = 0; i < old_nr; i++) | |
2039 | if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) | |
2040 | return -1; | |
2041 | ||
2042 | /* create pointers for initial change-point information (for sorting) */ | |
2043 | - for (i=0; i < 2*old_nr; i++) | |
2044 | + for (i = 0; i < 2 * old_nr; i++) | |
2045 | change_point[i] = &change_point_list[i]; | |
2046 | ||
2047 | /* record all known change-points (starting and ending addresses), | |
2048 | omitting those that are for empty memory regions */ | |
2049 | chgidx = 0; | |
2050 | - for (i=0; i < old_nr; i++) { | |
2051 | + for (i = 0; i < old_nr; i++) { | |
2052 | if (biosmap[i].size != 0) { | |
2053 | change_point[chgidx]->addr = biosmap[i].addr; | |
2054 | change_point[chgidx++]->pbios = &biosmap[i]; | |
2055 | - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; | |
2056 | + change_point[chgidx]->addr = biosmap[i].addr + | |
2057 | + biosmap[i].size; | |
2058 | change_point[chgidx++]->pbios = &biosmap[i]; | |
2059 | } | |
2060 | } | |
2061 | @@ -518,75 +542,106 @@ static int __init sanitize_e820_map(stru | |
2062 | still_changing = 1; | |
2063 | while (still_changing) { | |
2064 | still_changing = 0; | |
2065 | - for (i=1; i < chg_nr; i++) { | |
2066 | - /* if <current_addr> > <last_addr>, swap */ | |
2067 | - /* or, if current=<start_addr> & last=<end_addr>, swap */ | |
2068 | - if ((change_point[i]->addr < change_point[i-1]->addr) || | |
2069 | - ((change_point[i]->addr == change_point[i-1]->addr) && | |
2070 | - (change_point[i]->addr == change_point[i]->pbios->addr) && | |
2071 | - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) | |
2072 | - ) | |
2073 | - { | |
2074 | + for (i = 1; i < chg_nr; i++) { | |
2075 | + unsigned long long curaddr, lastaddr; | |
2076 | + unsigned long long curpbaddr, lastpbaddr; | |
2077 | + | |
2078 | + curaddr = change_point[i]->addr; | |
2079 | + lastaddr = change_point[i - 1]->addr; | |
2080 | + curpbaddr = change_point[i]->pbios->addr; | |
2081 | + lastpbaddr = change_point[i - 1]->pbios->addr; | |
2082 | + | |
2083 | + /* | |
2084 | + * swap entries, when: | |
2085 | + * | |
2086 | + * curaddr > lastaddr or | |
2087 | + * curaddr == lastaddr and curaddr == curpbaddr and | |
2088 | + * lastaddr != lastpbaddr | |
2089 | + */ | |
2090 | + if (curaddr < lastaddr || | |
2091 | + (curaddr == lastaddr && curaddr == curpbaddr && | |
2092 | + lastaddr != lastpbaddr)) { | |
2093 | change_tmp = change_point[i]; | |
2094 | change_point[i] = change_point[i-1]; | |
2095 | change_point[i-1] = change_tmp; | |
2096 | - still_changing=1; | |
2097 | + still_changing = 1; | |
2098 | } | |
2099 | } | |
2100 | } | |
2101 | ||
2102 | /* create a new bios memory map, removing overlaps */ | |
2103 | - overlap_entries=0; /* number of entries in the overlap table */ | |
2104 | - new_bios_entry=0; /* index for creating new bios map entries */ | |
2105 | + overlap_entries = 0; /* number of entries in the overlap table */ | |
2106 | + new_bios_entry = 0; /* index for creating new bios map entries */ | |
2107 | last_type = 0; /* start with undefined memory type */ | |
2108 | last_addr = 0; /* start with 0 as last starting address */ | |
2109 | + | |
2110 | /* loop through change-points, determining affect on the new bios map */ | |
2111 | - for (chgidx=0; chgidx < chg_nr; chgidx++) | |
2112 | - { | |
2113 | + for (chgidx = 0; chgidx < chg_nr; chgidx++) { | |
2114 | /* keep track of all overlapping bios entries */ | |
2115 | - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) | |
2116 | - { | |
2117 | - /* add map entry to overlap list (> 1 entry implies an overlap) */ | |
2118 | - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; | |
2119 | - } | |
2120 | - else | |
2121 | - { | |
2122 | - /* remove entry from list (order independent, so swap with last) */ | |
2123 | - for (i=0; i<overlap_entries; i++) | |
2124 | - { | |
2125 | - if (overlap_list[i] == change_point[chgidx]->pbios) | |
2126 | - overlap_list[i] = overlap_list[overlap_entries-1]; | |
2127 | + if (change_point[chgidx]->addr == | |
2128 | + change_point[chgidx]->pbios->addr) { | |
2129 | + /* | |
2130 | + * add map entry to overlap list (> 1 entry | |
2131 | + * implies an overlap) | |
2132 | + */ | |
2133 | + overlap_list[overlap_entries++] = | |
2134 | + change_point[chgidx]->pbios; | |
2135 | + } else { | |
2136 | + /* | |
2137 | + * remove entry from list (order independent, | |
2138 | + * so swap with last) | |
2139 | + */ | |
2140 | + for (i = 0; i < overlap_entries; i++) { | |
2141 | + if (overlap_list[i] == | |
2142 | + change_point[chgidx]->pbios) | |
2143 | + overlap_list[i] = | |
2144 | + overlap_list[overlap_entries-1]; | |
2145 | } | |
2146 | overlap_entries--; | |
2147 | } | |
2148 | - /* if there are overlapping entries, decide which "type" to use */ | |
2149 | - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ | |
2150 | + /* | |
2151 | + * if there are overlapping entries, decide which | |
2152 | + * "type" to use (larger value takes precedence -- | |
2153 | + * 1=usable, 2,3,4,4+=unusable) | |
2154 | + */ | |
2155 | current_type = 0; | |
2156 | - for (i=0; i<overlap_entries; i++) | |
2157 | + for (i = 0; i < overlap_entries; i++) | |
2158 | if (overlap_list[i]->type > current_type) | |
2159 | current_type = overlap_list[i]->type; | |
2160 | - /* continue building up new bios map based on this information */ | |
2161 | + /* | |
2162 | + * continue building up new bios map based on this | |
2163 | + * information | |
2164 | + */ | |
2165 | if (current_type != last_type) { | |
2166 | if (last_type != 0) { | |
2167 | new_bios[new_bios_entry].size = | |
2168 | change_point[chgidx]->addr - last_addr; | |
2169 | - /* move forward only if the new size was non-zero */ | |
2170 | + /* | |
2171 | + * move forward only if the new size | |
2172 | + * was non-zero | |
2173 | + */ | |
2174 | if (new_bios[new_bios_entry].size != 0) | |
2175 | + /* | |
2176 | + * no more space left for new | |
2177 | + * bios entries ? | |
2178 | + */ | |
2179 | if (++new_bios_entry >= E820MAX) | |
2180 | - break; /* no more space left for new bios entries */ | |
2181 | + break; | |
2182 | } | |
2183 | if (current_type != 0) { | |
2184 | - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; | |
2185 | + new_bios[new_bios_entry].addr = | |
2186 | + change_point[chgidx]->addr; | |
2187 | new_bios[new_bios_entry].type = current_type; | |
2188 | - last_addr=change_point[chgidx]->addr; | |
2189 | + last_addr = change_point[chgidx]->addr; | |
2190 | } | |
2191 | last_type = current_type; | |
2192 | } | |
2193 | } | |
2194 | - new_nr = new_bios_entry; /* retain count for new bios entries */ | |
2195 | + /* retain count for new bios entries */ | |
2196 | + new_nr = new_bios_entry; | |
2197 | ||
2198 | /* copy new bios mapping into original location */ | |
2199 | - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); | |
2200 | + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); | |
2201 | *pnr_map = new_nr; | |
2202 | ||
2203 | return 0; | |
2204 | @@ -601,7 +656,7 @@ static int __init sanitize_e820_map(stru | |
2205 | * will have given us a memory map that we can use to properly | |
2206 | * set up memory. If we aren't, we'll fake a memory map. | |
2207 | */ | |
2208 | -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) | |
2209 | +static int __init copy_e820_map(struct e820entry *biosmap, int nr_map) | |
2210 | { | |
2211 | #ifndef CONFIG_XEN | |
2212 | /* Only one memory region (or negative)? Ignore it */ | |
2213 | @@ -622,7 +677,7 @@ static int __init copy_e820_map(struct e | |
2214 | return -1; | |
2215 | ||
2216 | add_memory_region(start, size, type); | |
2217 | - } while (biosmap++,--nr_map); | |
2218 | + } while (biosmap++, --nr_map); | |
2219 | ||
2220 | #ifdef CONFIG_XEN | |
2221 | if (is_initial_xendomain()) { | |
2222 | @@ -641,15 +696,17 @@ static int __init copy_e820_map(struct e | |
2223 | return 0; | |
2224 | } | |
2225 | ||
2226 | -void early_panic(char *msg) | |
2227 | +static void early_panic(char *msg) | |
2228 | { | |
2229 | early_printk(msg); | |
2230 | panic(msg); | |
2231 | } | |
2232 | ||
2233 | -#ifndef CONFIG_XEN | |
2234 | -void __init setup_memory_region(void) | |
2235 | +/* We're not void only for x86 32-bit compat */ | |
2236 | +char * __init machine_specific_memory_setup(void) | |
2237 | { | |
2238 | +#ifndef CONFIG_XEN | |
2239 | + char *who = "BIOS-e820"; | |
2240 | /* | |
2241 | * Try to copy the BIOS-supplied E820-map. | |
2242 | * | |
2243 | @@ -659,14 +716,8 @@ void __init setup_memory_region(void) | |
2244 | sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); | |
2245 | if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) | |
2246 | early_panic("Cannot find a valid memory map"); | |
2247 | - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | |
2248 | - e820_print_map("BIOS-e820"); | |
2249 | -} | |
2250 | - | |
2251 | #else /* CONFIG_XEN */ | |
2252 | - | |
2253 | -void __init setup_memory_region(void) | |
2254 | -{ | |
2255 | + char *who = "Xen"; | |
2256 | int rc; | |
2257 | struct xen_memory_map memmap; | |
2258 | /* | |
2259 | @@ -694,11 +745,13 @@ void __init setup_memory_region(void) | |
2260 | ||
2261 | if (copy_e820_map(map, (char)memmap.nr_entries) < 0) | |
2262 | early_panic("Cannot find a valid memory map"); | |
2263 | - | |
2264 | +#endif | |
2265 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | |
2266 | - e820_print_map("Xen"); | |
2267 | + e820_print_map(who); | |
2268 | + | |
2269 | + /* In case someone cares... */ | |
2270 | + return who; | |
2271 | } | |
2272 | -#endif | |
2273 | ||
2274 | static int __init parse_memopt(char *p) | |
2275 | { | |
2276 | @@ -709,7 +762,7 @@ static int __init parse_memopt(char *p) | |
2277 | if (!p) | |
2278 | return -EINVAL; | |
2279 | end_user_pfn = memparse(p, &p); | |
2280 | - end_user_pfn >>= PAGE_SHIFT; | |
2281 | + end_user_pfn >>= PAGE_SHIFT; | |
2282 | ||
2283 | end = end_user_pfn<<PAGE_SHIFT; | |
2284 | i = e820.nr_map-1; | |
2285 | @@ -727,7 +780,7 @@ static int __init parse_memopt(char *p) | |
2286 | } | |
2287 | ||
2288 | return 0; | |
2289 | -} | |
2290 | +} | |
2291 | early_param("mem", parse_memopt); | |
2292 | ||
2293 | static int userdef __initdata; | |
2294 | @@ -739,9 +792,9 @@ static int __init parse_memmap_opt(char | |
2295 | ||
2296 | if (!strcmp(p, "exactmap")) { | |
2297 | #ifdef CONFIG_CRASH_DUMP | |
2298 | - /* If we are doing a crash dump, we | |
2299 | - * still need to know the real mem | |
2300 | - * size before original memory map is | |
2301 | + /* | |
2302 | + * If we are doing a crash dump, we still need to know | |
2303 | + * the real mem size before original memory map is | |
2304 | * reset. | |
2305 | */ | |
2306 | e820_register_active_regions(0, 0, -1UL); | |
2307 | @@ -758,6 +811,8 @@ static int __init parse_memmap_opt(char | |
2308 | mem_size = memparse(p, &p); | |
2309 | if (p == oldp) | |
2310 | return -EINVAL; | |
2311 | + | |
2312 | + userdef = 1; | |
2313 | if (*p == '@') { | |
2314 | start_at = memparse(p+1, &p); | |
2315 | add_memory_region(start_at, mem_size, E820_RAM); | |
2316 | @@ -777,11 +832,58 @@ early_param("memmap", parse_memmap_opt); | |
2317 | void __init finish_e820_parsing(void) | |
2318 | { | |
2319 | if (userdef) { | |
2320 | + char nr = e820.nr_map; | |
2321 | + | |
2322 | + if (sanitize_e820_map(e820.map, &nr) < 0) | |
2323 | + early_panic("Invalid user supplied memory map"); | |
2324 | + e820.nr_map = nr; | |
2325 | + | |
2326 | printk(KERN_INFO "user-defined physical RAM map:\n"); | |
2327 | e820_print_map("user"); | |
2328 | } | |
2329 | } | |
2330 | ||
2331 | +#ifndef CONFIG_XEN | |
2332 | +void __init update_memory_range(u64 start, u64 size, unsigned old_type, | |
2333 | + unsigned new_type) | |
2334 | +{ | |
2335 | + int i; | |
2336 | + | |
2337 | + BUG_ON(old_type == new_type); | |
2338 | + | |
2339 | + for (i = 0; i < e820.nr_map; i++) { | |
2340 | + struct e820entry *ei = &e820.map[i]; | |
2341 | + u64 final_start, final_end; | |
2342 | + if (ei->type != old_type) | |
2343 | + continue; | |
2344 | + /* totally covered? */ | |
2345 | + if (ei->addr >= start && ei->size <= size) { | |
2346 | + ei->type = new_type; | |
2347 | + continue; | |
2348 | + } | |
2349 | + /* partially covered */ | |
2350 | + final_start = max(start, ei->addr); | |
2351 | + final_end = min(start + size, ei->addr + ei->size); | |
2352 | + if (final_start >= final_end) | |
2353 | + continue; | |
2354 | + add_memory_region(final_start, final_end - final_start, | |
2355 | + new_type); | |
2356 | + } | |
2357 | +} | |
2358 | + | |
2359 | +void __init update_e820(void) | |
2360 | +{ | |
2361 | + u8 nr_map; | |
2362 | + | |
2363 | + nr_map = e820.nr_map; | |
2364 | + if (sanitize_e820_map(e820.map, &nr_map)) | |
2365 | + return; | |
2366 | + e820.nr_map = nr_map; | |
2367 | + printk(KERN_INFO "modified physical RAM map:\n"); | |
2368 | + e820_print_map("modified"); | |
2369 | +} | |
2370 | +#endif | |
2371 | + | |
2372 | unsigned long pci_mem_start = 0xaeedbabe; | |
2373 | EXPORT_SYMBOL(pci_mem_start); | |
2374 | ||
2375 | @@ -825,8 +927,10 @@ __init void e820_setup_gap(struct e820en | |
2376 | ||
2377 | if (!found) { | |
2378 | gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; | |
2379 | - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" | |
2380 | - KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); | |
2381 | + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " | |
2382 | + "address range\n" | |
2383 | + KERN_ERR "PCI: Unassigned devices with 32bit resource " | |
2384 | + "registers may break!\n"); | |
2385 | } | |
2386 | ||
2387 | /* | |
2388 | @@ -839,8 +943,9 @@ __init void e820_setup_gap(struct e820en | |
2389 | /* Fun with two's complement */ | |
2390 | pci_mem_start = (gapstart + round) & -round; | |
2391 | ||
2392 | - printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", | |
2393 | - pci_mem_start, gapstart, gapsize); | |
2394 | + printk(KERN_INFO | |
2395 | + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", | |
2396 | + pci_mem_start, gapstart, gapsize); | |
2397 | } | |
2398 | ||
2399 | int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) | |
2400 | --- a/arch/x86/kernel/early_printk-xen.c | |
2401 | +++ b/arch/x86/kernel/early_printk-xen.c | |
2402 | @@ -222,7 +222,7 @@ static struct console simnow_console = { | |
2403 | }; | |
2404 | ||
2405 | /* Direct interface for emergencies */ | |
2406 | -struct console *early_console = &early_vga_console; | |
2407 | +static struct console *early_console = &early_vga_console; | |
2408 | static int early_console_initialized = 0; | |
2409 | ||
2410 | void early_printk(const char *fmt, ...) | |
2411 | --- a/arch/x86/kernel/entry_32-xen.S | |
2412 | +++ b/arch/x86/kernel/entry_32-xen.S | |
2413 | @@ -59,7 +59,7 @@ | |
2414 | * for paravirtualization. The following will never clobber any registers: | |
2415 | * INTERRUPT_RETURN (aka. "iret") | |
2416 | * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") | |
2417 | - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). | |
2418 | + * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit"). | |
2419 | * | |
2420 | * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must | |
2421 | * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). | |
2422 | @@ -282,16 +282,21 @@ END(resume_kernel) | |
2423 | #endif | |
2424 | CFI_ENDPROC | |
2425 | ||
2426 | + .macro test_tif ti_reg # system call tracing in operation / emulation | |
2427 | + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ | |
2428 | + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg) | |
2429 | + .endm | |
2430 | + | |
2431 | /* SYSENTER_RETURN points to after the "sysenter" instruction in | |
2432 | the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ | |
2433 | ||
2434 | # sysenter call handler stub | |
2435 | -ENTRY(sysenter_entry) | |
2436 | +ENTRY(ia32_sysenter_target) | |
2437 | CFI_STARTPROC simple | |
2438 | CFI_SIGNAL_FRAME | |
2439 | CFI_DEF_CFA esp, 0 | |
2440 | CFI_REGISTER esp, ebp | |
2441 | - movl SYSENTER_stack_esp0(%esp),%esp | |
2442 | + movl SYSENTER_stack_sp0(%esp),%esp | |
2443 | sysenter_past_esp: | |
2444 | /* | |
2445 | * No need to follow this irqs on/off section: the syscall | |
2446 | @@ -334,9 +339,7 @@ sysenter_past_esp: | |
2447 | CFI_ADJUST_CFA_OFFSET 4 | |
2448 | SAVE_ALL | |
2449 | GET_THREAD_INFO(%ebp) | |
2450 | - | |
2451 | - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ | |
2452 | - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) | |
2453 | + test_tif %ebp | |
2454 | jnz syscall_trace_entry | |
2455 | cmpl $(nr_syscalls), %eax | |
2456 | jae syscall_badsys | |
2457 | @@ -354,7 +357,7 @@ sysenter_past_esp: | |
2458 | xorl %ebp,%ebp | |
2459 | TRACE_IRQS_ON | |
2460 | 1: mov PT_FS(%esp), %fs | |
2461 | - ENABLE_INTERRUPTS_SYSEXIT | |
2462 | + ENABLE_INTERRUPTS_SYSCALL_RET | |
2463 | CFI_ENDPROC | |
2464 | .pushsection .fixup,"ax" | |
2465 | 2: movl $0,PT_FS(%esp) | |
2466 | @@ -363,10 +366,10 @@ sysenter_past_esp: | |
2467 | .align 4 | |
2468 | .long 1b,2b | |
2469 | .popsection | |
2470 | -ENDPROC(sysenter_entry) | |
2471 | +ENDPROC(ia32_sysenter_target) | |
2472 | ||
2473 | # pv sysenter call handler stub | |
2474 | -ENTRY(sysenter_entry_pv) | |
2475 | +ENTRY(ia32pv_sysenter_target) | |
2476 | RING0_INT_FRAME | |
2477 | movl $__USER_DS,16(%esp) | |
2478 | movl %ebp,12(%esp) | |
2479 | @@ -389,7 +392,7 @@ ENTRY(sysenter_entry_pv) | |
2480 | .previous | |
2481 | /* fall through */ | |
2482 | CFI_ENDPROC | |
2483 | -ENDPROC(sysenter_entry_pv) | |
2484 | +ENDPROC(ia32pv_sysenter_target) | |
2485 | ||
2486 | # system call handler stub | |
2487 | ENTRY(system_call) | |
2488 | @@ -398,9 +401,7 @@ ENTRY(system_call) | |
2489 | CFI_ADJUST_CFA_OFFSET 4 | |
2490 | SAVE_ALL | |
2491 | GET_THREAD_INFO(%ebp) | |
2492 | - # system call tracing in operation / emulation | |
2493 | - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ | |
2494 | - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) | |
2495 | + test_tif %ebp | |
2496 | jnz syscall_trace_entry | |
2497 | cmpl $(nr_syscalls), %eax | |
2498 | jae syscall_badsys | |
2499 | @@ -452,7 +453,8 @@ restore_nocheck_notrace: | |
2500 | RESTORE_REGS | |
2501 | addl $4, %esp # skip orig_eax/error_code | |
2502 | CFI_ADJUST_CFA_OFFSET -4 | |
2503 | -1: INTERRUPT_RETURN | |
2504 | +irq_return: | |
2505 | + INTERRUPT_RETURN | |
2506 | .section .fixup,"ax" | |
2507 | iret_exc: | |
2508 | pushl $0 # no error code | |
2509 | @@ -461,7 +463,7 @@ iret_exc: | |
2510 | .previous | |
2511 | .section __ex_table,"a" | |
2512 | .align 4 | |
2513 | - .long 1b,iret_exc | |
2514 | + .long irq_return,iret_exc | |
2515 | .previous | |
2516 | ||
2517 | CFI_RESTORE_STATE | |
2518 | @@ -657,7 +659,7 @@ END(syscall_badsys) | |
2519 | * Build the entry stubs and pointer table with | |
2520 | * some assembler magic. | |
2521 | */ | |
2522 | -.data | |
2523 | +.section .rodata,"a" | |
2524 | ENTRY(interrupt) | |
2525 | .text | |
2526 | ||
2527 | @@ -959,7 +961,7 @@ END(device_not_available) | |
2528 | * that sets up the real kernel stack. Check here, since we can't | |
2529 | * allow the wrong stack to be used. | |
2530 | * | |
2531 | - * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have | |
2532 | + * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have | |
2533 | * already pushed 3 words if it hits on the sysenter instruction: | |
2534 | * eflags, cs and eip. | |
2535 | * | |
2536 | @@ -971,7 +973,7 @@ END(device_not_available) | |
2537 | cmpw $__KERNEL_CS,4(%esp); \ | |
2538 | jne ok; \ | |
2539 | label: \ | |
2540 | - movl SYSENTER_stack_esp0+offset(%esp),%esp; \ | |
2541 | + movl SYSENTER_stack_sp0+offset(%esp),%esp; \ | |
2542 | CFI_DEF_CFA esp, 0; \ | |
2543 | CFI_UNDEFINED eip; \ | |
2544 | pushfl; \ | |
2545 | @@ -986,7 +988,7 @@ label: \ | |
2546 | KPROBE_ENTRY(debug) | |
2547 | RING0_INT_FRAME | |
2548 | #ifndef CONFIG_XEN | |
2549 | - cmpl $sysenter_entry,(%esp) | |
2550 | + cmpl $ia32_sysenter_target,(%esp) | |
2551 | jne debug_stack_correct | |
2552 | FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) | |
2553 | debug_stack_correct: | |
2554 | @@ -1019,7 +1021,7 @@ KPROBE_ENTRY(nmi) | |
2555 | popl %eax | |
2556 | CFI_ADJUST_CFA_OFFSET -4 | |
2557 | je nmi_espfix_stack | |
2558 | - cmpl $sysenter_entry,(%esp) | |
2559 | + cmpl $ia32_sysenter_target,(%esp) | |
2560 | je nmi_stack_fixup | |
2561 | pushl %eax | |
2562 | CFI_ADJUST_CFA_OFFSET 4 | |
2563 | @@ -1032,7 +1034,7 @@ KPROBE_ENTRY(nmi) | |
2564 | popl %eax | |
2565 | CFI_ADJUST_CFA_OFFSET -4 | |
2566 | jae nmi_stack_correct | |
2567 | - cmpl $sysenter_entry,12(%esp) | |
2568 | + cmpl $ia32_sysenter_target,12(%esp) | |
2569 | je nmi_debug_stack_check | |
2570 | nmi_stack_correct: | |
2571 | /* We have a RING0_INT_FRAME here */ | |
2572 | @@ -1085,12 +1087,8 @@ nmi_espfix_stack: | |
2573 | RESTORE_REGS | |
2574 | lss 12+4(%esp), %esp # back to espfix stack | |
2575 | CFI_ADJUST_CFA_OFFSET -24 | |
2576 | -1: INTERRUPT_RETURN | |
2577 | + jmp irq_return | |
2578 | CFI_ENDPROC | |
2579 | -.section __ex_table,"a" | |
2580 | - .align 4 | |
2581 | - .long 1b,iret_exc | |
2582 | -.previous | |
2583 | #else | |
2584 | KPROBE_ENTRY(nmi) | |
2585 | RING0_INT_FRAME | |
2586 | @@ -1108,17 +1106,17 @@ KPROBE_END(nmi) | |
2587 | ||
2588 | #ifdef CONFIG_PARAVIRT | |
2589 | ENTRY(native_iret) | |
2590 | -1: iret | |
2591 | + iret | |
2592 | .section __ex_table,"a" | |
2593 | .align 4 | |
2594 | - .long 1b,iret_exc | |
2595 | + .long native_iret, iret_exc | |
2596 | .previous | |
2597 | END(native_iret) | |
2598 | ||
2599 | -ENTRY(native_irq_enable_sysexit) | |
2600 | +ENTRY(native_irq_enable_syscall_ret) | |
2601 | sti | |
2602 | sysexit | |
2603 | -END(native_irq_enable_sysexit) | |
2604 | +END(native_irq_enable_syscall_ret) | |
2605 | #endif | |
2606 | ||
2607 | KPROBE_ENTRY(int3) | |
2608 | @@ -1267,7 +1265,144 @@ ENTRY(kernel_thread_helper) | |
2609 | CFI_ENDPROC | |
2610 | ENDPROC(kernel_thread_helper) | |
2611 | ||
2612 | +#include <asm/alternative-asm.h> | |
2613 | + | |
2614 | + # pv syscall call handler stub | |
2615 | +ENTRY(ia32pv_cstar_target) | |
2616 | + RING0_INT_FRAME | |
2617 | + movl $__USER_DS,16(%esp) | |
2618 | + movl %ebp,%ecx | |
2619 | + movl $__USER_CS,4(%esp) | |
2620 | + movl 12(%esp),%ebp | |
2621 | + pushl %eax # save orig_eax | |
2622 | + CFI_ADJUST_CFA_OFFSET 4 | |
2623 | +/* | |
2624 | + * Load the potential sixth argument from user stack. | |
2625 | + * Careful about security. | |
2626 | + */ | |
2627 | + cmpl $__PAGE_OFFSET-4,%ebp | |
2628 | + CFI_REMEMBER_STATE | |
2629 | + ja cstar_fault | |
2630 | +1: movl (%ebp),%ebp | |
2631 | +.section __ex_table,"a" | |
2632 | + .align 4 | |
2633 | + .long 1b,cstar_fault | |
2634 | +.previous | |
2635 | + SAVE_ALL | |
2636 | + GET_THREAD_INFO(%ebp) | |
2637 | + test_tif %ebp | |
2638 | + jnz cstar_trace_entry | |
2639 | + cmpl $nr_syscalls,%eax | |
2640 | + jae cstar_badsys | |
2641 | +.Lcstar_call: | |
2642 | + btl %eax,cstar_special | |
2643 | + jc .Lcstar_special | |
2644 | + call *cstar_call_table(,%eax,4) | |
2645 | + movl %eax,PT_EAX(%esp) # store the return value | |
2646 | +.Lcstar_exit: | |
2647 | + movl PT_ECX(%esp),%ecx | |
2648 | + movl %ecx,PT_EBP(%esp) # put user EBP back in place | |
2649 | + jmp syscall_exit | |
2650 | +.Lcstar_special: | |
2651 | + movl PT_ECX(%esp),%ecx | |
2652 | + movl %ecx,PT_EBP(%esp) # put user EBP back in place | |
2653 | + jmp syscall_call | |
2654 | +cstar_set_tif: | |
2655 | + movl $cstar_clear_tif,(%esp) # replace return address | |
2656 | + LOCK_PREFIX | |
2657 | + orl $_TIF_CSTAR,TI_flags(%ebp) | |
2658 | + jmp *sys_call_table(,%eax,4) | |
2659 | +cstar_clear_tif: | |
2660 | + movl %eax,PT_EAX(%esp) # store the return value | |
2661 | + LOCK_PREFIX | |
2662 | + andl $~_TIF_CSTAR,TI_flags(%ebp) | |
2663 | + jmp .Lcstar_exit | |
2664 | +cstar_trace_entry: | |
2665 | + movl $-ENOSYS,PT_EAX(%esp) | |
2666 | + cmpl $nr_syscalls,%eax | |
2667 | + jae 1f | |
2668 | + btl %eax,cstar_special | |
2669 | + jc .Lcstar_trace_special | |
2670 | +1: movl %esp,%eax | |
2671 | + xorl %edx,%edx | |
2672 | + LOCK_PREFIX | |
2673 | + orl $_TIF_CSTAR,TI_flags(%ebp) | |
2674 | + call do_syscall_trace | |
2675 | + LOCK_PREFIX | |
2676 | + andl $~_TIF_CSTAR,TI_flags(%ebp) | |
2677 | + testl %eax,%eax | |
2678 | + jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU, | |
2679 | + # so must skip actual syscall | |
2680 | + movl PT_ORIG_EAX(%esp),%eax | |
2681 | + cmpl $nr_syscalls,%eax | |
2682 | + jb .Lcstar_call | |
2683 | + jmp .Lcstar_exit | |
2684 | +.Lcstar_trace_special: | |
2685 | + movl PT_ECX(%esp),%ecx | |
2686 | + movl %esp,%eax | |
2687 | + xorl %edx,%edx | |
2688 | + movl %ecx,PT_EBP(%esp) # put user EBP back in place | |
2689 | + call do_syscall_trace | |
2690 | + testl %eax,%eax | |
2691 | + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, | |
2692 | + # so must skip actual syscall | |
2693 | + movl PT_ORIG_EAX(%esp),%eax | |
2694 | + cmpl $nr_syscalls,%eax | |
2695 | + jb syscall_call | |
2696 | + jmp syscall_exit | |
2697 | +cstar_badsys: | |
2698 | + movl $-ENOSYS,PT_EAX(%esp) | |
2699 | +.Lcstar_resume: | |
2700 | + movl PT_ECX(%esp),%ecx | |
2701 | + movl %ecx,PT_EBP(%esp) # put user EBP back in place | |
2702 | + jmp resume_userspace | |
2703 | + CFI_RESTORE_STATE | |
2704 | +cstar_fault: | |
2705 | + movl $-EFAULT,%eax | |
2706 | + SAVE_ALL | |
2707 | + GET_THREAD_INFO(%ebp) | |
2708 | + jmp .Lcstar_resume | |
2709 | + CFI_ENDPROC | |
2710 | +ENDPROC(ia32pv_cstar_target) | |
2711 | + | |
2712 | +ENTRY(cstar_ret_from_fork) | |
2713 | + CFI_STARTPROC | |
2714 | + movl PT_ECX(%esp),%ecx | |
2715 | + GET_THREAD_INFO(%ebp) | |
2716 | + movl %ecx,PT_EBP(%esp) # put user EBP back in place | |
2717 | + LOCK_PREFIX | |
2718 | + andl $~_TIF_CSTAR,TI_flags(%ebp) | |
2719 | + jmp ret_from_fork | |
2720 | + CFI_ENDPROC | |
2721 | +END(ret_from_fork) | |
2722 | + | |
2723 | .section .rodata,"a" | |
2724 | #include "syscall_table_32.S" | |
2725 | ||
2726 | syscall_table_size=(.-sys_call_table) | |
2727 | + | |
2728 | +#include <asm/unistd.h> | |
2729 | +cstar_special: | |
2730 | +nr=0 | |
2731 | +mask=0 | |
2732 | +.rept nr_syscalls+31 | |
2733 | + .irp n, __NR_sigreturn, __NR_rt_sigreturn | |
2734 | + .if nr == \n | |
2735 | + mask = mask | (1 << (\n & 31)) | |
2736 | + .endif | |
2737 | + .endr | |
2738 | + nr = nr + 1 | |
2739 | + .if (nr & 31) == 0 | |
2740 | + .long mask | |
2741 | + mask = 0 | |
2742 | + .endif | |
2743 | +.endr | |
2744 | +#define sys_call_table cstar_call_table | |
2745 | +#define sys_fork cstar_set_tif | |
2746 | +#define sys_clone cstar_set_tif | |
2747 | +#define sys_vfork cstar_set_tif | |
2748 | +#include "syscall_table_32.S" | |
2749 | +#undef sys_call_table | |
2750 | +#undef sys_fork | |
2751 | +#undef sys_clone | |
2752 | +#undef sys_vfork | |
2753 | --- a/arch/x86/kernel/entry_64-xen.S | |
2754 | +++ b/arch/x86/kernel/entry_64-xen.S | |
2755 | @@ -54,17 +54,22 @@ | |
2756 | #include <asm/page.h> | |
2757 | #include <asm/irqflags.h> | |
2758 | #include <asm/errno.h> | |
2759 | -#include <xen/interface/arch-x86_64.h> | |
2760 | +#include <xen/interface/xen.h> | |
2761 | #include <xen/interface/features.h> | |
2762 | ||
2763 | -#include "xen_entry_64.S" | |
2764 | - | |
2765 | .code64 | |
2766 | ||
2767 | #ifndef CONFIG_PREEMPT | |
2768 | #define retint_kernel retint_restore_args | |
2769 | #endif | |
2770 | ||
2771 | +#ifdef CONFIG_PARAVIRT | |
2772 | +ENTRY(native_irq_enable_syscall_ret) | |
2773 | + movq %gs:pda_oldrsp,%rsp | |
2774 | + swapgs | |
2775 | + sysretq | |
2776 | +#endif /* CONFIG_PARAVIRT */ | |
2777 | + | |
2778 | ||
2779 | .macro TRACE_IRQS_IRETQ offset=ARGOFFSET | |
2780 | #ifdef CONFIG_TRACE_IRQFLAGS | |
2781 | @@ -277,7 +282,7 @@ ret_from_sys_call: | |
2782 | sysret_check: | |
2783 | LOCKDEP_SYS_EXIT | |
2784 | GET_THREAD_INFO(%rcx) | |
2785 | - XEN_BLOCK_EVENTS(%rsi) | |
2786 | + DISABLE_INTERRUPTS(CLBR_NONE) | |
2787 | TRACE_IRQS_OFF | |
2788 | movl threadinfo_flags(%rcx),%edx | |
2789 | andl %edi,%edx | |
2790 | @@ -287,7 +292,7 @@ sysret_check: | |
2791 | * sysretq will re-enable interrupts: | |
2792 | */ | |
2793 | TRACE_IRQS_ON | |
2794 | - XEN_UNBLOCK_EVENTS(%rsi) | |
2795 | + ENABLE_INTERRUPTS(CLBR_NONE) | |
2796 | RESTORE_ARGS 0,8,0 | |
2797 | HYPERVISOR_IRET VGCF_IN_SYSCALL | |
2798 | ||
2799 | @@ -298,7 +303,7 @@ sysret_careful: | |
2800 | bt $TIF_NEED_RESCHED,%edx | |
2801 | jnc sysret_signal | |
2802 | TRACE_IRQS_ON | |
2803 | - XEN_UNBLOCK_EVENTS(%rsi) | |
2804 | + ENABLE_INTERRUPTS(CLBR_NONE) | |
2805 | pushq %rdi | |
2806 | CFI_ADJUST_CFA_OFFSET 8 | |
2807 | call schedule | |
2808 | @@ -309,9 +314,8 @@ sysret_careful: | |
2809 | /* Handle a signal */ | |
2810 | sysret_signal: | |
2811 | TRACE_IRQS_ON | |
2812 | -/* sti */ | |
2813 | - XEN_UNBLOCK_EVENTS(%rsi) | |
2814 | - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx | |
2815 | + ENABLE_INTERRUPTS(CLBR_NONE) | |
2816 | + testl $_TIF_DO_NOTIFY_MASK,%edx | |
2817 | jz 1f | |
2818 | ||
2819 | /* Really a signal */ | |
2820 | @@ -323,7 +327,7 @@ sysret_signal: | |
2821 | 1: movl $_TIF_NEED_RESCHED,%edi | |
2822 | /* Use IRET because user could have changed frame. This | |
2823 | works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ | |
2824 | - XEN_BLOCK_EVENTS(%rsi) | |
2825 | + DISABLE_INTERRUPTS(CLBR_NONE) | |
2826 | TRACE_IRQS_OFF | |
2827 | jmp int_with_check | |
2828 | ||
2829 | @@ -355,7 +359,7 @@ tracesys: | |
2830 | */ | |
2831 | .globl int_ret_from_sys_call | |
2832 | int_ret_from_sys_call: | |
2833 | - XEN_BLOCK_EVENTS(%rsi) | |
2834 | + DISABLE_INTERRUPTS(CLBR_NONE) | |
2835 | TRACE_IRQS_OFF | |
2836 | testb $3,CS-ARGOFFSET(%rsp) | |
2837 | jnz 1f | |
2838 | @@ -381,22 +385,20 @@ int_careful: | |
2839 | bt $TIF_NEED_RESCHED,%edx | |
2840 | jnc int_very_careful | |
2841 | TRACE_IRQS_ON | |
2842 | -/* sti */ | |
2843 | - XEN_UNBLOCK_EVENTS(%rsi) | |
2844 | + ENABLE_INTERRUPTS(CLBR_NONE) | |
2845 | pushq %rdi | |
2846 | CFI_ADJUST_CFA_OFFSET 8 | |
2847 | call schedule | |
2848 | popq %rdi | |
2849 | CFI_ADJUST_CFA_OFFSET -8 | |
2850 | - XEN_BLOCK_EVENTS(%rsi) | |
2851 | + DISABLE_INTERRUPTS(CLBR_NONE) | |
2852 | TRACE_IRQS_OFF | |
2853 | jmp int_with_check | |
2854 | ||
2855 | /* handle signals and tracing -- both require a full stack frame */ | |
2856 | int_very_careful: | |
2857 | TRACE_IRQS_ON | |
2858 | -/* sti */ | |
2859 | - XEN_UNBLOCK_EVENTS(%rsi) | |
2860 | + ENABLE_INTERRUPTS(CLBR_NONE) | |
2861 | SAVE_REST | |
2862 | /* Check for syscall exit trace */ | |
2863 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx | |
2864 | @@ -411,7 +413,7 @@ int_very_careful: | |
2865 | jmp int_restore_rest | |
2866 | ||
2867 | int_signal: | |
2868 | - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx | |
2869 | + testl $_TIF_DO_NOTIFY_MASK,%edx | |
2870 | jz 1f | |
2871 | movq %rsp,%rdi # &ptregs -> arg1 | |
2872 | xorl %esi,%esi # oldset -> arg2 | |
2873 | @@ -419,7 +421,7 @@ int_signal: | |
2874 | 1: movl $_TIF_NEED_RESCHED,%edi | |
2875 | int_restore_rest: | |
2876 | RESTORE_REST | |
2877 | - XEN_BLOCK_EVENTS(%rsi) | |
2878 | + DISABLE_INTERRUPTS(CLBR_NONE) | |
2879 | TRACE_IRQS_OFF | |
2880 | jmp int_with_check | |
2881 | CFI_ENDPROC | |
2882 | @@ -474,6 +476,7 @@ ENTRY(stub_execve) | |
2883 | CFI_REGISTER rip, r11 | |
2884 | SAVE_REST | |
2885 | FIXUP_TOP_OF_STACK %r11 | |
2886 | + movq %rsp, %rcx | |
2887 | call sys_execve | |
2888 | RESTORE_TOP_OF_STACK %r11 | |
2889 | movq %rax,RAX(%rsp) | |
2890 | @@ -526,11 +529,10 @@ retint_check: | |
2891 | retint_restore_args: /* return to kernel space */ | |
2892 | movl EFLAGS-REST_SKIP(%rsp), %eax | |
2893 | shr $9, %eax # EAX[0] == IRET_EFLAGS.IF | |
2894 | - XEN_GET_VCPU_INFO(%rsi) | |
2895 | + GET_VCPU_INFO | |
2896 | andb evtchn_upcall_mask(%rsi),%al | |
2897 | andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask | |
2898 | jnz restore_all_enable_events # != 0 => enable event delivery | |
2899 | - XEN_PUT_VCPU_INFO(%rsi) | |
2900 | ||
2901 | RESTORE_ARGS 0,8,0 | |
2902 | HYPERVISOR_IRET 0 | |
2903 | @@ -541,31 +543,29 @@ retint_careful: | |
2904 | bt $TIF_NEED_RESCHED,%edx | |
2905 | jnc retint_signal | |
2906 | TRACE_IRQS_ON | |
2907 | - XEN_UNBLOCK_EVENTS(%rsi) | |
2908 | -/* sti */ | |
2909 | + ENABLE_INTERRUPTS(CLBR_NONE) | |
2910 | pushq %rdi | |
2911 | CFI_ADJUST_CFA_OFFSET 8 | |
2912 | call schedule | |
2913 | popq %rdi | |
2914 | CFI_ADJUST_CFA_OFFSET -8 | |
2915 | GET_THREAD_INFO(%rcx) | |
2916 | - XEN_BLOCK_EVENTS(%rsi) | |
2917 | -/* cli */ | |
2918 | + DISABLE_INTERRUPTS(CLBR_NONE) | |
2919 | TRACE_IRQS_OFF | |
2920 | jmp retint_check | |
2921 | ||
2922 | retint_signal: | |
2923 | - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx | |
2924 | + testl $_TIF_DO_NOTIFY_MASK,%edx | |
2925 | jz retint_restore_args | |
2926 | TRACE_IRQS_ON | |
2927 | - XEN_UNBLOCK_EVENTS(%rsi) | |
2928 | + ENABLE_INTERRUPTS(CLBR_NONE) | |
2929 | SAVE_REST | |
2930 | movq $-1,ORIG_RAX(%rsp) | |
2931 | xorl %esi,%esi # oldset | |
2932 | movq %rsp,%rdi # &pt_regs | |
2933 | call do_notify_resume | |
2934 | RESTORE_REST | |
2935 | - XEN_BLOCK_EVENTS(%rsi) | |
2936 | + DISABLE_INTERRUPTS(CLBR_NONE) | |
2937 | TRACE_IRQS_OFF | |
2938 | movl $_TIF_NEED_RESCHED,%edi | |
2939 | GET_THREAD_INFO(%rcx) | |
2940 | @@ -702,7 +702,7 @@ END(spurious_interrupt) | |
2941 | rdmsr | |
2942 | testl %edx,%edx | |
2943 | js 1f | |
2944 | - swapgs | |
2945 | + SWAPGS | |
2946 | xorl %ebx,%ebx | |
2947 | 1: | |
2948 | #endif | |
2949 | @@ -719,8 +719,7 @@ END(spurious_interrupt) | |
2950 | .if \ist | |
2951 | addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) | |
2952 | .endif | |
2953 | -/* cli */ | |
2954 | - XEN_BLOCK_EVENTS(%rsi) | |
2955 | + DISABLE_INTERRUPTS(CLBR_NONE) | |
2956 | .if \irqtrace | |
2957 | TRACE_IRQS_OFF | |
2958 | .endif | |
2959 | @@ -749,10 +748,10 @@ paranoid_swapgs\trace: | |
2960 | .if \trace | |
2961 | TRACE_IRQS_IRETQ 0 | |
2962 | .endif | |
2963 | - swapgs | |
2964 | + SWAPGS_UNSAFE_STACK | |
2965 | paranoid_restore\trace: | |
2966 | RESTORE_ALL 8 | |
2967 | - iretq | |
2968 | + jmp irq_return | |
2969 | paranoid_userspace\trace: | |
2970 | GET_THREAD_INFO(%rcx) | |
2971 | movl threadinfo_flags(%rcx),%ebx | |
2972 | @@ -767,11 +766,11 @@ paranoid_userspace\trace: | |
2973 | .if \trace | |
2974 | TRACE_IRQS_ON | |
2975 | .endif | |
2976 | - sti | |
2977 | + ENABLE_INTERRUPTS(CLBR_NONE) | |
2978 | xorl %esi,%esi /* arg2: oldset */ | |
2979 | movq %rsp,%rdi /* arg1: &pt_regs */ | |
2980 | call do_notify_resume | |
2981 | - cli | |
2982 | + DISABLE_INTERRUPTS(CLBR_NONE) | |
2983 | .if \trace | |
2984 | TRACE_IRQS_OFF | |
2985 | .endif | |
2986 | @@ -780,9 +779,9 @@ paranoid_schedule\trace: | |
2987 | .if \trace | |
2988 | TRACE_IRQS_ON | |
2989 | .endif | |
2990 | - sti | |
2991 | + ENABLE_INTERRUPTS(CLBR_ANY) | |
2992 | call schedule | |
2993 | - cli | |
2994 | + DISABLE_INTERRUPTS(CLBR_ANY) | |
2995 | .if \trace | |
2996 | TRACE_IRQS_OFF | |
2997 | .endif | |
2998 | @@ -846,8 +845,7 @@ error_call_handler: | |
2999 | call *%rax | |
3000 | error_exit: | |
3001 | RESTORE_REST | |
3002 | -/* cli */ | |
3003 | - XEN_BLOCK_EVENTS(%rsi) | |
3004 | + DISABLE_INTERRUPTS(CLBR_NONE) | |
3005 | TRACE_IRQS_OFF | |
3006 | GET_THREAD_INFO(%rcx) | |
3007 | testb $3,CS-ARGOFFSET(%rsp) | |
3008 | @@ -875,7 +873,7 @@ error_kernelspace: | |
3009 | iret run with kernel gs again, so don't set the user space flag. | |
3010 | B stepping K8s sometimes report an truncated RIP for IRET | |
3011 | exceptions returning to compat mode. Check for these here too. */ | |
3012 | - leaq iret_label(%rip),%rbp | |
3013 | + leaq irq_return(%rip),%rbp | |
3014 | cmpq %rbp,RIP(%rsp) | |
3015 | je error_swapgs | |
3016 | movl %ebp,%ebp /* zero extend */ | |
3017 | @@ -930,19 +928,17 @@ END(do_hypervisor_callback) | |
3018 | restore_all_enable_events: | |
3019 | CFI_DEFAULT_STACK adj=1 | |
3020 | TRACE_IRQS_ON | |
3021 | - XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up... | |
3022 | + __ENABLE_INTERRUPTS | |
3023 | ||
3024 | scrit: /**** START OF CRITICAL REGION ****/ | |
3025 | - XEN_TEST_PENDING(%rsi) | |
3026 | + __TEST_PENDING | |
3027 | CFI_REMEMBER_STATE | |
3028 | jnz 14f # process more events if necessary... | |
3029 | - XEN_PUT_VCPU_INFO(%rsi) | |
3030 | RESTORE_ARGS 0,8,0 | |
3031 | HYPERVISOR_IRET 0 | |
3032 | ||
3033 | CFI_RESTORE_STATE | |
3034 | -14: XEN_LOCKED_BLOCK_EVENTS(%rsi) | |
3035 | - XEN_PUT_VCPU_INFO(%rsi) | |
3036 | +14: __DISABLE_INTERRUPTS | |
3037 | SAVE_REST | |
3038 | movq %rsp,%rdi # set the argument again | |
3039 | jmp 11b | |
3040 | @@ -1086,15 +1082,16 @@ ENDPROC(child_rip) | |
3041 | * rdi: name, rsi: argv, rdx: envp | |
3042 | * | |
3043 | * We want to fallback into: | |
3044 | - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) | |
3045 | + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) | |
3046 | * | |
3047 | * do_sys_execve asm fallback arguments: | |
3048 | - * rdi: name, rsi: argv, rdx: envp, fake frame on the stack | |
3049 | + * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack | |
3050 | */ | |
3051 | ENTRY(kernel_execve) | |
3052 | CFI_STARTPROC | |
3053 | FAKE_STACK_FRAME $0 | |
3054 | SAVE_ALL | |
3055 | + movq %rsp,%rcx | |
3056 | call sys_execve | |
3057 | movq %rax, RAX(%rsp) | |
3058 | RESTORE_REST | |
3059 | @@ -1144,7 +1141,7 @@ do_nmi_callback: | |
3060 | call do_nmi | |
3061 | orl $NMI_MASK,EFLAGS(%rsp) | |
3062 | RESTORE_REST | |
3063 | - XEN_BLOCK_EVENTS(%rsi) | |
3064 | + DISABLE_INTERRUPTS(CLBR_NONE) | |
3065 | TRACE_IRQS_OFF | |
3066 | GET_THREAD_INFO(%rcx) | |
3067 | jmp retint_restore_args | |
3068 | --- a/arch/x86/kernel/fixup.c | |
3069 | +++ b/arch/x86/kernel/fixup.c | |
3070 | @@ -36,7 +36,7 @@ | |
3071 | ||
3072 | #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) | |
3073 | ||
3074 | -fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) | |
3075 | +void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) | |
3076 | { | |
3077 | static unsigned long printed = 0; | |
3078 | char info[100]; | |
3079 | --- a/arch/x86/kernel/genapic_64-xen.c | |
3080 | +++ b/arch/x86/kernel/genapic_64-xen.c | |
3081 | @@ -24,20 +24,13 @@ | |
3082 | #include <acpi/acpi_bus.h> | |
3083 | #endif | |
3084 | ||
3085 | -/* | |
3086 | - * which logical CPU number maps to which CPU (physical APIC ID) | |
3087 | - * | |
3088 | - * The following static array is used during kernel startup | |
3089 | - * and the x86_cpu_to_apicid_ptr contains the address of the | |
3090 | - * array during this time. Is it zeroed when the per_cpu | |
3091 | - * data area is removed. | |
3092 | - */ | |
3093 | +/* which logical CPU number maps to which CPU (physical APIC ID) */ | |
3094 | #ifndef CONFIG_XEN | |
3095 | -u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata | |
3096 | +u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata | |
3097 | = { [0 ... NR_CPUS-1] = BAD_APICID }; | |
3098 | -void *x86_cpu_to_apicid_ptr; | |
3099 | +void *x86_cpu_to_apicid_early_ptr; | |
3100 | #endif | |
3101 | -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; | |
3102 | +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; | |
3103 | EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); | |
3104 | ||
3105 | #ifndef CONFIG_XEN | |
3106 | --- a/arch/x86/kernel/head_32-xen.S | |
3107 | +++ b/arch/x86/kernel/head_32-xen.S | |
3108 | @@ -3,6 +3,7 @@ | |
3109 | .text | |
3110 | #include <linux/elfnote.h> | |
3111 | #include <linux/threads.h> | |
3112 | +#include <linux/init.h> | |
3113 | #include <linux/linkage.h> | |
3114 | #include <asm/segment.h> | |
3115 | #include <asm/page.h> | |
3116 | @@ -88,7 +89,7 @@ ENTRY(_stext) | |
3117 | */ | |
3118 | .section ".bss.page_aligned","wa" | |
3119 | .align PAGE_SIZE_asm | |
3120 | -ENTRY(swapper_pg_pmd) | |
3121 | +ENTRY(swapper_pg_fixmap) | |
3122 | .fill 1024,4,0 | |
3123 | ENTRY(empty_zero_page) | |
3124 | .fill 4096,1,0 | |
3125 | --- a/arch/x86/kernel/head64-xen.c | |
3126 | +++ b/arch/x86/kernel/head64-xen.c | |
3127 | @@ -16,6 +16,7 @@ | |
3128 | #include <linux/kernel.h> | |
3129 | #include <linux/string.h> | |
3130 | #include <linux/percpu.h> | |
3131 | +#include <linux/start_kernel.h> | |
3132 | #include <linux/module.h> | |
3133 | ||
3134 | #include <asm/processor.h> | |
3135 | @@ -26,6 +27,8 @@ | |
3136 | #include <asm/pgtable.h> | |
3137 | #include <asm/tlbflush.h> | |
3138 | #include <asm/sections.h> | |
3139 | +#include <asm/kdebug.h> | |
3140 | +#include <asm/e820.h> | |
3141 | ||
3142 | unsigned long start_pfn; | |
3143 | ||
3144 | @@ -34,7 +37,7 @@ static void __init zap_identity_mappings | |
3145 | { | |
3146 | pgd_t *pgd = pgd_offset_k(0UL); | |
3147 | pgd_clear(pgd); | |
3148 | - __flush_tlb(); | |
3149 | + __flush_tlb_all(); | |
3150 | } | |
3151 | ||
3152 | /* Don't add a printk in there. printk relies on the PDA which is not initialized | |
3153 | @@ -72,6 +75,37 @@ EXPORT_SYMBOL(machine_to_phys_mapping); | |
3154 | unsigned int machine_to_phys_order; | |
3155 | EXPORT_SYMBOL(machine_to_phys_order); | |
3156 | ||
3157 | +#define EBDA_ADDR_POINTER 0x40E | |
3158 | + | |
3159 | +static __init void reserve_ebda(void) | |
3160 | +{ | |
3161 | +#ifndef CONFIG_XEN | |
3162 | + unsigned ebda_addr, ebda_size; | |
3163 | + | |
3164 | + /* | |
3165 | + * there is a real-mode segmented pointer pointing to the | |
3166 | + * 4K EBDA area at 0x40E | |
3167 | + */ | |
3168 | + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); | |
3169 | + ebda_addr <<= 4; | |
3170 | + | |
3171 | + if (!ebda_addr) | |
3172 | + return; | |
3173 | + | |
3174 | + ebda_size = *(unsigned short *)__va(ebda_addr); | |
3175 | + | |
3176 | + /* Round EBDA up to pages */ | |
3177 | + if (ebda_size == 0) | |
3178 | + ebda_size = 1; | |
3179 | + ebda_size <<= 10; | |
3180 | + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); | |
3181 | + if (ebda_size > 64*1024) | |
3182 | + ebda_size = 64*1024; | |
3183 | + | |
3184 | + reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA"); | |
3185 | +#endif | |
3186 | +} | |
3187 | + | |
3188 | void __init x86_64_start_kernel(char * real_mode_data) | |
3189 | { | |
3190 | struct xen_machphys_mapping mapping; | |
3191 | @@ -103,8 +137,16 @@ void __init x86_64_start_kernel(char * r | |
3192 | /* Make NULL pointers segfault */ | |
3193 | zap_identity_mappings(); | |
3194 | ||
3195 | - for (i = 0; i < IDT_ENTRIES; i++) | |
3196 | + /* Cleanup the over mapped high alias */ | |
3197 | + cleanup_highmap(); | |
3198 | + | |
3199 | + for (i = 0; i < IDT_ENTRIES; i++) { | |
3200 | +#ifdef CONFIG_EARLY_PRINTK | |
3201 | + set_intr_gate(i, &early_idt_handlers[i]); | |
3202 | +#else | |
3203 | set_intr_gate(i, early_idt_handler); | |
3204 | +#endif | |
3205 | + } | |
3206 | load_idt((const struct desc_ptr *)&idt_descr); | |
3207 | #endif | |
3208 | ||
3209 | @@ -115,8 +157,19 @@ void __init x86_64_start_kernel(char * r | |
3210 | ||
3211 | pda_init(0); | |
3212 | copy_bootdata(__va(real_mode_data)); | |
3213 | -#ifdef CONFIG_SMP | |
3214 | - cpu_set(0, cpu_online_map); | |
3215 | -#endif | |
3216 | + | |
3217 | + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); | |
3218 | + | |
3219 | + reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE), | |
3220 | + start_pfn << PAGE_SHIFT, "Xen provided"); | |
3221 | + | |
3222 | + reserve_ebda(); | |
3223 | + | |
3224 | + /* | |
3225 | + * At this point everything still needed from the boot loader | |
3226 | + * or BIOS or kernel text should be early reserved or marked not | |
3227 | + * RAM in e820. All other memory is free game. | |
3228 | + */ | |
3229 | + | |
3230 | start_kernel(); | |
3231 | } | |
3232 | --- a/arch/x86/kernel/init_task-xen.c | |
3233 | +++ b/arch/x86/kernel/init_task-xen.c | |
3234 | @@ -19,7 +19,7 @@ static struct sighand_struct init_sighan | |
3235 | #endif | |
3236 | struct mm_struct init_mm = INIT_MM(init_mm); | |
3237 | #undef swapper_pg_dir | |
3238 | -EXPORT_SYMBOL(init_mm); | |
3239 | +EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */ | |
3240 | ||
3241 | /* | |
3242 | * Initial thread structure. | |
3243 | --- a/arch/x86/kernel/io_apic_32-xen.c | |
3244 | +++ b/arch/x86/kernel/io_apic_32-xen.c | |
3245 | @@ -35,6 +35,7 @@ | |
3246 | #include <linux/htirq.h> | |
3247 | #include <linux/freezer.h> | |
3248 | #include <linux/kthread.h> | |
3249 | +#include <linux/jiffies.h> /* time_after() */ | |
3250 | ||
3251 | #include <asm/io.h> | |
3252 | #include <asm/smp.h> | |
3253 | @@ -48,8 +49,6 @@ | |
3254 | #include <mach_apic.h> | |
3255 | #include <mach_apicdef.h> | |
3256 | ||
3257 | -#include "io_ports.h" | |
3258 | - | |
3259 | #ifdef CONFIG_XEN | |
3260 | #include <xen/interface/xen.h> | |
3261 | #include <xen/interface/physdev.h> | |
3262 | @@ -400,7 +399,7 @@ static void set_ioapic_affinity_irq(unsi | |
3263 | # include <asm/processor.h> /* kernel_thread() */ | |
3264 | # include <linux/kernel_stat.h> /* kstat */ | |
3265 | # include <linux/slab.h> /* kmalloc() */ | |
3266 | -# include <linux/timer.h> /* time_after() */ | |
3267 | +# include <linux/timer.h> | |
3268 | ||
3269 | #define IRQBALANCE_CHECK_ARCH -999 | |
3270 | #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) | |
3271 | @@ -777,7 +776,7 @@ late_initcall(balanced_irq_init); | |
3272 | #endif | |
3273 | ||
3274 | #ifndef CONFIG_SMP | |
3275 | -void fastcall send_IPI_self(int vector) | |
3276 | +void send_IPI_self(int vector) | |
3277 | { | |
3278 | #ifndef CONFIG_XEN | |
3279 | unsigned int cfg; | |
3280 | @@ -1959,7 +1958,7 @@ static int __init timer_irq_works(void) | |
3281 | * might have cached one ExtINT interrupt. Finally, at | |
3282 | * least one tick may be lost due to delays. | |
3283 | */ | |
3284 | - if (jiffies - t1 > 4) | |
3285 | + if (time_after(jiffies, t1 + 4)) | |
3286 | return 1; | |
3287 | ||
3288 | return 0; | |
3289 | @@ -2142,7 +2141,7 @@ static struct irq_chip lapic_chip __read | |
3290 | .eoi = ack_apic, | |
3291 | }; | |
3292 | ||
3293 | -static void setup_nmi (void) | |
3294 | +static void __init setup_nmi(void) | |
3295 | { | |
3296 | /* | |
3297 | * Dirty trick to enable the NMI watchdog ... | |
3298 | @@ -2155,7 +2154,7 @@ static void setup_nmi (void) | |
3299 | */ | |
3300 | apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); | |
3301 | ||
3302 | - on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); | |
3303 | + enable_NMI_through_LVT0(); | |
3304 | ||
3305 | apic_printk(APIC_VERBOSE, " done.\n"); | |
3306 | } | |
3307 | @@ -2479,7 +2478,7 @@ static int ioapic_resume(struct sys_devi | |
3308 | } | |
3309 | ||
3310 | static struct sysdev_class ioapic_sysdev_class = { | |
3311 | - set_kset_name("ioapic"), | |
3312 | + .name = "ioapic", | |
3313 | .suspend = ioapic_suspend, | |
3314 | .resume = ioapic_resume, | |
3315 | }; | |
3316 | --- a/arch/x86/kernel/io_apic_64-xen.c | |
3317 | +++ b/arch/x86/kernel/io_apic_64-xen.c | |
3318 | @@ -32,9 +32,11 @@ | |
3319 | #include <linux/msi.h> | |
3320 | #include <linux/htirq.h> | |
3321 | #include <linux/dmar.h> | |
3322 | +#include <linux/jiffies.h> | |
3323 | #ifdef CONFIG_ACPI | |
3324 | #include <acpi/acpi_bus.h> | |
3325 | #endif | |
3326 | +#include <linux/bootmem.h> | |
3327 | ||
3328 | #include <asm/idle.h> | |
3329 | #include <asm/io.h> | |
3330 | @@ -1064,7 +1066,7 @@ void __apicdebuginit print_local_APIC(vo | |
3331 | v = apic_read(APIC_LVR); | |
3332 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); | |
3333 | ver = GET_APIC_VERSION(v); | |
3334 | - maxlvt = get_maxlvt(); | |
3335 | + maxlvt = lapic_get_maxlvt(); | |
3336 | ||
3337 | v = apic_read(APIC_TASKPRI); | |
3338 | printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); | |
3339 | @@ -1165,7 +1167,7 @@ void __apicdebuginit print_PIC(void) | |
3340 | } | |
3341 | #endif /* !CONFIG_XEN */ | |
3342 | ||
3343 | -static void __init enable_IO_APIC(void) | |
3344 | +void __init enable_IO_APIC(void) | |
3345 | { | |
3346 | union IO_APIC_reg_01 reg_01; | |
3347 | #ifndef CONFIG_XEN | |
3348 | @@ -1299,7 +1301,7 @@ static int __init timer_irq_works(void) | |
3349 | */ | |
3350 | ||
3351 | /* jiffies wrap? */ | |
3352 | - if (jiffies - t1 > 4) | |
3353 | + if (time_after(jiffies, t1 + 4)) | |
3354 | return 1; | |
3355 | return 0; | |
3356 | } | |
3357 | @@ -1412,7 +1414,7 @@ static void irq_complete_move(unsigned i | |
3358 | if (likely(!cfg->move_in_progress)) | |
3359 | return; | |
3360 | ||
3361 | - vector = ~get_irq_regs()->orig_rax; | |
3362 | + vector = ~get_irq_regs()->orig_ax; | |
3363 | me = smp_processor_id(); | |
3364 | if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { | |
3365 | cpumask_t cleanup_mask; | |
3366 | @@ -1439,7 +1441,7 @@ static void ack_apic_level(unsigned int | |
3367 | int do_unmask_irq = 0; | |
3368 | ||
3369 | irq_complete_move(irq); | |
3370 | -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) | |
3371 | +#ifdef CONFIG_GENERIC_PENDING_IRQ | |
3372 | /* If we are moving the irq we need to mask it */ | |
3373 | if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { | |
3374 | do_unmask_irq = 1; | |
3375 | @@ -1570,7 +1572,7 @@ static struct hw_interrupt_type lapic_ir | |
3376 | .end = end_lapic_irq, | |
3377 | }; | |
3378 | ||
3379 | -static void setup_nmi (void) | |
3380 | +static void __init setup_nmi(void) | |
3381 | { | |
3382 | /* | |
3383 | * Dirty trick to enable the NMI watchdog ... | |
3384 | @@ -1583,7 +1585,7 @@ static void setup_nmi (void) | |
3385 | */ | |
3386 | printk(KERN_INFO "activating NMI Watchdog ..."); | |
3387 | ||
3388 | - enable_NMI_through_LVT0(NULL); | |
3389 | + enable_NMI_through_LVT0(); | |
3390 | ||
3391 | printk(" done.\n"); | |
3392 | } | |
3393 | @@ -1659,7 +1661,7 @@ static inline void unlock_ExtINT_logic(v | |
3394 | * | |
3395 | * FIXME: really need to revamp this for modern platforms only. | |
3396 | */ | |
3397 | -static inline void check_timer(void) | |
3398 | +static inline void __init check_timer(void) | |
3399 | { | |
3400 | struct irq_cfg *cfg = irq_cfg + 0; | |
3401 | int apic1, pin1, apic2, pin2; | |
3402 | @@ -1863,7 +1865,7 @@ static int ioapic_resume(struct sys_devi | |
3403 | } | |
3404 | ||
3405 | static struct sysdev_class ioapic_sysdev_class = { | |
3406 | - set_kset_name("ioapic"), | |
3407 | + .name = "ioapic", | |
3408 | .suspend = ioapic_suspend, | |
3409 | .resume = ioapic_resume, | |
3410 | }; | |
3411 | @@ -2303,5 +2305,93 @@ void __init setup_ioapic_dest(void) | |
3412 | } | |
3413 | } | |
3414 | #endif | |
3415 | -#endif /* !CONFIG_XEN */ | |
3416 | ||
3417 | +#define IOAPIC_RESOURCE_NAME_SIZE 11 | |
3418 | + | |
3419 | +static struct resource *ioapic_resources; | |
3420 | + | |
3421 | +static struct resource * __init ioapic_setup_resources(void) | |
3422 | +{ | |
3423 | + unsigned long n; | |
3424 | + struct resource *res; | |
3425 | + char *mem; | |
3426 | + int i; | |
3427 | + | |
3428 | + if (nr_ioapics <= 0) | |
3429 | + return NULL; | |
3430 | + | |
3431 | + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); | |
3432 | + n *= nr_ioapics; | |
3433 | + | |
3434 | + mem = alloc_bootmem(n); | |
3435 | + res = (void *)mem; | |
3436 | + | |
3437 | + if (mem != NULL) { | |
3438 | + memset(mem, 0, n); | |
3439 | + mem += sizeof(struct resource) * nr_ioapics; | |
3440 | + | |
3441 | + for (i = 0; i < nr_ioapics; i++) { | |
3442 | + res[i].name = mem; | |
3443 | + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; | |
3444 | + sprintf(mem, "IOAPIC %u", i); | |
3445 | + mem += IOAPIC_RESOURCE_NAME_SIZE; | |
3446 | + } | |
3447 | + } | |
3448 | + | |
3449 | + ioapic_resources = res; | |
3450 | + | |
3451 | + return res; | |
3452 | +} | |
3453 | + | |
3454 | +void __init ioapic_init_mappings(void) | |
3455 | +{ | |
3456 | + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; | |
3457 | + struct resource *ioapic_res; | |
3458 | + int i; | |
3459 | + | |
3460 | + ioapic_res = ioapic_setup_resources(); | |
3461 | + for (i = 0; i < nr_ioapics; i++) { | |
3462 | + if (smp_found_config) { | |
3463 | + ioapic_phys = mp_ioapics[i].mpc_apicaddr; | |
3464 | + } else { | |
3465 | + ioapic_phys = (unsigned long) | |
3466 | + alloc_bootmem_pages(PAGE_SIZE); | |
3467 | + ioapic_phys = __pa(ioapic_phys); | |
3468 | + } | |
3469 | + set_fixmap_nocache(idx, ioapic_phys); | |
3470 | + apic_printk(APIC_VERBOSE, | |
3471 | + "mapped IOAPIC to %016lx (%016lx)\n", | |
3472 | + __fix_to_virt(idx), ioapic_phys); | |
3473 | + idx++; | |
3474 | + | |
3475 | + if (ioapic_res != NULL) { | |
3476 | + ioapic_res->start = ioapic_phys; | |
3477 | + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; | |
3478 | + ioapic_res++; | |
3479 | + } | |
3480 | + } | |
3481 | +} | |
3482 | + | |
3483 | +static int __init ioapic_insert_resources(void) | |
3484 | +{ | |
3485 | + int i; | |
3486 | + struct resource *r = ioapic_resources; | |
3487 | + | |
3488 | + if (!r) { | |
3489 | + printk(KERN_ERR | |
3490 | + "IO APIC resources could be not be allocated.\n"); | |
3491 | + return -1; | |
3492 | + } | |
3493 | + | |
3494 | + for (i = 0; i < nr_ioapics; i++) { | |
3495 | + insert_resource(&iomem_resource, r); | |
3496 | + r++; | |
3497 | + } | |
3498 | + | |
3499 | + return 0; | |
3500 | +} | |
3501 | + | |
3502 | +/* Insert the IO APIC resources after PCI initialization has occured to handle | |
3503 | + * IO APICS that are mapped in on a BAR in PCI space. */ | |
3504 | +late_initcall(ioapic_insert_resources); | |
3505 | +#endif /* !CONFIG_XEN */ | |
3506 | --- a/arch/x86/kernel/ioport_32-xen.c | |
3507 | +++ /dev/null | |
3508 | @@ -1,121 +0,0 @@ | |
3509 | -/* | |
3510 | - * This contains the io-permission bitmap code - written by obz, with changes | |
3511 | - * by Linus. | |
3512 | - */ | |
3513 | - | |
3514 | -#include <linux/sched.h> | |
3515 | -#include <linux/kernel.h> | |
3516 | -#include <linux/capability.h> | |
3517 | -#include <linux/errno.h> | |
3518 | -#include <linux/types.h> | |
3519 | -#include <linux/ioport.h> | |
3520 | -#include <linux/smp.h> | |
3521 | -#include <linux/stddef.h> | |
3522 | -#include <linux/slab.h> | |
3523 | -#include <linux/thread_info.h> | |
3524 | -#include <linux/syscalls.h> | |
3525 | -#include <xen/interface/physdev.h> | |
3526 | - | |
3527 | -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ | |
3528 | -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) | |
3529 | -{ | |
3530 | - unsigned long mask; | |
3531 | - unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); | |
3532 | - unsigned int low_index = base & (BITS_PER_LONG-1); | |
3533 | - int length = low_index + extent; | |
3534 | - | |
3535 | - if (low_index != 0) { | |
3536 | - mask = (~0UL << low_index); | |
3537 | - if (length < BITS_PER_LONG) | |
3538 | - mask &= ~(~0UL << length); | |
3539 | - if (new_value) | |
3540 | - *bitmap_base++ |= mask; | |
3541 | - else | |
3542 | - *bitmap_base++ &= ~mask; | |
3543 | - length -= BITS_PER_LONG; | |
3544 | - } | |
3545 | - | |
3546 | - mask = (new_value ? ~0UL : 0UL); | |
3547 | - while (length >= BITS_PER_LONG) { | |
3548 | - *bitmap_base++ = mask; | |
3549 | - length -= BITS_PER_LONG; | |
3550 | - } | |
3551 | - | |
3552 | - if (length > 0) { | |
3553 | - mask = ~(~0UL << length); | |
3554 | - if (new_value) | |
3555 | - *bitmap_base++ |= mask; | |
3556 | - else | |
3557 | - *bitmap_base++ &= ~mask; | |
3558 | - } | |
3559 | -} | |
3560 | - | |
3561 | - | |
3562 | -/* | |
3563 | - * this changes the io permissions bitmap in the current task. | |
3564 | - */ | |
3565 | -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | |
3566 | -{ | |
3567 | - struct thread_struct * t = ¤t->thread; | |
3568 | - unsigned long *bitmap; | |
3569 | - struct physdev_set_iobitmap set_iobitmap; | |
3570 | - | |
3571 | - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) | |
3572 | - return -EINVAL; | |
3573 | - if (turn_on && !capable(CAP_SYS_RAWIO)) | |
3574 | - return -EPERM; | |
3575 | - | |
3576 | - /* | |
3577 | - * If it's the first ioperm() call in this thread's lifetime, set the | |
3578 | - * IO bitmap up. ioperm() is much less timing critical than clone(), | |
3579 | - * this is why we delay this operation until now: | |
3580 | - */ | |
3581 | - if (!t->io_bitmap_ptr) { | |
3582 | - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | |
3583 | - if (!bitmap) | |
3584 | - return -ENOMEM; | |
3585 | - | |
3586 | - memset(bitmap, 0xff, IO_BITMAP_BYTES); | |
3587 | - t->io_bitmap_ptr = bitmap; | |
3588 | - set_thread_flag(TIF_IO_BITMAP); | |
3589 | - | |
3590 | - set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); | |
3591 | - set_iobitmap.nr_ports = IO_BITMAP_BITS; | |
3592 | - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, | |
3593 | - &set_iobitmap)); | |
3594 | - } | |
3595 | - | |
3596 | - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); | |
3597 | - | |
3598 | - return 0; | |
3599 | -} | |
3600 | - | |
3601 | -/* | |
3602 | - * sys_iopl has to be used when you want to access the IO ports | |
3603 | - * beyond the 0x3ff range: to get the full 65536 ports bitmapped | |
3604 | - * you'd need 8kB of bitmaps/process, which is a bit excessive. | |
3605 | - * | |
3606 | - * Here we just change the eflags value on the stack: we allow | |
3607 | - * only the super-user to do it. This depends on the stack-layout | |
3608 | - * on system-call entry - see also fork() and the signal handling | |
3609 | - * code. | |
3610 | - */ | |
3611 | - | |
3612 | -asmlinkage long sys_iopl(unsigned long unused) | |
3613 | -{ | |
3614 | - volatile struct pt_regs * regs = (struct pt_regs *) &unused; | |
3615 | - unsigned int level = regs->ebx; | |
3616 | - struct thread_struct *t = ¤t->thread; | |
3617 | - unsigned int old = (t->iopl >> 12) & 3; | |
3618 | - | |
3619 | - if (level > 3) | |
3620 | - return -EINVAL; | |
3621 | - /* Trying to gain more privileges? */ | |
3622 | - if (level > old) { | |
3623 | - if (!capable(CAP_SYS_RAWIO)) | |
3624 | - return -EPERM; | |
3625 | - } | |
3626 | - t->iopl = level << 12; | |
3627 | - set_iopl_mask(t->iopl); | |
3628 | - return 0; | |
3629 | -} | |
3630 | --- a/arch/x86/kernel/ioport_64-xen.c | |
3631 | +++ /dev/null | |
3632 | @@ -1,99 +0,0 @@ | |
3633 | -/* | |
3634 | - * This contains the io-permission bitmap code - written by obz, with changes | |
3635 | - * by Linus. | |
3636 | - */ | |
3637 | - | |
3638 | -#include <linux/sched.h> | |
3639 | -#include <linux/kernel.h> | |
3640 | -#include <linux/capability.h> | |
3641 | -#include <linux/errno.h> | |
3642 | -#include <linux/types.h> | |
3643 | -#include <linux/ioport.h> | |
3644 | -#include <linux/mm.h> | |
3645 | -#include <linux/smp.h> | |
3646 | -#include <linux/stddef.h> | |
3647 | -#include <linux/slab.h> | |
3648 | -#include <linux/thread_info.h> | |
3649 | -#include <linux/syscalls.h> | |
3650 | -#include <xen/interface/physdev.h> | |
3651 | - | |
3652 | -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ | |
3653 | -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) | |
3654 | -{ | |
3655 | - int i; | |
3656 | - | |
3657 | - if (new_value) | |
3658 | - for (i = base; i < base + extent; i++) | |
3659 | - __set_bit(i, bitmap); | |
3660 | - else | |
3661 | - for (i = base; i < base + extent; i++) | |
3662 | - clear_bit(i, bitmap); | |
3663 | -} | |
3664 | - | |
3665 | -/* | |
3666 | - * this changes the io permissions bitmap in the current task. | |
3667 | - */ | |
3668 | -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | |
3669 | -{ | |
3670 | - struct thread_struct * t = ¤t->thread; | |
3671 | - unsigned long *bitmap; | |
3672 | - struct physdev_set_iobitmap set_iobitmap; | |
3673 | - | |
3674 | - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) | |
3675 | - return -EINVAL; | |
3676 | - if (turn_on && !capable(CAP_SYS_RAWIO)) | |
3677 | - return -EPERM; | |
3678 | - | |
3679 | - /* | |
3680 | - * If it's the first ioperm() call in this thread's lifetime, set the | |
3681 | - * IO bitmap up. ioperm() is much less timing critical than clone(), | |
3682 | - * this is why we delay this operation until now: | |
3683 | - */ | |
3684 | - if (!t->io_bitmap_ptr) { | |
3685 | - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | |
3686 | - if (!bitmap) | |
3687 | - return -ENOMEM; | |
3688 | - | |
3689 | - memset(bitmap, 0xff, IO_BITMAP_BYTES); | |
3690 | - t->io_bitmap_ptr = bitmap; | |
3691 | - set_thread_flag(TIF_IO_BITMAP); | |
3692 | - | |
3693 | - set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); | |
3694 | - set_iobitmap.nr_ports = IO_BITMAP_BITS; | |
3695 | - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, | |
3696 | - &set_iobitmap)); | |
3697 | - } | |
3698 | - | |
3699 | - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); | |
3700 | - | |
3701 | - return 0; | |
3702 | -} | |
3703 | - | |
3704 | -/* | |
3705 | - * sys_iopl has to be used when you want to access the IO ports | |
3706 | - * beyond the 0x3ff range: to get the full 65536 ports bitmapped | |
3707 | - * you'd need 8kB of bitmaps/process, which is a bit excessive. | |
3708 | - * | |
3709 | - */ | |
3710 | - | |
3711 | -asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs) | |
3712 | -{ | |
3713 | - unsigned int old_iopl = current->thread.iopl; | |
3714 | - struct physdev_set_iopl set_iopl; | |
3715 | - | |
3716 | - if (new_iopl > 3) | |
3717 | - return -EINVAL; | |
3718 | - | |
3719 | - /* Need "raw I/O" privileges for direct port access. */ | |
3720 | - if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO)) | |
3721 | - return -EPERM; | |
3722 | - | |
3723 | - /* Change our version of the privilege levels. */ | |
3724 | - current->thread.iopl = new_iopl; | |
3725 | - | |
3726 | - /* Force the change at ring 0. */ | |
3727 | - set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl; | |
3728 | - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); | |
3729 | - | |
3730 | - return 0; | |
3731 | -} | |
3732 | --- /dev/null | |
3733 | +++ b/arch/x86/kernel/ioport-xen.c | |
3734 | @@ -0,0 +1,112 @@ | |
3735 | +/* | |
3736 | + * This contains the io-permission bitmap code - written by obz, with changes | |
3737 | + * by Linus. 32/64 bits code unification by Miguel Botón. | |
3738 | + */ | |
3739 | + | |
3740 | +#include <linux/sched.h> | |
3741 | +#include <linux/kernel.h> | |
3742 | +#include <linux/capability.h> | |
3743 | +#include <linux/errno.h> | |
3744 | +#include <linux/types.h> | |
3745 | +#include <linux/ioport.h> | |
3746 | +#include <linux/smp.h> | |
3747 | +#include <linux/stddef.h> | |
3748 | +#include <linux/slab.h> | |
3749 | +#include <linux/thread_info.h> | |
3750 | +#include <linux/syscalls.h> | |
3751 | +#include <xen/interface/physdev.h> | |
3752 | + | |
3753 | +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ | |
3754 | +static void set_bitmap(unsigned long *bitmap, unsigned int base, | |
3755 | + unsigned int extent, int new_value) | |
3756 | +{ | |
3757 | + unsigned int i; | |
3758 | + | |
3759 | + for (i = base; i < base + extent; i++) { | |
3760 | + if (new_value) | |
3761 | + __set_bit(i, bitmap); | |
3762 | + else | |
3763 | + __clear_bit(i, bitmap); | |
3764 | + } | |
3765 | +} | |
3766 | + | |
3767 | +/* | |
3768 | + * this changes the io permissions bitmap in the current task. | |
3769 | + */ | |
3770 | +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | |
3771 | +{ | |
3772 | + struct thread_struct * t = ¤t->thread; | |
3773 | + struct physdev_set_iobitmap set_iobitmap; | |
3774 | + | |
3775 | + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) | |
3776 | + return -EINVAL; | |
3777 | + if (turn_on && !capable(CAP_SYS_RAWIO)) | |
3778 | + return -EPERM; | |
3779 | + | |
3780 | + /* | |
3781 | + * If it's the first ioperm() call in this thread's lifetime, set the | |
3782 | + * IO bitmap up. ioperm() is much less timing critical than clone(), | |
3783 | + * this is why we delay this operation until now: | |
3784 | + */ | |
3785 | + if (!t->io_bitmap_ptr) { | |
3786 | + unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | |
3787 | + | |
3788 | + if (!bitmap) | |
3789 | + return -ENOMEM; | |
3790 | + | |
3791 | + memset(bitmap, 0xff, IO_BITMAP_BYTES); | |
3792 | + t->io_bitmap_ptr = bitmap; | |
3793 | + set_thread_flag(TIF_IO_BITMAP); | |
3794 | + | |
3795 | + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); | |
3796 | + set_iobitmap.nr_ports = IO_BITMAP_BITS; | |
3797 | + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, | |
3798 | + &set_iobitmap)); | |
3799 | + } | |
3800 | + | |
3801 | + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); | |
3802 | + | |
3803 | + return 0; | |
3804 | +} | |
3805 | + | |
3806 | +/* | |
3807 | + * sys_iopl has to be used when you want to access the IO ports | |
3808 | + * beyond the 0x3ff range: to get the full 65536 ports bitmapped | |
3809 | + * you'd need 8kB of bitmaps/process, which is a bit excessive. | |
3810 | + */ | |
3811 | +static int do_iopl(unsigned int level, struct thread_struct *t) | |
3812 | +{ | |
3813 | + unsigned int old = t->iopl >> 12; | |
3814 | + | |
3815 | + if (level > 3) | |
3816 | + return -EINVAL; | |
3817 | + /* Trying to gain more privileges? */ | |
3818 | + if (level > old) { | |
3819 | + if (!capable(CAP_SYS_RAWIO)) | |
3820 | + return -EPERM; | |
3821 | + } | |
3822 | + | |
3823 | + return 0; | |
3824 | +} | |
3825 | + | |
3826 | +#ifdef CONFIG_X86_32 | |
3827 | +asmlinkage long sys_iopl(unsigned long regsp) | |
3828 | +{ | |
3829 | + struct pt_regs *regs = (struct pt_regs *)®sp; | |
3830 | + unsigned int level = regs->bx; | |
3831 | +#else | |
3832 | +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) | |
3833 | +{ | |
3834 | +#endif | |
3835 | + struct thread_struct *t = ¤t->thread; | |
3836 | + int rc; | |
3837 | + | |
3838 | + rc = do_iopl(level, t); | |
3839 | + if (rc < 0) | |
3840 | + goto out; | |
3841 | + | |
3842 | + t->iopl = level << 12; | |
3843 | + set_iopl_mask(t->iopl); | |
3844 | +out: | |
3845 | + return rc; | |
3846 | +} | |
3847 | --- a/arch/x86/kernel/irq_32-xen.c | |
3848 | +++ b/arch/x86/kernel/irq_32-xen.c | |
3849 | @@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPU | |
3850 | * SMP cross-CPU interrupts have their own specific | |
3851 | * handlers). | |
3852 | */ | |
3853 | -fastcall unsigned int do_IRQ(struct pt_regs *regs) | |
3854 | +unsigned int do_IRQ(struct pt_regs *regs) | |
3855 | { | |
3856 | struct pt_regs *old_regs; | |
3857 | /* high bit used in ret_from_ code */ | |
3858 | - int irq = ~regs->orig_eax; | |
3859 | + int irq = ~regs->orig_ax; | |
3860 | struct irq_desc *desc = irq_desc + irq; | |
3861 | #ifdef CONFIG_4KSTACKS | |
3862 | union irq_ctx *curctx, *irqctx; | |
3863 | @@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_r | |
3864 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | |
3865 | /* Debugging check for stack overflow: is there less than 1KB free? */ | |
3866 | { | |
3867 | - long esp; | |
3868 | + long sp; | |
3869 | ||
3870 | __asm__ __volatile__("andl %%esp,%0" : | |
3871 | - "=r" (esp) : "0" (THREAD_SIZE - 1)); | |
3872 | - if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { | |
3873 | + "=r" (sp) : "0" (THREAD_SIZE - 1)); | |
3874 | + if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { | |
3875 | printk("do_IRQ: stack overflow: %ld\n", | |
3876 | - esp - sizeof(struct thread_info)); | |
3877 | + sp - sizeof(struct thread_info)); | |
3878 | dump_stack(); | |
3879 | } | |
3880 | } | |
3881 | @@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_r | |
3882 | * current stack (which is the irq stack already after all) | |
3883 | */ | |
3884 | if (curctx != irqctx) { | |
3885 | - int arg1, arg2, ebx; | |
3886 | + int arg1, arg2, bx; | |
3887 | ||
3888 | /* build the stack frame on the IRQ stack */ | |
3889 | isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); | |
3890 | @@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_r | |
3891 | (curctx->tinfo.preempt_count & SOFTIRQ_MASK); | |
3892 | ||
3893 | asm volatile( | |
3894 | - " xchgl %%ebx,%%esp \n" | |
3895 | - " call *%%edi \n" | |
3896 | - " movl %%ebx,%%esp \n" | |
3897 | - : "=a" (arg1), "=d" (arg2), "=b" (ebx) | |
3898 | + " xchgl %%ebx,%%esp \n" | |
3899 | + " call *%%edi \n" | |
3900 | + " movl %%ebx,%%esp \n" | |
3901 | + : "=a" (arg1), "=d" (arg2), "=b" (bx) | |
3902 | : "0" (irq), "1" (desc), "2" (isp), | |
3903 | "D" (desc->handle_irq) | |
3904 | : "memory", "cc" | |
3905 | --- a/arch/x86/kernel/irq_64-xen.c | |
3906 | +++ b/arch/x86/kernel/irq_64-xen.c | |
3907 | @@ -20,6 +20,28 @@ | |
3908 | ||
3909 | atomic_t irq_err_count; | |
3910 | ||
3911 | +/* | |
3912 | + * 'what should we do if we get a hw irq event on an illegal vector'. | |
3913 | + * each architecture has to answer this themselves. | |
3914 | + */ | |
3915 | +void ack_bad_irq(unsigned int irq) | |
3916 | +{ | |
3917 | + printk(KERN_WARNING "unexpected IRQ trap at irq %02x\n", irq); | |
3918 | +#ifdef CONFIG_X86_LOCAL_APIC | |
3919 | + /* | |
3920 | + * Currently unexpected vectors happen only on SMP and APIC. | |
3921 | + * We _must_ ack these because every local APIC has only N | |
3922 | + * irq slots per priority level, and a 'hanging, unacked' IRQ | |
3923 | + * holds up an irq slot - in excessive cases (when multiple | |
3924 | + * unexpected vectors occur) that might lock up the APIC | |
3925 | + * completely. | |
3926 | + * But don't ack when the APIC is disabled. -AK | |
3927 | + */ | |
3928 | + if (!disable_apic) | |
3929 | + ack_APIC_irq(); | |
3930 | +#endif | |
3931 | +} | |
3932 | + | |
3933 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | |
3934 | /* | |
3935 | * Probabilistic stack overflow check: | |
3936 | @@ -33,11 +55,11 @@ static inline void stack_overflow_check( | |
3937 | u64 curbase = (u64)task_stack_page(current); | |
3938 | static unsigned long warned = -60*HZ; | |
3939 | ||
3940 | - if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && | |
3941 | - regs->rsp < curbase + sizeof(struct thread_info) + 128 && | |
3942 | + if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && | |
3943 | + regs->sp < curbase + sizeof(struct thread_info) + 128 && | |
3944 | time_after(jiffies, warned + 60*HZ)) { | |
3945 | - printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", | |
3946 | - current->comm, curbase, regs->rsp); | |
3947 | + printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", | |
3948 | + current->comm, curbase, regs->sp); | |
3949 | show_stack(NULL,NULL); | |
3950 | warned = jiffies; | |
3951 | } | |
3952 | @@ -150,7 +172,7 @@ asmlinkage unsigned int do_IRQ(struct pt | |
3953 | struct pt_regs *old_regs = set_irq_regs(regs); | |
3954 | ||
3955 | /* high bit used in ret_from_ code */ | |
3956 | - unsigned irq = ~regs->orig_rax; | |
3957 | + unsigned irq = ~regs->orig_ax; | |
3958 | ||
3959 | /*exit_idle();*/ | |
3960 | /*irq_enter();*/ | |
3961 | @@ -251,14 +273,3 @@ asmlinkage void do_softirq(void) | |
3962 | } | |
3963 | local_irq_restore(flags); | |
3964 | } | |
3965 | - | |
3966 | -#ifndef CONFIG_X86_LOCAL_APIC | |
3967 | -/* | |
3968 | - * 'what should we do if we get a hw irq event on an illegal vector'. | |
3969 | - * each architecture has to answer this themselves. | |
3970 | - */ | |
3971 | -void ack_bad_irq(unsigned int irq) | |
3972 | -{ | |
3973 | - printk("unexpected IRQ trap at irq %02x\n", irq); | |
3974 | -} | |
3975 | -#endif | |
3976 | --- a/arch/x86/kernel/ldt_32-xen.c | |
3977 | +++ /dev/null | |
3978 | @@ -1,265 +0,0 @@ | |
3979 | -/* | |
3980 | - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds | |
3981 | - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | |
3982 | - */ | |
3983 | - | |
3984 | -#include <linux/errno.h> | |
3985 | -#include <linux/sched.h> | |
3986 | -#include <linux/string.h> | |
3987 | -#include <linux/mm.h> | |
3988 | -#include <linux/smp.h> | |
3989 | -#include <linux/vmalloc.h> | |
3990 | -#include <linux/slab.h> | |
3991 | - | |
3992 | -#include <asm/uaccess.h> | |
3993 | -#include <asm/system.h> | |
3994 | -#include <asm/ldt.h> | |
3995 | -#include <asm/desc.h> | |
3996 | -#include <asm/mmu_context.h> | |
3997 | - | |
3998 | -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ | |
3999 | -static void flush_ldt(void *null) | |
4000 | -{ | |
4001 | - if (current->active_mm) | |
4002 | - load_LDT(¤t->active_mm->context); | |
4003 | -} | |
4004 | -#endif | |
4005 | - | |
4006 | -static int alloc_ldt(mm_context_t *pc, int mincount, int reload) | |
4007 | -{ | |
4008 | - void *oldldt; | |
4009 | - void *newldt; | |
4010 | - int oldsize; | |
4011 | - | |
4012 | - if (mincount <= pc->size) | |
4013 | - return 0; | |
4014 | - oldsize = pc->size; | |
4015 | - mincount = (mincount+511)&(~511); | |
4016 | - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) | |
4017 | - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); | |
4018 | - else | |
4019 | - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); | |
4020 | - | |
4021 | - if (!newldt) | |
4022 | - return -ENOMEM; | |
4023 | - | |
4024 | - if (oldsize) | |
4025 | - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); | |
4026 | - oldldt = pc->ldt; | |
4027 | - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); | |
4028 | - pc->ldt = newldt; | |
4029 | - wmb(); | |
4030 | - pc->size = mincount; | |
4031 | - wmb(); | |
4032 | - | |
4033 | - if (reload) { | |
4034 | -#ifdef CONFIG_SMP | |
4035 | - cpumask_t mask; | |
4036 | - preempt_disable(); | |
4037 | -#endif | |
4038 | - make_pages_readonly( | |
4039 | - pc->ldt, | |
4040 | - (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
4041 | - XENFEAT_writable_descriptor_tables); | |
4042 | - load_LDT(pc); | |
4043 | -#ifdef CONFIG_SMP | |
4044 | - mask = cpumask_of_cpu(smp_processor_id()); | |
4045 | - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) | |
4046 | - smp_call_function(flush_ldt, NULL, 1, 1); | |
4047 | - preempt_enable(); | |
4048 | -#endif | |
4049 | - } | |
4050 | - if (oldsize) { | |
4051 | - make_pages_writable( | |
4052 | - oldldt, | |
4053 | - (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
4054 | - XENFEAT_writable_descriptor_tables); | |
4055 | - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) | |
4056 | - vfree(oldldt); | |
4057 | - else | |
4058 | - kfree(oldldt); | |
4059 | - } | |
4060 | - return 0; | |
4061 | -} | |
4062 | - | |
4063 | -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) | |
4064 | -{ | |
4065 | - int err = alloc_ldt(new, old->size, 0); | |
4066 | - if (err < 0) | |
4067 | - return err; | |
4068 | - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); | |
4069 | - make_pages_readonly( | |
4070 | - new->ldt, | |
4071 | - (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
4072 | - XENFEAT_writable_descriptor_tables); | |
4073 | - return 0; | |
4074 | -} | |
4075 | - | |
4076 | -/* | |
4077 | - * we do not have to muck with descriptors here, that is | |
4078 | - * done in switch_mm() as needed. | |
4079 | - */ | |
4080 | -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | |
4081 | -{ | |
4082 | - struct mm_struct * old_mm; | |
4083 | - int retval = 0; | |
4084 | - | |
4085 | - mutex_init(&mm->context.lock); | |
4086 | - mm->context.size = 0; | |
4087 | - mm->context.has_foreign_mappings = 0; | |
4088 | - old_mm = current->mm; | |
4089 | - if (old_mm && old_mm->context.size > 0) { | |
4090 | - mutex_lock(&old_mm->context.lock); | |
4091 | - retval = copy_ldt(&mm->context, &old_mm->context); | |
4092 | - mutex_unlock(&old_mm->context.lock); | |
4093 | - } | |
4094 | - return retval; | |
4095 | -} | |
4096 | - | |
4097 | -/* | |
4098 | - * No need to lock the MM as we are the last user | |
4099 | - */ | |
4100 | -void destroy_context(struct mm_struct *mm) | |
4101 | -{ | |
4102 | - if (mm->context.size) { | |
4103 | - if (mm == current->active_mm) | |
4104 | - clear_LDT(); | |
4105 | - make_pages_writable( | |
4106 | - mm->context.ldt, | |
4107 | - (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
4108 | - XENFEAT_writable_descriptor_tables); | |
4109 | - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) | |
4110 | - vfree(mm->context.ldt); | |
4111 | - else | |
4112 | - kfree(mm->context.ldt); | |
4113 | - mm->context.size = 0; | |
4114 | - } | |
4115 | -} | |
4116 | - | |
4117 | -static int read_ldt(void __user * ptr, unsigned long bytecount) | |
4118 | -{ | |
4119 | - int err; | |
4120 | - unsigned long size; | |
4121 | - struct mm_struct * mm = current->mm; | |
4122 | - | |
4123 | - if (!mm->context.size) | |
4124 | - return 0; | |
4125 | - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) | |
4126 | - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; | |
4127 | - | |
4128 | - mutex_lock(&mm->context.lock); | |
4129 | - size = mm->context.size*LDT_ENTRY_SIZE; | |
4130 | - if (size > bytecount) | |
4131 | - size = bytecount; | |
4132 | - | |
4133 | - err = 0; | |
4134 | - if (copy_to_user(ptr, mm->context.ldt, size)) | |
4135 | - err = -EFAULT; | |
4136 | - mutex_unlock(&mm->context.lock); | |
4137 | - if (err < 0) | |
4138 | - goto error_return; | |
4139 | - if (size != bytecount) { | |
4140 | - /* zero-fill the rest */ | |
4141 | - if (clear_user(ptr+size, bytecount-size) != 0) { | |
4142 | - err = -EFAULT; | |
4143 | - goto error_return; | |
4144 | - } | |
4145 | - } | |
4146 | - return bytecount; | |
4147 | -error_return: | |
4148 | - return err; | |
4149 | -} | |
4150 | - | |
4151 | -static int read_default_ldt(void __user * ptr, unsigned long bytecount) | |
4152 | -{ | |
4153 | - int err; | |
4154 | - unsigned long size; | |
4155 | - | |
4156 | - err = 0; | |
4157 | - size = 5*sizeof(struct desc_struct); | |
4158 | - if (size > bytecount) | |
4159 | - size = bytecount; | |
4160 | - | |
4161 | - err = size; | |
4162 | - if (clear_user(ptr, size)) | |
4163 | - err = -EFAULT; | |
4164 | - | |
4165 | - return err; | |
4166 | -} | |
4167 | - | |
4168 | -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) | |
4169 | -{ | |
4170 | - struct mm_struct * mm = current->mm; | |
4171 | - __u32 entry_1, entry_2; | |
4172 | - int error; | |
4173 | - struct user_desc ldt_info; | |
4174 | - | |
4175 | - error = -EINVAL; | |
4176 | - if (bytecount != sizeof(ldt_info)) | |
4177 | - goto out; | |
4178 | - error = -EFAULT; | |
4179 | - if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) | |
4180 | - goto out; | |
4181 | - | |
4182 | - error = -EINVAL; | |
4183 | - if (ldt_info.entry_number >= LDT_ENTRIES) | |
4184 | - goto out; | |
4185 | - if (ldt_info.contents == 3) { | |
4186 | - if (oldmode) | |
4187 | - goto out; | |
4188 | - if (ldt_info.seg_not_present == 0) | |
4189 | - goto out; | |
4190 | - } | |
4191 | - | |
4192 | - mutex_lock(&mm->context.lock); | |
4193 | - if (ldt_info.entry_number >= mm->context.size) { | |
4194 | - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); | |
4195 | - if (error < 0) | |
4196 | - goto out_unlock; | |
4197 | - } | |
4198 | - | |
4199 | - /* Allow LDTs to be cleared by the user. */ | |
4200 | - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { | |
4201 | - if (oldmode || LDT_empty(&ldt_info)) { | |
4202 | - entry_1 = 0; | |
4203 | - entry_2 = 0; | |
4204 | - goto install; | |
4205 | - } | |
4206 | - } | |
4207 | - | |
4208 | - entry_1 = LDT_entry_a(&ldt_info); | |
4209 | - entry_2 = LDT_entry_b(&ldt_info); | |
4210 | - if (oldmode) | |
4211 | - entry_2 &= ~(1 << 20); | |
4212 | - | |
4213 | - /* Install the new entry ... */ | |
4214 | -install: | |
4215 | - error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, | |
4216 | - entry_1, entry_2); | |
4217 | - | |
4218 | -out_unlock: | |
4219 | - mutex_unlock(&mm->context.lock); | |
4220 | -out: | |
4221 | - return error; | |
4222 | -} | |
4223 | - | |
4224 | -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) | |
4225 | -{ | |
4226 | - int ret = -ENOSYS; | |
4227 | - | |
4228 | - switch (func) { | |
4229 | - case 0: | |
4230 | - ret = read_ldt(ptr, bytecount); | |
4231 | - break; | |
4232 | - case 1: | |
4233 | - ret = write_ldt(ptr, bytecount, 1); | |
4234 | - break; | |
4235 | - case 2: | |
4236 | - ret = read_default_ldt(ptr, bytecount); | |
4237 | - break; | |
4238 | - case 0x11: | |
4239 | - ret = write_ldt(ptr, bytecount, 0); | |
4240 | - break; | |
4241 | - } | |
4242 | - return ret; | |
4243 | -} | |
4244 | --- a/arch/x86/kernel/ldt_64-xen.c | |
4245 | +++ /dev/null | |
4246 | @@ -1,271 +0,0 @@ | |
4247 | -/* | |
4248 | - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds | |
4249 | - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | |
4250 | - * Copyright (C) 2002 Andi Kleen | |
4251 | - * | |
4252 | - * This handles calls from both 32bit and 64bit mode. | |
4253 | - */ | |
4254 | - | |
4255 | -#include <linux/errno.h> | |
4256 | -#include <linux/sched.h> | |
4257 | -#include <linux/string.h> | |
4258 | -#include <linux/mm.h> | |
4259 | -#include <linux/smp.h> | |
4260 | -#include <linux/vmalloc.h> | |
4261 | -#include <linux/slab.h> | |
4262 | - | |
4263 | -#include <asm/uaccess.h> | |
4264 | -#include <asm/system.h> | |
4265 | -#include <asm/ldt.h> | |
4266 | -#include <asm/desc.h> | |
4267 | -#include <asm/proto.h> | |
4268 | -#include <asm/pgalloc.h> | |
4269 | - | |
4270 | -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ | |
4271 | -static void flush_ldt(void *null) | |
4272 | -{ | |
4273 | - if (current->active_mm) | |
4274 | - load_LDT(¤t->active_mm->context); | |
4275 | -} | |
4276 | -#endif | |
4277 | - | |
4278 | -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) | |
4279 | -{ | |
4280 | - void *oldldt; | |
4281 | - void *newldt; | |
4282 | - unsigned oldsize; | |
4283 | - | |
4284 | - if (mincount <= (unsigned)pc->size) | |
4285 | - return 0; | |
4286 | - oldsize = pc->size; | |
4287 | - mincount = (mincount+511)&(~511); | |
4288 | - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) | |
4289 | - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); | |
4290 | - else | |
4291 | - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); | |
4292 | - | |
4293 | - if (!newldt) | |
4294 | - return -ENOMEM; | |
4295 | - | |
4296 | - if (oldsize) | |
4297 | - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); | |
4298 | - oldldt = pc->ldt; | |
4299 | - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); | |
4300 | - wmb(); | |
4301 | - pc->ldt = newldt; | |
4302 | - wmb(); | |
4303 | - pc->size = mincount; | |
4304 | - wmb(); | |
4305 | - if (reload) { | |
4306 | -#ifdef CONFIG_SMP | |
4307 | - cpumask_t mask; | |
4308 | - | |
4309 | - preempt_disable(); | |
4310 | -#endif | |
4311 | - make_pages_readonly( | |
4312 | - pc->ldt, | |
4313 | - (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
4314 | - XENFEAT_writable_descriptor_tables); | |
4315 | - load_LDT(pc); | |
4316 | -#ifdef CONFIG_SMP | |
4317 | - mask = cpumask_of_cpu(smp_processor_id()); | |
4318 | - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) | |
4319 | - smp_call_function(flush_ldt, NULL, 1, 1); | |
4320 | - preempt_enable(); | |
4321 | -#endif | |
4322 | - } | |
4323 | - if (oldsize) { | |
4324 | - make_pages_writable( | |
4325 | - oldldt, | |
4326 | - (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
4327 | - XENFEAT_writable_descriptor_tables); | |
4328 | - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) | |
4329 | - vfree(oldldt); | |
4330 | - else | |
4331 | - kfree(oldldt); | |
4332 | - } | |
4333 | - return 0; | |
4334 | -} | |
4335 | - | |
4336 | -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) | |
4337 | -{ | |
4338 | - int err = alloc_ldt(new, old->size, 0); | |
4339 | - if (err < 0) | |
4340 | - return err; | |
4341 | - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); | |
4342 | - make_pages_readonly( | |
4343 | - new->ldt, | |
4344 | - (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
4345 | - XENFEAT_writable_descriptor_tables); | |
4346 | - return 0; | |
4347 | -} | |
4348 | - | |
4349 | -/* | |
4350 | - * we do not have to muck with descriptors here, that is | |
4351 | - * done in switch_mm() as needed. | |
4352 | - */ | |
4353 | -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | |
4354 | -{ | |
4355 | - struct mm_struct * old_mm; | |
4356 | - int retval = 0; | |
4357 | - | |
4358 | - memset(&mm->context, 0, sizeof(mm->context)); | |
4359 | - mutex_init(&mm->context.lock); | |
4360 | - old_mm = current->mm; | |
4361 | - if (old_mm) | |
4362 | - mm->context.vdso = old_mm->context.vdso; | |
4363 | - if (old_mm && old_mm->context.size > 0) { | |
4364 | - mutex_lock(&old_mm->context.lock); | |
4365 | - retval = copy_ldt(&mm->context, &old_mm->context); | |
4366 | - mutex_unlock(&old_mm->context.lock); | |
4367 | - } | |
4368 | - return retval; | |
4369 | -} | |
4370 | - | |
4371 | -/* | |
4372 | - * | |
4373 | - * Don't touch the LDT register - we're already in the next thread. | |
4374 | - */ | |
4375 | -void destroy_context(struct mm_struct *mm) | |
4376 | -{ | |
4377 | - if (mm->context.size) { | |
4378 | - if (mm == current->active_mm) | |
4379 | - clear_LDT(); | |
4380 | - make_pages_writable( | |
4381 | - mm->context.ldt, | |
4382 | - (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
4383 | - XENFEAT_writable_descriptor_tables); | |
4384 | - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) | |
4385 | - vfree(mm->context.ldt); | |
4386 | - else | |
4387 | - kfree(mm->context.ldt); | |
4388 | - mm->context.size = 0; | |
4389 | - } | |
4390 | -} | |
4391 | - | |
4392 | -static int read_ldt(void __user * ptr, unsigned long bytecount) | |
4393 | -{ | |
4394 | - int err; | |
4395 | - unsigned long size; | |
4396 | - struct mm_struct * mm = current->mm; | |
4397 | - | |
4398 | - if (!mm->context.size) | |
4399 | - return 0; | |
4400 | - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) | |
4401 | - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; | |
4402 | - | |
4403 | - mutex_lock(&mm->context.lock); | |
4404 | - size = mm->context.size*LDT_ENTRY_SIZE; | |
4405 | - if (size > bytecount) | |
4406 | - size = bytecount; | |
4407 | - | |
4408 | - err = 0; | |
4409 | - if (copy_to_user(ptr, mm->context.ldt, size)) | |
4410 | - err = -EFAULT; | |
4411 | - mutex_unlock(&mm->context.lock); | |
4412 | - if (err < 0) | |
4413 | - goto error_return; | |
4414 | - if (size != bytecount) { | |
4415 | - /* zero-fill the rest */ | |
4416 | - if (clear_user(ptr+size, bytecount-size) != 0) { | |
4417 | - err = -EFAULT; | |
4418 | - goto error_return; | |
4419 | - } | |
4420 | - } | |
4421 | - return bytecount; | |
4422 | -error_return: | |
4423 | - return err; | |
4424 | -} | |
4425 | - | |
4426 | -static int read_default_ldt(void __user * ptr, unsigned long bytecount) | |
4427 | -{ | |
4428 | - /* Arbitrary number */ | |
4429 | - /* x86-64 default LDT is all zeros */ | |
4430 | - if (bytecount > 128) | |
4431 | - bytecount = 128; | |
4432 | - if (clear_user(ptr, bytecount)) | |
4433 | - return -EFAULT; | |
4434 | - return bytecount; | |
4435 | -} | |
4436 | - | |
4437 | -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) | |
4438 | -{ | |
4439 | - struct task_struct *me = current; | |
4440 | - struct mm_struct * mm = me->mm; | |
4441 | - __u32 entry_1, entry_2, *lp; | |
4442 | - unsigned long mach_lp; | |
4443 | - int error; | |
4444 | - struct user_desc ldt_info; | |
4445 | - | |
4446 | - error = -EINVAL; | |
4447 | - | |
4448 | - if (bytecount != sizeof(ldt_info)) | |
4449 | - goto out; | |
4450 | - error = -EFAULT; | |
4451 | - if (copy_from_user(&ldt_info, ptr, bytecount)) | |
4452 | - goto out; | |
4453 | - | |
4454 | - error = -EINVAL; | |
4455 | - if (ldt_info.entry_number >= LDT_ENTRIES) | |
4456 | - goto out; | |
4457 | - if (ldt_info.contents == 3) { | |
4458 | - if (oldmode) | |
4459 | - goto out; | |
4460 | - if (ldt_info.seg_not_present == 0) | |
4461 | - goto out; | |
4462 | - } | |
4463 | - | |
4464 | - mutex_lock(&mm->context.lock); | |
4465 | - if (ldt_info.entry_number >= (unsigned)mm->context.size) { | |
4466 | - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); | |
4467 | - if (error < 0) | |
4468 | - goto out_unlock; | |
4469 | - } | |
4470 | - | |
4471 | - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); | |
4472 | - mach_lp = arbitrary_virt_to_machine(lp); | |
4473 | - | |
4474 | - /* Allow LDTs to be cleared by the user. */ | |
4475 | - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { | |
4476 | - if (oldmode || LDT_empty(&ldt_info)) { | |
4477 | - entry_1 = 0; | |
4478 | - entry_2 = 0; | |
4479 | - goto install; | |
4480 | - } | |
4481 | - } | |
4482 | - | |
4483 | - entry_1 = LDT_entry_a(&ldt_info); | |
4484 | - entry_2 = LDT_entry_b(&ldt_info); | |
4485 | - if (oldmode) | |
4486 | - entry_2 &= ~(1 << 20); | |
4487 | - | |
4488 | - /* Install the new entry ... */ | |
4489 | -install: | |
4490 | - error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32))); | |
4491 | - | |
4492 | -out_unlock: | |
4493 | - mutex_unlock(&mm->context.lock); | |
4494 | -out: | |
4495 | - return error; | |
4496 | -} | |
4497 | - | |
4498 | -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) | |
4499 | -{ | |
4500 | - int ret = -ENOSYS; | |
4501 | - | |
4502 | - switch (func) { | |
4503 | - case 0: | |
4504 | - ret = read_ldt(ptr, bytecount); | |
4505 | - break; | |
4506 | - case 1: | |
4507 | - ret = write_ldt(ptr, bytecount, 1); | |
4508 | - break; | |
4509 | - case 2: | |
4510 | - ret = read_default_ldt(ptr, bytecount); | |
4511 | - break; | |
4512 | - case 0x11: | |
4513 | - ret = write_ldt(ptr, bytecount, 0); | |
4514 | - break; | |
4515 | - } | |
4516 | - return ret; | |
4517 | -} | |
4518 | --- /dev/null | |
4519 | +++ b/arch/x86/kernel/ldt-xen.c | |
4520 | @@ -0,0 +1,272 @@ | |
4521 | +/* | |
4522 | + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds | |
4523 | + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | |
4524 | + * Copyright (C) 2002 Andi Kleen | |
4525 | + * | |
4526 | + * This handles calls from both 32bit and 64bit mode. | |
4527 | + */ | |
4528 | + | |
4529 | +#include <linux/errno.h> | |
4530 | +#include <linux/sched.h> | |
4531 | +#include <linux/string.h> | |
4532 | +#include <linux/mm.h> | |
4533 | +#include <linux/smp.h> | |
4534 | +#include <linux/vmalloc.h> | |
4535 | + | |
4536 | +#include <asm/uaccess.h> | |
4537 | +#include <asm/system.h> | |
4538 | +#include <asm/ldt.h> | |
4539 | +#include <asm/desc.h> | |
4540 | +#include <asm/mmu_context.h> | |
4541 | + | |
4542 | +#ifdef CONFIG_SMP | |
4543 | +static void flush_ldt(void *null) | |
4544 | +{ | |
4545 | + if (current->active_mm) | |
4546 | + load_LDT(¤t->active_mm->context); | |
4547 | +} | |
4548 | +#endif | |
4549 | + | |
4550 | +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) | |
4551 | +{ | |
4552 | + void *oldldt, *newldt; | |
4553 | + int oldsize; | |
4554 | + | |
4555 | + if (mincount <= pc->size) | |
4556 | + return 0; | |
4557 | + oldsize = pc->size; | |
4558 | + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & | |
4559 | + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); | |
4560 | + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) | |
4561 | + newldt = vmalloc(mincount * LDT_ENTRY_SIZE); | |
4562 | + else | |
4563 | + newldt = (void *)__get_free_page(GFP_KERNEL); | |
4564 | + | |
4565 | + if (!newldt) | |
4566 | + return -ENOMEM; | |
4567 | + | |
4568 | + if (oldsize) | |
4569 | + memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); | |
4570 | + oldldt = pc->ldt; | |
4571 | + memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, | |
4572 | + (mincount - oldsize) * LDT_ENTRY_SIZE); | |
4573 | + | |
4574 | +#ifdef CONFIG_X86_64 | |
4575 | + /* CHECKME: Do we really need this ? */ | |
4576 | + wmb(); | |
4577 | +#endif | |
4578 | + pc->ldt = newldt; | |
4579 | + wmb(); | |
4580 | + pc->size = mincount; | |
4581 | + wmb(); | |
4582 | + | |
4583 | + if (reload) { | |
4584 | +#ifdef CONFIG_SMP | |
4585 | + cpumask_t mask; | |
4586 | + | |
4587 | + preempt_disable(); | |
4588 | +#endif | |
4589 | + make_pages_readonly(newldt, | |
4590 | + (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
4591 | + XENFEAT_writable_descriptor_tables); | |
4592 | + load_LDT(pc); | |
4593 | +#ifdef CONFIG_SMP | |
4594 | + mask = cpumask_of_cpu(smp_processor_id()); | |
4595 | + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) | |
4596 | + smp_call_function(flush_ldt, NULL, 1, 1); | |
4597 | + preempt_enable(); | |
4598 | +#endif | |
4599 | + } | |
4600 | + if (oldsize) { | |
4601 | + make_pages_writable(oldldt, | |
4602 | + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
4603 | + XENFEAT_writable_descriptor_tables); | |
4604 | + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) | |
4605 | + vfree(oldldt); | |
4606 | + else | |
4607 | + put_page(virt_to_page(oldldt)); | |
4608 | + } | |
4609 | + return 0; | |
4610 | +} | |
4611 | + | |
4612 | +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) | |
4613 | +{ | |
4614 | + int err = alloc_ldt(new, old->size, 0); | |
4615 | + | |
4616 | + if (err < 0) | |
4617 | + return err; | |
4618 | + memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); | |
4619 | + make_pages_readonly(new->ldt, | |
4620 | + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
4621 | + XENFEAT_writable_descriptor_tables); | |
4622 | + return 0; | |
4623 | +} | |
4624 | + | |
4625 | +/* | |
4626 | + * we do not have to muck with descriptors here, that is | |
4627 | + * done in switch_mm() as needed. | |
4628 | + */ | |
4629 | +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | |
4630 | +{ | |
4631 | + struct mm_struct *old_mm; | |
4632 | + int retval = 0; | |
4633 | + | |
4634 | + memset(&mm->context, 0, sizeof(mm->context)); | |
4635 | + mutex_init(&mm->context.lock); | |
4636 | + old_mm = current->mm; | |
4637 | + if (old_mm) | |
4638 | + mm->context.vdso = old_mm->context.vdso; | |
4639 | + if (old_mm && old_mm->context.size > 0) { | |
4640 | + mutex_lock(&old_mm->context.lock); | |
4641 | + retval = copy_ldt(&mm->context, &old_mm->context); | |
4642 | + mutex_unlock(&old_mm->context.lock); | |
4643 | + } | |
4644 | + return retval; | |
4645 | +} | |
4646 | + | |
4647 | +/* | |
4648 | + * No need to lock the MM as we are the last user | |
4649 | + * | |
4650 | + * 64bit: Don't touch the LDT register - we're already in the next thread. | |
4651 | + */ | |
4652 | +void destroy_context(struct mm_struct *mm) | |
4653 | +{ | |
4654 | + if (mm->context.size) { | |
4655 | + /* CHECKME: Can this ever happen ? */ | |
4656 | + if (mm == current->active_mm) | |
4657 | + clear_LDT(); | |
4658 | + make_pages_writable(mm->context.ldt, | |
4659 | + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
4660 | + XENFEAT_writable_descriptor_tables); | |
4661 | + if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) | |
4662 | + vfree(mm->context.ldt); | |
4663 | + else | |
4664 | + put_page(virt_to_page(mm->context.ldt)); | |
4665 | + mm->context.size = 0; | |
4666 | + } | |
4667 | +} | |
4668 | + | |
4669 | +static int read_ldt(void __user *ptr, unsigned long bytecount) | |
4670 | +{ | |
4671 | + int err; | |
4672 | + unsigned long size; | |
4673 | + struct mm_struct *mm = current->mm; | |
4674 | + | |
4675 | + if (!mm->context.size) | |
4676 | + return 0; | |
4677 | + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) | |
4678 | + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; | |
4679 | + | |
4680 | + mutex_lock(&mm->context.lock); | |
4681 | + size = mm->context.size * LDT_ENTRY_SIZE; | |
4682 | + if (size > bytecount) | |
4683 | + size = bytecount; | |
4684 | + | |
4685 | + err = 0; | |
4686 | + if (copy_to_user(ptr, mm->context.ldt, size)) | |
4687 | + err = -EFAULT; | |
4688 | + mutex_unlock(&mm->context.lock); | |
4689 | + if (err < 0) | |
4690 | + goto error_return; | |
4691 | + if (size != bytecount) { | |
4692 | + /* zero-fill the rest */ | |
4693 | + if (clear_user(ptr + size, bytecount - size) != 0) { | |
4694 | + err = -EFAULT; | |
4695 | + goto error_return; | |
4696 | + } | |
4697 | + } | |
4698 | + return bytecount; | |
4699 | +error_return: | |
4700 | + return err; | |
4701 | +} | |
4702 | + | |
4703 | +static int read_default_ldt(void __user *ptr, unsigned long bytecount) | |
4704 | +{ | |
4705 | + /* CHECKME: Can we use _one_ random number ? */ | |
4706 | +#ifdef CONFIG_X86_32 | |
4707 | + unsigned long size = 5 * sizeof(struct desc_struct); | |
4708 | +#else | |
4709 | + unsigned long size = 128; | |
4710 | +#endif | |
4711 | + if (bytecount > size) | |
4712 | + bytecount = size; | |
4713 | + if (clear_user(ptr, bytecount)) | |
4714 | + return -EFAULT; | |
4715 | + return bytecount; | |
4716 | +} | |
4717 | + | |
4718 | +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) | |
4719 | +{ | |
4720 | + struct mm_struct *mm = current->mm; | |
4721 | + struct desc_struct ldt; | |
4722 | + int error; | |
4723 | + struct user_desc ldt_info; | |
4724 | + | |
4725 | + error = -EINVAL; | |
4726 | + if (bytecount != sizeof(ldt_info)) | |
4727 | + goto out; | |
4728 | + error = -EFAULT; | |
4729 | + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) | |
4730 | + goto out; | |
4731 | + | |
4732 | + error = -EINVAL; | |
4733 | + if (ldt_info.entry_number >= LDT_ENTRIES) | |
4734 | + goto out; | |
4735 | + if (ldt_info.contents == 3) { | |
4736 | + if (oldmode) | |
4737 | + goto out; | |
4738 | + if (ldt_info.seg_not_present == 0) | |
4739 | + goto out; | |
4740 | + } | |
4741 | + | |
4742 | + mutex_lock(&mm->context.lock); | |
4743 | + if (ldt_info.entry_number >= mm->context.size) { | |
4744 | + error = alloc_ldt(¤t->mm->context, | |
4745 | + ldt_info.entry_number + 1, 1); | |
4746 | + if (error < 0) | |
4747 | + goto out_unlock; | |
4748 | + } | |
4749 | + | |
4750 | + /* Allow LDTs to be cleared by the user. */ | |
4751 | + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { | |
4752 | + if (oldmode || LDT_empty(&ldt_info)) { | |
4753 | + memset(&ldt, 0, sizeof(ldt)); | |
4754 | + goto install; | |
4755 | + } | |
4756 | + } | |
4757 | + | |
4758 | + fill_ldt(&ldt, &ldt_info); | |
4759 | + if (oldmode) | |
4760 | + ldt.avl = 0; | |
4761 | + | |
4762 | + /* Install the new entry ... */ | |
4763 | +install: | |
4764 | + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); | |
4765 | + | |
4766 | +out_unlock: | |
4767 | + mutex_unlock(&mm->context.lock); | |
4768 | +out: | |
4769 | + return error; | |
4770 | +} | |
4771 | + | |
4772 | +asmlinkage int sys_modify_ldt(int func, void __user *ptr, | |
4773 | + unsigned long bytecount) | |
4774 | +{ | |
4775 | + int ret = -ENOSYS; | |
4776 | + | |
4777 | + switch (func) { | |
4778 | + case 0: | |
4779 | + ret = read_ldt(ptr, bytecount); | |
4780 | + break; | |
4781 | + case 1: | |
4782 | + ret = write_ldt(ptr, bytecount, 1); | |
4783 | + break; | |
4784 | + case 2: | |
4785 | + ret = read_default_ldt(ptr, bytecount); | |
4786 | + break; | |
4787 | + case 0x11: | |
4788 | + ret = write_ldt(ptr, bytecount, 0); | |
4789 | + break; | |
4790 | + } | |
4791 | + return ret; | |
4792 | +} | |
4793 | --- a/arch/x86/kernel/machine_kexec_64.c | |
4794 | +++ b/arch/x86/kernel/machine_kexec_64.c | |
4795 | @@ -300,7 +300,9 @@ void machine_kexec(struct kimage *image) | |
4796 | ||
4797 | void arch_crash_save_vmcoreinfo(void) | |
4798 | { | |
4799 | +#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */ | |
4800 | VMCOREINFO_SYMBOL(phys_base); | |
4801 | +#endif | |
4802 | VMCOREINFO_SYMBOL(init_level4_pgt); | |
4803 | ||
4804 | #ifdef CONFIG_NUMA | |
4805 | --- a/arch/x86/kernel/Makefile | |
4806 | +++ b/arch/x86/kernel/Makefile | |
4807 | @@ -120,11 +120,10 @@ ifeq ($(CONFIG_X86_64),y) | |
4808 | ||
4809 | obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o | |
4810 | ||
4811 | + obj-$(CONFIG_XEN) += nmi_64.o | |
4812 | time_64-$(CONFIG_XEN) += time_32.o | |
4813 | pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o | |
4814 | endif | |
4815 | ||
4816 | disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \ | |
4817 | smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o | |
4818 | -disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o | |
4819 | -%/head_64.o %/head_64.s: asflags-$(CONFIG_XEN) := | |
4820 | --- a/arch/x86/kernel/microcode-xen.c | |
4821 | +++ b/arch/x86/kernel/microcode-xen.c | |
4822 | @@ -167,7 +167,7 @@ static int request_microcode(void) | |
4823 | } | |
4824 | ||
4825 | op.cmd = XENPF_microcode_update; | |
4826 | - set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data); | |
4827 | + set_xen_guest_handle(op.u.microcode.data, firmware->data); | |
4828 | op.u.microcode.length = firmware->size; | |
4829 | error = HYPERVISOR_platform_op(&op); | |
4830 | ||
4831 | --- a/arch/x86/kernel/mpparse_32-xen.c | |
4832 | +++ b/arch/x86/kernel/mpparse_32-xen.c | |
4833 | @@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0; | |
4834 | /* Processor that is doing the boot up */ | |
4835 | unsigned int boot_cpu_physical_apicid = -1U; | |
4836 | /* Internal processor count */ | |
4837 | -unsigned int __cpuinitdata num_processors; | |
4838 | +unsigned int num_processors; | |
4839 | ||
4840 | /* Bitmask of physically existing CPUs */ | |
4841 | physid_mask_t phys_cpu_present_map; | |
4842 | @@ -265,7 +265,7 @@ static void __init MP_ioapic_info (struc | |
4843 | if (!(m->mpc_flags & MPC_APIC_USABLE)) | |
4844 | return; | |
4845 | ||
4846 | - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", | |
4847 | + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", | |
4848 | m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); | |
4849 | if (nr_ioapics >= MAX_IO_APICS) { | |
4850 | printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", | |
4851 | @@ -412,9 +412,9 @@ static int __init smp_read_mpc(struct mp | |
4852 | ||
4853 | mps_oem_check(mpc, oem, str); | |
4854 | ||
4855 | - printk("APIC at: 0x%lX\n",mpc->mpc_lapic); | |
4856 | + printk("APIC at: 0x%X\n", mpc->mpc_lapic); | |
4857 | ||
4858 | - /* | |
4859 | + /* | |
4860 | * Save the local APIC address (it might be non-default) -- but only | |
4861 | * if we're not using ACPI. | |
4862 | */ | |
4863 | @@ -728,7 +728,7 @@ static int __init smp_scan_config (unsig | |
4864 | unsigned long *bp = isa_bus_to_virt(base); | |
4865 | struct intel_mp_floating *mpf; | |
4866 | ||
4867 | - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); | |
4868 | + printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length); | |
4869 | if (sizeof(*mpf) != 16) | |
4870 | printk("Error: MPF size\n"); | |
4871 | ||
4872 | @@ -742,9 +742,10 @@ static int __init smp_scan_config (unsig | |
4873 | ||
4874 | smp_found_config = 1; | |
4875 | #ifndef CONFIG_XEN | |
4876 | - printk(KERN_INFO "found SMP MP-table at %08lx\n", | |
4877 | - virt_to_phys(mpf)); | |
4878 | - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); | |
4879 | + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", | |
4880 | + mpf, virt_to_phys(mpf)); | |
4881 | + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, | |
4882 | + BOOTMEM_DEFAULT); | |
4883 | if (mpf->mpf_physptr) { | |
4884 | /* | |
4885 | * We cannot access to MPC table to compute | |
4886 | @@ -759,11 +760,12 @@ static int __init smp_scan_config (unsig | |
4887 | unsigned long end = max_low_pfn * PAGE_SIZE; | |
4888 | if (mpf->mpf_physptr + size > end) | |
4889 | size = end - mpf->mpf_physptr; | |
4890 | - reserve_bootmem(mpf->mpf_physptr, size); | |
4891 | + reserve_bootmem(mpf->mpf_physptr, size, | |
4892 | + BOOTMEM_DEFAULT); | |
4893 | } | |
4894 | #else | |
4895 | - printk(KERN_INFO "found SMP MP-table at %08lx\n", | |
4896 | - ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base); | |
4897 | + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", | |
4898 | + mpf, ((void *)bp - isa_bus_to_virt(base)) + base); | |
4899 | #endif | |
4900 | ||
4901 | mpf_found = mpf; | |
4902 | @@ -940,14 +942,14 @@ void __init mp_register_ioapic(u8 id, u3 | |
4903 | */ | |
4904 | mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; | |
4905 | mp_ioapic_routing[idx].gsi_base = gsi_base; | |
4906 | - mp_ioapic_routing[idx].gsi_end = gsi_base + | |
4907 | + mp_ioapic_routing[idx].gsi_end = gsi_base + | |
4908 | io_apic_get_redir_entries(idx); | |
4909 | ||
4910 | - printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " | |
4911 | - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, | |
4912 | - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, | |
4913 | - mp_ioapic_routing[idx].gsi_base, | |
4914 | - mp_ioapic_routing[idx].gsi_end); | |
4915 | + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " | |
4916 | + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, | |
4917 | + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, | |
4918 | + mp_ioapic_routing[idx].gsi_base, | |
4919 | + mp_ioapic_routing[idx].gsi_end); | |
4920 | } | |
4921 | ||
4922 | void __init | |
4923 | @@ -1063,15 +1065,16 @@ void __init mp_config_acpi_legacy_irqs ( | |
4924 | } | |
4925 | ||
4926 | #define MAX_GSI_NUM 4096 | |
4927 | +#define IRQ_COMPRESSION_START 64 | |
4928 | ||
4929 | int mp_register_gsi(u32 gsi, int triggering, int polarity) | |
4930 | { | |
4931 | int ioapic = -1; | |
4932 | int ioapic_pin = 0; | |
4933 | int idx, bit = 0; | |
4934 | - static int pci_irq = 16; | |
4935 | + static int pci_irq = IRQ_COMPRESSION_START; | |
4936 | /* | |
4937 | - * Mapping between Global System Interrups, which | |
4938 | + * Mapping between Global System Interrupts, which | |
4939 | * represent all possible interrupts, and IRQs | |
4940 | * assigned to actual devices. | |
4941 | */ | |
4942 | @@ -1108,12 +1111,16 @@ int mp_register_gsi(u32 gsi, int trigger | |
4943 | if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { | |
4944 | Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", | |
4945 | mp_ioapic_routing[ioapic].apic_id, ioapic_pin); | |
4946 | - return gsi_to_irq[gsi]; | |
4947 | + return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); | |
4948 | } | |
4949 | ||
4950 | mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); | |
4951 | ||
4952 | - if (triggering == ACPI_LEVEL_SENSITIVE) { | |
4953 | + /* | |
4954 | + * For GSI >= 64, use IRQ compression | |
4955 | + */ | |
4956 | + if ((gsi >= IRQ_COMPRESSION_START) | |
4957 | + && (triggering == ACPI_LEVEL_SENSITIVE)) { | |
4958 | /* | |
4959 | * For PCI devices assign IRQs in order, avoiding gaps | |
4960 | * due to unused I/O APIC pins. | |
4961 | --- a/arch/x86/kernel/mpparse_64-xen.c | |
4962 | +++ b/arch/x86/kernel/mpparse_64-xen.c | |
4963 | @@ -60,14 +60,20 @@ unsigned int boot_cpu_id = -1U; | |
4964 | EXPORT_SYMBOL(boot_cpu_id); | |
4965 | ||
4966 | /* Internal processor count */ | |
4967 | -unsigned int num_processors __cpuinitdata = 0; | |
4968 | +unsigned int num_processors; | |
4969 | ||
4970 | unsigned disabled_cpus __cpuinitdata; | |
4971 | ||
4972 | /* Bitmask of physically existing CPUs */ | |
4973 | physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; | |
4974 | ||
4975 | -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | |
4976 | +#ifndef CONFIG_XEN | |
4977 | +u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata | |
4978 | + = { [0 ... NR_CPUS-1] = BAD_APICID }; | |
4979 | +void *x86_bios_cpu_apicid_early_ptr; | |
4980 | +#endif | |
4981 | +DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; | |
4982 | +EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); | |
4983 | ||
4984 | ||
4985 | /* | |
4986 | @@ -119,24 +125,22 @@ static void __cpuinit MP_processor_info( | |
4987 | physid_set(m->mpc_apicid, phys_cpu_present_map); | |
4988 | if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { | |
4989 | /* | |
4990 | - * bios_cpu_apicid is required to have processors listed | |
4991 | + * x86_bios_cpu_apicid is required to have processors listed | |
4992 | * in same order as logical cpu numbers. Hence the first | |
4993 | * entry is BSP, and so on. | |
4994 | */ | |
4995 | cpu = 0; | |
4996 | } | |
4997 | - bios_cpu_apicid[cpu] = m->mpc_apicid; | |
4998 | - /* | |
4999 | - * We get called early in the the start_kernel initialization | |
5000 | - * process when the per_cpu data area is not yet setup, so we | |
5001 | - * use a static array that is removed after the per_cpu data | |
5002 | - * area is created. | |
5003 | - */ | |
5004 | - if (x86_cpu_to_apicid_ptr) { | |
5005 | - u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr; | |
5006 | - x86_cpu_to_apicid[cpu] = m->mpc_apicid; | |
5007 | + /* are we being called early in kernel startup? */ | |
5008 | + if (x86_cpu_to_apicid_early_ptr) { | |
5009 | + u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; | |
5010 | + u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; | |
5011 | + | |
5012 | + cpu_to_apicid[cpu] = m->mpc_apicid; | |
5013 | + bios_cpu_apicid[cpu] = m->mpc_apicid; | |
5014 | } else { | |
5015 | per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid; | |
5016 | + per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid; | |
5017 | } | |
5018 | ||
5019 | cpu_set(cpu, cpu_possible_map); | |
5020 | --- a/arch/x86/kernel/pci-dma-xen.c | |
5021 | +++ b/arch/x86/kernel/pci-dma-xen.c | |
5022 | @@ -434,3 +434,23 @@ dma_sync_single_for_device(struct device | |
5023 | swiotlb_sync_single_for_device(dev, dma_handle, size, direction); | |
5024 | } | |
5025 | EXPORT_SYMBOL(dma_sync_single_for_device); | |
5026 | + | |
5027 | +void | |
5028 | +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, | |
5029 | + enum dma_data_direction direction) | |
5030 | +{ | |
5031 | + if (swiotlb) | |
5032 | + swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction); | |
5033 | + flush_write_buffers(); | |
5034 | +} | |
5035 | +EXPORT_SYMBOL(dma_sync_sg_for_cpu); | |
5036 | + | |
5037 | +void | |
5038 | +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, | |
5039 | + enum dma_data_direction direction) | |
5040 | +{ | |
5041 | + if (swiotlb) | |
5042 | + swiotlb_sync_sg_for_device(dev,sg,nelems,direction); | |
5043 | + flush_write_buffers(); | |
5044 | +} | |
5045 | +EXPORT_SYMBOL(dma_sync_sg_for_device); | |
5046 | --- a/arch/x86/kernel/process_32-xen.c | |
5047 | +++ b/arch/x86/kernel/process_32-xen.c | |
5048 | @@ -23,7 +23,6 @@ | |
5049 | #include <linux/slab.h> | |
5050 | #include <linux/vmalloc.h> | |
5051 | #include <linux/user.h> | |
5052 | -#include <linux/a.out.h> | |
5053 | #include <linux/interrupt.h> | |
5054 | #include <linux/utsname.h> | |
5055 | #include <linux/delay.h> | |
5056 | @@ -59,8 +58,10 @@ | |
5057 | ||
5058 | #include <asm/tlbflush.h> | |
5059 | #include <asm/cpu.h> | |
5060 | +#include <asm/kdebug.h> | |
5061 | ||
5062 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); | |
5063 | +asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork"); | |
5064 | ||
5065 | static int hlt_counter; | |
5066 | ||
5067 | @@ -78,7 +79,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); | |
5068 | */ | |
5069 | unsigned long thread_saved_pc(struct task_struct *tsk) | |
5070 | { | |
5071 | - return ((unsigned long *)tsk->thread.esp)[3]; | |
5072 | + return ((unsigned long *)tsk->thread.sp)[3]; | |
5073 | } | |
5074 | ||
5075 | /* | |
5076 | @@ -86,7 +87,6 @@ unsigned long thread_saved_pc(struct tas | |
5077 | */ | |
5078 | void (*pm_idle)(void); | |
5079 | EXPORT_SYMBOL(pm_idle); | |
5080 | -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); | |
5081 | ||
5082 | void disable_hlt(void) | |
5083 | { | |
5084 | @@ -107,7 +107,7 @@ EXPORT_SYMBOL(enable_hlt); | |
5085 | * to poll the ->work.need_resched flag instead of waiting for the | |
5086 | * cross-CPU IPI to arrive. Use this option with caution. | |
5087 | */ | |
5088 | -static void poll_idle (void) | |
5089 | +static void poll_idle(void) | |
5090 | { | |
5091 | cpu_relax(); | |
5092 | } | |
5093 | @@ -122,10 +122,19 @@ static void xen_idle(void) | |
5094 | smp_mb(); | |
5095 | ||
5096 | local_irq_disable(); | |
5097 | - if (!need_resched()) | |
5098 | + if (!need_resched()) { | |
5099 | + ktime_t t0, t1; | |
5100 | + u64 t0n, t1n; | |
5101 | + | |
5102 | + t0 = ktime_get(); | |
5103 | + t0n = ktime_to_ns(t0); | |
5104 | safe_halt(); /* enables interrupts racelessly */ | |
5105 | - else | |
5106 | - local_irq_enable(); | |
5107 | + local_irq_disable(); | |
5108 | + t1 = ktime_get(); | |
5109 | + t1n = ktime_to_ns(t1); | |
5110 | + sched_clock_idle_wakeup_event(t1n - t0n); | |
5111 | + } | |
5112 | + local_irq_enable(); | |
5113 | current_thread_info()->status |= TS_POLLING; | |
5114 | } | |
5115 | #ifdef CONFIG_APM_MODULE | |
5116 | @@ -168,13 +177,13 @@ void cpu_idle(void) | |
5117 | while (!need_resched()) { | |
5118 | void (*idle)(void); | |
5119 | ||
5120 | - if (__get_cpu_var(cpu_idle_state)) | |
5121 | - __get_cpu_var(cpu_idle_state) = 0; | |
5122 | - | |
5123 | check_pgt_cache(); | |
5124 | rmb(); | |
5125 | idle = xen_idle; /* no alternatives */ | |
5126 | ||
5127 | + if (rcu_pending(cpu)) | |
5128 | + rcu_check_callbacks(cpu, 0); | |
5129 | + | |
5130 | if (cpu_is_offline(cpu)) | |
5131 | play_dead(); | |
5132 | ||
5133 | @@ -192,40 +201,19 @@ static void do_nothing(void *unused) | |
5134 | { | |
5135 | } | |
5136 | ||
5137 | +/* | |
5138 | + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of | |
5139 | + * pm_idle and update to new pm_idle value. Required while changing pm_idle | |
5140 | + * handler on SMP systems. | |
5141 | + * | |
5142 | + * Caller must have changed pm_idle to the new value before the call. Old | |
5143 | + * pm_idle value will not be used by any CPU after the return of this function. | |
5144 | + */ | |
5145 | void cpu_idle_wait(void) | |
5146 | { | |
5147 | - unsigned int cpu, this_cpu = get_cpu(); | |
5148 | - cpumask_t map, tmp = current->cpus_allowed; | |
5149 | - | |
5150 | - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); | |
5151 | - put_cpu(); | |
5152 | - | |
5153 | - cpus_clear(map); | |
5154 | - for_each_online_cpu(cpu) { | |
5155 | - per_cpu(cpu_idle_state, cpu) = 1; | |
5156 | - cpu_set(cpu, map); | |
5157 | - } | |
5158 | - | |
5159 | - __get_cpu_var(cpu_idle_state) = 0; | |
5160 | - | |
5161 | - wmb(); | |
5162 | - do { | |
5163 | - ssleep(1); | |
5164 | - for_each_online_cpu(cpu) { | |
5165 | - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) | |
5166 | - cpu_clear(cpu, map); | |
5167 | - } | |
5168 | - cpus_and(map, map, cpu_online_map); | |
5169 | - /* | |
5170 | - * We waited 1 sec, if a CPU still did not call idle | |
5171 | - * it may be because it is in idle and not waking up | |
5172 | - * because it has nothing to do. | |
5173 | - * Give all the remaining CPUS a kick. | |
5174 | - */ | |
5175 | - smp_call_function_mask(map, do_nothing, 0, 0); | |
5176 | - } while (!cpus_empty(map)); | |
5177 | - | |
5178 | - set_cpus_allowed(current, tmp); | |
5179 | + smp_mb(); | |
5180 | + /* kick all the CPUs so that they exit out of pm_idle */ | |
5181 | + smp_call_function(do_nothing, NULL, 0, 1); | |
5182 | } | |
5183 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | |
5184 | ||
5185 | @@ -251,15 +239,15 @@ void __show_registers(struct pt_regs *re | |
5186 | { | |
5187 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; | |
5188 | unsigned long d0, d1, d2, d3, d6, d7; | |
5189 | - unsigned long esp; | |
5190 | + unsigned long sp; | |
5191 | unsigned short ss, gs; | |
5192 | ||
5193 | if (user_mode_vm(regs)) { | |
5194 | - esp = regs->esp; | |
5195 | - ss = regs->xss & 0xffff; | |
5196 | + sp = regs->sp; | |
5197 | + ss = regs->ss & 0xffff; | |
5198 | savesegment(gs, gs); | |
5199 | } else { | |
5200 | - esp = (unsigned long) (®s->esp); | |
5201 | + sp = (unsigned long) (®s->sp); | |
5202 | savesegment(ss, ss); | |
5203 | savesegment(gs, gs); | |
5204 | } | |
5205 | @@ -272,17 +260,17 @@ void __show_registers(struct pt_regs *re | |
5206 | init_utsname()->version); | |
5207 | ||
5208 | printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", | |
5209 | - 0xffff & regs->xcs, regs->eip, regs->eflags, | |
5210 | + 0xffff & regs->cs, regs->ip, regs->flags, | |
5211 | smp_processor_id()); | |
5212 | - print_symbol("EIP is at %s\n", regs->eip); | |
5213 | + print_symbol("EIP is at %s\n", regs->ip); | |
5214 | ||
5215 | printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", | |
5216 | - regs->eax, regs->ebx, regs->ecx, regs->edx); | |
5217 | + regs->ax, regs->bx, regs->cx, regs->dx); | |
5218 | printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", | |
5219 | - regs->esi, regs->edi, regs->ebp, esp); | |
5220 | + regs->si, regs->di, regs->bp, sp); | |
5221 | printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", | |
5222 | - regs->xds & 0xffff, regs->xes & 0xffff, | |
5223 | - regs->xfs & 0xffff, gs, ss); | |
5224 | + regs->ds & 0xffff, regs->es & 0xffff, | |
5225 | + regs->fs & 0xffff, gs, ss); | |
5226 | ||
5227 | if (!all) | |
5228 | return; | |
5229 | @@ -310,12 +298,12 @@ void __show_registers(struct pt_regs *re | |
5230 | void show_regs(struct pt_regs *regs) | |
5231 | { | |
5232 | __show_registers(regs, 1); | |
5233 | - show_trace(NULL, regs, ®s->esp); | |
5234 | + show_trace(NULL, regs, ®s->sp, regs->bp); | |
5235 | } | |
5236 | ||
5237 | /* | |
5238 | - * This gets run with %ebx containing the | |
5239 | - * function to call, and %edx containing | |
5240 | + * This gets run with %bx containing the | |
5241 | + * function to call, and %dx containing | |
5242 | * the "args". | |
5243 | */ | |
5244 | extern void kernel_thread_helper(void); | |
5245 | @@ -329,16 +317,16 @@ int kernel_thread(int (*fn)(void *), voi | |
5246 | ||
5247 | memset(®s, 0, sizeof(regs)); | |
5248 | ||
5249 | - regs.ebx = (unsigned long) fn; | |
5250 | - regs.edx = (unsigned long) arg; | |
5251 | + regs.bx = (unsigned long) fn; | |
5252 | + regs.dx = (unsigned long) arg; | |
5253 | ||
5254 | - regs.xds = __USER_DS; | |
5255 | - regs.xes = __USER_DS; | |
5256 | - regs.xfs = __KERNEL_PERCPU; | |
5257 | - regs.orig_eax = -1; | |
5258 | - regs.eip = (unsigned long) kernel_thread_helper; | |
5259 | - regs.xcs = __KERNEL_CS | get_kernel_rpl(); | |
5260 | - regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; | |
5261 | + regs.ds = __USER_DS; | |
5262 | + regs.es = __USER_DS; | |
5263 | + regs.fs = __KERNEL_PERCPU; | |
5264 | + regs.orig_ax = -1; | |
5265 | + regs.ip = (unsigned long) kernel_thread_helper; | |
5266 | + regs.cs = __KERNEL_CS | get_kernel_rpl(); | |
5267 | + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; | |
5268 | ||
5269 | /* Ok, create the new process.. */ | |
5270 | return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); | |
5271 | @@ -368,7 +356,12 @@ void flush_thread(void) | |
5272 | { | |
5273 | struct task_struct *tsk = current; | |
5274 | ||
5275 | - memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); | |
5276 | + tsk->thread.debugreg0 = 0; | |
5277 | + tsk->thread.debugreg1 = 0; | |
5278 | + tsk->thread.debugreg2 = 0; | |
5279 | + tsk->thread.debugreg3 = 0; | |
5280 | + tsk->thread.debugreg6 = 0; | |
5281 | + tsk->thread.debugreg7 = 0; | |
5282 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | |
5283 | clear_tsk_thread_flag(tsk, TIF_DEBUG); | |
5284 | /* | |
5285 | @@ -393,7 +386,7 @@ void prepare_to_copy(struct task_struct | |
5286 | unlazy_fpu(tsk); | |
5287 | } | |
5288 | ||
5289 | -int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, | |
5290 | +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, | |
5291 | unsigned long unused, | |
5292 | struct task_struct * p, struct pt_regs * regs) | |
5293 | { | |
5294 | @@ -403,17 +396,19 @@ int copy_thread(int nr, unsigned long cl | |
5295 | ||
5296 | childregs = task_pt_regs(p); | |
5297 | *childregs = *regs; | |
5298 | - childregs->eax = 0; | |
5299 | - childregs->esp = esp; | |
5300 | + childregs->ax = 0; | |
5301 | + childregs->sp = sp; | |
5302 | ||
5303 | - p->thread.esp = (unsigned long) childregs; | |
5304 | - p->thread.esp0 = (unsigned long) (childregs+1); | |
5305 | + p->thread.sp = (unsigned long) childregs; | |
5306 | + p->thread.sp0 = (unsigned long) (childregs+1); | |
5307 | ||
5308 | - p->thread.eip = (unsigned long) ret_from_fork; | |
5309 | + p->thread.ip = (unsigned long) ret_from_fork; | |
5310 | ||
5311 | - savesegment(gs,p->thread.gs); | |
5312 | + savesegment(gs, p->thread.gs); | |
5313 | ||
5314 | tsk = current; | |
5315 | + if (test_tsk_thread_flag(tsk, TIF_CSTAR)) | |
5316 | + p->thread.ip = (unsigned long) cstar_ret_from_fork; | |
5317 | if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { | |
5318 | p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, | |
5319 | IO_BITMAP_BYTES, GFP_KERNEL); | |
5320 | @@ -424,34 +419,17 @@ int copy_thread(int nr, unsigned long cl | |
5321 | set_tsk_thread_flag(p, TIF_IO_BITMAP); | |
5322 | } | |
5323 | ||
5324 | + err = 0; | |
5325 | + | |
5326 | /* | |
5327 | * Set a new TLS for the child thread? | |
5328 | */ | |
5329 | - if (clone_flags & CLONE_SETTLS) { | |
5330 | - struct desc_struct *desc; | |
5331 | - struct user_desc info; | |
5332 | - int idx; | |
5333 | - | |
5334 | - err = -EFAULT; | |
5335 | - if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) | |
5336 | - goto out; | |
5337 | - err = -EINVAL; | |
5338 | - if (LDT_empty(&info)) | |
5339 | - goto out; | |
5340 | - | |
5341 | - idx = info.entry_number; | |
5342 | - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | |
5343 | - goto out; | |
5344 | - | |
5345 | - desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; | |
5346 | - desc->a = LDT_entry_a(&info); | |
5347 | - desc->b = LDT_entry_b(&info); | |
5348 | - } | |
5349 | + if (clone_flags & CLONE_SETTLS) | |
5350 | + err = do_set_thread_area(p, -1, | |
5351 | + (struct user_desc __user *)childregs->si, 0); | |
5352 | ||
5353 | p->thread.iopl = current->thread.iopl; | |
5354 | ||
5355 | - err = 0; | |
5356 | - out: | |
5357 | if (err && p->thread.io_bitmap_ptr) { | |
5358 | kfree(p->thread.io_bitmap_ptr); | |
5359 | p->thread.io_bitmap_max = 0; | |
5360 | @@ -459,67 +437,8 @@ int copy_thread(int nr, unsigned long cl | |
5361 | return err; | |
5362 | } | |
5363 | ||
5364 | -/* | |
5365 | - * fill in the user structure for a core dump.. | |
5366 | - */ | |
5367 | -void dump_thread(struct pt_regs * regs, struct user * dump) | |
5368 | -{ | |
5369 | - int i; | |
5370 | - | |
5371 | -/* changed the size calculations - should hopefully work better. lbt */ | |
5372 | - dump->magic = CMAGIC; | |
5373 | - dump->start_code = 0; | |
5374 | - dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); | |
5375 | - dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; | |
5376 | - dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; | |
5377 | - dump->u_dsize -= dump->u_tsize; | |
5378 | - dump->u_ssize = 0; | |
5379 | - for (i = 0; i < 8; i++) | |
5380 | - dump->u_debugreg[i] = current->thread.debugreg[i]; | |
5381 | - | |
5382 | - if (dump->start_stack < TASK_SIZE) | |
5383 | - dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; | |
5384 | - | |
5385 | - dump->regs.ebx = regs->ebx; | |
5386 | - dump->regs.ecx = regs->ecx; | |
5387 | - dump->regs.edx = regs->edx; | |
5388 | - dump->regs.esi = regs->esi; | |
5389 | - dump->regs.edi = regs->edi; | |
5390 | - dump->regs.ebp = regs->ebp; | |
5391 | - dump->regs.eax = regs->eax; | |
5392 | - dump->regs.ds = regs->xds; | |
5393 | - dump->regs.es = regs->xes; | |
5394 | - dump->regs.fs = regs->xfs; | |
5395 | - savesegment(gs,dump->regs.gs); | |
5396 | - dump->regs.orig_eax = regs->orig_eax; | |
5397 | - dump->regs.eip = regs->eip; | |
5398 | - dump->regs.cs = regs->xcs; | |
5399 | - dump->regs.eflags = regs->eflags; | |
5400 | - dump->regs.esp = regs->esp; | |
5401 | - dump->regs.ss = regs->xss; | |
5402 | - | |
5403 | - dump->u_fpvalid = dump_fpu (regs, &dump->i387); | |
5404 | -} | |
5405 | -EXPORT_SYMBOL(dump_thread); | |
5406 | - | |
5407 | -/* | |
5408 | - * Capture the user space registers if the task is not running (in user space) | |
5409 | - */ | |
5410 | -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) | |
5411 | -{ | |
5412 | - struct pt_regs ptregs = *task_pt_regs(tsk); | |
5413 | - ptregs.xcs &= 0xffff; | |
5414 | - ptregs.xds &= 0xffff; | |
5415 | - ptregs.xes &= 0xffff; | |
5416 | - ptregs.xss &= 0xffff; | |
5417 | - | |
5418 | - elf_core_copy_regs(regs, &ptregs); | |
5419 | - | |
5420 | - return 1; | |
5421 | -} | |
5422 | - | |
5423 | #ifdef CONFIG_SECCOMP | |
5424 | -void hard_disable_TSC(void) | |
5425 | +static void hard_disable_TSC(void) | |
5426 | { | |
5427 | write_cr4(read_cr4() | X86_CR4_TSD); | |
5428 | } | |
5429 | @@ -534,7 +453,7 @@ void disable_TSC(void) | |
5430 | hard_disable_TSC(); | |
5431 | preempt_enable(); | |
5432 | } | |
5433 | -void hard_enable_TSC(void) | |
5434 | +static void hard_enable_TSC(void) | |
5435 | { | |
5436 | write_cr4(read_cr4() & ~X86_CR4_TSD); | |
5437 | } | |
5438 | @@ -543,18 +462,32 @@ void hard_enable_TSC(void) | |
5439 | static noinline void | |
5440 | __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) | |
5441 | { | |
5442 | - struct thread_struct *next; | |
5443 | + struct thread_struct *prev, *next; | |
5444 | + unsigned long debugctl; | |
5445 | ||
5446 | + prev = &prev_p->thread; | |
5447 | next = &next_p->thread; | |
5448 | ||
5449 | + debugctl = prev->debugctlmsr; | |
5450 | + if (next->ds_area_msr != prev->ds_area_msr) { | |
5451 | + /* we clear debugctl to make sure DS | |
5452 | + * is not in use when we change it */ | |
5453 | + debugctl = 0; | |
5454 | + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); | |
5455 | + wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); | |
5456 | + } | |
5457 | + | |
5458 | + if (next->debugctlmsr != debugctl) | |
5459 | + wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0); | |
5460 | + | |
5461 | if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { | |
5462 | - set_debugreg(next->debugreg[0], 0); | |
5463 | - set_debugreg(next->debugreg[1], 1); | |
5464 | - set_debugreg(next->debugreg[2], 2); | |
5465 | - set_debugreg(next->debugreg[3], 3); | |
5466 | + set_debugreg(next->debugreg0, 0); | |
5467 | + set_debugreg(next->debugreg1, 1); | |
5468 | + set_debugreg(next->debugreg2, 2); | |
5469 | + set_debugreg(next->debugreg3, 3); | |
5470 | /* no 4 and 5 */ | |
5471 | - set_debugreg(next->debugreg[6], 6); | |
5472 | - set_debugreg(next->debugreg[7], 7); | |
5473 | + set_debugreg(next->debugreg6, 6); | |
5474 | + set_debugreg(next->debugreg7, 7); | |
5475 | } | |
5476 | ||
5477 | #ifdef CONFIG_SECCOMP | |
5478 | @@ -567,6 +500,14 @@ __switch_to_xtra(struct task_struct *pre | |
5479 | hard_enable_TSC(); | |
5480 | } | |
5481 | #endif | |
5482 | + | |
5483 | +#ifdef X86_BTS | |
5484 | + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) | |
5485 | + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); | |
5486 | + | |
5487 | + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) | |
5488 | + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); | |
5489 | +#endif | |
5490 | } | |
5491 | ||
5492 | /* | |
5493 | @@ -592,11 +533,11 @@ __switch_to_xtra(struct task_struct *pre | |
5494 | * More important, however, is the fact that this allows us much | |
5495 | * more flexibility. | |
5496 | * | |
5497 | - * The return value (in %eax) will be the "prev" task after | |
5498 | + * The return value (in %ax) will be the "prev" task after | |
5499 | * the task-switch, and shows up in ret_from_fork in entry.S, | |
5500 | * for example. | |
5501 | */ | |
5502 | -struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |
5503 | +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |
5504 | { | |
5505 | struct thread_struct *prev = &prev_p->thread, | |
5506 | *next = &next_p->thread; | |
5507 | @@ -632,12 +573,12 @@ struct task_struct fastcall * __switch_t | |
5508 | #endif | |
5509 | ||
5510 | /* | |
5511 | - * Reload esp0. | |
5512 | - * This is load_esp0(tss, next) with a multicall. | |
5513 | + * Reload sp0. | |
5514 | + * This is load_sp0(tss, next) with a multicall. | |
5515 | */ | |
5516 | mcl->op = __HYPERVISOR_stack_switch; | |
5517 | mcl->args[0] = __KERNEL_DS; | |
5518 | - mcl->args[1] = next->esp0; | |
5519 | + mcl->args[1] = next->sp0; | |
5520 | mcl++; | |
5521 | ||
5522 | /* | |
5523 | @@ -734,7 +675,7 @@ struct task_struct fastcall * __switch_t | |
5524 | ||
5525 | asmlinkage int sys_fork(struct pt_regs regs) | |
5526 | { | |
5527 | - return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); | |
5528 | + return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL); | |
5529 | } | |
5530 | ||
5531 | asmlinkage int sys_clone(struct pt_regs regs) | |
5532 | @@ -743,12 +684,12 @@ asmlinkage int sys_clone(struct pt_regs | |
5533 | unsigned long newsp; | |
5534 | int __user *parent_tidptr, *child_tidptr; | |
5535 | ||
5536 | - clone_flags = regs.ebx; | |
5537 | - newsp = regs.ecx; | |
5538 | - parent_tidptr = (int __user *)regs.edx; | |
5539 | - child_tidptr = (int __user *)regs.edi; | |
5540 | + clone_flags = regs.bx; | |
5541 | + newsp = regs.cx; | |
5542 | + parent_tidptr = (int __user *)regs.dx; | |
5543 | + child_tidptr = (int __user *)regs.di; | |
5544 | if (!newsp) | |
5545 | - newsp = regs.esp; | |
5546 | + newsp = regs.sp; | |
5547 | return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); | |
5548 | } | |
5549 | ||
5550 | @@ -764,7 +705,7 @@ asmlinkage int sys_clone(struct pt_regs | |
5551 | */ | |
5552 | asmlinkage int sys_vfork(struct pt_regs regs) | |
5553 | { | |
5554 | - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); | |
5555 | + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL); | |
5556 | } | |
5557 | ||
5558 | /* | |
5559 | @@ -775,18 +716,15 @@ asmlinkage int sys_execve(struct pt_regs | |
5560 | int error; | |
5561 | char * filename; | |
5562 | ||
5563 | - filename = getname((char __user *) regs.ebx); | |
5564 | + filename = getname((char __user *) regs.bx); | |
5565 | error = PTR_ERR(filename); | |
5566 | if (IS_ERR(filename)) | |
5567 | goto out; | |
5568 | error = do_execve(filename, | |
5569 | - (char __user * __user *) regs.ecx, | |
5570 | - (char __user * __user *) regs.edx, | |
5571 | + (char __user * __user *) regs.cx, | |
5572 | + (char __user * __user *) regs.dx, | |
5573 | ®s); | |
5574 | if (error == 0) { | |
5575 | - task_lock(current); | |
5576 | - current->ptrace &= ~PT_DTRACE; | |
5577 | - task_unlock(current); | |
5578 | /* Make sure we don't return using sysenter.. */ | |
5579 | set_thread_flag(TIF_IRET); | |
5580 | } | |
5581 | @@ -800,145 +738,37 @@ out: | |
5582 | ||
5583 | unsigned long get_wchan(struct task_struct *p) | |
5584 | { | |
5585 | - unsigned long ebp, esp, eip; | |
5586 | + unsigned long bp, sp, ip; | |
5587 | unsigned long stack_page; | |
5588 | int count = 0; | |
5589 | if (!p || p == current || p->state == TASK_RUNNING) | |
5590 | return 0; | |
5591 | stack_page = (unsigned long)task_stack_page(p); | |
5592 | - esp = p->thread.esp; | |
5593 | - if (!stack_page || esp < stack_page || esp > top_esp+stack_page) | |
5594 | + sp = p->thread.sp; | |
5595 | + if (!stack_page || sp < stack_page || sp > top_esp+stack_page) | |
5596 | return 0; | |
5597 | - /* include/asm-i386/system.h:switch_to() pushes ebp last. */ | |
5598 | - ebp = *(unsigned long *) esp; | |
5599 | + /* include/asm-i386/system.h:switch_to() pushes bp last. */ | |
5600 | + bp = *(unsigned long *) sp; | |
5601 | do { | |
5602 | - if (ebp < stack_page || ebp > top_ebp+stack_page) | |
5603 | + if (bp < stack_page || bp > top_ebp+stack_page) | |
5604 | return 0; | |
5605 | - eip = *(unsigned long *) (ebp+4); | |
5606 | - if (!in_sched_functions(eip)) | |
5607 | - return eip; | |
5608 | - ebp = *(unsigned long *) ebp; | |
5609 | + ip = *(unsigned long *) (bp+4); | |
5610 | + if (!in_sched_functions(ip)) | |
5611 | + return ip; | |
5612 | + bp = *(unsigned long *) bp; | |
5613 | } while (count++ < 16); | |
5614 | return 0; | |
5615 | } | |
5616 | ||
5617 | -/* | |
5618 | - * sys_alloc_thread_area: get a yet unused TLS descriptor index. | |
5619 | - */ | |
5620 | -static int get_free_idx(void) | |
5621 | -{ | |
5622 | - struct thread_struct *t = ¤t->thread; | |
5623 | - int idx; | |
5624 | - | |
5625 | - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) | |
5626 | - if (desc_empty(t->tls_array + idx)) | |
5627 | - return idx + GDT_ENTRY_TLS_MIN; | |
5628 | - return -ESRCH; | |
5629 | -} | |
5630 | - | |
5631 | -/* | |
5632 | - * Set a given TLS descriptor: | |
5633 | - */ | |
5634 | -asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) | |
5635 | -{ | |
5636 | - struct thread_struct *t = ¤t->thread; | |
5637 | - struct user_desc info; | |
5638 | - struct desc_struct *desc; | |
5639 | - int cpu, idx; | |
5640 | - | |
5641 | - if (copy_from_user(&info, u_info, sizeof(info))) | |
5642 | - return -EFAULT; | |
5643 | - idx = info.entry_number; | |
5644 | - | |
5645 | - /* | |
5646 | - * index -1 means the kernel should try to find and | |
5647 | - * allocate an empty descriptor: | |
5648 | - */ | |
5649 | - if (idx == -1) { | |
5650 | - idx = get_free_idx(); | |
5651 | - if (idx < 0) | |
5652 | - return idx; | |
5653 | - if (put_user(idx, &u_info->entry_number)) | |
5654 | - return -EFAULT; | |
5655 | - } | |
5656 | - | |
5657 | - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | |
5658 | - return -EINVAL; | |
5659 | - | |
5660 | - desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; | |
5661 | - | |
5662 | - /* | |
5663 | - * We must not get preempted while modifying the TLS. | |
5664 | - */ | |
5665 | - cpu = get_cpu(); | |
5666 | - | |
5667 | - if (LDT_empty(&info)) { | |
5668 | - desc->a = 0; | |
5669 | - desc->b = 0; | |
5670 | - } else { | |
5671 | - desc->a = LDT_entry_a(&info); | |
5672 | - desc->b = LDT_entry_b(&info); | |
5673 | - } | |
5674 | - load_TLS(t, cpu); | |
5675 | - | |
5676 | - put_cpu(); | |
5677 | - | |
5678 | - return 0; | |
5679 | -} | |
5680 | - | |
5681 | -/* | |
5682 | - * Get the current Thread-Local Storage area: | |
5683 | - */ | |
5684 | - | |
5685 | -#define GET_BASE(desc) ( \ | |
5686 | - (((desc)->a >> 16) & 0x0000ffff) | \ | |
5687 | - (((desc)->b << 16) & 0x00ff0000) | \ | |
5688 | - ( (desc)->b & 0xff000000) ) | |
5689 | - | |
5690 | -#define GET_LIMIT(desc) ( \ | |
5691 | - ((desc)->a & 0x0ffff) | \ | |
5692 | - ((desc)->b & 0xf0000) ) | |
5693 | - | |
5694 | -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) | |
5695 | -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) | |
5696 | -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) | |
5697 | -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) | |
5698 | -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) | |
5699 | -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) | |
5700 | - | |
5701 | -asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) | |
5702 | -{ | |
5703 | - struct user_desc info; | |
5704 | - struct desc_struct *desc; | |
5705 | - int idx; | |
5706 | - | |
5707 | - if (get_user(idx, &u_info->entry_number)) | |
5708 | - return -EFAULT; | |
5709 | - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | |
5710 | - return -EINVAL; | |
5711 | - | |
5712 | - memset(&info, 0, sizeof(info)); | |
5713 | - | |
5714 | - desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; | |
5715 | - | |
5716 | - info.entry_number = idx; | |
5717 | - info.base_addr = GET_BASE(desc); | |
5718 | - info.limit = GET_LIMIT(desc); | |
5719 | - info.seg_32bit = GET_32BIT(desc); | |
5720 | - info.contents = GET_CONTENTS(desc); | |
5721 | - info.read_exec_only = !GET_WRITABLE(desc); | |
5722 | - info.limit_in_pages = GET_LIMIT_PAGES(desc); | |
5723 | - info.seg_not_present = !GET_PRESENT(desc); | |
5724 | - info.useable = GET_USEABLE(desc); | |
5725 | - | |
5726 | - if (copy_to_user(u_info, &info, sizeof(info))) | |
5727 | - return -EFAULT; | |
5728 | - return 0; | |
5729 | -} | |
5730 | - | |
5731 | unsigned long arch_align_stack(unsigned long sp) | |
5732 | { | |
5733 | if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) | |
5734 | sp -= get_random_int() % 8192; | |
5735 | return sp & ~0xf; | |
5736 | } | |
5737 | + | |
5738 | +unsigned long arch_randomize_brk(struct mm_struct *mm) | |
5739 | +{ | |
5740 | + unsigned long range_end = mm->brk + 0x02000000; | |
5741 | + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; | |
5742 | +} | |
5743 | --- a/arch/x86/kernel/process_64-xen.c | |
5744 | +++ b/arch/x86/kernel/process_64-xen.c | |
5745 | @@ -3,7 +3,7 @@ | |
5746 | * | |
5747 | * Pentium III FXSR, SSE support | |
5748 | * Gareth Hughes <gareth@valinux.com>, May 2000 | |
5749 | - * | |
5750 | + * | |
5751 | * X86-64 port | |
5752 | * Andi Kleen. | |
5753 | * | |
5754 | @@ -22,19 +22,18 @@ | |
5755 | #include <linux/cpu.h> | |
5756 | #include <linux/errno.h> | |
5757 | #include <linux/sched.h> | |
5758 | +#include <linux/fs.h> | |
5759 | #include <linux/kernel.h> | |
5760 | #include <linux/mm.h> | |
5761 | -#include <linux/fs.h> | |
5762 | #include <linux/elfcore.h> | |
5763 | #include <linux/smp.h> | |
5764 | #include <linux/slab.h> | |
5765 | #include <linux/user.h> | |
5766 | -#include <linux/module.h> | |
5767 | -#include <linux/a.out.h> | |
5768 | #include <linux/interrupt.h> | |
5769 | +#include <linux/utsname.h> | |
5770 | #include <linux/delay.h> | |
5771 | +#include <linux/module.h> | |
5772 | #include <linux/ptrace.h> | |
5773 | -#include <linux/utsname.h> | |
5774 | #include <linux/random.h> | |
5775 | #include <linux/notifier.h> | |
5776 | #include <linux/kprobes.h> | |
5777 | @@ -73,7 +72,6 @@ EXPORT_SYMBOL(boot_option_idle_override) | |
5778 | */ | |
5779 | void (*pm_idle)(void); | |
5780 | EXPORT_SYMBOL(pm_idle); | |
5781 | -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); | |
5782 | ||
5783 | static ATOMIC_NOTIFIER_HEAD(idle_notifier); | |
5784 | ||
5785 | @@ -81,13 +79,6 @@ void idle_notifier_register(struct notif | |
5786 | { | |
5787 | atomic_notifier_chain_register(&idle_notifier, n); | |
5788 | } | |
5789 | -EXPORT_SYMBOL_GPL(idle_notifier_register); | |
5790 | - | |
5791 | -void idle_notifier_unregister(struct notifier_block *n) | |
5792 | -{ | |
5793 | - atomic_notifier_chain_unregister(&idle_notifier, n); | |
5794 | -} | |
5795 | -EXPORT_SYMBOL(idle_notifier_unregister); | |
5796 | ||
5797 | void enter_idle(void) | |
5798 | { | |
5799 | @@ -116,7 +107,7 @@ void exit_idle(void) | |
5800 | * to poll the ->need_resched flag instead of waiting for the | |
5801 | * cross-CPU IPI to arrive. Use this option with caution. | |
5802 | */ | |
5803 | -static void poll_idle (void) | |
5804 | +static void poll_idle(void) | |
5805 | { | |
5806 | local_irq_enable(); | |
5807 | cpu_relax(); | |
5808 | @@ -131,10 +122,19 @@ static void xen_idle(void) | |
5809 | */ | |
5810 | smp_mb(); | |
5811 | local_irq_disable(); | |
5812 | - if (!need_resched()) | |
5813 | - safe_halt(); | |
5814 | - else | |
5815 | - local_irq_enable(); | |
5816 | + if (!need_resched()) { | |
5817 | + ktime_t t0, t1; | |
5818 | + u64 t0n, t1n; | |
5819 | + | |
5820 | + t0 = ktime_get(); | |
5821 | + t0n = ktime_to_ns(t0); | |
5822 | + safe_halt(); /* enables interrupts racelessly */ | |
5823 | + local_irq_disable(); | |
5824 | + t1 = ktime_get(); | |
5825 | + t1n = ktime_to_ns(t1); | |
5826 | + sched_clock_idle_wakeup_event(t1n - t0n); | |
5827 | + } | |
5828 | + local_irq_enable(); | |
5829 | current_thread_info()->status |= TS_POLLING; | |
5830 | } | |
5831 | ||
5832 | @@ -161,19 +161,15 @@ static inline void play_dead(void) | |
5833 | * low exit latency (ie sit in a loop waiting for | |
5834 | * somebody to say that they'd like to reschedule) | |
5835 | */ | |
5836 | -void cpu_idle (void) | |
5837 | +void cpu_idle(void) | |
5838 | { | |
5839 | current_thread_info()->status |= TS_POLLING; | |
5840 | /* endless idle loop with no priority at all */ | |
5841 | while (1) { | |
5842 | + tick_nohz_stop_sched_tick(); | |
5843 | while (!need_resched()) { | |
5844 | void (*idle)(void); | |
5845 | ||
5846 | - if (__get_cpu_var(cpu_idle_state)) | |
5847 | - __get_cpu_var(cpu_idle_state) = 0; | |
5848 | - | |
5849 | - tick_nohz_stop_sched_tick(); | |
5850 | - | |
5851 | rmb(); | |
5852 | idle = xen_idle; /* no alternatives */ | |
5853 | if (cpu_is_offline(smp_processor_id())) | |
5854 | @@ -203,49 +199,27 @@ static void do_nothing(void *unused) | |
5855 | { | |
5856 | } | |
5857 | ||
5858 | +/* | |
5859 | + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of | |
5860 | + * pm_idle and update to new pm_idle value. Required while changing pm_idle | |
5861 | + * handler on SMP systems. | |
5862 | + * | |
5863 | + * Caller must have changed pm_idle to the new value before the call. Old | |
5864 | + * pm_idle value will not be used by any CPU after the return of this function. | |
5865 | + */ | |
5866 | void cpu_idle_wait(void) | |
5867 | { | |
5868 | - unsigned int cpu, this_cpu = get_cpu(); | |
5869 | - cpumask_t map, tmp = current->cpus_allowed; | |
5870 | - | |
5871 | - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); | |
5872 | - put_cpu(); | |
5873 | - | |
5874 | - cpus_clear(map); | |
5875 | - for_each_online_cpu(cpu) { | |
5876 | - per_cpu(cpu_idle_state, cpu) = 1; | |
5877 | - cpu_set(cpu, map); | |
5878 | - } | |
5879 | - | |
5880 | - __get_cpu_var(cpu_idle_state) = 0; | |
5881 | - | |
5882 | - wmb(); | |
5883 | - do { | |
5884 | - ssleep(1); | |
5885 | - for_each_online_cpu(cpu) { | |
5886 | - if (cpu_isset(cpu, map) && | |
5887 | - !per_cpu(cpu_idle_state, cpu)) | |
5888 | - cpu_clear(cpu, map); | |
5889 | - } | |
5890 | - cpus_and(map, map, cpu_online_map); | |
5891 | - /* | |
5892 | - * We waited 1 sec, if a CPU still did not call idle | |
5893 | - * it may be because it is in idle and not waking up | |
5894 | - * because it has nothing to do. | |
5895 | - * Give all the remaining CPUS a kick. | |
5896 | - */ | |
5897 | - smp_call_function_mask(map, do_nothing, 0, 0); | |
5898 | - } while (!cpus_empty(map)); | |
5899 | - | |
5900 | - set_cpus_allowed(current, tmp); | |
5901 | + smp_mb(); | |
5902 | + /* kick all the CPUs so that they exit out of pm_idle */ | |
5903 | + smp_call_function(do_nothing, NULL, 0, 1); | |
5904 | } | |
5905 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | |
5906 | ||
5907 | -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) | |
5908 | +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) | |
5909 | { | |
5910 | } | |
5911 | ||
5912 | -static int __init idle_setup (char *str) | |
5913 | +static int __init idle_setup(char *str) | |
5914 | { | |
5915 | if (!strcmp(str, "poll")) { | |
5916 | printk("using polling idle threads.\n"); | |
5917 | @@ -260,13 +234,13 @@ static int __init idle_setup (char *str) | |
5918 | } | |
5919 | early_param("idle", idle_setup); | |
5920 | ||
5921 | -/* Prints also some state that isn't saved in the pt_regs */ | |
5922 | +/* Prints also some state that isn't saved in the pt_regs */ | |
5923 | void __show_regs(struct pt_regs * regs) | |
5924 | { | |
5925 | unsigned long fs, gs, shadowgs; | |
5926 | unsigned long d0, d1, d2, d3, d6, d7; | |
5927 | - unsigned int fsindex,gsindex; | |
5928 | - unsigned int ds,cs,es; | |
5929 | + unsigned int fsindex, gsindex; | |
5930 | + unsigned int ds, cs, es; | |
5931 | ||
5932 | printk("\n"); | |
5933 | print_modules(); | |
5934 | @@ -275,16 +249,16 @@ void __show_regs(struct pt_regs * regs) | |
5935 | init_utsname()->release, | |
5936 | (int)strcspn(init_utsname()->version, " "), | |
5937 | init_utsname()->version); | |
5938 | - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); | |
5939 | - printk_address(regs->rip); | |
5940 | - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, | |
5941 | - regs->eflags); | |
5942 | + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); | |
5943 | + printk_address(regs->ip, 1); | |
5944 | + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, | |
5945 | + regs->flags); | |
5946 | printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", | |
5947 | - regs->rax, regs->rbx, regs->rcx); | |
5948 | + regs->ax, regs->bx, regs->cx); | |
5949 | printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", | |
5950 | - regs->rdx, regs->rsi, regs->rdi); | |
5951 | + regs->dx, regs->si, regs->di); | |
5952 | printk("RBP: %016lx R08: %016lx R09: %016lx\n", | |
5953 | - regs->rbp, regs->r8, regs->r9); | |
5954 | + regs->bp, regs->r8, regs->r9); | |
5955 | printk("R10: %016lx R11: %016lx R12: %016lx\n", | |
5956 | regs->r10, regs->r11, regs->r12); | |
5957 | printk("R13: %016lx R14: %016lx R15: %016lx\n", | |
5958 | @@ -318,7 +292,7 @@ void show_regs(struct pt_regs *regs) | |
5959 | { | |
5960 | printk("CPU %d:", smp_processor_id()); | |
5961 | __show_regs(regs); | |
5962 | - show_trace(NULL, regs, (void *)(regs + 1)); | |
5963 | + show_trace(NULL, regs, (void *)(regs + 1), regs->bp); | |
5964 | } | |
5965 | ||
5966 | /* | |
5967 | @@ -329,7 +303,7 @@ void exit_thread(void) | |
5968 | struct task_struct *me = current; | |
5969 | struct thread_struct *t = &me->thread; | |
5970 | ||
5971 | - if (me->thread.io_bitmap_ptr) { | |
5972 | + if (me->thread.io_bitmap_ptr) { | |
5973 | #ifndef CONFIG_X86_NO_TSS | |
5974 | struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); | |
5975 | #endif | |
5976 | @@ -382,7 +356,7 @@ void flush_thread(void) | |
5977 | tsk->thread.debugreg3 = 0; | |
5978 | tsk->thread.debugreg6 = 0; | |
5979 | tsk->thread.debugreg7 = 0; | |
5980 | - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | |
5981 | + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | |
5982 | /* | |
5983 | * Forget coprocessor state.. | |
5984 | */ | |
5985 | @@ -405,26 +379,21 @@ void release_thread(struct task_struct * | |
5986 | ||
5987 | static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) | |
5988 | { | |
5989 | - struct user_desc ud = { | |
5990 | + struct user_desc ud = { | |
5991 | .base_addr = addr, | |
5992 | .limit = 0xfffff, | |
5993 | .seg_32bit = 1, | |
5994 | .limit_in_pages = 1, | |
5995 | .useable = 1, | |
5996 | }; | |
5997 | - struct n_desc_struct *desc = (void *)t->thread.tls_array; | |
5998 | + struct desc_struct *desc = t->thread.tls_array; | |
5999 | desc += tls; | |
6000 | - desc->a = LDT_entry_a(&ud); | |
6001 | - desc->b = LDT_entry_b(&ud); | |
6002 | + fill_ldt(desc, &ud); | |
6003 | } | |
6004 | ||
6005 | static inline u32 read_32bit_tls(struct task_struct *t, int tls) | |
6006 | { | |
6007 | - struct desc_struct *desc = (void *)t->thread.tls_array; | |
6008 | - desc += tls; | |
6009 | - return desc->base0 | | |
6010 | - (((u32)desc->base1) << 16) | | |
6011 | - (((u32)desc->base2) << 24); | |
6012 | + return get_desc_base(&t->thread.tls_array[tls]); | |
6013 | } | |
6014 | ||
6015 | /* | |
6016 | @@ -436,7 +405,7 @@ void prepare_to_copy(struct task_struct | |
6017 | unlazy_fpu(tsk); | |
6018 | } | |
6019 | ||
6020 | -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, | |
6021 | +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, | |
6022 | unsigned long unused, | |
6023 | struct task_struct * p, struct pt_regs * regs) | |
6024 | { | |
6025 | @@ -448,14 +417,14 @@ int copy_thread(int nr, unsigned long cl | |
6026 | (THREAD_SIZE + task_stack_page(p))) - 1; | |
6027 | *childregs = *regs; | |
6028 | ||
6029 | - childregs->rax = 0; | |
6030 | - childregs->rsp = rsp; | |
6031 | - if (rsp == ~0UL) | |
6032 | - childregs->rsp = (unsigned long)childregs; | |
6033 | - | |
6034 | - p->thread.rsp = (unsigned long) childregs; | |
6035 | - p->thread.rsp0 = (unsigned long) (childregs+1); | |
6036 | - p->thread.userrsp = me->thread.userrsp; | |
6037 | + childregs->ax = 0; | |
6038 | + childregs->sp = sp; | |
6039 | + if (sp == ~0UL) | |
6040 | + childregs->sp = (unsigned long)childregs; | |
6041 | + | |
6042 | + p->thread.sp = (unsigned long) childregs; | |
6043 | + p->thread.sp0 = (unsigned long) (childregs+1); | |
6044 | + p->thread.usersp = me->thread.usersp; | |
6045 | ||
6046 | set_tsk_thread_flag(p, TIF_FORK); | |
6047 | ||
6048 | @@ -476,7 +445,7 @@ int copy_thread(int nr, unsigned long cl | |
6049 | memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, | |
6050 | IO_BITMAP_BYTES); | |
6051 | set_tsk_thread_flag(p, TIF_IO_BITMAP); | |
6052 | - } | |
6053 | + } | |
6054 | ||
6055 | /* | |
6056 | * Set a new TLS for the child thread? | |
6057 | @@ -484,7 +453,8 @@ int copy_thread(int nr, unsigned long cl | |
6058 | if (clone_flags & CLONE_SETTLS) { | |
6059 | #ifdef CONFIG_IA32_EMULATION | |
6060 | if (test_thread_flag(TIF_IA32)) | |
6061 | - err = ia32_child_tls(p, childregs); | |
6062 | + err = do_set_thread_area(p, -1, | |
6063 | + (struct user_desc __user *)childregs->si, 0); | |
6064 | else | |
6065 | #endif | |
6066 | err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); | |
6067 | @@ -502,26 +472,32 @@ out: | |
6068 | return err; | |
6069 | } | |
6070 | ||
6071 | -static inline void __save_init_fpu( struct task_struct *tsk ) | |
6072 | -{ | |
6073 | - asm volatile( "rex64 ; fxsave %0 ; fnclex" | |
6074 | - : "=m" (tsk->thread.i387.fxsave)); | |
6075 | - tsk->thread_info->status &= ~TS_USEDFPU; | |
6076 | -} | |
6077 | - | |
6078 | /* | |
6079 | * This special macro can be used to load a debugging register | |
6080 | */ | |
6081 | -#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) | |
6082 | +#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r) | |
6083 | ||
6084 | static inline void __switch_to_xtra(struct task_struct *prev_p, | |
6085 | - struct task_struct *next_p) | |
6086 | + struct task_struct *next_p) | |
6087 | { | |
6088 | struct thread_struct *prev, *next; | |
6089 | + unsigned long debugctl; | |
6090 | ||
6091 | prev = &prev_p->thread, | |
6092 | next = &next_p->thread; | |
6093 | ||
6094 | + debugctl = prev->debugctlmsr; | |
6095 | + if (next->ds_area_msr != prev->ds_area_msr) { | |
6096 | + /* we clear debugctl to make sure DS | |
6097 | + * is not in use when we change it */ | |
6098 | + debugctl = 0; | |
6099 | + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); | |
6100 | + wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); | |
6101 | + } | |
6102 | + | |
6103 | + if (next->debugctlmsr != debugctl) | |
6104 | + wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr); | |
6105 | + | |
6106 | if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { | |
6107 | loaddebug(next, 0); | |
6108 | loaddebug(next, 1); | |
6109 | @@ -531,12 +507,20 @@ static inline void __switch_to_xtra(stru | |
6110 | loaddebug(next, 6); | |
6111 | loaddebug(next, 7); | |
6112 | } | |
6113 | + | |
6114 | +#ifdef X86_BTS | |
6115 | + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) | |
6116 | + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); | |
6117 | + | |
6118 | + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) | |
6119 | + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); | |
6120 | +#endif | |
6121 | } | |
6122 | ||
6123 | /* | |
6124 | * switch_to(x,y) should switch tasks from x to y. | |
6125 | * | |
6126 | - * This could still be optimized: | |
6127 | + * This could still be optimized: | |
6128 | * - fold all the options into a flag word and test it with a single test. | |
6129 | * - could test fs/gs bitsliced | |
6130 | * | |
6131 | @@ -547,7 +531,7 @@ __switch_to(struct task_struct *prev_p, | |
6132 | { | |
6133 | struct thread_struct *prev = &prev_p->thread, | |
6134 | *next = &next_p->thread; | |
6135 | - int cpu = smp_processor_id(); | |
6136 | + int cpu = smp_processor_id(); | |
6137 | #ifndef CONFIG_X86_NO_TSS | |
6138 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | |
6139 | #endif | |
6140 | @@ -581,11 +565,12 @@ __switch_to(struct task_struct *prev_p, | |
6141 | prev_p->fpu_counter = 0; | |
6142 | ||
6143 | /* | |
6144 | - * Reload esp0, LDT and the page table pointer: | |
6145 | + * Reload sp0. | |
6146 | + * This is load_sp0(tss, next) with a multicall. | |
6147 | */ | |
6148 | mcl->op = __HYPERVISOR_stack_switch; | |
6149 | mcl->args[0] = __KERNEL_DS; | |
6150 | - mcl->args[1] = next->rsp0; | |
6151 | + mcl->args[1] = next->sp0; | |
6152 | mcl++; | |
6153 | ||
6154 | /* | |
6155 | @@ -593,11 +578,12 @@ __switch_to(struct task_struct *prev_p, | |
6156 | * This is load_TLS(next, cpu) with multicalls. | |
6157 | */ | |
6158 | #define C(i) do { \ | |
6159 | - if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \ | |
6160 | + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ | |
6161 | + next->tls_array[i].b != prev->tls_array[i].b)) { \ | |
6162 | mcl->op = __HYPERVISOR_update_descriptor; \ | |
6163 | mcl->args[0] = virt_to_machine( \ | |
6164 | - &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \ | |
6165 | - mcl->args[1] = next->tls_array[i]; \ | |
6166 | + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ | |
6167 | + mcl->args[1] = *(u64 *)&next->tls_array[i]; \ | |
6168 | mcl++; \ | |
6169 | } \ | |
6170 | } while (0) | |
6171 | @@ -605,7 +591,7 @@ __switch_to(struct task_struct *prev_p, | |
6172 | #undef C | |
6173 | ||
6174 | if (unlikely(prev->iopl != next->iopl)) { | |
6175 | - iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl; | |
6176 | + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; | |
6177 | #if CONFIG_XEN_COMPAT > 0x030002 | |
6178 | mcl->op = __HYPERVISOR_physdev_op; | |
6179 | mcl->args[0] = PHYSDEVOP_set_iopl; | |
6180 | @@ -669,8 +655,8 @@ __switch_to(struct task_struct *prev_p, | |
6181 | /* | |
6182 | * Switch the PDA context. | |
6183 | */ | |
6184 | - prev->userrsp = read_pda(oldrsp); | |
6185 | - write_pda(oldrsp, next->userrsp); | |
6186 | + prev->usersp = read_pda(oldrsp); | |
6187 | + write_pda(oldrsp, next->usersp); | |
6188 | write_pda(pcurrent, next_p); | |
6189 | write_pda(kernelstack, | |
6190 | (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); | |
6191 | @@ -687,7 +673,8 @@ __switch_to(struct task_struct *prev_p, | |
6192 | /* | |
6193 | * Now maybe reload the debug registers | |
6194 | */ | |
6195 | - if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) | |
6196 | + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || | |
6197 | + task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) | |
6198 | __switch_to_xtra(prev_p, next_p); | |
6199 | ||
6200 | /* If the task has used fpu the last 5 timeslices, just do a full | |
6201 | @@ -702,23 +689,18 @@ __switch_to(struct task_struct *prev_p, | |
6202 | /* | |
6203 | * sys_execve() executes a new program. | |
6204 | */ | |
6205 | -asmlinkage | |
6206 | +asmlinkage | |
6207 | long sys_execve(char __user *name, char __user * __user *argv, | |
6208 | - char __user * __user *envp, struct pt_regs regs) | |
6209 | + char __user * __user *envp, struct pt_regs *regs) | |
6210 | { | |
6211 | long error; | |
6212 | char * filename; | |
6213 | ||
6214 | filename = getname(name); | |
6215 | error = PTR_ERR(filename); | |
6216 | - if (IS_ERR(filename)) | |
6217 | + if (IS_ERR(filename)) | |
6218 | return error; | |
6219 | - error = do_execve(filename, argv, envp, ®s); | |
6220 | - if (error == 0) { | |
6221 | - task_lock(current); | |
6222 | - current->ptrace &= ~PT_DTRACE; | |
6223 | - task_unlock(current); | |
6224 | - } | |
6225 | + error = do_execve(filename, argv, envp, regs); | |
6226 | putname(filename); | |
6227 | return error; | |
6228 | } | |
6229 | @@ -728,18 +710,18 @@ void set_personality_64bit(void) | |
6230 | /* inherit personality from parent */ | |
6231 | ||
6232 | /* Make sure to be in 64bit mode */ | |
6233 | - clear_thread_flag(TIF_IA32); | |
6234 | + clear_thread_flag(TIF_IA32); | |
6235 | ||
6236 | /* TBD: overwrites user setup. Should have two bits. | |
6237 | But 64bit processes have always behaved this way, | |
6238 | so it's not too bad. The main problem is just that | |
6239 | - 32bit childs are affected again. */ | |
6240 | + 32bit childs are affected again. */ | |
6241 | current->personality &= ~READ_IMPLIES_EXEC; | |
6242 | } | |
6243 | ||
6244 | asmlinkage long sys_fork(struct pt_regs *regs) | |
6245 | { | |
6246 | - return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); | |
6247 | + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); | |
6248 | } | |
6249 | ||
6250 | asmlinkage long | |
6251 | @@ -747,7 +729,7 @@ sys_clone(unsigned long clone_flags, uns | |
6252 | void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) | |
6253 | { | |
6254 | if (!newsp) | |
6255 | - newsp = regs->rsp; | |
6256 | + newsp = regs->sp; | |
6257 | return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); | |
6258 | } | |
6259 | ||
6260 | @@ -763,29 +745,29 @@ sys_clone(unsigned long clone_flags, uns | |
6261 | */ | |
6262 | asmlinkage long sys_vfork(struct pt_regs *regs) | |
6263 | { | |
6264 | - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, | |
6265 | + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, | |
6266 | NULL, NULL); | |
6267 | } | |
6268 | ||
6269 | unsigned long get_wchan(struct task_struct *p) | |
6270 | { | |
6271 | unsigned long stack; | |
6272 | - u64 fp,rip; | |
6273 | + u64 fp,ip; | |
6274 | int count = 0; | |
6275 | ||
6276 | if (!p || p == current || p->state==TASK_RUNNING) | |
6277 | return 0; | |
6278 | stack = (unsigned long)task_stack_page(p); | |
6279 | - if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) | |
6280 | + if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) | |
6281 | return 0; | |
6282 | - fp = *(u64 *)(p->thread.rsp); | |
6283 | + fp = *(u64 *)(p->thread.sp); | |
6284 | do { | |
6285 | if (fp < (unsigned long)stack || | |
6286 | fp > (unsigned long)stack+THREAD_SIZE) | |
6287 | return 0; | |
6288 | - rip = *(u64 *)(fp+8); | |
6289 | - if (!in_sched_functions(rip)) | |
6290 | - return rip; | |
6291 | + ip = *(u64 *)(fp+8); | |
6292 | + if (!in_sched_functions(ip)) | |
6293 | + return ip; | |
6294 | fp = *(u64 *)fp; | |
6295 | } while (count++ < 16); | |
6296 | return 0; | |
6297 | @@ -827,19 +809,19 @@ long do_arch_prctl(struct task_struct *t | |
6298 | /* Not strictly needed for fs, but do it for symmetry | |
6299 | with gs */ | |
6300 | if (addr >= TASK_SIZE_OF(task)) | |
6301 | - return -EPERM; | |
6302 | + return -EPERM; | |
6303 | cpu = get_cpu(); | |
6304 | - /* handle small bases via the GDT because that's faster to | |
6305 | + /* handle small bases via the GDT because that's faster to | |
6306 | switch. */ | |
6307 | - if (addr <= 0xffffffff) { | |
6308 | + if (addr <= 0xffffffff) { | |
6309 | set_32bit_tls(task, FS_TLS, addr); | |
6310 | - if (doit) { | |
6311 | - load_TLS(&task->thread, cpu); | |
6312 | + if (doit) { | |
6313 | + load_TLS(&task->thread, cpu); | |
6314 | asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); | |
6315 | } | |
6316 | task->thread.fsindex = FS_TLS_SEL; | |
6317 | task->thread.fs = 0; | |
6318 | - } else { | |
6319 | + } else { | |
6320 | task->thread.fsindex = 0; | |
6321 | task->thread.fs = addr; | |
6322 | if (doit) { | |
6323 | @@ -852,24 +834,24 @@ long do_arch_prctl(struct task_struct *t | |
6324 | } | |
6325 | put_cpu(); | |
6326 | break; | |
6327 | - case ARCH_GET_FS: { | |
6328 | - unsigned long base; | |
6329 | + case ARCH_GET_FS: { | |
6330 | + unsigned long base; | |
6331 | if (task->thread.fsindex == FS_TLS_SEL) | |
6332 | base = read_32bit_tls(task, FS_TLS); | |
6333 | else if (doit) | |
6334 | rdmsrl(MSR_FS_BASE, base); | |
6335 | else | |
6336 | base = task->thread.fs; | |
6337 | - ret = put_user(base, (unsigned long __user *)addr); | |
6338 | - break; | |
6339 | + ret = put_user(base, (unsigned long __user *)addr); | |
6340 | + break; | |
6341 | } | |
6342 | - case ARCH_GET_GS: { | |
6343 | + case ARCH_GET_GS: { | |
6344 | unsigned long base; | |
6345 | unsigned gsindex; | |
6346 | if (task->thread.gsindex == GS_TLS_SEL) | |
6347 | base = read_32bit_tls(task, GS_TLS); | |
6348 | else if (doit) { | |
6349 | - asm("movl %%gs,%0" : "=r" (gsindex)); | |
6350 | + asm("movl %%gs,%0" : "=r" (gsindex)); | |
6351 | if (gsindex) | |
6352 | rdmsrl(MSR_KERNEL_GS_BASE, base); | |
6353 | else | |
6354 | @@ -877,40 +859,21 @@ long do_arch_prctl(struct task_struct *t | |
6355 | } | |
6356 | else | |
6357 | base = task->thread.gs; | |
6358 | - ret = put_user(base, (unsigned long __user *)addr); | |
6359 | + ret = put_user(base, (unsigned long __user *)addr); | |
6360 | break; | |
6361 | } | |
6362 | ||
6363 | default: | |
6364 | ret = -EINVAL; | |
6365 | break; | |
6366 | - } | |
6367 | + } | |
6368 | ||
6369 | - return ret; | |
6370 | -} | |
6371 | + return ret; | |
6372 | +} | |
6373 | ||
6374 | long sys_arch_prctl(int code, unsigned long addr) | |
6375 | { | |
6376 | return do_arch_prctl(current, code, addr); | |
6377 | -} | |
6378 | - | |
6379 | -/* | |
6380 | - * Capture the user space registers if the task is not running (in user space) | |
6381 | - */ | |
6382 | -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) | |
6383 | -{ | |
6384 | - struct pt_regs *pp, ptregs; | |
6385 | - | |
6386 | - pp = task_pt_regs(tsk); | |
6387 | - | |
6388 | - ptregs = *pp; | |
6389 | - ptregs.cs &= 0xffff; | |
6390 | - ptregs.ss &= 0xffff; | |
6391 | - | |
6392 | - elf_core_copy_regs(regs, &ptregs); | |
6393 | - | |
6394 | - boot_option_idle_override = 1; | |
6395 | - return 1; | |
6396 | } | |
6397 | ||
6398 | unsigned long arch_align_stack(unsigned long sp) | |
6399 | @@ -919,3 +882,9 @@ unsigned long arch_align_stack(unsigned | |
6400 | sp -= get_random_int() % 8192; | |
6401 | return sp & ~0xf; | |
6402 | } | |
6403 | + | |
6404 | +unsigned long arch_randomize_brk(struct mm_struct *mm) | |
6405 | +{ | |
6406 | + unsigned long range_end = mm->brk + 0x02000000; | |
6407 | + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; | |
6408 | +} | |
6409 | --- a/arch/x86/kernel/quirks-xen.c | |
6410 | +++ b/arch/x86/kernel/quirks-xen.c | |
6411 | @@ -9,7 +9,7 @@ | |
6412 | static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) | |
6413 | { | |
6414 | u8 config, rev; | |
6415 | - u32 word; | |
6416 | + u16 word; | |
6417 | ||
6418 | /* BIOS may enable hardware IRQ balancing for | |
6419 | * E7520/E7320/E7525(revision ID 0x9 and below) | |
6420 | @@ -24,14 +24,17 @@ static void __devinit quirk_intel_irqbal | |
6421 | pci_read_config_byte(dev, 0xf4, &config); | |
6422 | pci_write_config_byte(dev, 0xf4, config|0x2); | |
6423 | ||
6424 | - /* read xTPR register */ | |
6425 | - raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); | |
6426 | + /* | |
6427 | + * read xTPR register. We may not have a pci_dev for device 8 | |
6428 | + * because it might be hidden until the above write. | |
6429 | + */ | |
6430 | + pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word); | |
6431 | ||
6432 | if (!(word & (1 << 13))) { | |
6433 | struct xen_platform_op op; | |
6434 | ||
6435 | - printk(KERN_INFO "Intel E7520/7320/7525 detected. " | |
6436 | - "Disabling irq balancing and affinity\n"); | |
6437 | + dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " | |
6438 | + "disabling irq balancing and affinity\n"); | |
6439 | op.cmd = XENPF_platform_quirk; | |
6440 | op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; | |
6441 | WARN_ON(HYPERVISOR_platform_op(&op)); | |
6442 | @@ -102,14 +105,16 @@ static void ich_force_enable_hpet(struct | |
6443 | pci_read_config_dword(dev, 0xF0, &rcba); | |
6444 | rcba &= 0xFFFFC000; | |
6445 | if (rcba == 0) { | |
6446 | - printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n"); | |
6447 | + dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; " | |
6448 | + "cannot force enable HPET\n"); | |
6449 | return; | |
6450 | } | |
6451 | ||
6452 | /* use bits 31:14, 16 kB aligned */ | |
6453 | rcba_base = ioremap_nocache(rcba, 0x4000); | |
6454 | if (rcba_base == NULL) { | |
6455 | - printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n"); | |
6456 | + dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; " | |
6457 | + "cannot force enable HPET\n"); | |
6458 | return; | |
6459 | } | |
6460 | ||
6461 | @@ -120,8 +125,8 @@ static void ich_force_enable_hpet(struct | |
6462 | /* HPET is enabled in HPTC. Just not reported by BIOS */ | |
6463 | val = val & 0x3; | |
6464 | force_hpet_address = 0xFED00000 | (val << 12); | |
6465 | - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", | |
6466 | - force_hpet_address); | |
6467 | + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " | |
6468 | + "0x%lx\n", force_hpet_address); | |
6469 | iounmap(rcba_base); | |
6470 | return; | |
6471 | } | |
6472 | @@ -140,11 +145,12 @@ static void ich_force_enable_hpet(struct | |
6473 | if (err) { | |
6474 | force_hpet_address = 0; | |
6475 | iounmap(rcba_base); | |
6476 | - printk(KERN_DEBUG "Failed to force enable HPET\n"); | |
6477 | + dev_printk(KERN_DEBUG, &dev->dev, | |
6478 | + "Failed to force enable HPET\n"); | |
6479 | } else { | |
6480 | force_hpet_resume_type = ICH_FORCE_HPET_RESUME; | |
6481 | - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", | |
6482 | - force_hpet_address); | |
6483 | + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " | |
6484 | + "0x%lx\n", force_hpet_address); | |
6485 | } | |
6486 | } | |
6487 | ||
6488 | @@ -160,6 +166,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I | |
6489 | ich_force_enable_hpet); | |
6490 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, | |
6491 | ich_force_enable_hpet); | |
6492 | +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, | |
6493 | + ich_force_enable_hpet); | |
6494 | ||
6495 | ||
6496 | static struct pci_dev *cached_dev; | |
6497 | @@ -204,8 +212,8 @@ static void old_ich_force_enable_hpet(st | |
6498 | if (val & 0x4) { | |
6499 | val &= 0x3; | |
6500 | force_hpet_address = 0xFED00000 | (val << 12); | |
6501 | - printk(KERN_DEBUG "HPET at base address 0x%lx\n", | |
6502 | - force_hpet_address); | |
6503 | + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", | |
6504 | + force_hpet_address); | |
6505 | return; | |
6506 | } | |
6507 | ||
6508 | @@ -225,14 +233,14 @@ static void old_ich_force_enable_hpet(st | |
6509 | /* HPET is enabled in HPTC. Just not reported by BIOS */ | |
6510 | val &= 0x3; | |
6511 | force_hpet_address = 0xFED00000 | (val << 12); | |
6512 | - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", | |
6513 | - force_hpet_address); | |
6514 | + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " | |
6515 | + "0x%lx\n", force_hpet_address); | |
6516 | cached_dev = dev; | |
6517 | force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; | |
6518 | return; | |
6519 | } | |
6520 | ||
6521 | - printk(KERN_DEBUG "Failed to force enable HPET\n"); | |
6522 | + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); | |
6523 | } | |
6524 | ||
6525 | /* | |
6526 | @@ -290,8 +298,8 @@ static void vt8237_force_enable_hpet(str | |
6527 | */ | |
6528 | if (val & 0x80) { | |
6529 | force_hpet_address = (val & ~0x3ff); | |
6530 | - printk(KERN_DEBUG "HPET at base address 0x%lx\n", | |
6531 | - force_hpet_address); | |
6532 | + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", | |
6533 | + force_hpet_address); | |
6534 | return; | |
6535 | } | |
6536 | ||
6537 | @@ -305,14 +313,14 @@ static void vt8237_force_enable_hpet(str | |
6538 | pci_read_config_dword(dev, 0x68, &val); | |
6539 | if (val & 0x80) { | |
6540 | force_hpet_address = (val & ~0x3ff); | |
6541 | - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", | |
6542 | - force_hpet_address); | |
6543 | + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " | |
6544 | + "0x%lx\n", force_hpet_address); | |
6545 | cached_dev = dev; | |
6546 | force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; | |
6547 | return; | |
6548 | } | |
6549 | ||
6550 | - printk(KERN_DEBUG "Failed to force enable HPET\n"); | |
6551 | + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); | |
6552 | } | |
6553 | ||
6554 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, | |
6555 | @@ -340,7 +348,7 @@ static void nvidia_force_enable_hpet(str | |
6556 | pci_read_config_dword(dev, 0x44, &val); | |
6557 | force_hpet_address = val & 0xfffffffe; | |
6558 | force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME; | |
6559 | - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", | |
6560 | + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", | |
6561 | force_hpet_address); | |
6562 | cached_dev = dev; | |
6563 | return; | |
6564 | @@ -353,6 +361,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N | |
6565 | nvidia_force_enable_hpet); | |
6566 | ||
6567 | /* LPC bridges */ | |
6568 | +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260, | |
6569 | + nvidia_force_enable_hpet); | |
6570 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360, | |
6571 | nvidia_force_enable_hpet); | |
6572 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361, | |
6573 | @@ -373,19 +383,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N | |
6574 | void force_hpet_resume(void) | |
6575 | { | |
6576 | switch (force_hpet_resume_type) { | |
6577 | - case ICH_FORCE_HPET_RESUME: | |
6578 | - return ich_force_hpet_resume(); | |
6579 | - | |
6580 | - case OLD_ICH_FORCE_HPET_RESUME: | |
6581 | - return old_ich_force_hpet_resume(); | |
6582 | - | |
6583 | - case VT8237_FORCE_HPET_RESUME: | |
6584 | - return vt8237_force_hpet_resume(); | |
6585 | - | |
6586 | - case NVIDIA_FORCE_HPET_RESUME: | |
6587 | - return nvidia_force_hpet_resume(); | |
6588 | - | |
6589 | - default: | |
6590 | + case ICH_FORCE_HPET_RESUME: | |
6591 | + ich_force_hpet_resume(); | |
6592 | + return; | |
6593 | + case OLD_ICH_FORCE_HPET_RESUME: | |
6594 | + old_ich_force_hpet_resume(); | |
6595 | + return; | |
6596 | + case VT8237_FORCE_HPET_RESUME: | |
6597 | + vt8237_force_hpet_resume(); | |
6598 | + return; | |
6599 | + case NVIDIA_FORCE_HPET_RESUME: | |
6600 | + nvidia_force_hpet_resume(); | |
6601 | + return; | |
6602 | + default: | |
6603 | break; | |
6604 | } | |
6605 | } | |
6606 | --- a/arch/x86/kernel/rtc.c | |
6607 | +++ b/arch/x86/kernel/rtc.c | |
6608 | @@ -181,6 +181,10 @@ unsigned long read_persistent_clock(void | |
6609 | { | |
6610 | unsigned long retval, flags; | |
6611 | ||
6612 | +#ifdef CONFIG_XEN | |
6613 | + if (!is_initial_xendomain()) | |
6614 | + return xen_read_persistent_clock(); | |
6615 | +#endif | |
6616 | spin_lock_irqsave(&rtc_lock, flags); | |
6617 | retval = get_wallclock(); | |
6618 | spin_unlock_irqrestore(&rtc_lock, flags); | |
6619 | @@ -190,6 +194,10 @@ unsigned long read_persistent_clock(void | |
6620 | ||
6621 | int update_persistent_clock(struct timespec now) | |
6622 | { | |
6623 | +#ifdef CONFIG_XEN | |
6624 | + if (xen_update_persistent_clock() < 0 || xen_independent_wallclock()) | |
6625 | + return 0; | |
6626 | +#endif | |
6627 | return set_rtc_mmss(now.tv_sec); | |
6628 | } | |
6629 | ||
6630 | --- a/arch/x86/kernel/setup_32-xen.c | |
6631 | +++ b/arch/x86/kernel/setup_32-xen.c | |
6632 | @@ -47,9 +47,12 @@ | |
6633 | #include <linux/crash_dump.h> | |
6634 | #include <linux/dmi.h> | |
6635 | #include <linux/pfn.h> | |
6636 | +#include <linux/pci.h> | |
6637 | +#include <linux/init_ohci1394_dma.h> | |
6638 | ||
6639 | #include <video/edid.h> | |
6640 | ||
6641 | +#include <asm/mtrr.h> | |
6642 | #include <asm/apic.h> | |
6643 | #include <asm/e820.h> | |
6644 | #include <asm/mpspec.h> | |
6645 | @@ -79,14 +82,83 @@ static struct notifier_block xen_panic_b | |
6646 | xen_panic_event, NULL, 0 /* try to go last */ | |
6647 | }; | |
6648 | ||
6649 | -int disable_pse __cpuinitdata = 0; | |
6650 | - | |
6651 | /* | |
6652 | * Machine setup.. | |
6653 | */ | |
6654 | -extern struct resource code_resource; | |
6655 | -extern struct resource data_resource; | |
6656 | -extern struct resource bss_resource; | |
6657 | +static struct resource data_resource = { | |
6658 | + .name = "Kernel data", | |
6659 | + .start = 0, | |
6660 | + .end = 0, | |
6661 | + .flags = IORESOURCE_BUSY | IORESOURCE_MEM | |
6662 | +}; | |
6663 | + | |
6664 | +static struct resource code_resource = { | |
6665 | + .name = "Kernel code", | |
6666 | + .start = 0, | |
6667 | + .end = 0, | |
6668 | + .flags = IORESOURCE_BUSY | IORESOURCE_MEM | |
6669 | +}; | |
6670 | + | |
6671 | +static struct resource bss_resource = { | |
6672 | + .name = "Kernel bss", | |
6673 | + .start = 0, | |
6674 | + .end = 0, | |
6675 | + .flags = IORESOURCE_BUSY | IORESOURCE_MEM | |
6676 | +}; | |
6677 | + | |
6678 | +static struct resource video_ram_resource = { | |
6679 | + .name = "Video RAM area", | |
6680 | + .start = 0xa0000, | |
6681 | + .end = 0xbffff, | |
6682 | + .flags = IORESOURCE_BUSY | IORESOURCE_MEM | |
6683 | +}; | |
6684 | + | |
6685 | +static struct resource standard_io_resources[] = { { | |
6686 | + .name = "dma1", | |
6687 | + .start = 0x0000, | |
6688 | + .end = 0x001f, | |
6689 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
6690 | +}, { | |
6691 | + .name = "pic1", | |
6692 | + .start = 0x0020, | |
6693 | + .end = 0x0021, | |
6694 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
6695 | +}, { | |
6696 | + .name = "timer0", | |
6697 | + .start = 0x0040, | |
6698 | + .end = 0x0043, | |
6699 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
6700 | +}, { | |
6701 | + .name = "timer1", | |
6702 | + .start = 0x0050, | |
6703 | + .end = 0x0053, | |
6704 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
6705 | +}, { | |
6706 | + .name = "keyboard", | |
6707 | + .start = 0x0060, | |
6708 | + .end = 0x006f, | |
6709 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
6710 | +}, { | |
6711 | + .name = "dma page reg", | |
6712 | + .start = 0x0080, | |
6713 | + .end = 0x008f, | |
6714 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
6715 | +}, { | |
6716 | + .name = "pic2", | |
6717 | + .start = 0x00a0, | |
6718 | + .end = 0x00a1, | |
6719 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
6720 | +}, { | |
6721 | + .name = "dma2", | |
6722 | + .start = 0x00c0, | |
6723 | + .end = 0x00df, | |
6724 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
6725 | +}, { | |
6726 | + .name = "fpu", | |
6727 | + .start = 0x00f0, | |
6728 | + .end = 0x00ff, | |
6729 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
6730 | +} }; | |
6731 | ||
6732 | /* cpu data as detected by the assembly code in head.S */ | |
6733 | struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; | |
6734 | @@ -94,13 +166,16 @@ struct cpuinfo_x86 new_cpu_data __cpuini | |
6735 | struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; | |
6736 | EXPORT_SYMBOL(boot_cpu_data); | |
6737 | ||
6738 | +#ifndef CONFIG_X86_PAE | |
6739 | unsigned long mmu_cr4_features; | |
6740 | +#else | |
6741 | +unsigned long mmu_cr4_features = X86_CR4_PAE; | |
6742 | +#endif | |
6743 | ||
6744 | /* for MCA, but anyone else can use it if they want */ | |
6745 | unsigned int machine_id; | |
6746 | unsigned int machine_submodel_id; | |
6747 | unsigned int BIOS_revision; | |
6748 | -unsigned int mca_pentium_flag; | |
6749 | ||
6750 | /* Boot loader ID as an integer, for the benefit of proc_dointvec */ | |
6751 | int bootloader_type; | |
6752 | @@ -131,13 +206,17 @@ extern int root_mountflags; | |
6753 | ||
6754 | unsigned long saved_videomode; | |
6755 | ||
6756 | -#define RAMDISK_IMAGE_START_MASK 0x07FF | |
6757 | +#define RAMDISK_IMAGE_START_MASK 0x07FF | |
6758 | #define RAMDISK_PROMPT_FLAG 0x8000 | |
6759 | -#define RAMDISK_LOAD_FLAG 0x4000 | |
6760 | +#define RAMDISK_LOAD_FLAG 0x4000 | |
6761 | ||
6762 | static char __initdata command_line[COMMAND_LINE_SIZE]; | |
6763 | ||
6764 | +#ifndef CONFIG_DEBUG_BOOT_PARAMS | |
6765 | struct boot_params __initdata boot_params; | |
6766 | +#else | |
6767 | +struct boot_params boot_params; | |
6768 | +#endif | |
6769 | ||
6770 | /* | |
6771 | * Point at the empty zero page to start with. We map the real shared_info | |
6772 | @@ -198,8 +277,7 @@ static int __init parse_mem(char *arg) | |
6773 | return -EINVAL; | |
6774 | ||
6775 | if (strcmp(arg, "nopentium") == 0) { | |
6776 | - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); | |
6777 | - disable_pse = 1; | |
6778 | + setup_clear_cpu_cap(X86_FEATURE_PSE); | |
6779 | } else { | |
6780 | /* If the user specifies memory size, we | |
6781 | * limit the BIOS-provided memory map to | |
6782 | @@ -208,7 +286,7 @@ static int __init parse_mem(char *arg) | |
6783 | * trim the existing memory map. | |
6784 | */ | |
6785 | unsigned long long mem_size; | |
6786 | - | |
6787 | + | |
6788 | mem_size = memparse(arg, &arg); | |
6789 | limit_regions(mem_size); | |
6790 | user_defined_memmap = 1; | |
6791 | @@ -350,7 +428,7 @@ static void __init reserve_ebda_region(v | |
6792 | unsigned int addr; | |
6793 | addr = get_bios_ebda(); | |
6794 | if (addr) | |
6795 | - reserve_bootmem(addr, PAGE_SIZE); | |
6796 | + reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT); | |
6797 | } | |
6798 | #endif | |
6799 | ||
6800 | @@ -365,8 +443,6 @@ static unsigned long __init setup_memory | |
6801 | min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) + | |
6802 | xen_start_info->nr_pt_frames; | |
6803 | ||
6804 | - find_max_pfn(); | |
6805 | - | |
6806 | max_low_pfn = find_max_low_pfn(); | |
6807 | ||
6808 | #ifdef CONFIG_HIGHMEM | |
6809 | @@ -447,7 +523,8 @@ static void __init reserve_crashkernel(v | |
6810 | (unsigned long)(total_mem >> 20)); | |
6811 | crashk_res.start = crash_base; | |
6812 | crashk_res.end = crash_base + crash_size - 1; | |
6813 | - reserve_bootmem(crash_base, crash_size); | |
6814 | + reserve_bootmem(crash_base, crash_size, | |
6815 | + BOOTMEM_DEFAULT); | |
6816 | } else | |
6817 | printk(KERN_INFO "crashkernel reservation failed - " | |
6818 | "you have to specify a base address\n"); | |
6819 | @@ -461,6 +538,99 @@ static inline void __init reserve_crashk | |
6820 | {} | |
6821 | #endif | |
6822 | ||
6823 | +#ifdef CONFIG_BLK_DEV_INITRD | |
6824 | + | |
6825 | +static bool do_relocate_initrd = false; | |
6826 | + | |
6827 | +static void __init reserve_initrd(void) | |
6828 | +{ | |
6829 | + unsigned long ramdisk_image = __pa(xen_start_info->mod_start); | |
6830 | + unsigned long ramdisk_size = xen_start_info->mod_len; | |
6831 | + unsigned long ramdisk_end = ramdisk_image + ramdisk_size; | |
6832 | + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; | |
6833 | + unsigned long ramdisk_here; | |
6834 | + | |
6835 | + initrd_start = 0; | |
6836 | + | |
6837 | + if (!xen_start_info->mod_start || !ramdisk_size) | |
6838 | + return; /* No initrd provided by bootloader */ | |
6839 | + | |
6840 | + if (ramdisk_end < ramdisk_image) { | |
6841 | + printk(KERN_ERR "initrd wraps around end of memory, " | |
6842 | + "disabling initrd\n"); | |
6843 | + return; | |
6844 | + } | |
6845 | + if (ramdisk_size >= end_of_lowmem/2) { | |
6846 | + printk(KERN_ERR "initrd too large to handle, " | |
6847 | + "disabling initrd\n"); | |
6848 | + return; | |
6849 | + } | |
6850 | + if (ramdisk_end <= end_of_lowmem) { | |
6851 | + /* All in lowmem, easy case */ | |
6852 | + reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT); | |
6853 | + initrd_start = ramdisk_image + PAGE_OFFSET; | |
6854 | + initrd_end = initrd_start+ramdisk_size; | |
6855 | + return; | |
6856 | + } | |
6857 | + | |
6858 | + /* We need to move the initrd down into lowmem */ | |
6859 | + ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK; | |
6860 | + | |
6861 | + /* Note: this includes all the lowmem currently occupied by | |
6862 | + the initrd, we rely on that fact to keep the data intact. */ | |
6863 | + reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT); | |
6864 | + initrd_start = ramdisk_here + PAGE_OFFSET; | |
6865 | + initrd_end = initrd_start + ramdisk_size; | |
6866 | + | |
6867 | + do_relocate_initrd = true; | |
6868 | +} | |
6869 | + | |
6870 | +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) | |
6871 | + | |
6872 | +static void __init relocate_initrd(void) | |
6873 | +{ | |
6874 | + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; | |
6875 | + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; | |
6876 | + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; | |
6877 | + unsigned long ramdisk_here; | |
6878 | + unsigned long slop, clen, mapaddr; | |
6879 | + char *p, *q; | |
6880 | + | |
6881 | + if (!do_relocate_initrd) | |
6882 | + return; | |
6883 | + | |
6884 | + ramdisk_here = initrd_start - PAGE_OFFSET; | |
6885 | + | |
6886 | + q = (char *)initrd_start; | |
6887 | + | |
6888 | + /* Copy any lowmem portion of the initrd */ | |
6889 | + if (ramdisk_image < end_of_lowmem) { | |
6890 | + clen = end_of_lowmem - ramdisk_image; | |
6891 | + p = (char *)__va(ramdisk_image); | |
6892 | + memcpy(q, p, clen); | |
6893 | + q += clen; | |
6894 | + ramdisk_image += clen; | |
6895 | + ramdisk_size -= clen; | |
6896 | + } | |
6897 | + | |
6898 | + /* Copy the highmem portion of the initrd */ | |
6899 | + while (ramdisk_size) { | |
6900 | + slop = ramdisk_image & ~PAGE_MASK; | |
6901 | + clen = ramdisk_size; | |
6902 | + if (clen > MAX_MAP_CHUNK-slop) | |
6903 | + clen = MAX_MAP_CHUNK-slop; | |
6904 | + mapaddr = ramdisk_image & PAGE_MASK; | |
6905 | + p = early_ioremap(mapaddr, clen+slop); | |
6906 | + memcpy(q, p+slop, clen); | |
6907 | + early_iounmap(p, clen+slop); | |
6908 | + q += clen; | |
6909 | + ramdisk_image += clen; | |
6910 | + ramdisk_size -= clen; | |
6911 | + } | |
6912 | +} | |
6913 | + | |
6914 | +#endif /* CONFIG_BLK_DEV_INITRD */ | |
6915 | + | |
6916 | void __init setup_bootmem_allocator(void) | |
6917 | { | |
6918 | unsigned long bootmap_size; | |
6919 | @@ -478,14 +648,15 @@ void __init setup_bootmem_allocator(void | |
6920 | * bootmem allocator with an invalid RAM area. | |
6921 | */ | |
6922 | reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) + | |
6923 | - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text)); | |
6924 | + bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text), | |
6925 | + BOOTMEM_DEFAULT); | |
6926 | ||
6927 | #ifndef CONFIG_XEN | |
6928 | /* | |
6929 | * reserve physical page 0 - it's a special BIOS page on many boxes, | |
6930 | * enabling clean reboots, SMP operation, laptop functions. | |
6931 | */ | |
6932 | - reserve_bootmem(0, PAGE_SIZE); | |
6933 | + reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT); | |
6934 | ||
6935 | /* reserve EBDA region, it's a 4K region */ | |
6936 | reserve_ebda_region(); | |
6937 | @@ -495,7 +666,7 @@ void __init setup_bootmem_allocator(void | |
6938 | unless you have no PS/2 mouse plugged in. */ | |
6939 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | |
6940 | boot_cpu_data.x86 == 6) | |
6941 | - reserve_bootmem(0xa0000 - 4096, 4096); | |
6942 | + reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT); | |
6943 | ||
6944 | #ifdef CONFIG_SMP | |
6945 | /* | |
6946 | @@ -503,7 +674,7 @@ void __init setup_bootmem_allocator(void | |
6947 | * FIXME: Don't need the extra page at 4K, but need to fix | |
6948 | * trampoline before removing it. (see the GDT stuff) | |
6949 | */ | |
6950 | - reserve_bootmem(PAGE_SIZE, PAGE_SIZE); | |
6951 | + reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT); | |
6952 | #endif | |
6953 | #ifdef CONFIG_ACPI_SLEEP | |
6954 | /* | |
6955 | @@ -511,29 +682,12 @@ void __init setup_bootmem_allocator(void | |
6956 | */ | |
6957 | acpi_reserve_bootmem(); | |
6958 | #endif | |
6959 | - numa_kva_reserve(); | |
6960 | #endif /* !CONFIG_XEN */ | |
6961 | ||
6962 | #ifdef CONFIG_BLK_DEV_INITRD | |
6963 | - if (xen_start_info->mod_start) { | |
6964 | - unsigned long ramdisk_image = __pa(xen_start_info->mod_start); | |
6965 | - unsigned long ramdisk_size = xen_start_info->mod_len; | |
6966 | - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; | |
6967 | - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; | |
6968 | - | |
6969 | - if (ramdisk_end <= end_of_lowmem) { | |
6970 | - /*reserve_bootmem(ramdisk_image, ramdisk_size);*/ | |
6971 | - initrd_start = ramdisk_image + PAGE_OFFSET; | |
6972 | - initrd_end = initrd_start+ramdisk_size; | |
6973 | - initrd_below_start_ok = 1; | |
6974 | - } else { | |
6975 | - printk(KERN_ERR "initrd extends beyond end of memory " | |
6976 | - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", | |
6977 | - ramdisk_end, end_of_lowmem); | |
6978 | - initrd_start = 0; | |
6979 | - } | |
6980 | - } | |
6981 | + reserve_initrd(); | |
6982 | #endif | |
6983 | + numa_kva_reserve(); | |
6984 | reserve_crashkernel(); | |
6985 | } | |
6986 | ||
6987 | @@ -600,20 +754,14 @@ void __init setup_arch(char **cmdline_p) | |
6988 | memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); | |
6989 | pre_setup_arch_hook(); | |
6990 | early_cpu_init(); | |
6991 | + early_ioremap_init(); | |
6992 | #ifdef CONFIG_SMP | |
6993 | prefill_possible_map(); | |
6994 | #endif | |
6995 | ||
6996 | - /* | |
6997 | - * FIXME: This isn't an official loader_type right | |
6998 | - * now but does currently work with elilo. | |
6999 | - * If we were configured as an EFI kernel, check to make | |
7000 | - * sure that we were loaded correctly from elilo and that | |
7001 | - * the system table is valid. If not, then initialize normally. | |
7002 | - */ | |
7003 | #ifdef CONFIG_EFI | |
7004 | - if ((boot_params.hdr.type_of_loader == 0x50) && | |
7005 | - boot_params.efi_info.efi_systab) | |
7006 | + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, | |
7007 | + "EL32", 4)) | |
7008 | efi_enabled = 1; | |
7009 | #endif | |
7010 | ||
7011 | @@ -653,12 +801,9 @@ void __init setup_arch(char **cmdline_p) | |
7012 | #endif | |
7013 | ||
7014 | ARCH_SETUP | |
7015 | - if (efi_enabled) | |
7016 | - efi_init(); | |
7017 | - else { | |
7018 | - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | |
7019 | - print_memory_map(memory_setup()); | |
7020 | - } | |
7021 | + | |
7022 | + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | |
7023 | + print_memory_map(memory_setup()); | |
7024 | ||
7025 | copy_edd(); | |
7026 | ||
7027 | @@ -691,6 +836,17 @@ void __init setup_arch(char **cmdline_p) | |
7028 | strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); | |
7029 | *cmdline_p = command_line; | |
7030 | ||
7031 | + if (efi_enabled) | |
7032 | + efi_init(); | |
7033 | + | |
7034 | + /* update e820 for memory not covered by WB MTRRs */ | |
7035 | + find_max_pfn(); | |
7036 | + mtrr_bp_init(); | |
7037 | +#ifndef CONFIG_XEN | |
7038 | + if (mtrr_trim_uncached_memory(max_pfn)) | |
7039 | + find_max_pfn(); | |
7040 | +#endif | |
7041 | + | |
7042 | max_low_pfn = setup_memory(); | |
7043 | ||
7044 | #ifdef CONFIG_VMI | |
7045 | @@ -715,6 +871,16 @@ void __init setup_arch(char **cmdline_p) | |
7046 | smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ | |
7047 | #endif | |
7048 | paging_init(); | |
7049 | + | |
7050 | + /* | |
7051 | + * NOTE: On x86-32, only from this point on, fixmaps are ready for use. | |
7052 | + */ | |
7053 | + | |
7054 | +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT | |
7055 | + if (init_ohci1394_dma_early) | |
7056 | + init_ohci1394_dma_on_all_controllers(); | |
7057 | +#endif | |
7058 | + | |
7059 | remapped_pgdat_init(); | |
7060 | sparse_init(); | |
7061 | zone_sizes_init(); | |
7062 | @@ -800,16 +966,20 @@ void __init setup_arch(char **cmdline_p) | |
7063 | * NOTE: at this point the bootmem allocator is fully available. | |
7064 | */ | |
7065 | ||
7066 | +#ifdef CONFIG_BLK_DEV_INITRD | |
7067 | + relocate_initrd(); | |
7068 | +#endif | |
7069 | + | |
7070 | paravirt_post_allocator_init(); | |
7071 | ||
7072 | if (is_initial_xendomain()) | |
7073 | dmi_scan_machine(); | |
7074 | ||
7075 | + io_delay_init(); | |
7076 | + | |
7077 | #ifdef CONFIG_X86_GENERICARCH | |
7078 | generic_apic_probe(); | |
7079 | -#endif | |
7080 | - if (efi_enabled) | |
7081 | - efi_map_memmap(); | |
7082 | +#endif | |
7083 | ||
7084 | set_iopl.iopl = 1; | |
7085 | WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); | |
7086 | @@ -827,7 +997,7 @@ void __init setup_arch(char **cmdline_p) | |
7087 | acpi_boot_table_init(); | |
7088 | #endif | |
7089 | ||
7090 | -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN) | |
7091 | +#ifndef CONFIG_XEN | |
7092 | early_quirks(); | |
7093 | #endif | |
7094 | ||
7095 | @@ -873,3 +1043,30 @@ xen_panic_event(struct notifier_block *t | |
7096 | /* we're never actually going to get here... */ | |
7097 | return NOTIFY_DONE; | |
7098 | } | |
7099 | + | |
7100 | +/* | |
7101 | + * Request address space for all standard resources | |
7102 | + * | |
7103 | + * This is called just before pcibios_init(), which is also a | |
7104 | + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). | |
7105 | + */ | |
7106 | +static int __init request_standard_resources(void) | |
7107 | +{ | |
7108 | + int i; | |
7109 | + | |
7110 | + /* Nothing to do if not running in dom0. */ | |
7111 | + if (!is_initial_xendomain()) | |
7112 | + return 0; | |
7113 | + | |
7114 | + printk(KERN_INFO "Setting up standard PCI resources\n"); | |
7115 | + init_iomem_resources(&code_resource, &data_resource, &bss_resource); | |
7116 | + | |
7117 | + request_resource(&iomem_resource, &video_ram_resource); | |
7118 | + | |
7119 | + /* request I/O space for devices used on all i[345]86 PCs */ | |
7120 | + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) | |
7121 | + request_resource(&ioport_resource, &standard_io_resources[i]); | |
7122 | + return 0; | |
7123 | +} | |
7124 | + | |
7125 | +subsys_initcall(request_standard_resources); | |
7126 | --- a/arch/x86/kernel/setup_64-xen.c | |
7127 | +++ b/arch/x86/kernel/setup_64-xen.c | |
7128 | @@ -15,7 +15,6 @@ | |
7129 | #include <linux/ptrace.h> | |
7130 | #include <linux/slab.h> | |
7131 | #include <linux/user.h> | |
7132 | -#include <linux/a.out.h> | |
7133 | #include <linux/screen_info.h> | |
7134 | #include <linux/ioport.h> | |
7135 | #include <linux/delay.h> | |
7136 | @@ -30,6 +29,7 @@ | |
7137 | #include <linux/crash_dump.h> | |
7138 | #include <linux/root_dev.h> | |
7139 | #include <linux/pci.h> | |
7140 | +#include <linux/efi.h> | |
7141 | #include <linux/acpi.h> | |
7142 | #include <linux/kallsyms.h> | |
7143 | #include <linux/edd.h> | |
7144 | @@ -39,10 +39,13 @@ | |
7145 | #include <linux/dmi.h> | |
7146 | #include <linux/dma-mapping.h> | |
7147 | #include <linux/ctype.h> | |
7148 | +#include <linux/uaccess.h> | |
7149 | +#include <linux/init_ohci1394_dma.h> | |
7150 | ||
7151 | #include <asm/mtrr.h> | |
7152 | #include <asm/uaccess.h> | |
7153 | #include <asm/system.h> | |
7154 | +#include <asm/vsyscall.h> | |
7155 | #include <asm/io.h> | |
7156 | #include <asm/smp.h> | |
7157 | #include <asm/msr.h> | |
7158 | @@ -50,6 +53,7 @@ | |
7159 | #include <video/edid.h> | |
7160 | #include <asm/e820.h> | |
7161 | #include <asm/dma.h> | |
7162 | +#include <asm/gart.h> | |
7163 | #include <asm/mpspec.h> | |
7164 | #include <asm/mmu_context.h> | |
7165 | #include <asm/proto.h> | |
7166 | @@ -59,6 +63,9 @@ | |
7167 | #include <asm/sections.h> | |
7168 | #include <asm/dmi.h> | |
7169 | #include <asm/cacheflush.h> | |
7170 | +#include <asm/mce.h> | |
7171 | +#include <asm/ds.h> | |
7172 | +#include <asm/topology.h> | |
7173 | #ifdef CONFIG_XEN | |
7174 | #include <linux/percpu.h> | |
7175 | #include <xen/interface/physdev.h> | |
7176 | @@ -108,6 +115,8 @@ EXPORT_SYMBOL(xen_start_info); | |
7177 | struct cpuinfo_x86 boot_cpu_data __read_mostly; | |
7178 | EXPORT_SYMBOL(boot_cpu_data); | |
7179 | ||
7180 | +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; | |
7181 | + | |
7182 | unsigned long mmu_cr4_features; | |
7183 | ||
7184 | /* Boot loader ID as an integer, for the benefit of proc_dointvec */ | |
7185 | @@ -117,7 +126,7 @@ unsigned long saved_video_mode; | |
7186 | ||
7187 | int force_mwait __cpuinitdata; | |
7188 | ||
7189 | -/* | |
7190 | +/* | |
7191 | * Early DMI memory | |
7192 | */ | |
7193 | int dmi_alloc_index; | |
7194 | @@ -163,25 +172,27 @@ struct resource standard_io_resources[] | |
7195 | ||
7196 | #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) | |
7197 | ||
7198 | -struct resource data_resource = { | |
7199 | +static struct resource data_resource = { | |
7200 | .name = "Kernel data", | |
7201 | .start = 0, | |
7202 | .end = 0, | |
7203 | .flags = IORESOURCE_RAM, | |
7204 | }; | |
7205 | -struct resource code_resource = { | |
7206 | +static struct resource code_resource = { | |
7207 | .name = "Kernel code", | |
7208 | .start = 0, | |
7209 | .end = 0, | |
7210 | .flags = IORESOURCE_RAM, | |
7211 | }; | |
7212 | -struct resource bss_resource = { | |
7213 | +static struct resource bss_resource = { | |
7214 | .name = "Kernel bss", | |
7215 | .start = 0, | |
7216 | .end = 0, | |
7217 | .flags = IORESOURCE_RAM, | |
7218 | }; | |
7219 | ||
7220 | +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); | |
7221 | + | |
7222 | #ifdef CONFIG_PROC_VMCORE | |
7223 | /* elfcorehdr= specifies the location of elf core header | |
7224 | * stored by the crashed kernel. This option will be passed | |
7225 | @@ -205,9 +216,10 @@ contig_initmem_init(unsigned long start_ | |
7226 | unsigned long bootmap_size, bootmap; | |
7227 | ||
7228 | bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; | |
7229 | - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); | |
7230 | + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size, | |
7231 | + PAGE_SIZE); | |
7232 | if (bootmap == -1L) | |
7233 | - panic("Cannot find bootmem map of size %ld\n",bootmap_size); | |
7234 | + panic("Cannot find bootmem map of size %ld\n", bootmap_size); | |
7235 | bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); | |
7236 | e820_register_active_regions(0, start_pfn, end_pfn); | |
7237 | #ifdef CONFIG_XEN | |
7238 | @@ -215,8 +227,8 @@ contig_initmem_init(unsigned long start_ | |
7239 | #else | |
7240 | free_bootmem_with_active_regions(0, end_pfn); | |
7241 | #endif | |
7242 | - reserve_bootmem(bootmap, bootmap_size); | |
7243 | -} | |
7244 | + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); | |
7245 | +} | |
7246 | #endif | |
7247 | ||
7248 | #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | |
7249 | @@ -249,27 +261,35 @@ static inline void copy_edd(void) | |
7250 | #ifndef CONFIG_XEN | |
7251 | static void __init reserve_crashkernel(void) | |
7252 | { | |
7253 | - unsigned long long free_mem; | |
7254 | + unsigned long long total_mem; | |
7255 | unsigned long long crash_size, crash_base; | |
7256 | int ret; | |
7257 | ||
7258 | - free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; | |
7259 | + total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; | |
7260 | ||
7261 | - ret = parse_crashkernel(boot_command_line, free_mem, | |
7262 | + ret = parse_crashkernel(boot_command_line, total_mem, | |
7263 | &crash_size, &crash_base); | |
7264 | if (ret == 0 && crash_size) { | |
7265 | - if (crash_base > 0) { | |
7266 | - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " | |
7267 | - "for crashkernel (System RAM: %ldMB)\n", | |
7268 | - (unsigned long)(crash_size >> 20), | |
7269 | - (unsigned long)(crash_base >> 20), | |
7270 | - (unsigned long)(free_mem >> 20)); | |
7271 | - crashk_res.start = crash_base; | |
7272 | - crashk_res.end = crash_base + crash_size - 1; | |
7273 | - reserve_bootmem(crash_base, crash_size); | |
7274 | - } else | |
7275 | + if (crash_base <= 0) { | |
7276 | printk(KERN_INFO "crashkernel reservation failed - " | |
7277 | "you have to specify a base address\n"); | |
7278 | + return; | |
7279 | + } | |
7280 | + | |
7281 | + if (reserve_bootmem(crash_base, crash_size, | |
7282 | + BOOTMEM_EXCLUSIVE) < 0) { | |
7283 | + printk(KERN_INFO "crashkernel reservation failed - " | |
7284 | + "memory is in use\n"); | |
7285 | + return; | |
7286 | + } | |
7287 | + | |
7288 | + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " | |
7289 | + "for crashkernel (System RAM: %ldMB)\n", | |
7290 | + (unsigned long)(crash_size >> 20), | |
7291 | + (unsigned long)(crash_base >> 20), | |
7292 | + (unsigned long)(total_mem >> 20)); | |
7293 | + crashk_res.start = crash_base; | |
7294 | + crashk_res.end = crash_base + crash_size - 1; | |
7295 | } | |
7296 | } | |
7297 | #else | |
7298 | @@ -280,37 +300,21 @@ static inline void __init reserve_crashk | |
7299 | {} | |
7300 | #endif | |
7301 | ||
7302 | -#ifndef CONFIG_XEN | |
7303 | -#define EBDA_ADDR_POINTER 0x40E | |
7304 | - | |
7305 | -unsigned __initdata ebda_addr; | |
7306 | -unsigned __initdata ebda_size; | |
7307 | - | |
7308 | -static void discover_ebda(void) | |
7309 | +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ | |
7310 | +void __attribute__((weak)) __init memory_setup(void) | |
7311 | { | |
7312 | - /* | |
7313 | - * there is a real-mode segmented pointer pointing to the | |
7314 | - * 4K EBDA area at 0x40E | |
7315 | - */ | |
7316 | - ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); | |
7317 | - ebda_addr <<= 4; | |
7318 | - | |
7319 | - ebda_size = *(unsigned short *)__va(ebda_addr); | |
7320 | - | |
7321 | - /* Round EBDA up to pages */ | |
7322 | - if (ebda_size == 0) | |
7323 | - ebda_size = 1; | |
7324 | - ebda_size <<= 10; | |
7325 | - ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); | |
7326 | - if (ebda_size > 64*1024) | |
7327 | - ebda_size = 64*1024; | |
7328 | + machine_specific_memory_setup(); | |
7329 | } | |
7330 | -#else | |
7331 | -#define discover_ebda() ((void)0) | |
7332 | -#endif | |
7333 | ||
7334 | +/* | |
7335 | + * setup_arch - architecture-specific boot-time initializations | |
7336 | + * | |
7337 | + * Note: On x86_64, fixmaps are ready for use even before this is called. | |
7338 | + */ | |
7339 | void __init setup_arch(char **cmdline_p) | |
7340 | { | |
7341 | + unsigned i; | |
7342 | + | |
7343 | #ifdef CONFIG_XEN | |
7344 | extern struct e820map machine_e820; | |
7345 | ||
7346 | @@ -319,6 +323,11 @@ void __init setup_arch(char **cmdline_p) | |
7347 | /* Register a call for panic conditions. */ | |
7348 | atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); | |
7349 | ||
7350 | + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, | |
7351 | + VMASST_TYPE_writable_pagetables)); | |
7352 | + | |
7353 | + early_ioremap_init(); | |
7354 | + | |
7355 | ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); | |
7356 | screen_info = boot_params.screen_info; | |
7357 | ||
7358 | @@ -335,11 +344,6 @@ void __init setup_arch(char **cmdline_p) | |
7359 | screen_info.orig_video_isVGA = 0; | |
7360 | ||
7361 | copy_edid(); | |
7362 | - | |
7363 | - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, | |
7364 | - VMASST_TYPE_writable_pagetables)); | |
7365 | - | |
7366 | - ARCH_SETUP | |
7367 | #else | |
7368 | printk(KERN_INFO "Command line: %s\n", boot_command_line); | |
7369 | ||
7370 | @@ -355,7 +359,15 @@ void __init setup_arch(char **cmdline_p) | |
7371 | rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); | |
7372 | rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); | |
7373 | #endif | |
7374 | - setup_memory_region(); | |
7375 | +#ifdef CONFIG_EFI | |
7376 | + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, | |
7377 | + "EL64", 4)) | |
7378 | + efi_enabled = 1; | |
7379 | +#endif | |
7380 | + | |
7381 | + ARCH_SETUP | |
7382 | + | |
7383 | + memory_setup(); | |
7384 | copy_edd(); | |
7385 | ||
7386 | if (!boot_params.hdr.root_flags) | |
7387 | @@ -379,28 +391,51 @@ void __init setup_arch(char **cmdline_p) | |
7388 | ||
7389 | parse_early_param(); | |
7390 | ||
7391 | +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT | |
7392 | + if (init_ohci1394_dma_early) | |
7393 | + init_ohci1394_dma_on_all_controllers(); | |
7394 | +#endif | |
7395 | + | |
7396 | finish_e820_parsing(); | |
7397 | ||
7398 | + early_gart_iommu_check(); | |
7399 | + | |
7400 | e820_register_active_regions(0, 0, -1UL); | |
7401 | /* | |
7402 | * partially used pages are not usable - thus | |
7403 | * we are rounding upwards: | |
7404 | */ | |
7405 | end_pfn = e820_end_of_ram(); | |
7406 | + /* update e820 for memory not covered by WB MTRRs */ | |
7407 | + mtrr_bp_init(); | |
7408 | +#ifndef CONFIG_XEN | |
7409 | + if (mtrr_trim_uncached_memory(end_pfn)) { | |
7410 | + e820_register_active_regions(0, 0, -1UL); | |
7411 | + end_pfn = e820_end_of_ram(); | |
7412 | + } | |
7413 | +#endif | |
7414 | + | |
7415 | num_physpages = end_pfn; | |
7416 | + max_mapnr = end_pfn; | |
7417 | ||
7418 | check_efer(); | |
7419 | ||
7420 | - discover_ebda(); | |
7421 | - | |
7422 | init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); | |
7423 | + if (efi_enabled) | |
7424 | + efi_init(); | |
7425 | ||
7426 | if (is_initial_xendomain()) | |
7427 | dmi_scan_machine(); | |
7428 | ||
7429 | + io_delay_init(); | |
7430 | + | |
7431 | #if defined(CONFIG_SMP) && !defined(CONFIG_XEN) | |
7432 | - /* setup to use the static apicid table during kernel startup */ | |
7433 | - x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init; | |
7434 | + /* setup to use the early static init tables during kernel startup */ | |
7435 | + x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; | |
7436 | + x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; | |
7437 | +#ifdef CONFIG_NUMA | |
7438 | + x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; | |
7439 | +#endif | |
7440 | #endif | |
7441 | ||
7442 | /* How many end-of-memory variables you have, grandma! */ | |
7443 | @@ -419,54 +454,25 @@ void __init setup_arch(char **cmdline_p) | |
7444 | #endif | |
7445 | ||
7446 | #ifdef CONFIG_NUMA | |
7447 | - numa_initmem_init(0, end_pfn); | |
7448 | + numa_initmem_init(0, end_pfn); | |
7449 | #else | |
7450 | contig_initmem_init(0, end_pfn); | |
7451 | #endif | |
7452 | ||
7453 | -#ifdef CONFIG_XEN | |
7454 | - /* | |
7455 | - * Reserve kernel, physmap, start info, initial page tables, and | |
7456 | - * direct mapping. | |
7457 | - */ | |
7458 | - reserve_bootmem_generic(__pa_symbol(&_text), | |
7459 | - (table_end << PAGE_SHIFT) - __pa_symbol(&_text)); | |
7460 | -#else | |
7461 | - /* Reserve direct mapping */ | |
7462 | - reserve_bootmem_generic(table_start << PAGE_SHIFT, | |
7463 | - (table_end - table_start) << PAGE_SHIFT); | |
7464 | - | |
7465 | - /* reserve kernel */ | |
7466 | - reserve_bootmem_generic(__pa_symbol(&_text), | |
7467 | - __pa_symbol(&_end) - __pa_symbol(&_text)); | |
7468 | + early_res_to_bootmem(); | |
7469 | ||
7470 | +#ifndef CONFIG_XEN | |
7471 | +#ifdef CONFIG_ACPI_SLEEP | |
7472 | /* | |
7473 | - * reserve physical page 0 - it's a special BIOS page on many boxes, | |
7474 | - * enabling clean reboots, SMP operation, laptop functions. | |
7475 | + * Reserve low memory region for sleep support. | |
7476 | */ | |
7477 | - reserve_bootmem_generic(0, PAGE_SIZE); | |
7478 | - | |
7479 | - /* reserve ebda region */ | |
7480 | - if (ebda_addr) | |
7481 | - reserve_bootmem_generic(ebda_addr, ebda_size); | |
7482 | -#ifdef CONFIG_NUMA | |
7483 | - /* reserve nodemap region */ | |
7484 | - if (nodemap_addr) | |
7485 | - reserve_bootmem_generic(nodemap_addr, nodemap_size); | |
7486 | + acpi_reserve_bootmem(); | |
7487 | #endif | |
7488 | ||
7489 | -#ifdef CONFIG_SMP | |
7490 | - /* Reserve SMP trampoline */ | |
7491 | - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); | |
7492 | -#endif | |
7493 | + if (efi_enabled) | |
7494 | + efi_reserve_bootmem(); | |
7495 | #endif | |
7496 | ||
7497 | -#ifdef CONFIG_ACPI_SLEEP | |
7498 | - /* | |
7499 | - * Reserve low memory region for sleep support. | |
7500 | - */ | |
7501 | - acpi_reserve_bootmem(); | |
7502 | -#endif | |
7503 | #ifdef CONFIG_BLK_DEV_INITRD | |
7504 | #ifdef CONFIG_XEN | |
7505 | if (xen_start_info->mod_start) { | |
7506 | @@ -490,6 +496,8 @@ void __init setup_arch(char **cmdline_p) | |
7507 | initrd_below_start_ok = 1; | |
7508 | #endif | |
7509 | } else { | |
7510 | + /* Assumes everything on node 0 */ | |
7511 | + free_bootmem(ramdisk_image, ramdisk_size); | |
7512 | printk(KERN_ERR "initrd extends beyond end of memory " | |
7513 | "(0x%08lx > 0x%08lx)\ndisabling initrd\n", | |
7514 | ramdisk_end, end_of_mem); | |
7515 | @@ -499,10 +507,11 @@ void __init setup_arch(char **cmdline_p) | |
7516 | #endif | |
7517 | reserve_crashkernel(); | |
7518 | paging_init(); | |
7519 | + map_vsyscall(); | |
7520 | #ifdef CONFIG_X86_LOCAL_APIC | |
7521 | /* | |
7522 | - * Find and reserve possible boot-time SMP configuration: | |
7523 | - */ | |
7524 | + * Find and reserve possible boot-time SMP configuration: | |
7525 | + */ | |
7526 | find_smp_config(); | |
7527 | #endif | |
7528 | #ifdef CONFIG_XEN | |
7529 | @@ -590,16 +599,10 @@ void __init setup_arch(char **cmdline_p) | |
7530 | #endif | |
7531 | #endif | |
7532 | ||
7533 | -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN) | |
7534 | +#ifndef CONFIG_XEN | |
7535 | early_quirks(); | |
7536 | #endif | |
7537 | ||
7538 | - /* | |
7539 | - * set this early, so we dont allocate cpu0 | |
7540 | - * if MADT list doesnt list BSP first | |
7541 | - * mpparse.c/MP_processor_info() allocates logical cpu numbers. | |
7542 | - */ | |
7543 | - cpu_set(0, cpu_present_map); | |
7544 | #ifdef CONFIG_ACPI | |
7545 | /* | |
7546 | * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). | |
7547 | @@ -623,6 +626,7 @@ void __init setup_arch(char **cmdline_p) | |
7548 | get_smp_config(); | |
7549 | #ifndef CONFIG_XEN | |
7550 | init_apic_mappings(); | |
7551 | + ioapic_init_mappings(); | |
7552 | #endif | |
7553 | #endif | |
7554 | #if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) | |
7555 | @@ -634,18 +638,17 @@ void __init setup_arch(char **cmdline_p) | |
7556 | */ | |
7557 | #ifdef CONFIG_XEN | |
7558 | if (is_initial_xendomain()) | |
7559 | - e820_reserve_resources(machine_e820.map, machine_e820.nr_map); | |
7560 | + e820_reserve_resources(machine_e820.map, machine_e820.nr_map, | |
7561 | + &code_resource, &data_resource, &bss_resource); | |
7562 | #else | |
7563 | - e820_reserve_resources(e820.map, e820.nr_map); | |
7564 | + e820_reserve_resources(e820.map, e820.nr_map, | |
7565 | + &code_resource, &data_resource, &bss_resource); | |
7566 | e820_mark_nosave_regions(); | |
7567 | #endif | |
7568 | ||
7569 | - { | |
7570 | - unsigned i; | |
7571 | /* request I/O space for devices used on all i[345]86 PCs */ | |
7572 | for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) | |
7573 | request_resource(&ioport_resource, &standard_io_resources[i]); | |
7574 | - } | |
7575 | ||
7576 | #ifdef CONFIG_XEN | |
7577 | if (is_initial_xendomain()) | |
7578 | @@ -679,7 +682,8 @@ void __init setup_arch(char **cmdline_p) | |
7579 | ||
7580 | #ifdef CONFIG_VT | |
7581 | #if defined(CONFIG_VGA_CONSOLE) | |
7582 | - conswitchp = &vga_con; | |
7583 | + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) | |
7584 | + conswitchp = &vga_con; | |
7585 | #elif defined(CONFIG_DUMMY_CONSOLE) | |
7586 | conswitchp = &dummy_con; | |
7587 | #endif | |
7588 | @@ -723,9 +727,10 @@ static void __cpuinit display_cacheinfo( | |
7589 | ||
7590 | if (n >= 0x80000005) { | |
7591 | cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); | |
7592 | - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", | |
7593 | - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); | |
7594 | - c->x86_cache_size=(ecx>>24)+(edx>>24); | |
7595 | + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " | |
7596 | + "D cache %dK (%d bytes/line)\n", | |
7597 | + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); | |
7598 | + c->x86_cache_size = (ecx>>24) + (edx>>24); | |
7599 | /* On K8 L1 TLB is inclusive, so don't count it */ | |
7600 | c->x86_tlbsize = 0; | |
7601 | } | |
7602 | @@ -739,27 +744,25 @@ static void __cpuinit display_cacheinfo( | |
7603 | printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", | |
7604 | c->x86_cache_size, ecx & 0xFF); | |
7605 | } | |
7606 | - | |
7607 | - if (n >= 0x80000007) | |
7608 | - cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); | |
7609 | if (n >= 0x80000008) { | |
7610 | - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); | |
7611 | + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); | |
7612 | c->x86_virt_bits = (eax >> 8) & 0xff; | |
7613 | c->x86_phys_bits = eax & 0xff; | |
7614 | } | |
7615 | } | |
7616 | ||
7617 | #ifdef CONFIG_NUMA | |
7618 | -static int nearby_node(int apicid) | |
7619 | +static int __cpuinit nearby_node(int apicid) | |
7620 | { | |
7621 | - int i; | |
7622 | + int i, node; | |
7623 | + | |
7624 | for (i = apicid - 1; i >= 0; i--) { | |
7625 | - int node = apicid_to_node[i]; | |
7626 | + node = apicid_to_node[i]; | |
7627 | if (node != NUMA_NO_NODE && node_online(node)) | |
7628 | return node; | |
7629 | } | |
7630 | for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { | |
7631 | - int node = apicid_to_node[i]; | |
7632 | + node = apicid_to_node[i]; | |
7633 | if (node != NUMA_NO_NODE && node_online(node)) | |
7634 | return node; | |
7635 | } | |
7636 | @@ -771,7 +774,7 @@ static int nearby_node(int apicid) | |
7637 | * On a AMD dual core setup the lower bits of the APIC id distingush the cores. | |
7638 | * Assumes number of cores is a power of two. | |
7639 | */ | |
7640 | -static void __init amd_detect_cmp(struct cpuinfo_x86 *c) | |
7641 | +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) | |
7642 | { | |
7643 | #ifdef CONFIG_SMP | |
7644 | unsigned bits; | |
7645 | @@ -780,7 +783,54 @@ static void __init amd_detect_cmp(struct | |
7646 | int node = 0; | |
7647 | unsigned apicid = hard_smp_processor_id(); | |
7648 | #endif | |
7649 | - unsigned ecx = cpuid_ecx(0x80000008); | |
7650 | + bits = c->x86_coreid_bits; | |
7651 | + | |
7652 | + /* Low order bits define the core id (index of core in socket) */ | |
7653 | + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); | |
7654 | + /* Convert the APIC ID into the socket ID */ | |
7655 | + c->phys_proc_id = phys_pkg_id(bits); | |
7656 | + | |
7657 | +#ifdef CONFIG_NUMA | |
7658 | + node = c->phys_proc_id; | |
7659 | + if (apicid_to_node[apicid] != NUMA_NO_NODE) | |
7660 | + node = apicid_to_node[apicid]; | |
7661 | + if (!node_online(node)) { | |
7662 | + /* Two possibilities here: | |
7663 | + - The CPU is missing memory and no node was created. | |
7664 | + In that case try picking one from a nearby CPU | |
7665 | + - The APIC IDs differ from the HyperTransport node IDs | |
7666 | + which the K8 northbridge parsing fills in. | |
7667 | + Assume they are all increased by a constant offset, | |
7668 | + but in the same order as the HT nodeids. | |
7669 | + If that doesn't result in a usable node fall back to the | |
7670 | + path for the previous case. */ | |
7671 | + | |
7672 | + int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); | |
7673 | + | |
7674 | + if (ht_nodeid >= 0 && | |
7675 | + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) | |
7676 | + node = apicid_to_node[ht_nodeid]; | |
7677 | + /* Pick a nearby node */ | |
7678 | + if (!node_online(node)) | |
7679 | + node = nearby_node(apicid); | |
7680 | + } | |
7681 | + numa_set_node(cpu, node); | |
7682 | + | |
7683 | + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | |
7684 | +#endif | |
7685 | +#endif | |
7686 | +} | |
7687 | + | |
7688 | +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) | |
7689 | +{ | |
7690 | +#ifdef CONFIG_SMP | |
7691 | + unsigned bits, ecx; | |
7692 | + | |
7693 | + /* Multi core CPU? */ | |
7694 | + if (c->extended_cpuid_level < 0x80000008) | |
7695 | + return; | |
7696 | + | |
7697 | + ecx = cpuid_ecx(0x80000008); | |
7698 | ||
7699 | c->x86_max_cores = (ecx & 0xff) + 1; | |
7700 | ||
7701 | @@ -793,37 +843,8 @@ static void __init amd_detect_cmp(struct | |
7702 | bits++; | |
7703 | } | |
7704 | ||
7705 | - /* Low order bits define the core id (index of core in socket) */ | |
7706 | - c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); | |
7707 | - /* Convert the APIC ID into the socket ID */ | |
7708 | - c->phys_proc_id = phys_pkg_id(bits); | |
7709 | - | |
7710 | -#ifdef CONFIG_NUMA | |
7711 | - node = c->phys_proc_id; | |
7712 | - if (apicid_to_node[apicid] != NUMA_NO_NODE) | |
7713 | - node = apicid_to_node[apicid]; | |
7714 | - if (!node_online(node)) { | |
7715 | - /* Two possibilities here: | |
7716 | - - The CPU is missing memory and no node was created. | |
7717 | - In that case try picking one from a nearby CPU | |
7718 | - - The APIC IDs differ from the HyperTransport node IDs | |
7719 | - which the K8 northbridge parsing fills in. | |
7720 | - Assume they are all increased by a constant offset, | |
7721 | - but in the same order as the HT nodeids. | |
7722 | - If that doesn't result in a usable node fall back to the | |
7723 | - path for the previous case. */ | |
7724 | - int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); | |
7725 | - if (ht_nodeid >= 0 && | |
7726 | - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) | |
7727 | - node = apicid_to_node[ht_nodeid]; | |
7728 | - /* Pick a nearby node */ | |
7729 | - if (!node_online(node)) | |
7730 | - node = nearby_node(apicid); | |
7731 | - } | |
7732 | - numa_set_node(cpu, node); | |
7733 | + c->x86_coreid_bits = bits; | |
7734 | ||
7735 | - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | |
7736 | -#endif | |
7737 | #endif | |
7738 | } | |
7739 | ||
7740 | @@ -840,8 +861,8 @@ static void __init amd_detect_cmp(struct | |
7741 | /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ | |
7742 | static __cpuinit int amd_apic_timer_broken(void) | |
7743 | { | |
7744 | - u32 lo, hi; | |
7745 | - u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); | |
7746 | + u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); | |
7747 | + | |
7748 | switch (eax & CPUID_XFAM) { | |
7749 | case CPUID_XFAM_K8: | |
7750 | if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) | |
7751 | @@ -860,6 +881,15 @@ static __cpuinit int amd_apic_timer_brok | |
7752 | } | |
7753 | #endif | |
7754 | ||
7755 | +static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | |
7756 | +{ | |
7757 | + early_init_amd_mc(c); | |
7758 | + | |
7759 | + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ | |
7760 | + if (c->x86_power & (1<<8)) | |
7761 | + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | |
7762 | +} | |
7763 | + | |
7764 | static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |
7765 | { | |
7766 | unsigned level; | |
7767 | @@ -870,7 +900,7 @@ static void __cpuinit init_amd(struct cp | |
7768 | /* | |
7769 | * Disable TLB flush filter by setting HWCR.FFDIS on K8 | |
7770 | * bit 6 of msr C001_0015 | |
7771 | - * | |
7772 | + * | |
7773 | * Errata 63 for SH-B3 steppings | |
7774 | * Errata 122 for all steppings (F+ have it disabled by default) | |
7775 | */ | |
7776 | @@ -883,35 +913,32 @@ static void __cpuinit init_amd(struct cp | |
7777 | ||
7778 | /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; | |
7779 | 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ | |
7780 | - clear_bit(0*32+31, &c->x86_capability); | |
7781 | - | |
7782 | + clear_bit(0*32+31, (unsigned long *)&c->x86_capability); | |
7783 | + | |
7784 | /* On C+ stepping K8 rep microcode works well for copy/memset */ | |
7785 | level = cpuid_eax(1); | |
7786 | - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) | |
7787 | - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); | |
7788 | + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || | |
7789 | + level >= 0x0f58)) | |
7790 | + set_cpu_cap(c, X86_FEATURE_REP_GOOD); | |
7791 | if (c->x86 == 0x10 || c->x86 == 0x11) | |
7792 | - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); | |
7793 | + set_cpu_cap(c, X86_FEATURE_REP_GOOD); | |
7794 | ||
7795 | /* Enable workaround for FXSAVE leak */ | |
7796 | if (c->x86 >= 6) | |
7797 | - set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); | |
7798 | + set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); | |
7799 | ||
7800 | level = get_model_name(c); | |
7801 | if (!level) { | |
7802 | - switch (c->x86) { | |
7803 | + switch (c->x86) { | |
7804 | case 15: | |
7805 | /* Should distinguish Models here, but this is only | |
7806 | a fallback anyways. */ | |
7807 | strcpy(c->x86_model_id, "Hammer"); | |
7808 | - break; | |
7809 | - } | |
7810 | - } | |
7811 | + break; | |
7812 | + } | |
7813 | + } | |
7814 | display_cacheinfo(c); | |
7815 | ||
7816 | - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ | |
7817 | - if (c->x86_power & (1<<8)) | |
7818 | - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); | |
7819 | - | |
7820 | /* Multi core CPU? */ | |
7821 | if (c->extended_cpuid_level >= 0x80000008) | |
7822 | amd_detect_cmp(c); | |
7823 | @@ -923,14 +950,10 @@ static void __cpuinit init_amd(struct cp | |
7824 | num_cache_leaves = 3; | |
7825 | ||
7826 | if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) | |
7827 | - set_bit(X86_FEATURE_K8, &c->x86_capability); | |
7828 | - | |
7829 | - /* RDTSC can be speculated around */ | |
7830 | - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); | |
7831 | + set_cpu_cap(c, X86_FEATURE_K8); | |
7832 | ||
7833 | - /* Family 10 doesn't support C states in MWAIT so don't use it */ | |
7834 | - if (c->x86 == 0x10 && !force_mwait) | |
7835 | - clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); | |
7836 | + /* MFENCE stops RDTSC speculation */ | |
7837 | + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); | |
7838 | ||
7839 | #ifndef CONFIG_XEN | |
7840 | if (amd_apic_timer_broken()) | |
7841 | @@ -938,28 +961,29 @@ static void __cpuinit init_amd(struct cp | |
7842 | #endif | |
7843 | } | |
7844 | ||
7845 | -static void __cpuinit detect_ht(struct cpuinfo_x86 *c) | |
7846 | +void __cpuinit detect_ht(struct cpuinfo_x86 *c) | |
7847 | { | |
7848 | #ifdef CONFIG_SMP | |
7849 | - u32 eax, ebx, ecx, edx; | |
7850 | - int index_msb, core_bits; | |
7851 | + u32 eax, ebx, ecx, edx; | |
7852 | + int index_msb, core_bits; | |
7853 | ||
7854 | cpuid(1, &eax, &ebx, &ecx, &edx); | |
7855 | ||
7856 | ||
7857 | if (!cpu_has(c, X86_FEATURE_HT)) | |
7858 | return; | |
7859 | - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) | |
7860 | + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) | |
7861 | goto out; | |
7862 | ||
7863 | smp_num_siblings = (ebx & 0xff0000) >> 16; | |
7864 | ||
7865 | if (smp_num_siblings == 1) { | |
7866 | printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); | |
7867 | - } else if (smp_num_siblings > 1 ) { | |
7868 | + } else if (smp_num_siblings > 1) { | |
7869 | ||
7870 | if (smp_num_siblings > NR_CPUS) { | |
7871 | - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); | |
7872 | + printk(KERN_WARNING "CPU: Unsupported number of " | |
7873 | + "siblings %d", smp_num_siblings); | |
7874 | smp_num_siblings = 1; | |
7875 | return; | |
7876 | } | |
7877 | @@ -969,7 +993,7 @@ static void __cpuinit detect_ht(struct c | |
7878 | ||
7879 | smp_num_siblings = smp_num_siblings / c->x86_max_cores; | |
7880 | ||
7881 | - index_msb = get_count_order(smp_num_siblings) ; | |
7882 | + index_msb = get_count_order(smp_num_siblings); | |
7883 | ||
7884 | core_bits = get_count_order(c->x86_max_cores); | |
7885 | ||
7886 | @@ -978,8 +1002,10 @@ static void __cpuinit detect_ht(struct c | |
7887 | } | |
7888 | out: | |
7889 | if ((c->x86_max_cores * smp_num_siblings) > 1) { | |
7890 | - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); | |
7891 | - printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); | |
7892 | + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", | |
7893 | + c->phys_proc_id); | |
7894 | + printk(KERN_INFO "CPU: Processor Core ID: %d\n", | |
7895 | + c->cpu_core_id); | |
7896 | } | |
7897 | ||
7898 | #endif | |
7899 | @@ -1003,7 +1029,7 @@ static int __cpuinit intel_num_cpu_cores | |
7900 | return 1; | |
7901 | } | |
7902 | ||
7903 | -static void srat_detect_node(void) | |
7904 | +static void __cpuinit srat_detect_node(void) | |
7905 | { | |
7906 | #ifdef CONFIG_NUMA | |
7907 | unsigned node; | |
7908 | @@ -1013,7 +1039,7 @@ static void srat_detect_node(void) | |
7909 | /* Don't do the funky fallback heuristics the AMD version employs | |
7910 | for now. */ | |
7911 | node = apicid_to_node[apicid]; | |
7912 | - if (node == NUMA_NO_NODE) | |
7913 | + if (node == NUMA_NO_NODE || !node_online(node)) | |
7914 | node = first_node(node_online_map); | |
7915 | numa_set_node(cpu, node); | |
7916 | ||
7917 | @@ -1021,28 +1047,39 @@ static void srat_detect_node(void) | |
7918 | #endif | |
7919 | } | |
7920 | ||
7921 | +static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | |
7922 | +{ | |
7923 | + if ((c->x86 == 0xf && c->x86_model >= 0x03) || | |
7924 | + (c->x86 == 0x6 && c->x86_model >= 0x0e)) | |
7925 | + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); | |
7926 | +} | |
7927 | + | |
7928 | static void __cpuinit init_intel(struct cpuinfo_x86 *c) | |
7929 | { | |
7930 | /* Cache sizes */ | |
7931 | unsigned n; | |
7932 | ||
7933 | init_intel_cacheinfo(c); | |
7934 | - if (c->cpuid_level > 9 ) { | |
7935 | + if (c->cpuid_level > 9) { | |
7936 | unsigned eax = cpuid_eax(10); | |
7937 | /* Check for version and the number of counters */ | |
7938 | if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) | |
7939 | - set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); | |
7940 | + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); | |
7941 | } | |
7942 | ||
7943 | if (cpu_has_ds) { | |
7944 | unsigned int l1, l2; | |
7945 | rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); | |
7946 | if (!(l1 & (1<<11))) | |
7947 | - set_bit(X86_FEATURE_BTS, c->x86_capability); | |
7948 | + set_cpu_cap(c, X86_FEATURE_BTS); | |
7949 | if (!(l1 & (1<<12))) | |
7950 | - set_bit(X86_FEATURE_PEBS, c->x86_capability); | |
7951 | + set_cpu_cap(c, X86_FEATURE_PEBS); | |
7952 | } | |
7953 | ||
7954 | + | |
7955 | + if (cpu_has_bts) | |
7956 | + ds_init_intel(c); | |
7957 | + | |
7958 | n = c->extended_cpuid_level; | |
7959 | if (n >= 0x80000008) { | |
7960 | unsigned eax = cpuid_eax(0x80000008); | |
7961 | @@ -1059,14 +1096,11 @@ static void __cpuinit init_intel(struct | |
7962 | c->x86_cache_alignment = c->x86_clflush_size * 2; | |
7963 | if ((c->x86 == 0xf && c->x86_model >= 0x03) || | |
7964 | (c->x86 == 0x6 && c->x86_model >= 0x0e)) | |
7965 | - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); | |
7966 | + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | |
7967 | if (c->x86 == 6) | |
7968 | - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); | |
7969 | - if (c->x86 == 15) | |
7970 | - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); | |
7971 | - else | |
7972 | - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); | |
7973 | - c->x86_max_cores = intel_num_cpu_cores(c); | |
7974 | + set_cpu_cap(c, X86_FEATURE_REP_GOOD); | |
7975 | + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); | |
7976 | + c->x86_max_cores = intel_num_cpu_cores(c); | |
7977 | ||
7978 | srat_detect_node(); | |
7979 | } | |
7980 | @@ -1083,18 +1117,12 @@ static void __cpuinit get_cpu_vendor(str | |
7981 | c->x86_vendor = X86_VENDOR_UNKNOWN; | |
7982 | } | |
7983 | ||
7984 | -struct cpu_model_info { | |
7985 | - int vendor; | |
7986 | - int family; | |
7987 | - char *model_names[16]; | |
7988 | -}; | |
7989 | - | |
7990 | /* Do some early cpuid on the boot CPU to get some parameter that are | |
7991 | needed before check_bugs. Everything advanced is in identify_cpu | |
7992 | below. */ | |
7993 | -void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) | |
7994 | +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) | |
7995 | { | |
7996 | - u32 tfms; | |
7997 | + u32 tfms, xlvl; | |
7998 | ||
7999 | c->loops_per_jiffy = loops_per_jiffy; | |
8000 | c->x86_cache_size = -1; | |
8001 | @@ -1105,6 +1133,7 @@ void __cpuinit early_identify_cpu(struct | |
8002 | c->x86_clflush_size = 64; | |
8003 | c->x86_cache_alignment = c->x86_clflush_size; | |
8004 | c->x86_max_cores = 1; | |
8005 | + c->x86_coreid_bits = 0; | |
8006 | c->extended_cpuid_level = 0; | |
8007 | memset(&c->x86_capability, 0, sizeof c->x86_capability); | |
8008 | ||
8009 | @@ -1113,7 +1142,7 @@ void __cpuinit early_identify_cpu(struct | |
8010 | (unsigned int *)&c->x86_vendor_id[0], | |
8011 | (unsigned int *)&c->x86_vendor_id[8], | |
8012 | (unsigned int *)&c->x86_vendor_id[4]); | |
8013 | - | |
8014 | + | |
8015 | get_cpu_vendor(c); | |
8016 | ||
8017 | /* Initialize the standard set of capabilities */ | |
8018 | @@ -1131,7 +1160,7 @@ void __cpuinit early_identify_cpu(struct | |
8019 | c->x86 += (tfms >> 20) & 0xff; | |
8020 | if (c->x86 >= 0x6) | |
8021 | c->x86_model += ((tfms >> 16) & 0xF) << 4; | |
8022 | - if (c->x86_capability[0] & (1<<19)) | |
8023 | + if (c->x86_capability[0] & (1<<19)) | |
8024 | c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; | |
8025 | } else { | |
8026 | /* Have CPUID level 0 only - unheard of */ | |
8027 | @@ -1141,18 +1170,6 @@ void __cpuinit early_identify_cpu(struct | |
8028 | #ifdef CONFIG_SMP | |
8029 | c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; | |
8030 | #endif | |
8031 | -} | |
8032 | - | |
8033 | -/* | |
8034 | - * This does the hard work of actually picking apart the CPU stuff... | |
8035 | - */ | |
8036 | -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |
8037 | -{ | |
8038 | - int i; | |
8039 | - u32 xlvl; | |
8040 | - | |
8041 | - early_identify_cpu(c); | |
8042 | - | |
8043 | /* AMD-defined flags: level 0x80000001 */ | |
8044 | xlvl = cpuid_eax(0x80000000); | |
8045 | c->extended_cpuid_level = xlvl; | |
8046 | @@ -1173,6 +1190,30 @@ void __cpuinit identify_cpu(struct cpuin | |
8047 | c->x86_capability[2] = cpuid_edx(0x80860001); | |
8048 | } | |
8049 | ||
8050 | + c->extended_cpuid_level = cpuid_eax(0x80000000); | |
8051 | + if (c->extended_cpuid_level >= 0x80000007) | |
8052 | + c->x86_power = cpuid_edx(0x80000007); | |
8053 | + | |
8054 | + switch (c->x86_vendor) { | |
8055 | + case X86_VENDOR_AMD: | |
8056 | + early_init_amd(c); | |
8057 | + break; | |
8058 | + case X86_VENDOR_INTEL: | |
8059 | + early_init_intel(c); | |
8060 | + break; | |
8061 | + } | |
8062 | + | |
8063 | +} | |
8064 | + | |
8065 | +/* | |
8066 | + * This does the hard work of actually picking apart the CPU stuff... | |
8067 | + */ | |
8068 | +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |
8069 | +{ | |
8070 | + int i; | |
8071 | + | |
8072 | + early_identify_cpu(c); | |
8073 | + | |
8074 | init_scattered_cpuid_features(c); | |
8075 | ||
8076 | c->apicid = phys_pkg_id(0); | |
8077 | @@ -1202,8 +1243,7 @@ void __cpuinit identify_cpu(struct cpuin | |
8078 | break; | |
8079 | } | |
8080 | ||
8081 | - select_idle_routine(c); | |
8082 | - detect_ht(c); | |
8083 | + detect_ht(c); | |
8084 | ||
8085 | /* | |
8086 | * On SMP, boot_cpu_data holds the common feature set between | |
8087 | @@ -1213,31 +1253,55 @@ void __cpuinit identify_cpu(struct cpuin | |
8088 | */ | |
8089 | if (c != &boot_cpu_data) { | |
8090 | /* AND the already accumulated flags with these */ | |
8091 | - for (i = 0 ; i < NCAPINTS ; i++) | |
8092 | + for (i = 0; i < NCAPINTS; i++) | |
8093 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | |
8094 | } | |
8095 | ||
8096 | + /* Clear all flags overriden by options */ | |
8097 | + for (i = 0; i < NCAPINTS; i++) | |
8098 | + c->x86_capability[i] &= ~cleared_cpu_caps[i]; | |
8099 | + | |
8100 | #ifdef CONFIG_X86_MCE | |
8101 | mcheck_init(c); | |
8102 | #endif | |
8103 | + select_idle_routine(c); | |
8104 | + | |
8105 | if (c != &boot_cpu_data) | |
8106 | mtrr_ap_init(); | |
8107 | #ifdef CONFIG_NUMA | |
8108 | numa_add_cpu(smp_processor_id()); | |
8109 | #endif | |
8110 | + | |
8111 | } | |
8112 | - | |
8113 | + | |
8114 | +static __init int setup_noclflush(char *arg) | |
8115 | +{ | |
8116 | + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); | |
8117 | + return 1; | |
8118 | +} | |
8119 | +__setup("noclflush", setup_noclflush); | |
8120 | ||
8121 | void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) | |
8122 | { | |
8123 | if (c->x86_model_id[0]) | |
8124 | - printk("%s", c->x86_model_id); | |
8125 | + printk(KERN_CONT "%s", c->x86_model_id); | |
8126 | + | |
8127 | + if (c->x86_mask || c->cpuid_level >= 0) | |
8128 | + printk(KERN_CONT " stepping %02x\n", c->x86_mask); | |
8129 | + else | |
8130 | + printk(KERN_CONT "\n"); | |
8131 | +} | |
8132 | ||
8133 | - if (c->x86_mask || c->cpuid_level >= 0) | |
8134 | - printk(" stepping %02x\n", c->x86_mask); | |
8135 | +static __init int setup_disablecpuid(char *arg) | |
8136 | +{ | |
8137 | + int bit; | |
8138 | + if (get_option(&arg, &bit) && bit < NCAPINTS*32) | |
8139 | + setup_clear_cpu_cap(bit); | |
8140 | else | |
8141 | - printk("\n"); | |
8142 | + return 0; | |
8143 | + return 1; | |
8144 | } | |
8145 | +__setup("clearcpuid=", setup_disablecpuid); | |
8146 | ||
8147 | /* | |
8148 | * Get CPU information for use by the procfs. | |
8149 | @@ -1246,116 +1310,41 @@ void __cpuinit print_cpu_info(struct cpu | |
8150 | static int show_cpuinfo(struct seq_file *m, void *v) | |
8151 | { | |
8152 | struct cpuinfo_x86 *c = v; | |
8153 | - int cpu = 0; | |
8154 | - | |
8155 | - /* | |
8156 | - * These flag bits must match the definitions in <asm/cpufeature.h>. | |
8157 | - * NULL means this bit is undefined or reserved; either way it doesn't | |
8158 | - * have meaning as far as Linux is concerned. Note that it's important | |
8159 | - * to realize there is a difference between this table and CPUID -- if | |
8160 | - * applications want to get the raw CPUID data, they should access | |
8161 | - * /dev/cpu/<cpu_nr>/cpuid instead. | |
8162 | - */ | |
8163 | - static const char *const x86_cap_flags[] = { | |
8164 | - /* Intel-defined */ | |
8165 | - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", | |
8166 | - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", | |
8167 | - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", | |
8168 | - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", | |
8169 | - | |
8170 | - /* AMD-defined */ | |
8171 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8172 | - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, | |
8173 | - NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, | |
8174 | - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", | |
8175 | - "3dnowext", "3dnow", | |
8176 | - | |
8177 | - /* Transmeta-defined */ | |
8178 | - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, | |
8179 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8180 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8181 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8182 | - | |
8183 | - /* Other (Linux-defined) */ | |
8184 | - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", | |
8185 | - NULL, NULL, NULL, NULL, | |
8186 | - "constant_tsc", "up", NULL, "arch_perfmon", | |
8187 | - "pebs", "bts", NULL, "sync_rdtsc", | |
8188 | - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8189 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8190 | - | |
8191 | - /* Intel-defined (#2) */ | |
8192 | - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", | |
8193 | - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, | |
8194 | - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", | |
8195 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8196 | - | |
8197 | - /* VIA/Cyrix/Centaur-defined */ | |
8198 | - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", | |
8199 | - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, | |
8200 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8201 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8202 | - | |
8203 | - /* AMD-defined (#2) */ | |
8204 | - "lahf_lm", "cmp_legacy", "svm", "extapic", | |
8205 | - "cr8_legacy", "abm", "sse4a", "misalignsse", | |
8206 | - "3dnowprefetch", "osvw", "ibs", "sse5", | |
8207 | - "skinit", "wdt", NULL, NULL, | |
8208 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8209 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8210 | - | |
8211 | - /* Auxiliary (Linux-defined) */ | |
8212 | - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8213 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8214 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8215 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
8216 | - }; | |
8217 | - static const char *const x86_power_flags[] = { | |
8218 | - "ts", /* temperature sensor */ | |
8219 | - "fid", /* frequency id control */ | |
8220 | - "vid", /* voltage id control */ | |
8221 | - "ttp", /* thermal trip */ | |
8222 | - "tm", | |
8223 | - "stc", | |
8224 | - "100mhzsteps", | |
8225 | - "hwpstate", | |
8226 | - "", /* tsc invariant mapped to constant_tsc */ | |
8227 | - /* nothing */ | |
8228 | - }; | |
8229 | - | |
8230 | + int cpu = 0, i; | |
8231 | ||
8232 | #ifdef CONFIG_SMP | |
8233 | cpu = c->cpu_index; | |
8234 | #endif | |
8235 | ||
8236 | - seq_printf(m,"processor\t: %u\n" | |
8237 | - "vendor_id\t: %s\n" | |
8238 | - "cpu family\t: %d\n" | |
8239 | - "model\t\t: %d\n" | |
8240 | - "model name\t: %s\n", | |
8241 | - (unsigned)cpu, | |
8242 | - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", | |
8243 | - c->x86, | |
8244 | - (int)c->x86_model, | |
8245 | - c->x86_model_id[0] ? c->x86_model_id : "unknown"); | |
8246 | - | |
8247 | + seq_printf(m, "processor\t: %u\n" | |
8248 | + "vendor_id\t: %s\n" | |
8249 | + "cpu family\t: %d\n" | |
8250 | + "model\t\t: %d\n" | |
8251 | + "model name\t: %s\n", | |
8252 | + (unsigned)cpu, | |
8253 | + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", | |
8254 | + c->x86, | |
8255 | + (int)c->x86_model, | |
8256 | + c->x86_model_id[0] ? c->x86_model_id : "unknown"); | |
8257 | + | |
8258 | if (c->x86_mask || c->cpuid_level >= 0) | |
8259 | seq_printf(m, "stepping\t: %d\n", c->x86_mask); | |
8260 | else | |
8261 | seq_printf(m, "stepping\t: unknown\n"); | |
8262 | - | |
8263 | - if (cpu_has(c,X86_FEATURE_TSC)) { | |
8264 | + | |
8265 | + if (cpu_has(c, X86_FEATURE_TSC)) { | |
8266 | unsigned int freq = cpufreq_quick_get((unsigned)cpu); | |
8267 | + | |
8268 | if (!freq) | |
8269 | freq = cpu_khz; | |
8270 | seq_printf(m, "cpu MHz\t\t: %u.%03u\n", | |
8271 | - freq / 1000, (freq % 1000)); | |
8272 | + freq / 1000, (freq % 1000)); | |
8273 | } | |
8274 | ||
8275 | /* Cache size */ | |
8276 | - if (c->x86_cache_size >= 0) | |
8277 | + if (c->x86_cache_size >= 0) | |
8278 | seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); | |
8279 | - | |
8280 | + | |
8281 | #ifdef CONFIG_SMP | |
8282 | if (smp_num_siblings * c->x86_max_cores > 1) { | |
8283 | seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); | |
8284 | @@ -1364,48 +1353,43 @@ static int show_cpuinfo(struct seq_file | |
8285 | seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); | |
8286 | seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); | |
8287 | } | |
8288 | -#endif | |
8289 | +#endif | |
8290 | ||
8291 | seq_printf(m, | |
8292 | - "fpu\t\t: yes\n" | |
8293 | - "fpu_exception\t: yes\n" | |
8294 | - "cpuid level\t: %d\n" | |
8295 | - "wp\t\t: yes\n" | |
8296 | - "flags\t\t:", | |
8297 | + "fpu\t\t: yes\n" | |
8298 | + "fpu_exception\t: yes\n" | |
8299 | + "cpuid level\t: %d\n" | |
8300 | + "wp\t\t: yes\n" | |
8301 | + "flags\t\t:", | |
8302 | c->cpuid_level); | |
8303 | ||
8304 | - { | |
8305 | - int i; | |
8306 | - for ( i = 0 ; i < 32*NCAPINTS ; i++ ) | |
8307 | - if (cpu_has(c, i) && x86_cap_flags[i] != NULL) | |
8308 | - seq_printf(m, " %s", x86_cap_flags[i]); | |
8309 | - } | |
8310 | - | |
8311 | + for (i = 0; i < 32*NCAPINTS; i++) | |
8312 | + if (cpu_has(c, i) && x86_cap_flags[i] != NULL) | |
8313 | + seq_printf(m, " %s", x86_cap_flags[i]); | |
8314 | + | |
8315 | seq_printf(m, "\nbogomips\t: %lu.%02lu\n", | |
8316 | c->loops_per_jiffy/(500000/HZ), | |
8317 | (c->loops_per_jiffy/(5000/HZ)) % 100); | |
8318 | ||
8319 | - if (c->x86_tlbsize > 0) | |
8320 | + if (c->x86_tlbsize > 0) | |
8321 | seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); | |
8322 | seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); | |
8323 | seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); | |
8324 | ||
8325 | - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", | |
8326 | + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", | |
8327 | c->x86_phys_bits, c->x86_virt_bits); | |
8328 | ||
8329 | seq_printf(m, "power management:"); | |
8330 | - { | |
8331 | - unsigned i; | |
8332 | - for (i = 0; i < 32; i++) | |
8333 | - if (c->x86_power & (1 << i)) { | |
8334 | - if (i < ARRAY_SIZE(x86_power_flags) && | |
8335 | - x86_power_flags[i]) | |
8336 | - seq_printf(m, "%s%s", | |
8337 | - x86_power_flags[i][0]?" ":"", | |
8338 | - x86_power_flags[i]); | |
8339 | - else | |
8340 | - seq_printf(m, " [%d]", i); | |
8341 | - } | |
8342 | + for (i = 0; i < 32; i++) { | |
8343 | + if (c->x86_power & (1 << i)) { | |
8344 | + if (i < ARRAY_SIZE(x86_power_flags) && | |
8345 | + x86_power_flags[i]) | |
8346 | + seq_printf(m, "%s%s", | |
8347 | + x86_power_flags[i][0]?" ":"", | |
8348 | + x86_power_flags[i]); | |
8349 | + else | |
8350 | + seq_printf(m, " [%d]", i); | |
8351 | + } | |
8352 | } | |
8353 | ||
8354 | seq_printf(m, "\n\n"); | |
8355 | @@ -1432,8 +1416,8 @@ static void c_stop(struct seq_file *m, v | |
8356 | { | |
8357 | } | |
8358 | ||
8359 | -struct seq_operations cpuinfo_op = { | |
8360 | - .start =c_start, | |
8361 | +const struct seq_operations cpuinfo_op = { | |
8362 | + .start = c_start, | |
8363 | .next = c_next, | |
8364 | .stop = c_stop, | |
8365 | .show = show_cpuinfo, | |
8366 | --- a/arch/x86/kernel/setup64-xen.c | |
8367 | +++ b/arch/x86/kernel/setup64-xen.c | |
8368 | @@ -31,7 +31,11 @@ | |
8369 | #include <asm/hypervisor.h> | |
8370 | #endif | |
8371 | ||
8372 | +#ifndef CONFIG_DEBUG_BOOT_PARAMS | |
8373 | struct boot_params __initdata boot_params; | |
8374 | +#else | |
8375 | +struct boot_params boot_params; | |
8376 | +#endif | |
8377 | ||
8378 | cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; | |
8379 | ||
8380 | @@ -47,6 +51,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __attr | |
8381 | ||
8382 | unsigned long __supported_pte_mask __read_mostly = ~0UL; | |
8383 | EXPORT_SYMBOL(__supported_pte_mask); | |
8384 | + | |
8385 | static int do_not_nx __cpuinitdata = 0; | |
8386 | ||
8387 | /* noexec=on|off | |
8388 | @@ -90,6 +95,45 @@ static int __init nonx32_setup(char *str | |
8389 | __setup("noexec32=", nonx32_setup); | |
8390 | ||
8391 | /* | |
8392 | + * Copy data used in early init routines from the initial arrays to the | |
8393 | + * per cpu data areas. These arrays then become expendable and the | |
8394 | + * *_early_ptr's are zeroed indicating that the static arrays are gone. | |
8395 | + */ | |
8396 | +static void __init setup_per_cpu_maps(void) | |
8397 | +{ | |
8398 | +#ifndef CONFIG_XEN | |
8399 | + int cpu; | |
8400 | + | |
8401 | + for_each_possible_cpu(cpu) { | |
8402 | +#ifdef CONFIG_SMP | |
8403 | + if (per_cpu_offset(cpu)) { | |
8404 | +#endif | |
8405 | + per_cpu(x86_cpu_to_apicid, cpu) = | |
8406 | + x86_cpu_to_apicid_init[cpu]; | |
8407 | + per_cpu(x86_bios_cpu_apicid, cpu) = | |
8408 | + x86_bios_cpu_apicid_init[cpu]; | |
8409 | +#ifdef CONFIG_NUMA | |
8410 | + per_cpu(x86_cpu_to_node_map, cpu) = | |
8411 | + x86_cpu_to_node_map_init[cpu]; | |
8412 | +#endif | |
8413 | +#ifdef CONFIG_SMP | |
8414 | + } | |
8415 | + else | |
8416 | + printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n", | |
8417 | + cpu); | |
8418 | +#endif | |
8419 | + } | |
8420 | + | |
8421 | + /* indicate the early static arrays will soon be gone */ | |
8422 | + x86_cpu_to_apicid_early_ptr = NULL; | |
8423 | + x86_bios_cpu_apicid_early_ptr = NULL; | |
8424 | +#ifdef CONFIG_NUMA | |
8425 | + x86_cpu_to_node_map_early_ptr = NULL; | |
8426 | +#endif | |
8427 | +#endif | |
8428 | +} | |
8429 | + | |
8430 | +/* | |
8431 | * Great future plan: | |
8432 | * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. | |
8433 | * Always point %gs to its beginning | |
8434 | @@ -109,19 +153,24 @@ void __init setup_per_cpu_areas(void) | |
8435 | printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); | |
8436 | for_each_cpu_mask (i, cpu_possible_map) { | |
8437 | char *ptr; | |
8438 | +#ifndef CONFIG_NEED_MULTIPLE_NODES | |
8439 | + ptr = alloc_bootmem_pages(size); | |
8440 | +#else | |
8441 | + int node = early_cpu_to_node(i); | |
8442 | ||
8443 | - if (!NODE_DATA(cpu_to_node(i))) { | |
8444 | - printk("cpu with no node %d, num_online_nodes %d\n", | |
8445 | - i, num_online_nodes()); | |
8446 | + if (!node_online(node) || !NODE_DATA(node)) | |
8447 | ptr = alloc_bootmem_pages(size); | |
8448 | - } else { | |
8449 | - ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); | |
8450 | - } | |
8451 | + else | |
8452 | + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); | |
8453 | +#endif | |
8454 | if (!ptr) | |
8455 | panic("Cannot allocate cpu data for CPU %d\n", i); | |
8456 | cpu_pda(i)->data_offset = ptr - __per_cpu_start; | |
8457 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | |
8458 | } | |
8459 | + | |
8460 | + /* setup percpu data maps early */ | |
8461 | + setup_per_cpu_maps(); | |
8462 | } | |
8463 | ||
8464 | #ifdef CONFIG_XEN | |
8465 | @@ -224,7 +273,8 @@ void syscall_init(void) | |
8466 | wrmsrl(MSR_CSTAR, ignore_sysret); | |
8467 | ||
8468 | /* Flags to clear on syscall */ | |
8469 | - wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); | |
8470 | + wrmsrl(MSR_SYSCALL_MASK, | |
8471 | + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); | |
8472 | #endif | |
8473 | #ifdef CONFIG_IA32_EMULATION | |
8474 | syscall32_cpu_init (); | |
8475 | @@ -303,7 +353,7 @@ void __cpuinit cpu_init (void) | |
8476 | */ | |
8477 | #ifndef CONFIG_XEN | |
8478 | if (cpu) | |
8479 | - memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); | |
8480 | + memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE); | |
8481 | #endif | |
8482 | ||
8483 | cpu_gdt_descr[cpu].size = GDT_SIZE; | |
8484 | @@ -334,10 +384,10 @@ void __cpuinit cpu_init (void) | |
8485 | v, cpu); | |
8486 | } | |
8487 | estacks += PAGE_SIZE << order[v]; | |
8488 | - orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; | |
8489 | + orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; | |
8490 | } | |
8491 | ||
8492 | - t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | |
8493 | + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | |
8494 | /* | |
8495 | * <= is required because the CPU will access up to | |
8496 | * 8 bits beyond the end of the IO permission bitmap. | |
8497 | --- a/arch/x86/kernel/smp_32-xen.c | |
8498 | +++ b/arch/x86/kernel/smp_32-xen.c | |
8499 | @@ -168,7 +168,7 @@ void __send_IPI_shortcut(unsigned int sh | |
8500 | } | |
8501 | } | |
8502 | ||
8503 | -void fastcall send_IPI_self(int vector) | |
8504 | +void send_IPI_self(int vector) | |
8505 | { | |
8506 | __send_IPI_shortcut(APIC_DEST_SELF, vector); | |
8507 | } | |
8508 | @@ -224,13 +224,14 @@ static DEFINE_SPINLOCK(tlbstate_lock); | |
8509 | * We need to reload %cr3 since the page tables may be going | |
8510 | * away from under us.. | |
8511 | */ | |
8512 | -void leave_mm(unsigned long cpu) | |
8513 | +void leave_mm(int cpu) | |
8514 | { | |
8515 | if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) | |
8516 | BUG(); | |
8517 | cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); | |
8518 | load_cr3(swapper_pg_dir); | |
8519 | } | |
8520 | +EXPORT_SYMBOL_GPL(leave_mm); | |
8521 | ||
8522 | /* | |
8523 | * | |
8524 | --- a/arch/x86/kernel/smp_64-xen.c | |
8525 | +++ b/arch/x86/kernel/smp_64-xen.c | |
8526 | @@ -33,7 +33,7 @@ | |
8527 | ||
8528 | #ifndef CONFIG_XEN | |
8529 | /* | |
8530 | - * Smarter SMP flushing macros. | |
8531 | + * Smarter SMP flushing macros. | |
8532 | * c/o Linus Torvalds. | |
8533 | * | |
8534 | * These mean you can really definitely utterly forget about | |
8535 | @@ -41,15 +41,15 @@ | |
8536 | * | |
8537 | * Optimizations Manfred Spraul <manfred@colorfullife.com> | |
8538 | * | |
8539 | - * More scalable flush, from Andi Kleen | |
8540 | + * More scalable flush, from Andi Kleen | |
8541 | * | |
8542 | - * To avoid global state use 8 different call vectors. | |
8543 | - * Each CPU uses a specific vector to trigger flushes on other | |
8544 | - * CPUs. Depending on the received vector the target CPUs look into | |
8545 | + * To avoid global state use 8 different call vectors. | |
8546 | + * Each CPU uses a specific vector to trigger flushes on other | |
8547 | + * CPUs. Depending on the received vector the target CPUs look into | |
8548 | * the right per cpu variable for the flush data. | |
8549 | * | |
8550 | - * With more than 8 CPUs they are hashed to the 8 available | |
8551 | - * vectors. The limited global vector space forces us to this right now. | |
8552 | + * With more than 8 CPUs they are hashed to the 8 available | |
8553 | + * vectors. The limited global vector space forces us to this right now. | |
8554 | * In future when interrupts are split into per CPU domains this could be | |
8555 | * fixed, at the cost of triggering multiple IPIs in some cases. | |
8556 | */ | |
8557 | @@ -59,7 +59,6 @@ union smp_flush_state { | |
8558 | cpumask_t flush_cpumask; | |
8559 | struct mm_struct *flush_mm; | |
8560 | unsigned long flush_va; | |
8561 | -#define FLUSH_ALL -1ULL | |
8562 | spinlock_t tlbstate_lock; | |
8563 | }; | |
8564 | char pad[SMP_CACHE_BYTES]; | |
8565 | @@ -71,16 +70,17 @@ union smp_flush_state { | |
8566 | static DEFINE_PER_CPU(union smp_flush_state, flush_state); | |
8567 | ||
8568 | /* | |
8569 | - * We cannot call mmdrop() because we are in interrupt context, | |
8570 | + * We cannot call mmdrop() because we are in interrupt context, | |
8571 | * instead update mm->cpu_vm_mask. | |
8572 | */ | |
8573 | -static inline void leave_mm(unsigned long cpu) | |
8574 | +void leave_mm(int cpu) | |
8575 | { | |
8576 | if (read_pda(mmu_state) == TLBSTATE_OK) | |
8577 | BUG(); | |
8578 | cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); | |
8579 | load_cr3(swapper_pg_dir); | |
8580 | } | |
8581 | +EXPORT_SYMBOL_GPL(leave_mm); | |
8582 | ||
8583 | /* | |
8584 | * | |
8585 | @@ -89,25 +89,25 @@ static inline void leave_mm(unsigned lon | |
8586 | * 1) switch_mm() either 1a) or 1b) | |
8587 | * 1a) thread switch to a different mm | |
8588 | * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); | |
8589 | - * Stop ipi delivery for the old mm. This is not synchronized with | |
8590 | - * the other cpus, but smp_invalidate_interrupt ignore flush ipis | |
8591 | - * for the wrong mm, and in the worst case we perform a superfluous | |
8592 | - * tlb flush. | |
8593 | + * Stop ipi delivery for the old mm. This is not synchronized with | |
8594 | + * the other cpus, but smp_invalidate_interrupt ignore flush ipis | |
8595 | + * for the wrong mm, and in the worst case we perform a superfluous | |
8596 | + * tlb flush. | |
8597 | * 1a2) set cpu mmu_state to TLBSTATE_OK | |
8598 | - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 | |
8599 | + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 | |
8600 | * was in lazy tlb mode. | |
8601 | * 1a3) update cpu active_mm | |
8602 | - * Now cpu0 accepts tlb flushes for the new mm. | |
8603 | + * Now cpu0 accepts tlb flushes for the new mm. | |
8604 | * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); | |
8605 | - * Now the other cpus will send tlb flush ipis. | |
8606 | + * Now the other cpus will send tlb flush ipis. | |
8607 | * 1a4) change cr3. | |
8608 | * 1b) thread switch without mm change | |
8609 | * cpu active_mm is correct, cpu0 already handles | |
8610 | * flush ipis. | |
8611 | * 1b1) set cpu mmu_state to TLBSTATE_OK | |
8612 | * 1b2) test_and_set the cpu bit in cpu_vm_mask. | |
8613 | - * Atomically set the bit [other cpus will start sending flush ipis], | |
8614 | - * and test the bit. | |
8615 | + * Atomically set the bit [other cpus will start sending flush ipis], | |
8616 | + * and test the bit. | |
8617 | * 1b3) if the bit was 0: leave_mm was called, flush the tlb. | |
8618 | * 2) switch %%esp, ie current | |
8619 | * | |
8620 | @@ -141,12 +141,12 @@ asmlinkage void smp_invalidate_interrupt | |
8621 | * orig_rax contains the negated interrupt vector. | |
8622 | * Use that to determine where the sender put the data. | |
8623 | */ | |
8624 | - sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; | |
8625 | + sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; | |
8626 | f = &per_cpu(flush_state, sender); | |
8627 | ||
8628 | if (!cpu_isset(cpu, f->flush_cpumask)) | |
8629 | goto out; | |
8630 | - /* | |
8631 | + /* | |
8632 | * This was a BUG() but until someone can quote me the | |
8633 | * line from the intel manual that guarantees an IPI to | |
8634 | * multiple CPUs is retried _only_ on the erroring CPUs | |
8635 | @@ -154,10 +154,10 @@ asmlinkage void smp_invalidate_interrupt | |
8636 | * | |
8637 | * BUG(); | |
8638 | */ | |
8639 | - | |
8640 | + | |
8641 | if (f->flush_mm == read_pda(active_mm)) { | |
8642 | if (read_pda(mmu_state) == TLBSTATE_OK) { | |
8643 | - if (f->flush_va == FLUSH_ALL) | |
8644 | + if (f->flush_va == TLB_FLUSH_ALL) | |
8645 | local_flush_tlb(); | |
8646 | else | |
8647 | __flush_tlb_one(f->flush_va); | |
8648 | @@ -170,19 +170,22 @@ out: | |
8649 | add_pda(irq_tlb_count, 1); | |
8650 | } | |
8651 | ||
8652 | -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, | |
8653 | - unsigned long va) | |
8654 | +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, | |
8655 | + unsigned long va) | |
8656 | { | |
8657 | int sender; | |
8658 | union smp_flush_state *f; | |
8659 | + cpumask_t cpumask = *cpumaskp; | |
8660 | ||
8661 | /* Caller has disabled preemption */ | |
8662 | sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; | |
8663 | f = &per_cpu(flush_state, sender); | |
8664 | ||
8665 | - /* Could avoid this lock when | |
8666 | - num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is | |
8667 | - probably not worth checking this for a cache-hot lock. */ | |
8668 | + /* | |
8669 | + * Could avoid this lock when | |
8670 | + * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is | |
8671 | + * probably not worth checking this for a cache-hot lock. | |
8672 | + */ | |
8673 | spin_lock(&f->tlbstate_lock); | |
8674 | ||
8675 | f->flush_mm = mm; | |
8676 | @@ -206,14 +209,14 @@ static void flush_tlb_others(cpumask_t c | |
8677 | int __cpuinit init_smp_flush(void) | |
8678 | { | |
8679 | int i; | |
8680 | + | |
8681 | for_each_cpu_mask(i, cpu_possible_map) { | |
8682 | spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); | |
8683 | } | |
8684 | return 0; | |
8685 | } | |
8686 | - | |
8687 | core_initcall(init_smp_flush); | |
8688 | - | |
8689 | + | |
8690 | void flush_tlb_current_task(void) | |
8691 | { | |
8692 | struct mm_struct *mm = current->mm; | |
8693 | @@ -225,10 +228,9 @@ void flush_tlb_current_task(void) | |
8694 | ||
8695 | local_flush_tlb(); | |
8696 | if (!cpus_empty(cpu_mask)) | |
8697 | - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | |
8698 | + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); | |
8699 | preempt_enable(); | |
8700 | } | |
8701 | -EXPORT_SYMBOL(flush_tlb_current_task); | |
8702 | ||
8703 | void flush_tlb_mm (struct mm_struct * mm) | |
8704 | { | |
8705 | @@ -245,11 +247,10 @@ void flush_tlb_mm (struct mm_struct * mm | |
8706 | leave_mm(smp_processor_id()); | |
8707 | } | |
8708 | if (!cpus_empty(cpu_mask)) | |
8709 | - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | |
8710 | + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); | |
8711 | ||
8712 | preempt_enable(); | |
8713 | } | |
8714 | -EXPORT_SYMBOL(flush_tlb_mm); | |
8715 | ||
8716 | void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) | |
8717 | { | |
8718 | @@ -263,8 +264,8 @@ void flush_tlb_page(struct vm_area_struc | |
8719 | if (current->active_mm == mm) { | |
8720 | if(current->mm) | |
8721 | __flush_tlb_one(va); | |
8722 | - else | |
8723 | - leave_mm(smp_processor_id()); | |
8724 | + else | |
8725 | + leave_mm(smp_processor_id()); | |
8726 | } | |
8727 | ||
8728 | if (!cpus_empty(cpu_mask)) | |
8729 | @@ -272,7 +273,6 @@ void flush_tlb_page(struct vm_area_struc | |
8730 | ||
8731 | preempt_enable(); | |
8732 | } | |
8733 | -EXPORT_SYMBOL(flush_tlb_page); | |
8734 | ||
8735 | static void do_flush_tlb_all(void* info) | |
8736 | { | |
8737 | @@ -330,11 +330,9 @@ void unlock_ipi_call_lock(void) | |
8738 | * this function sends a 'generic call function' IPI to all other CPU | |
8739 | * of the system defined in the mask. | |
8740 | */ | |
8741 | - | |
8742 | -static int | |
8743 | -__smp_call_function_mask(cpumask_t mask, | |
8744 | - void (*func)(void *), void *info, | |
8745 | - int wait) | |
8746 | +static int __smp_call_function_mask(cpumask_t mask, | |
8747 | + void (*func)(void *), void *info, | |
8748 | + int wait) | |
8749 | { | |
8750 | struct call_data_struct data; | |
8751 | cpumask_t allbutself; | |
8752 | @@ -422,11 +420,10 @@ EXPORT_SYMBOL(smp_call_function_mask); | |
8753 | */ | |
8754 | ||
8755 | int smp_call_function_single (int cpu, void (*func) (void *info), void *info, | |
8756 | - int nonatomic, int wait) | |
8757 | + int nonatomic, int wait) | |
8758 | { | |
8759 | /* prevent preemption and reschedule on another processor */ | |
8760 | - int ret; | |
8761 | - int me = get_cpu(); | |
8762 | + int ret, me = get_cpu(); | |
8763 | ||
8764 | /* Can deadlock when called with interrupts disabled */ | |
8765 | WARN_ON(irqs_disabled()); | |
8766 | @@ -476,9 +473,9 @@ static void stop_this_cpu(void *dummy) | |
8767 | */ | |
8768 | cpu_clear(smp_processor_id(), cpu_online_map); | |
8769 | disable_all_local_evtchn(); | |
8770 | - for (;;) | |
8771 | + for (;;) | |
8772 | halt(); | |
8773 | -} | |
8774 | +} | |
8775 | ||
8776 | void smp_send_stop(void) | |
8777 | { | |
8778 | --- a/arch/x86/kernel/time_32-xen.c | |
8779 | +++ b/arch/x86/kernel/time_32-xen.c | |
8780 | @@ -28,21 +28,9 @@ | |
8781 | * serialize accesses to xtime/lost_ticks). | |
8782 | */ | |
8783 | ||
8784 | -#include <linux/errno.h> | |
8785 | -#include <linux/sched.h> | |
8786 | -#include <linux/kernel.h> | |
8787 | -#include <linux/param.h> | |
8788 | -#include <linux/string.h> | |
8789 | -#include <linux/mm.h> | |
8790 | +#include <linux/init.h> | |
8791 | #include <linux/interrupt.h> | |
8792 | #include <linux/time.h> | |
8793 | -#include <linux/delay.h> | |
8794 | -#include <linux/init.h> | |
8795 | -#include <linux/smp.h> | |
8796 | -#include <linux/module.h> | |
8797 | -#include <linux/sysdev.h> | |
8798 | -#include <linux/bcd.h> | |
8799 | -#include <linux/efi.h> | |
8800 | #include <linux/mca.h> | |
8801 | #include <linux/sysctl.h> | |
8802 | #include <linux/percpu.h> | |
8803 | @@ -50,26 +38,10 @@ | |
8804 | #include <linux/posix-timers.h> | |
8805 | #include <linux/cpufreq.h> | |
8806 | #include <linux/clocksource.h> | |
8807 | +#include <linux/sysdev.h> | |
8808 | ||
8809 | -#include <asm/io.h> | |
8810 | -#include <asm/smp.h> | |
8811 | -#include <asm/irq.h> | |
8812 | -#include <asm/msr.h> | |
8813 | #include <asm/delay.h> | |
8814 | -#include <asm/mpspec.h> | |
8815 | -#include <asm/uaccess.h> | |
8816 | -#include <asm/processor.h> | |
8817 | -#include <asm/timer.h> | |
8818 | #include <asm/time.h> | |
8819 | -#include <asm/sections.h> | |
8820 | - | |
8821 | -#include "mach_time.h" | |
8822 | - | |
8823 | -#include <linux/timex.h> | |
8824 | - | |
8825 | -#include <asm/hpet.h> | |
8826 | - | |
8827 | -#include <asm/arch_hooks.h> | |
8828 | ||
8829 | #include <xen/evtchn.h> | |
8830 | #include <xen/sysctl.h> | |
8831 | @@ -89,9 +61,6 @@ volatile unsigned long __jiffies __secti | |
8832 | unsigned int cpu_khz; /* Detected as we calibrate the TSC */ | |
8833 | EXPORT_SYMBOL(cpu_khz); | |
8834 | ||
8835 | -DEFINE_SPINLOCK(rtc_lock); | |
8836 | -EXPORT_SYMBOL(rtc_lock); | |
8837 | - | |
8838 | /* These are peridically updated in shared_info, and then copied here. */ | |
8839 | struct shadow_time_info { | |
8840 | u64 tsc_timestamp; /* TSC at last update of time vals. */ | |
8841 | @@ -154,6 +123,11 @@ static int __init __independent_wallcloc | |
8842 | } | |
8843 | __setup("independent_wallclock", __independent_wallclock); | |
8844 | ||
8845 | +int xen_independent_wallclock(void) | |
8846 | +{ | |
8847 | + return independent_wallclock; | |
8848 | +} | |
8849 | + | |
8850 | /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */ | |
8851 | static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */ | |
8852 | static int __init __permitted_clock_jitter(char *str) | |
8853 | @@ -223,7 +197,6 @@ static inline u64 get64(volatile u64 *pt | |
8854 | return cmpxchg64(ptr, 0, 0); | |
8855 | #else | |
8856 | return *ptr; | |
8857 | -#define cmpxchg64 cmpxchg | |
8858 | #endif | |
8859 | } | |
8860 | ||
8861 | @@ -233,7 +206,6 @@ static inline u64 get64_local(volatile u | |
8862 | return cmpxchg64_local(ptr, 0, 0); | |
8863 | #else | |
8864 | return *ptr; | |
8865 | -#define cmpxchg64_local cmpxchg_local | |
8866 | #endif | |
8867 | } | |
8868 | ||
8869 | @@ -341,35 +313,6 @@ static inline int time_values_up_to_date | |
8870 | return (dst->version == src->version); | |
8871 | } | |
8872 | ||
8873 | -/* | |
8874 | - * This is a special lock that is owned by the CPU and holds the index | |
8875 | - * register we are working with. It is required for NMI access to the | |
8876 | - * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. | |
8877 | - */ | |
8878 | -volatile unsigned long cmos_lock = 0; | |
8879 | -EXPORT_SYMBOL(cmos_lock); | |
8880 | - | |
8881 | -/* Routines for accessing the CMOS RAM/RTC. */ | |
8882 | -unsigned char rtc_cmos_read(unsigned char addr) | |
8883 | -{ | |
8884 | - unsigned char val; | |
8885 | - lock_cmos_prefix(addr); | |
8886 | - outb_p(addr, RTC_PORT(0)); | |
8887 | - val = inb_p(RTC_PORT(1)); | |
8888 | - lock_cmos_suffix(addr); | |
8889 | - return val; | |
8890 | -} | |
8891 | -EXPORT_SYMBOL(rtc_cmos_read); | |
8892 | - | |
8893 | -void rtc_cmos_write(unsigned char val, unsigned char addr) | |
8894 | -{ | |
8895 | - lock_cmos_prefix(addr); | |
8896 | - outb_p(addr, RTC_PORT(0)); | |
8897 | - outb_p(val, RTC_PORT(1)); | |
8898 | - lock_cmos_suffix(addr); | |
8899 | -} | |
8900 | -EXPORT_SYMBOL(rtc_cmos_write); | |
8901 | - | |
8902 | static void sync_xen_wallclock(unsigned long dummy); | |
8903 | static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0); | |
8904 | static void sync_xen_wallclock(unsigned long dummy) | |
8905 | @@ -378,7 +321,8 @@ static void sync_xen_wallclock(unsigned | |
8906 | s64 nsec; | |
8907 | struct xen_platform_op op; | |
8908 | ||
8909 | - if (!ntp_synced() || independent_wallclock || !is_initial_xendomain()) | |
8910 | + BUG_ON(!is_initial_xendomain()); | |
8911 | + if (!ntp_synced() || independent_wallclock) | |
8912 | return; | |
8913 | ||
8914 | write_seqlock_irq(&xtime_lock); | |
8915 | @@ -401,23 +345,6 @@ static void sync_xen_wallclock(unsigned | |
8916 | mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ); | |
8917 | } | |
8918 | ||
8919 | -static int set_rtc_mmss(unsigned long nowtime) | |
8920 | -{ | |
8921 | - int retval; | |
8922 | - unsigned long flags; | |
8923 | - | |
8924 | - if (independent_wallclock || !is_initial_xendomain()) | |
8925 | - return 0; | |
8926 | - | |
8927 | - /* gets recalled with irq locally disabled */ | |
8928 | - /* XXX - does irqsave resolve this? -johnstul */ | |
8929 | - spin_lock_irqsave(&rtc_lock, flags); | |
8930 | - retval = set_wallclock(nowtime); | |
8931 | - spin_unlock_irqrestore(&rtc_lock, flags); | |
8932 | - | |
8933 | - return retval; | |
8934 | -} | |
8935 | - | |
8936 | static unsigned long long local_clock(void) | |
8937 | { | |
8938 | unsigned int cpu = get_cpu(); | |
8939 | @@ -500,28 +427,24 @@ unsigned long profile_pc(struct pt_regs | |
8940 | ||
8941 | #if defined(CONFIG_SMP) || defined(__x86_64__) | |
8942 | # ifdef __i386__ | |
8943 | - if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) | |
8944 | + if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) | |
8945 | # else | |
8946 | if (!user_mode(regs) | |
8947 | # endif | |
8948 | && in_lock_functions(pc)) { | |
8949 | # ifdef CONFIG_FRAME_POINTER | |
8950 | -# ifdef __i386__ | |
8951 | - return ((unsigned long *)regs->ebp)[1]; | |
8952 | -# else | |
8953 | - return ((unsigned long *)regs->rbp)[1]; | |
8954 | -# endif | |
8955 | + return ((unsigned long *)regs->bp)[1]; | |
8956 | # else | |
8957 | # ifdef __i386__ | |
8958 | - unsigned long *sp = (unsigned long *)®s->esp; | |
8959 | + unsigned long *sp = (unsigned long *)®s->sp; | |
8960 | # else | |
8961 | - unsigned long *sp = (unsigned long *)regs->rsp; | |
8962 | + unsigned long *sp = (unsigned long *)regs->sp; | |
8963 | # endif | |
8964 | ||
8965 | /* Return address is either directly at stack pointer | |
8966 | - or above a saved eflags. Eflags has bits 22-31 zero, | |
8967 | + or above a saved flags. Eflags has bits 22-31 zero, | |
8968 | kernel addresses don't. */ | |
8969 | - if (sp[0] >> 22) | |
8970 | + if (sp[0] >> 22) | |
8971 | return sp[0]; | |
8972 | if (sp[1] >> 22) | |
8973 | return sp[1]; | |
8974 | @@ -750,25 +673,32 @@ static void init_missing_ticks_accountin | |
8975 | runstate->time[RUNSTATE_offline]; | |
8976 | } | |
8977 | ||
8978 | -/* not static: needed by APM */ | |
8979 | -unsigned long read_persistent_clock(void) | |
8980 | +unsigned long xen_read_persistent_clock(void) | |
8981 | { | |
8982 | - unsigned long retval; | |
8983 | - unsigned long flags; | |
8984 | - | |
8985 | - spin_lock_irqsave(&rtc_lock, flags); | |
8986 | + const shared_info_t *s = HYPERVISOR_shared_info; | |
8987 | + u32 version, sec, nsec; | |
8988 | + u64 delta; | |
8989 | ||
8990 | - retval = get_wallclock(); | |
8991 | + do { | |
8992 | + version = s->wc_version; | |
8993 | + rmb(); | |
8994 | + sec = s->wc_sec; | |
8995 | + nsec = s->wc_nsec; | |
8996 | + rmb(); | |
8997 | + } while ((s->wc_version & 1) | (version ^ s->wc_version)); | |
8998 | ||
8999 | - spin_unlock_irqrestore(&rtc_lock, flags); | |
9000 | + delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec; | |
9001 | + do_div(delta, NSEC_PER_SEC); | |
9002 | ||
9003 | - return retval; | |
9004 | + return delta; | |
9005 | } | |
9006 | ||
9007 | -int update_persistent_clock(struct timespec now) | |
9008 | +int xen_update_persistent_clock(void) | |
9009 | { | |
9010 | + if (!is_initial_xendomain()) | |
9011 | + return -1; | |
9012 | mod_timer(&sync_xen_wallclock_timer, jiffies + 1); | |
9013 | - return set_rtc_mmss(now.tv_sec); | |
9014 | + return 0; | |
9015 | } | |
9016 | ||
9017 | extern void (*late_time_init)(void); | |
9018 | --- a/arch/x86/kernel/traps_32-xen.c | |
9019 | +++ b/arch/x86/kernel/traps_32-xen.c | |
9020 | @@ -79,7 +79,8 @@ char ignore_fpu_irq = 0; | |
9021 | * F0 0F bug workaround.. We have a special link segment | |
9022 | * for this. | |
9023 | */ | |
9024 | -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; | |
9025 | +gate_desc idt_table[256] | |
9026 | + __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; | |
9027 | #endif | |
9028 | ||
9029 | asmlinkage void divide_error(void); | |
9030 | @@ -109,6 +110,34 @@ asmlinkage void machine_check(void); | |
9031 | int kstack_depth_to_print = 24; | |
9032 | static unsigned int code_bytes = 64; | |
9033 | ||
9034 | +void printk_address(unsigned long address, int reliable) | |
9035 | +{ | |
9036 | +#ifdef CONFIG_KALLSYMS | |
9037 | + unsigned long offset = 0, symsize; | |
9038 | + const char *symname; | |
9039 | + char *modname; | |
9040 | + char *delim = ":"; | |
9041 | + char namebuf[128]; | |
9042 | + char reliab[4] = ""; | |
9043 | + | |
9044 | + symname = kallsyms_lookup(address, &symsize, &offset, | |
9045 | + &modname, namebuf); | |
9046 | + if (!symname) { | |
9047 | + printk(" [<%08lx>]\n", address); | |
9048 | + return; | |
9049 | + } | |
9050 | + if (!reliable) | |
9051 | + strcpy(reliab, "? "); | |
9052 | + | |
9053 | + if (!modname) | |
9054 | + modname = delim = ""; | |
9055 | + printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", | |
9056 | + address, reliab, delim, modname, delim, symname, offset, symsize); | |
9057 | +#else | |
9058 | + printk(" [<%08lx>]\n", address); | |
9059 | +#endif | |
9060 | +} | |
9061 | + | |
9062 | static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) | |
9063 | { | |
9064 | return p > (void *)tinfo && | |
9065 | @@ -122,48 +151,35 @@ struct stack_frame { | |
9066 | }; | |
9067 | ||
9068 | static inline unsigned long print_context_stack(struct thread_info *tinfo, | |
9069 | - unsigned long *stack, unsigned long ebp, | |
9070 | + unsigned long *stack, unsigned long bp, | |
9071 | const struct stacktrace_ops *ops, void *data) | |
9072 | { | |
9073 | -#ifdef CONFIG_FRAME_POINTER | |
9074 | - struct stack_frame *frame = (struct stack_frame *)ebp; | |
9075 | - while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) { | |
9076 | - struct stack_frame *next; | |
9077 | - unsigned long addr; | |
9078 | + struct stack_frame *frame = (struct stack_frame *)bp; | |
9079 | ||
9080 | - addr = frame->return_address; | |
9081 | - ops->address(data, addr); | |
9082 | - /* | |
9083 | - * break out of recursive entries (such as | |
9084 | - * end_of_stack_stop_unwind_function). Also, | |
9085 | - * we can never allow a frame pointer to | |
9086 | - * move downwards! | |
9087 | - */ | |
9088 | - next = frame->next_frame; | |
9089 | - if (next <= frame) | |
9090 | - break; | |
9091 | - frame = next; | |
9092 | - } | |
9093 | -#else | |
9094 | while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { | |
9095 | unsigned long addr; | |
9096 | ||
9097 | - addr = *stack++; | |
9098 | - if (__kernel_text_address(addr)) | |
9099 | - ops->address(data, addr); | |
9100 | + addr = *stack; | |
9101 | + if (__kernel_text_address(addr)) { | |
9102 | + if ((unsigned long) stack == bp + 4) { | |
9103 | + ops->address(data, addr, 1); | |
9104 | + frame = frame->next_frame; | |
9105 | + bp = (unsigned long) frame; | |
9106 | + } else { | |
9107 | + ops->address(data, addr, bp == 0); | |
9108 | + } | |
9109 | + } | |
9110 | + stack++; | |
9111 | } | |
9112 | -#endif | |
9113 | - return ebp; | |
9114 | + return bp; | |
9115 | } | |
9116 | ||
9117 | #define MSG(msg) ops->warning(data, msg) | |
9118 | ||
9119 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | |
9120 | - unsigned long *stack, | |
9121 | + unsigned long *stack, unsigned long bp, | |
9122 | const struct stacktrace_ops *ops, void *data) | |
9123 | { | |
9124 | - unsigned long ebp = 0; | |
9125 | - | |
9126 | if (!task) | |
9127 | task = current; | |
9128 | ||
9129 | @@ -171,17 +187,17 @@ void dump_trace(struct task_struct *task | |
9130 | unsigned long dummy; | |
9131 | stack = &dummy; | |
9132 | if (task != current) | |
9133 | - stack = (unsigned long *)task->thread.esp; | |
9134 | + stack = (unsigned long *)task->thread.sp; | |
9135 | } | |
9136 | ||
9137 | #ifdef CONFIG_FRAME_POINTER | |
9138 | - if (!ebp) { | |
9139 | + if (!bp) { | |
9140 | if (task == current) { | |
9141 | - /* Grab ebp right from our regs */ | |
9142 | - asm ("movl %%ebp, %0" : "=r" (ebp) : ); | |
9143 | + /* Grab bp right from our regs */ | |
9144 | + asm ("movl %%ebp, %0" : "=r" (bp) : ); | |
9145 | } else { | |
9146 | - /* ebp is the last reg pushed by switch_to */ | |
9147 | - ebp = *(unsigned long *) task->thread.esp; | |
9148 | + /* bp is the last reg pushed by switch_to */ | |
9149 | + bp = *(unsigned long *) task->thread.sp; | |
9150 | } | |
9151 | } | |
9152 | #endif | |
9153 | @@ -190,7 +206,7 @@ void dump_trace(struct task_struct *task | |
9154 | struct thread_info *context; | |
9155 | context = (struct thread_info *) | |
9156 | ((unsigned long)stack & (~(THREAD_SIZE - 1))); | |
9157 | - ebp = print_context_stack(context, stack, ebp, ops, data); | |
9158 | + bp = print_context_stack(context, stack, bp, ops, data); | |
9159 | /* Should be after the line below, but somewhere | |
9160 | in early boot context comes out corrupted and we | |
9161 | can't reference it -AK */ | |
9162 | @@ -225,9 +241,11 @@ static int print_trace_stack(void *data, | |
9163 | /* | |
9164 | * Print one address/symbol entries per line. | |
9165 | */ | |
9166 | -static void print_trace_address(void *data, unsigned long addr) | |
9167 | +static void print_trace_address(void *data, unsigned long addr, int reliable) | |
9168 | { | |
9169 | printk("%s [<%08lx>] ", (char *)data, addr); | |
9170 | + if (!reliable) | |
9171 | + printk("? "); | |
9172 | print_symbol("%s\n", addr); | |
9173 | touch_nmi_watchdog(); | |
9174 | } | |
9175 | @@ -241,32 +259,32 @@ static const struct stacktrace_ops print | |
9176 | ||
9177 | static void | |
9178 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | |
9179 | - unsigned long * stack, char *log_lvl) | |
9180 | + unsigned long *stack, unsigned long bp, char *log_lvl) | |
9181 | { | |
9182 | - dump_trace(task, regs, stack, &print_trace_ops, log_lvl); | |
9183 | + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); | |
9184 | printk("%s =======================\n", log_lvl); | |
9185 | } | |
9186 | ||
9187 | void show_trace(struct task_struct *task, struct pt_regs *regs, | |
9188 | - unsigned long * stack) | |
9189 | + unsigned long *stack, unsigned long bp) | |
9190 | { | |
9191 | - show_trace_log_lvl(task, regs, stack, ""); | |
9192 | + show_trace_log_lvl(task, regs, stack, bp, ""); | |
9193 | } | |
9194 | ||
9195 | static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | |
9196 | - unsigned long *esp, char *log_lvl) | |
9197 | + unsigned long *sp, unsigned long bp, char *log_lvl) | |
9198 | { | |
9199 | unsigned long *stack; | |
9200 | int i; | |
9201 | ||
9202 | - if (esp == NULL) { | |
9203 | + if (sp == NULL) { | |
9204 | if (task) | |
9205 | - esp = (unsigned long*)task->thread.esp; | |
9206 | + sp = (unsigned long*)task->thread.sp; | |
9207 | else | |
9208 | - esp = (unsigned long *)&esp; | |
9209 | + sp = (unsigned long *)&sp; | |
9210 | } | |
9211 | ||
9212 | - stack = esp; | |
9213 | + stack = sp; | |
9214 | for(i = 0; i < kstack_depth_to_print; i++) { | |
9215 | if (kstack_end(stack)) | |
9216 | break; | |
9217 | @@ -275,13 +293,13 @@ static void show_stack_log_lvl(struct ta | |
9218 | printk("%08lx ", *stack++); | |
9219 | } | |
9220 | printk("\n%sCall Trace:\n", log_lvl); | |
9221 | - show_trace_log_lvl(task, regs, esp, log_lvl); | |
9222 | + show_trace_log_lvl(task, regs, sp, bp, log_lvl); | |
9223 | } | |
9224 | ||
9225 | -void show_stack(struct task_struct *task, unsigned long *esp) | |
9226 | +void show_stack(struct task_struct *task, unsigned long *sp) | |
9227 | { | |
9228 | printk(" "); | |
9229 | - show_stack_log_lvl(task, NULL, esp, ""); | |
9230 | + show_stack_log_lvl(task, NULL, sp, 0, ""); | |
9231 | } | |
9232 | ||
9233 | /* | |
9234 | @@ -290,13 +308,19 @@ void show_stack(struct task_struct *task | |
9235 | void dump_stack(void) | |
9236 | { | |
9237 | unsigned long stack; | |
9238 | + unsigned long bp = 0; | |
9239 | + | |
9240 | +#ifdef CONFIG_FRAME_POINTER | |
9241 | + if (!bp) | |
9242 | + asm("movl %%ebp, %0" : "=r" (bp):); | |
9243 | +#endif | |
9244 | ||
9245 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | |
9246 | current->pid, current->comm, print_tainted(), | |
9247 | init_utsname()->release, | |
9248 | (int)strcspn(init_utsname()->version, " "), | |
9249 | init_utsname()->version); | |
9250 | - show_trace(current, NULL, &stack); | |
9251 | + show_trace(current, NULL, &stack, bp); | |
9252 | } | |
9253 | ||
9254 | EXPORT_SYMBOL(dump_stack); | |
9255 | @@ -315,30 +339,30 @@ void show_registers(struct pt_regs *regs | |
9256 | * time of the fault.. | |
9257 | */ | |
9258 | if (!user_mode_vm(regs)) { | |
9259 | - u8 *eip; | |
9260 | + u8 *ip; | |
9261 | unsigned int code_prologue = code_bytes * 43 / 64; | |
9262 | unsigned int code_len = code_bytes; | |
9263 | unsigned char c; | |
9264 | ||
9265 | printk("\n" KERN_EMERG "Stack: "); | |
9266 | - show_stack_log_lvl(NULL, regs, ®s->esp, KERN_EMERG); | |
9267 | + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); | |
9268 | ||
9269 | printk(KERN_EMERG "Code: "); | |
9270 | ||
9271 | - eip = (u8 *)regs->eip - code_prologue; | |
9272 | - if (eip < (u8 *)PAGE_OFFSET || | |
9273 | - probe_kernel_address(eip, c)) { | |
9274 | + ip = (u8 *)regs->ip - code_prologue; | |
9275 | + if (ip < (u8 *)PAGE_OFFSET || | |
9276 | + probe_kernel_address(ip, c)) { | |
9277 | /* try starting at EIP */ | |
9278 | - eip = (u8 *)regs->eip; | |
9279 | + ip = (u8 *)regs->ip; | |
9280 | code_len = code_len - code_prologue + 1; | |
9281 | } | |
9282 | - for (i = 0; i < code_len; i++, eip++) { | |
9283 | - if (eip < (u8 *)PAGE_OFFSET || | |
9284 | - probe_kernel_address(eip, c)) { | |
9285 | + for (i = 0; i < code_len; i++, ip++) { | |
9286 | + if (ip < (u8 *)PAGE_OFFSET || | |
9287 | + probe_kernel_address(ip, c)) { | |
9288 | printk(" Bad EIP value."); | |
9289 | break; | |
9290 | } | |
9291 | - if (eip == (u8 *)regs->eip) | |
9292 | + if (ip == (u8 *)regs->ip) | |
9293 | printk("<%02x> ", c); | |
9294 | else | |
9295 | printk("%02x ", c); | |
9296 | @@ -347,18 +371,57 @@ void show_registers(struct pt_regs *regs | |
9297 | printk("\n"); | |
9298 | } | |
9299 | ||
9300 | -int is_valid_bugaddr(unsigned long eip) | |
9301 | +int is_valid_bugaddr(unsigned long ip) | |
9302 | { | |
9303 | unsigned short ud2; | |
9304 | ||
9305 | - if (eip < PAGE_OFFSET) | |
9306 | + if (ip < PAGE_OFFSET) | |
9307 | return 0; | |
9308 | - if (probe_kernel_address((unsigned short *)eip, ud2)) | |
9309 | + if (probe_kernel_address((unsigned short *)ip, ud2)) | |
9310 | return 0; | |
9311 | ||
9312 | return ud2 == 0x0b0f; | |
9313 | } | |
9314 | ||
9315 | +static int die_counter; | |
9316 | + | |
9317 | +int __kprobes __die(const char * str, struct pt_regs * regs, long err) | |
9318 | +{ | |
9319 | + unsigned long sp; | |
9320 | + unsigned short ss; | |
9321 | + | |
9322 | + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); | |
9323 | +#ifdef CONFIG_PREEMPT | |
9324 | + printk("PREEMPT "); | |
9325 | +#endif | |
9326 | +#ifdef CONFIG_SMP | |
9327 | + printk("SMP "); | |
9328 | +#endif | |
9329 | +#ifdef CONFIG_DEBUG_PAGEALLOC | |
9330 | + printk("DEBUG_PAGEALLOC"); | |
9331 | +#endif | |
9332 | + printk("\n"); | |
9333 | + | |
9334 | + if (notify_die(DIE_OOPS, str, regs, err, | |
9335 | + current->thread.trap_no, SIGSEGV) != | |
9336 | + NOTIFY_STOP) { | |
9337 | + show_registers(regs); | |
9338 | + /* Executive summary in case the oops scrolled away */ | |
9339 | + sp = (unsigned long) (®s->sp); | |
9340 | + savesegment(ss, ss); | |
9341 | + if (user_mode(regs)) { | |
9342 | + sp = regs->sp; | |
9343 | + ss = regs->ss & 0xffff; | |
9344 | + } | |
9345 | + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); | |
9346 | + print_symbol("%s", regs->ip); | |
9347 | + printk(" SS:ESP %04x:%08lx\n", ss, sp); | |
9348 | + return 0; | |
9349 | + } else { | |
9350 | + return 1; | |
9351 | + } | |
9352 | +} | |
9353 | + | |
9354 | /* | |
9355 | * This is gone through when something in the kernel has done something bad and | |
9356 | * is about to be terminated. | |
9357 | @@ -374,7 +437,6 @@ void die(const char * str, struct pt_reg | |
9358 | .lock_owner = -1, | |
9359 | .lock_owner_depth = 0 | |
9360 | }; | |
9361 | - static int die_counter; | |
9362 | unsigned long flags; | |
9363 | ||
9364 | oops_enter(); | |
9365 | @@ -390,43 +452,13 @@ void die(const char * str, struct pt_reg | |
9366 | raw_local_irq_save(flags); | |
9367 | ||
9368 | if (++die.lock_owner_depth < 3) { | |
9369 | - unsigned long esp; | |
9370 | - unsigned short ss; | |
9371 | - | |
9372 | - report_bug(regs->eip, regs); | |
9373 | - | |
9374 | - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, | |
9375 | - ++die_counter); | |
9376 | -#ifdef CONFIG_PREEMPT | |
9377 | - printk("PREEMPT "); | |
9378 | -#endif | |
9379 | -#ifdef CONFIG_SMP | |
9380 | - printk("SMP "); | |
9381 | -#endif | |
9382 | -#ifdef CONFIG_DEBUG_PAGEALLOC | |
9383 | - printk("DEBUG_PAGEALLOC"); | |
9384 | -#endif | |
9385 | - printk("\n"); | |
9386 | + report_bug(regs->ip, regs); | |
9387 | ||
9388 | - if (notify_die(DIE_OOPS, str, regs, err, | |
9389 | - current->thread.trap_no, SIGSEGV) != | |
9390 | - NOTIFY_STOP) { | |
9391 | - show_registers(regs); | |
9392 | - /* Executive summary in case the oops scrolled away */ | |
9393 | - esp = (unsigned long) (®s->esp); | |
9394 | - savesegment(ss, ss); | |
9395 | - if (user_mode(regs)) { | |
9396 | - esp = regs->esp; | |
9397 | - ss = regs->xss & 0xffff; | |
9398 | - } | |
9399 | - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip); | |
9400 | - print_symbol("%s", regs->eip); | |
9401 | - printk(" SS:ESP %04x:%08lx\n", ss, esp); | |
9402 | - } | |
9403 | - else | |
9404 | + if (__die(str, regs, err)) | |
9405 | regs = NULL; | |
9406 | - } else | |
9407 | + } else { | |
9408 | printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); | |
9409 | + } | |
9410 | ||
9411 | bust_spinlocks(0); | |
9412 | die.lock_owner = -1; | |
9413 | @@ -462,7 +494,7 @@ static void __kprobes do_trap(int trapnr | |
9414 | { | |
9415 | struct task_struct *tsk = current; | |
9416 | ||
9417 | - if (regs->eflags & VM_MASK) { | |
9418 | + if (regs->flags & VM_MASK) { | |
9419 | if (vm86) | |
9420 | goto vm86_trap; | |
9421 | goto trap_signal; | |
9422 | @@ -508,7 +540,7 @@ static void __kprobes do_trap(int trapnr | |
9423 | } | |
9424 | ||
9425 | #define DO_ERROR(trapnr, signr, str, name) \ | |
9426 | -fastcall void do_##name(struct pt_regs * regs, long error_code) \ | |
9427 | +void do_##name(struct pt_regs * regs, long error_code) \ | |
9428 | { \ | |
9429 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | |
9430 | == NOTIFY_STOP) \ | |
9431 | @@ -517,7 +549,7 @@ fastcall void do_##name(struct pt_regs * | |
9432 | } | |
9433 | ||
9434 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ | |
9435 | -fastcall void do_##name(struct pt_regs * regs, long error_code) \ | |
9436 | +void do_##name(struct pt_regs * regs, long error_code) \ | |
9437 | { \ | |
9438 | siginfo_t info; \ | |
9439 | if (irq) \ | |
9440 | @@ -533,7 +565,7 @@ fastcall void do_##name(struct pt_regs * | |
9441 | } | |
9442 | ||
9443 | #define DO_VM86_ERROR(trapnr, signr, str, name) \ | |
9444 | -fastcall void do_##name(struct pt_regs * regs, long error_code) \ | |
9445 | +void do_##name(struct pt_regs * regs, long error_code) \ | |
9446 | { \ | |
9447 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | |
9448 | == NOTIFY_STOP) \ | |
9449 | @@ -542,7 +574,7 @@ fastcall void do_##name(struct pt_regs * | |
9450 | } | |
9451 | ||
9452 | #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | |
9453 | -fastcall void do_##name(struct pt_regs * regs, long error_code) \ | |
9454 | +void do_##name(struct pt_regs * regs, long error_code) \ | |
9455 | { \ | |
9456 | siginfo_t info; \ | |
9457 | info.si_signo = signr; \ | |
9458 | @@ -556,13 +588,13 @@ fastcall void do_##name(struct pt_regs * | |
9459 | do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ | |
9460 | } | |
9461 | ||
9462 | -DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) | |
9463 | +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) | |
9464 | #ifndef CONFIG_KPROBES | |
9465 | DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) | |
9466 | #endif | |
9467 | DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) | |
9468 | DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) | |
9469 | -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0) | |
9470 | +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) | |
9471 | DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | |
9472 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | |
9473 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | |
9474 | @@ -570,10 +602,10 @@ DO_ERROR(12, SIGBUS, "stack segment", s | |
9475 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) | |
9476 | DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) | |
9477 | ||
9478 | -fastcall void __kprobes do_general_protection(struct pt_regs * regs, | |
9479 | +void __kprobes do_general_protection(struct pt_regs * regs, | |
9480 | long error_code) | |
9481 | { | |
9482 | - if (regs->eflags & VM_MASK) | |
9483 | + if (regs->flags & VM_MASK) | |
9484 | goto gp_in_vm86; | |
9485 | ||
9486 | if (!user_mode(regs)) | |
9487 | @@ -582,11 +614,14 @@ fastcall void __kprobes do_general_prote | |
9488 | current->thread.error_code = error_code; | |
9489 | current->thread.trap_no = 13; | |
9490 | if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && | |
9491 | - printk_ratelimit()) | |
9492 | + printk_ratelimit()) { | |
9493 | printk(KERN_INFO | |
9494 | - "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", | |
9495 | + "%s[%d] general protection ip:%lx sp:%lx error:%lx", | |
9496 | current->comm, task_pid_nr(current), | |
9497 | - regs->eip, regs->esp, error_code); | |
9498 | + regs->ip, regs->sp, error_code); | |
9499 | + print_vma_addr(" in ", regs->ip); | |
9500 | + printk("\n"); | |
9501 | + } | |
9502 | ||
9503 | force_sig(SIGSEGV, current); | |
9504 | return; | |
9505 | @@ -675,8 +710,8 @@ void __kprobes die_nmi(struct pt_regs *r | |
9506 | */ | |
9507 | bust_spinlocks(1); | |
9508 | printk(KERN_EMERG "%s", msg); | |
9509 | - printk(" on CPU%d, eip %08lx, registers:\n", | |
9510 | - smp_processor_id(), regs->eip); | |
9511 | + printk(" on CPU%d, ip %08lx, registers:\n", | |
9512 | + smp_processor_id(), regs->ip); | |
9513 | show_registers(regs); | |
9514 | console_silent(); | |
9515 | spin_unlock(&nmi_print_lock); | |
9516 | @@ -733,7 +768,7 @@ static __kprobes void default_do_nmi(str | |
9517 | ||
9518 | static int ignore_nmis; | |
9519 | ||
9520 | -fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) | |
9521 | +__kprobes void do_nmi(struct pt_regs * regs, long error_code) | |
9522 | { | |
9523 | int cpu; | |
9524 | ||
9525 | @@ -762,7 +797,7 @@ void restart_nmi(void) | |
9526 | } | |
9527 | ||
9528 | #ifdef CONFIG_KPROBES | |
9529 | -fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) | |
9530 | +void __kprobes do_int3(struct pt_regs *regs, long error_code) | |
9531 | { | |
9532 | trace_hardirqs_fixup(); | |
9533 | ||
9534 | @@ -798,7 +833,7 @@ fastcall void __kprobes do_int3(struct p | |
9535 | * find every occurrence of the TF bit that could be saved away even | |
9536 | * by user code) | |
9537 | */ | |
9538 | -fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) | |
9539 | +void __kprobes do_debug(struct pt_regs * regs, long error_code) | |
9540 | { | |
9541 | unsigned int condition; | |
9542 | struct task_struct *tsk = current; | |
9543 | @@ -807,24 +842,30 @@ fastcall void __kprobes do_debug(struct | |
9544 | ||
9545 | get_debugreg(condition, 6); | |
9546 | ||
9547 | + /* | |
9548 | + * The processor cleared BTF, so don't mark that we need it set. | |
9549 | + */ | |
9550 | + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); | |
9551 | + tsk->thread.debugctlmsr = 0; | |
9552 | + | |
9553 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | |
9554 | SIGTRAP) == NOTIFY_STOP) | |
9555 | return; | |
9556 | /* It's safe to allow irq's after DR6 has been saved */ | |
9557 | - if (regs->eflags & X86_EFLAGS_IF) | |
9558 | + if (regs->flags & X86_EFLAGS_IF) | |
9559 | local_irq_enable(); | |
9560 | ||
9561 | /* Mask out spurious debug traps due to lazy DR7 setting */ | |
9562 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | |
9563 | - if (!tsk->thread.debugreg[7]) | |
9564 | + if (!tsk->thread.debugreg7) | |
9565 | goto clear_dr7; | |
9566 | } | |
9567 | ||
9568 | - if (regs->eflags & VM_MASK) | |
9569 | + if (regs->flags & VM_MASK) | |
9570 | goto debug_vm86; | |
9571 | ||
9572 | /* Save debug status register where ptrace can see it */ | |
9573 | - tsk->thread.debugreg[6] = condition; | |
9574 | + tsk->thread.debugreg6 = condition; | |
9575 | ||
9576 | /* | |
9577 | * Single-stepping through TF: make sure we ignore any events in | |
9578 | @@ -856,7 +897,7 @@ debug_vm86: | |
9579 | ||
9580 | clear_TF_reenable: | |
9581 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | |
9582 | - regs->eflags &= ~TF_MASK; | |
9583 | + regs->flags &= ~TF_MASK; | |
9584 | return; | |
9585 | } | |
9586 | ||
9587 | @@ -865,7 +906,7 @@ clear_TF_reenable: | |
9588 | * the correct behaviour even in the presence of the asynchronous | |
9589 | * IRQ13 behaviour | |
9590 | */ | |
9591 | -void math_error(void __user *eip) | |
9592 | +void math_error(void __user *ip) | |
9593 | { | |
9594 | struct task_struct * task; | |
9595 | siginfo_t info; | |
9596 | @@ -881,7 +922,7 @@ void math_error(void __user *eip) | |
9597 | info.si_signo = SIGFPE; | |
9598 | info.si_errno = 0; | |
9599 | info.si_code = __SI_FAULT; | |
9600 | - info.si_addr = eip; | |
9601 | + info.si_addr = ip; | |
9602 | /* | |
9603 | * (~cwd & swd) will mask out exceptions that are not set to unmasked | |
9604 | * status. 0x3f is the exception bits in these regs, 0x200 is the | |
9605 | @@ -924,13 +965,13 @@ void math_error(void __user *eip) | |
9606 | force_sig_info(SIGFPE, &info, task); | |
9607 | } | |
9608 | ||
9609 | -fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) | |
9610 | +void do_coprocessor_error(struct pt_regs * regs, long error_code) | |
9611 | { | |
9612 | ignore_fpu_irq = 1; | |
9613 | - math_error((void __user *)regs->eip); | |
9614 | + math_error((void __user *)regs->ip); | |
9615 | } | |
9616 | ||
9617 | -static void simd_math_error(void __user *eip) | |
9618 | +static void simd_math_error(void __user *ip) | |
9619 | { | |
9620 | struct task_struct * task; | |
9621 | siginfo_t info; | |
9622 | @@ -946,7 +987,7 @@ static void simd_math_error(void __user | |
9623 | info.si_signo = SIGFPE; | |
9624 | info.si_errno = 0; | |
9625 | info.si_code = __SI_FAULT; | |
9626 | - info.si_addr = eip; | |
9627 | + info.si_addr = ip; | |
9628 | /* | |
9629 | * The SIMD FPU exceptions are handled a little differently, as there | |
9630 | * is only a single status/control register. Thus, to determine which | |
9631 | @@ -978,19 +1019,19 @@ static void simd_math_error(void __user | |
9632 | force_sig_info(SIGFPE, &info, task); | |
9633 | } | |
9634 | ||
9635 | -fastcall void do_simd_coprocessor_error(struct pt_regs * regs, | |
9636 | +void do_simd_coprocessor_error(struct pt_regs * regs, | |
9637 | long error_code) | |
9638 | { | |
9639 | if (cpu_has_xmm) { | |
9640 | /* Handle SIMD FPU exceptions on PIII+ processors. */ | |
9641 | ignore_fpu_irq = 1; | |
9642 | - simd_math_error((void __user *)regs->eip); | |
9643 | + simd_math_error((void __user *)regs->ip); | |
9644 | } else { | |
9645 | /* | |
9646 | * Handle strange cache flush from user space exception | |
9647 | * in all other cases. This is undocumented behaviour. | |
9648 | */ | |
9649 | - if (regs->eflags & VM_MASK) { | |
9650 | + if (regs->flags & VM_MASK) { | |
9651 | handle_vm86_fault((struct kernel_vm86_regs *)regs, | |
9652 | error_code); | |
9653 | return; | |
9654 | @@ -1003,7 +1044,7 @@ fastcall void do_simd_coprocessor_error( | |
9655 | } | |
9656 | ||
9657 | #ifndef CONFIG_XEN | |
9658 | -fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, | |
9659 | +void do_spurious_interrupt_bug(struct pt_regs * regs, | |
9660 | long error_code) | |
9661 | { | |
9662 | #if 0 | |
9663 | @@ -1012,7 +1053,7 @@ fastcall void do_spurious_interrupt_bug( | |
9664 | #endif | |
9665 | } | |
9666 | ||
9667 | -fastcall unsigned long patch_espfix_desc(unsigned long uesp, | |
9668 | +unsigned long patch_espfix_desc(unsigned long uesp, | |
9669 | unsigned long kesp) | |
9670 | { | |
9671 | struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; | |
9672 | @@ -1072,7 +1113,7 @@ asmlinkage void math_emulate(long arg) | |
9673 | * NB. All these are "trap gates" (i.e. events_mask isn't set) except | |
9674 | * for those that specify <dpl>|4 in the second field. | |
9675 | */ | |
9676 | -static trap_info_t __cpuinitdata trap_table[] = { | |
9677 | +static const trap_info_t __cpuinitconst trap_table[] = { | |
9678 | { 0, 0, __KERNEL_CS, (unsigned long)divide_error }, | |
9679 | { 1, 0|4, __KERNEL_CS, (unsigned long)debug }, | |
9680 | { 3, 3|4, __KERNEL_CS, (unsigned long)int3 }, | |
9681 | @@ -1105,17 +1146,12 @@ void __init trap_init(void) | |
9682 | if (ret) | |
9683 | printk("HYPERVISOR_set_trap_table failed: error %d\n", ret); | |
9684 | ||
9685 | + /* | |
9686 | + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. | |
9687 | + * Generate a build-time error if the alignment is wrong. | |
9688 | + */ | |
9689 | + BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15); | |
9690 | if (cpu_has_fxsr) { | |
9691 | - /* | |
9692 | - * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. | |
9693 | - * Generates a compile-time "error: zero width for bit-field" if | |
9694 | - * the alignment is wrong. | |
9695 | - */ | |
9696 | - struct fxsrAlignAssert { | |
9697 | - int _:!(offsetof(struct task_struct, | |
9698 | - thread.i387.fxsave) & 15); | |
9699 | - }; | |
9700 | - | |
9701 | printk(KERN_INFO "Enabling fast FPU save and restore... "); | |
9702 | set_in_cr4(X86_CR4_OSFXSR); | |
9703 | printk("done.\n"); | |
9704 | --- a/arch/x86/kernel/traps_64-xen.c | |
9705 | +++ b/arch/x86/kernel/traps_64-xen.c | |
9706 | @@ -74,38 +74,41 @@ asmlinkage void alignment_check(void); | |
9707 | asmlinkage void machine_check(void); | |
9708 | asmlinkage void spurious_interrupt_bug(void); | |
9709 | ||
9710 | +static unsigned int code_bytes = 64; | |
9711 | + | |
9712 | static inline void conditional_sti(struct pt_regs *regs) | |
9713 | { | |
9714 | - if (regs->eflags & X86_EFLAGS_IF) | |
9715 | + if (regs->flags & X86_EFLAGS_IF) | |
9716 | local_irq_enable(); | |
9717 | } | |
9718 | ||
9719 | static inline void preempt_conditional_sti(struct pt_regs *regs) | |
9720 | { | |
9721 | - preempt_disable(); | |
9722 | - if (regs->eflags & X86_EFLAGS_IF) | |
9723 | + inc_preempt_count(); | |
9724 | + if (regs->flags & X86_EFLAGS_IF) | |
9725 | local_irq_enable(); | |
9726 | } | |
9727 | ||
9728 | static inline void preempt_conditional_cli(struct pt_regs *regs) | |
9729 | { | |
9730 | - if (regs->eflags & X86_EFLAGS_IF) | |
9731 | + if (regs->flags & X86_EFLAGS_IF) | |
9732 | local_irq_disable(); | |
9733 | /* Make sure to not schedule here because we could be running | |
9734 | on an exception stack. */ | |
9735 | - preempt_enable_no_resched(); | |
9736 | + dec_preempt_count(); | |
9737 | } | |
9738 | ||
9739 | int kstack_depth_to_print = 12; | |
9740 | ||
9741 | -#ifdef CONFIG_KALLSYMS | |
9742 | -void printk_address(unsigned long address) | |
9743 | +void printk_address(unsigned long address, int reliable) | |
9744 | { | |
9745 | +#ifdef CONFIG_KALLSYMS | |
9746 | unsigned long offset = 0, symsize; | |
9747 | const char *symname; | |
9748 | char *modname; | |
9749 | char *delim = ":"; | |
9750 | - char namebuf[128]; | |
9751 | + char namebuf[KSYM_NAME_LEN]; | |
9752 | + char reliab[4] = ""; | |
9753 | ||
9754 | symname = kallsyms_lookup(address, &symsize, &offset, | |
9755 | &modname, namebuf); | |
9756 | @@ -113,17 +116,17 @@ void printk_address(unsigned long addres | |
9757 | printk(" [<%016lx>]\n", address); | |
9758 | return; | |
9759 | } | |
9760 | + if (!reliable) | |
9761 | + strcpy(reliab, "? "); | |
9762 | + | |
9763 | if (!modname) | |
9764 | - modname = delim = ""; | |
9765 | - printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", | |
9766 | - address, delim, modname, delim, symname, offset, symsize); | |
9767 | -} | |
9768 | + modname = delim = ""; | |
9769 | + printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n", | |
9770 | + address, reliab, delim, modname, delim, symname, offset, symsize); | |
9771 | #else | |
9772 | -void printk_address(unsigned long address) | |
9773 | -{ | |
9774 | printk(" [<%016lx>]\n", address); | |
9775 | -} | |
9776 | #endif | |
9777 | +} | |
9778 | ||
9779 | static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | |
9780 | unsigned *usedp, char **idp) | |
9781 | @@ -210,14 +213,53 @@ static unsigned long *in_exception_stack | |
9782 | * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack | |
9783 | */ | |
9784 | ||
9785 | -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) | |
9786 | +static inline int valid_stack_ptr(struct thread_info *tinfo, | |
9787 | + void *p, unsigned int size, void *end) | |
9788 | { | |
9789 | - void *t = (void *)tinfo; | |
9790 | - return p > t && p < t + THREAD_SIZE - 3; | |
9791 | + void *t = tinfo; | |
9792 | + if (end) { | |
9793 | + if (p < end && p >= (end-THREAD_SIZE)) | |
9794 | + return 1; | |
9795 | + else | |
9796 | + return 0; | |
9797 | + } | |
9798 | + return p > t && p < t + THREAD_SIZE - size; | |
9799 | +} | |
9800 | + | |
9801 | +/* The form of the top of the frame on the stack */ | |
9802 | +struct stack_frame { | |
9803 | + struct stack_frame *next_frame; | |
9804 | + unsigned long return_address; | |
9805 | +}; | |
9806 | + | |
9807 | + | |
9808 | +static inline unsigned long print_context_stack(struct thread_info *tinfo, | |
9809 | + unsigned long *stack, unsigned long bp, | |
9810 | + const struct stacktrace_ops *ops, void *data, | |
9811 | + unsigned long *end) | |
9812 | +{ | |
9813 | + struct stack_frame *frame = (struct stack_frame *)bp; | |
9814 | + | |
9815 | + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { | |
9816 | + unsigned long addr; | |
9817 | + | |
9818 | + addr = *stack; | |
9819 | + if (__kernel_text_address(addr)) { | |
9820 | + if ((unsigned long) stack == bp + 8) { | |
9821 | + ops->address(data, addr, 1); | |
9822 | + frame = frame->next_frame; | |
9823 | + bp = (unsigned long) frame; | |
9824 | + } else { | |
9825 | + ops->address(data, addr, bp == 0); | |
9826 | + } | |
9827 | + } | |
9828 | + stack++; | |
9829 | + } | |
9830 | + return bp; | |
9831 | } | |
9832 | ||
9833 | void dump_trace(struct task_struct *tsk, struct pt_regs *regs, | |
9834 | - unsigned long *stack, | |
9835 | + unsigned long *stack, unsigned long bp, | |
9836 | const struct stacktrace_ops *ops, void *data) | |
9837 | { | |
9838 | const unsigned cpu = get_cpu(); | |
9839 | @@ -227,36 +269,28 @@ void dump_trace(struct task_struct *tsk, | |
9840 | ||
9841 | if (!tsk) | |
9842 | tsk = current; | |
9843 | + tinfo = task_thread_info(tsk); | |
9844 | ||
9845 | if (!stack) { | |
9846 | unsigned long dummy; | |
9847 | stack = &dummy; | |
9848 | if (tsk && tsk != current) | |
9849 | - stack = (unsigned long *)tsk->thread.rsp; | |
9850 | + stack = (unsigned long *)tsk->thread.sp; | |
9851 | } | |
9852 | ||
9853 | - /* | |
9854 | - * Print function call entries within a stack. 'cond' is the | |
9855 | - * "end of stackframe" condition, that the 'stack++' | |
9856 | - * iteration will eventually trigger. | |
9857 | - */ | |
9858 | -#define HANDLE_STACK(cond) \ | |
9859 | - do while (cond) { \ | |
9860 | - unsigned long addr = *stack++; \ | |
9861 | - /* Use unlocked access here because except for NMIs \ | |
9862 | - we should be already protected against module unloads */ \ | |
9863 | - if (__kernel_text_address(addr)) { \ | |
9864 | - /* \ | |
9865 | - * If the address is either in the text segment of the \ | |
9866 | - * kernel, or in the region which contains vmalloc'ed \ | |
9867 | - * memory, it *may* be the address of a calling \ | |
9868 | - * routine; if so, print it so that someone tracing \ | |
9869 | - * down the cause of the crash will be able to figure \ | |
9870 | - * out the call path that was taken. \ | |
9871 | - */ \ | |
9872 | - ops->address(data, addr); \ | |
9873 | - } \ | |
9874 | - } while (0) | |
9875 | +#ifdef CONFIG_FRAME_POINTER | |
9876 | + if (!bp) { | |
9877 | + if (tsk == current) { | |
9878 | + /* Grab bp right from our regs */ | |
9879 | + asm("movq %%rbp, %0" : "=r" (bp):); | |
9880 | + } else { | |
9881 | + /* bp is the last reg pushed by switch_to */ | |
9882 | + bp = *(unsigned long *) tsk->thread.sp; | |
9883 | + } | |
9884 | + } | |
9885 | +#endif | |
9886 | + | |
9887 | + | |
9888 | ||
9889 | /* | |
9890 | * Print function call entries in all stacks, starting at the | |
9891 | @@ -272,7 +306,9 @@ void dump_trace(struct task_struct *tsk, | |
9892 | if (estack_end) { | |
9893 | if (ops->stack(data, id) < 0) | |
9894 | break; | |
9895 | - HANDLE_STACK (stack < estack_end); | |
9896 | + | |
9897 | + bp = print_context_stack(tinfo, stack, bp, ops, | |
9898 | + data, estack_end); | |
9899 | ops->stack(data, "<EOE>"); | |
9900 | /* | |
9901 | * We link to the next stack via the | |
9902 | @@ -290,7 +326,8 @@ void dump_trace(struct task_struct *tsk, | |
9903 | if (stack >= irqstack && stack < irqstack_end) { | |
9904 | if (ops->stack(data, "IRQ") < 0) | |
9905 | break; | |
9906 | - HANDLE_STACK (stack < irqstack_end); | |
9907 | + bp = print_context_stack(tinfo, stack, bp, | |
9908 | + ops, data, irqstack_end); | |
9909 | /* | |
9910 | * We link to the next stack (which would be | |
9911 | * the process stack normally) the last | |
9912 | @@ -308,9 +345,7 @@ void dump_trace(struct task_struct *tsk, | |
9913 | /* | |
9914 | * This handles the process stack: | |
9915 | */ | |
9916 | - tinfo = task_thread_info(tsk); | |
9917 | - HANDLE_STACK (valid_stack_ptr(tinfo, stack)); | |
9918 | -#undef HANDLE_STACK | |
9919 | + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); | |
9920 | put_cpu(); | |
9921 | } | |
9922 | EXPORT_SYMBOL(dump_trace); | |
9923 | @@ -333,10 +368,10 @@ static int print_trace_stack(void *data, | |
9924 | return 0; | |
9925 | } | |
9926 | ||
9927 | -static void print_trace_address(void *data, unsigned long addr) | |
9928 | +static void print_trace_address(void *data, unsigned long addr, int reliable) | |
9929 | { | |
9930 | touch_nmi_watchdog(); | |
9931 | - printk_address(addr); | |
9932 | + printk_address(addr, reliable); | |
9933 | } | |
9934 | ||
9935 | static const struct stacktrace_ops print_trace_ops = { | |
9936 | @@ -347,15 +382,17 @@ static const struct stacktrace_ops print | |
9937 | }; | |
9938 | ||
9939 | void | |
9940 | -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) | |
9941 | +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, | |
9942 | + unsigned long bp) | |
9943 | { | |
9944 | printk("\nCall Trace:\n"); | |
9945 | - dump_trace(tsk, regs, stack, &print_trace_ops, NULL); | |
9946 | + dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL); | |
9947 | printk("\n"); | |
9948 | } | |
9949 | ||
9950 | static void | |
9951 | -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) | |
9952 | +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, | |
9953 | + unsigned long bp) | |
9954 | { | |
9955 | unsigned long *stack; | |
9956 | int i; | |
9957 | @@ -366,14 +403,14 @@ _show_stack(struct task_struct *tsk, str | |
9958 | // debugging aid: "show_stack(NULL, NULL);" prints the | |
9959 | // back trace for this cpu. | |
9960 | ||
9961 | - if (rsp == NULL) { | |
9962 | + if (sp == NULL) { | |
9963 | if (tsk) | |
9964 | - rsp = (unsigned long *)tsk->thread.rsp; | |
9965 | + sp = (unsigned long *)tsk->thread.sp; | |
9966 | else | |
9967 | - rsp = (unsigned long *)&rsp; | |
9968 | + sp = (unsigned long *)&sp; | |
9969 | } | |
9970 | ||
9971 | - stack = rsp; | |
9972 | + stack = sp; | |
9973 | for(i=0; i < kstack_depth_to_print; i++) { | |
9974 | if (stack >= irqstack && stack <= irqstack_end) { | |
9975 | if (stack == irqstack_end) { | |
9976 | @@ -389,12 +426,12 @@ _show_stack(struct task_struct *tsk, str | |
9977 | printk(" %016lx", *stack++); | |
9978 | touch_nmi_watchdog(); | |
9979 | } | |
9980 | - show_trace(tsk, regs, rsp); | |
9981 | + show_trace(tsk, regs, sp, bp); | |
9982 | } | |
9983 | ||
9984 | -void show_stack(struct task_struct *tsk, unsigned long * rsp) | |
9985 | +void show_stack(struct task_struct *tsk, unsigned long * sp) | |
9986 | { | |
9987 | - _show_stack(tsk, NULL, rsp); | |
9988 | + _show_stack(tsk, NULL, sp, 0); | |
9989 | } | |
9990 | ||
9991 | /* | |
9992 | @@ -403,13 +440,19 @@ void show_stack(struct task_struct *tsk, | |
9993 | void dump_stack(void) | |
9994 | { | |
9995 | unsigned long dummy; | |
9996 | + unsigned long bp = 0; | |
9997 | + | |
9998 | +#ifdef CONFIG_FRAME_POINTER | |
9999 | + if (!bp) | |
10000 | + asm("movq %%rbp, %0" : "=r" (bp):); | |
10001 | +#endif | |
10002 | ||
10003 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | |
10004 | current->pid, current->comm, print_tainted(), | |
10005 | init_utsname()->release, | |
10006 | (int)strcspn(init_utsname()->version, " "), | |
10007 | init_utsname()->version); | |
10008 | - show_trace(NULL, NULL, &dummy); | |
10009 | + show_trace(NULL, NULL, &dummy, bp); | |
10010 | } | |
10011 | ||
10012 | EXPORT_SYMBOL(dump_stack); | |
10013 | @@ -417,12 +460,15 @@ EXPORT_SYMBOL(dump_stack); | |
10014 | void show_registers(struct pt_regs *regs) | |
10015 | { | |
10016 | int i; | |
10017 | - int in_kernel = !user_mode(regs); | |
10018 | - unsigned long rsp; | |
10019 | + unsigned long sp; | |
10020 | const int cpu = smp_processor_id(); | |
10021 | struct task_struct *cur = cpu_pda(cpu)->pcurrent; | |
10022 | + u8 *ip; | |
10023 | + unsigned int code_prologue = code_bytes * 43 / 64; | |
10024 | + unsigned int code_len = code_bytes; | |
10025 | ||
10026 | - rsp = regs->rsp; | |
10027 | + sp = regs->sp; | |
10028 | + ip = (u8 *) regs->ip - code_prologue; | |
10029 | printk("CPU %d ", cpu); | |
10030 | __show_regs(regs); | |
10031 | printk("Process %s (pid: %d, threadinfo %p, task %p)\n", | |
10032 | @@ -432,45 +478,43 @@ void show_registers(struct pt_regs *regs | |
10033 | * When in-kernel, we also print out the stack and code at the | |
10034 | * time of the fault.. | |
10035 | */ | |
10036 | - if (in_kernel) { | |
10037 | + if (!user_mode(regs)) { | |
10038 | + unsigned char c; | |
10039 | printk("Stack: "); | |
10040 | - _show_stack(NULL, regs, (unsigned long*)rsp); | |
10041 | + _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); | |
10042 | + printk("\n"); | |
10043 | ||
10044 | - printk("\nCode: "); | |
10045 | - if (regs->rip < PAGE_OFFSET) | |
10046 | - goto bad; | |
10047 | - | |
10048 | - for (i=0; i<20; i++) { | |
10049 | - unsigned char c; | |
10050 | - if (__get_user(c, &((unsigned char*)regs->rip)[i])) { | |
10051 | -bad: | |
10052 | + printk(KERN_EMERG "Code: "); | |
10053 | + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { | |
10054 | + /* try starting at RIP */ | |
10055 | + ip = (u8 *) regs->ip; | |
10056 | + code_len = code_len - code_prologue + 1; | |
10057 | + } | |
10058 | + for (i = 0; i < code_len; i++, ip++) { | |
10059 | + if (ip < (u8 *)PAGE_OFFSET || | |
10060 | + probe_kernel_address(ip, c)) { | |
10061 | printk(" Bad RIP value."); | |
10062 | break; | |
10063 | } | |
10064 | - printk("%02x ", c); | |
10065 | + if (ip == (u8 *)regs->ip) | |
10066 | + printk("<%02x> ", c); | |
10067 | + else | |
10068 | + printk("%02x ", c); | |
10069 | } | |
10070 | } | |
10071 | printk("\n"); | |
10072 | } | |
10073 | ||
10074 | -int is_valid_bugaddr(unsigned long rip) | |
10075 | +int is_valid_bugaddr(unsigned long ip) | |
10076 | { | |
10077 | unsigned short ud2; | |
10078 | ||
10079 | - if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2))) | |
10080 | + if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) | |
10081 | return 0; | |
10082 | ||
10083 | return ud2 == 0x0b0f; | |
10084 | } | |
10085 | ||
10086 | -#ifdef CONFIG_BUG | |
10087 | -void out_of_line_bug(void) | |
10088 | -{ | |
10089 | - BUG(); | |
10090 | -} | |
10091 | -EXPORT_SYMBOL(out_of_line_bug); | |
10092 | -#endif | |
10093 | - | |
10094 | static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; | |
10095 | static int die_owner = -1; | |
10096 | static unsigned int die_nest_count; | |
10097 | @@ -498,7 +542,7 @@ unsigned __kprobes long oops_begin(void) | |
10098 | return flags; | |
10099 | } | |
10100 | ||
10101 | -void __kprobes oops_end(unsigned long flags) | |
10102 | +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | |
10103 | { | |
10104 | die_owner = -1; | |
10105 | bust_spinlocks(0); | |
10106 | @@ -507,12 +551,17 @@ void __kprobes oops_end(unsigned long fl | |
10107 | /* Nest count reaches zero, release the lock. */ | |
10108 | __raw_spin_unlock(&die_lock); | |
10109 | raw_local_irq_restore(flags); | |
10110 | + if (!regs) { | |
10111 | + oops_exit(); | |
10112 | + return; | |
10113 | + } | |
10114 | if (panic_on_oops) | |
10115 | panic("Fatal exception"); | |
10116 | oops_exit(); | |
10117 | + do_exit(signr); | |
10118 | } | |
10119 | ||
10120 | -void __kprobes __die(const char * str, struct pt_regs * regs, long err) | |
10121 | +int __kprobes __die(const char * str, struct pt_regs * regs, long err) | |
10122 | { | |
10123 | static int die_counter; | |
10124 | printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); | |
10125 | @@ -526,15 +575,17 @@ void __kprobes __die(const char * str, s | |
10126 | printk("DEBUG_PAGEALLOC"); | |
10127 | #endif | |
10128 | printk("\n"); | |
10129 | - notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); | |
10130 | + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | |
10131 | + return 1; | |
10132 | show_registers(regs); | |
10133 | add_taint(TAINT_DIE); | |
10134 | /* Executive summary in case the oops scrolled away */ | |
10135 | printk(KERN_ALERT "RIP "); | |
10136 | - printk_address(regs->rip); | |
10137 | - printk(" RSP <%016lx>\n", regs->rsp); | |
10138 | + printk_address(regs->ip, 1); | |
10139 | + printk(" RSP <%016lx>\n", regs->sp); | |
10140 | if (kexec_should_crash(current)) | |
10141 | crash_kexec(regs); | |
10142 | + return 0; | |
10143 | } | |
10144 | ||
10145 | void die(const char * str, struct pt_regs * regs, long err) | |
10146 | @@ -542,11 +593,11 @@ void die(const char * str, struct pt_reg | |
10147 | unsigned long flags = oops_begin(); | |
10148 | ||
10149 | if (!user_mode(regs)) | |
10150 | - report_bug(regs->rip, regs); | |
10151 | + report_bug(regs->ip, regs); | |
10152 | ||
10153 | - __die(str, regs, err); | |
10154 | - oops_end(flags); | |
10155 | - do_exit(SIGSEGV); | |
10156 | + if (__die(str, regs, err)) | |
10157 | + regs = NULL; | |
10158 | + oops_end(flags, regs, SIGSEGV); | |
10159 | } | |
10160 | ||
10161 | #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL) | |
10162 | @@ -564,10 +615,10 @@ void __kprobes die_nmi(char *str, struct | |
10163 | crash_kexec(regs); | |
10164 | if (do_panic || panic_on_oops) | |
10165 | panic("Non maskable interrupt"); | |
10166 | - oops_end(flags); | |
10167 | + oops_end(flags, NULL, SIGBUS); | |
10168 | nmi_exit(); | |
10169 | local_irq_enable(); | |
10170 | - do_exit(SIGSEGV); | |
10171 | + do_exit(SIGBUS); | |
10172 | } | |
10173 | #endif | |
10174 | ||
10175 | @@ -592,11 +643,14 @@ static void __kprobes do_trap(int trapnr | |
10176 | tsk->thread.trap_no = trapnr; | |
10177 | ||
10178 | if (show_unhandled_signals && unhandled_signal(tsk, signr) && | |
10179 | - printk_ratelimit()) | |
10180 | + printk_ratelimit()) { | |
10181 | printk(KERN_INFO | |
10182 | - "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", | |
10183 | + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", | |
10184 | tsk->comm, tsk->pid, str, | |
10185 | - regs->rip, regs->rsp, error_code); | |
10186 | + regs->ip, regs->sp, error_code); | |
10187 | + print_vma_addr(" in ", regs->ip); | |
10188 | + printk("\n"); | |
10189 | + } | |
10190 | ||
10191 | if (info) | |
10192 | force_sig_info(signr, info, tsk); | |
10193 | @@ -606,19 +660,12 @@ static void __kprobes do_trap(int trapnr | |
10194 | } | |
10195 | ||
10196 | ||
10197 | - /* kernel trap */ | |
10198 | - { | |
10199 | - const struct exception_table_entry *fixup; | |
10200 | - fixup = search_exception_tables(regs->rip); | |
10201 | - if (fixup) | |
10202 | - regs->rip = fixup->fixup; | |
10203 | - else { | |
10204 | - tsk->thread.error_code = error_code; | |
10205 | - tsk->thread.trap_no = trapnr; | |
10206 | - die(str, regs, error_code); | |
10207 | - } | |
10208 | - return; | |
10209 | + if (!fixup_exception(regs)) { | |
10210 | + tsk->thread.error_code = error_code; | |
10211 | + tsk->thread.trap_no = trapnr; | |
10212 | + die(str, regs, error_code); | |
10213 | } | |
10214 | + return; | |
10215 | } | |
10216 | ||
10217 | #define DO_ERROR(trapnr, signr, str, name) \ | |
10218 | @@ -647,10 +694,10 @@ asmlinkage void do_##name(struct pt_regs | |
10219 | do_trap(trapnr, signr, str, regs, error_code, &info); \ | |
10220 | } | |
10221 | ||
10222 | -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) | |
10223 | +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) | |
10224 | DO_ERROR( 4, SIGSEGV, "overflow", overflow) | |
10225 | DO_ERROR( 5, SIGSEGV, "bounds", bounds) | |
10226 | -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) | |
10227 | +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) | |
10228 | DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) | |
10229 | DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | |
10230 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | |
10231 | @@ -698,32 +745,28 @@ asmlinkage void __kprobes do_general_pro | |
10232 | tsk->thread.trap_no = 13; | |
10233 | ||
10234 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | |
10235 | - printk_ratelimit()) | |
10236 | + printk_ratelimit()) { | |
10237 | printk(KERN_INFO | |
10238 | - "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", | |
10239 | + "%s[%d] general protection ip:%lx sp:%lx error:%lx", | |
10240 | tsk->comm, tsk->pid, | |
10241 | - regs->rip, regs->rsp, error_code); | |
10242 | + regs->ip, regs->sp, error_code); | |
10243 | + print_vma_addr(" in ", regs->ip); | |
10244 | + printk("\n"); | |
10245 | + } | |
10246 | ||
10247 | force_sig(SIGSEGV, tsk); | |
10248 | return; | |
10249 | } | |
10250 | ||
10251 | - /* kernel gp */ | |
10252 | - { | |
10253 | - const struct exception_table_entry *fixup; | |
10254 | - fixup = search_exception_tables(regs->rip); | |
10255 | - if (fixup) { | |
10256 | - regs->rip = fixup->fixup; | |
10257 | - return; | |
10258 | - } | |
10259 | + if (fixup_exception(regs)) | |
10260 | + return; | |
10261 | ||
10262 | - tsk->thread.error_code = error_code; | |
10263 | - tsk->thread.trap_no = 13; | |
10264 | - if (notify_die(DIE_GPF, "general protection fault", regs, | |
10265 | - error_code, 13, SIGSEGV) == NOTIFY_STOP) | |
10266 | - return; | |
10267 | - die("general protection fault", regs, error_code); | |
10268 | - } | |
10269 | + tsk->thread.error_code = error_code; | |
10270 | + tsk->thread.trap_no = 13; | |
10271 | + if (notify_die(DIE_GPF, "general protection fault", regs, | |
10272 | + error_code, 13, SIGSEGV) == NOTIFY_STOP) | |
10273 | + return; | |
10274 | + die("general protection fault", regs, error_code); | |
10275 | } | |
10276 | ||
10277 | static __kprobes void | |
10278 | @@ -833,15 +876,15 @@ asmlinkage __kprobes struct pt_regs *syn | |
10279 | { | |
10280 | struct pt_regs *regs = eregs; | |
10281 | /* Did already sync */ | |
10282 | - if (eregs == (struct pt_regs *)eregs->rsp) | |
10283 | + if (eregs == (struct pt_regs *)eregs->sp) | |
10284 | ; | |
10285 | /* Exception from user space */ | |
10286 | else if (user_mode(eregs)) | |
10287 | regs = task_pt_regs(current); | |
10288 | /* Exception from kernel and interrupts are enabled. Move to | |
10289 | kernel process stack. */ | |
10290 | - else if (eregs->eflags & X86_EFLAGS_IF) | |
10291 | - regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); | |
10292 | + else if (eregs->flags & X86_EFLAGS_IF) | |
10293 | + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); | |
10294 | if (eregs != regs) | |
10295 | *regs = *eregs; | |
10296 | return regs; | |
10297 | @@ -859,6 +902,12 @@ asmlinkage void __kprobes do_debug(struc | |
10298 | ||
10299 | get_debugreg(condition, 6); | |
10300 | ||
10301 | + /* | |
10302 | + * The processor cleared BTF, so don't mark that we need it set. | |
10303 | + */ | |
10304 | + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); | |
10305 | + tsk->thread.debugctlmsr = 0; | |
10306 | + | |
10307 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | |
10308 | SIGTRAP) == NOTIFY_STOP) | |
10309 | return; | |
10310 | @@ -874,27 +923,14 @@ asmlinkage void __kprobes do_debug(struc | |
10311 | ||
10312 | tsk->thread.debugreg6 = condition; | |
10313 | ||
10314 | - /* Mask out spurious TF errors due to lazy TF clearing */ | |
10315 | + | |
10316 | + /* | |
10317 | + * Single-stepping through TF: make sure we ignore any events in | |
10318 | + * kernel space (but re-enable TF when returning to user mode). | |
10319 | + */ | |
10320 | if (condition & DR_STEP) { | |
10321 | - /* | |
10322 | - * The TF error should be masked out only if the current | |
10323 | - * process is not traced and if the TRAP flag has been set | |
10324 | - * previously by a tracing process (condition detected by | |
10325 | - * the PT_DTRACE flag); remember that the i386 TRAP flag | |
10326 | - * can be modified by the process itself in user mode, | |
10327 | - * allowing programs to debug themselves without the ptrace() | |
10328 | - * interface. | |
10329 | - */ | |
10330 | if (!user_mode(regs)) | |
10331 | goto clear_TF_reenable; | |
10332 | - /* | |
10333 | - * Was the TF flag set by a debugger? If so, clear it now, | |
10334 | - * so that register information is correct. | |
10335 | - */ | |
10336 | - if (tsk->ptrace & PT_DTRACE) { | |
10337 | - regs->eflags &= ~TF_MASK; | |
10338 | - tsk->ptrace &= ~PT_DTRACE; | |
10339 | - } | |
10340 | } | |
10341 | ||
10342 | /* Ok, finally something we can handle */ | |
10343 | @@ -903,7 +939,7 @@ asmlinkage void __kprobes do_debug(struc | |
10344 | info.si_signo = SIGTRAP; | |
10345 | info.si_errno = 0; | |
10346 | info.si_code = TRAP_BRKPT; | |
10347 | - info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; | |
10348 | + info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; | |
10349 | force_sig_info(SIGTRAP, &info, tsk); | |
10350 | ||
10351 | clear_dr7: | |
10352 | @@ -913,18 +949,15 @@ clear_dr7: | |
10353 | ||
10354 | clear_TF_reenable: | |
10355 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | |
10356 | - regs->eflags &= ~TF_MASK; | |
10357 | + regs->flags &= ~X86_EFLAGS_TF; | |
10358 | preempt_conditional_cli(regs); | |
10359 | } | |
10360 | ||
10361 | static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) | |
10362 | { | |
10363 | - const struct exception_table_entry *fixup; | |
10364 | - fixup = search_exception_tables(regs->rip); | |
10365 | - if (fixup) { | |
10366 | - regs->rip = fixup->fixup; | |
10367 | + if (fixup_exception(regs)) | |
10368 | return 1; | |
10369 | - } | |
10370 | + | |
10371 | notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); | |
10372 | /* Illegal floating point operation in the kernel */ | |
10373 | current->thread.trap_no = trapnr; | |
10374 | @@ -939,7 +972,7 @@ static int kernel_math_error(struct pt_r | |
10375 | */ | |
10376 | asmlinkage void do_coprocessor_error(struct pt_regs *regs) | |
10377 | { | |
10378 | - void __user *rip = (void __user *)(regs->rip); | |
10379 | + void __user *ip = (void __user *)(regs->ip); | |
10380 | struct task_struct * task; | |
10381 | siginfo_t info; | |
10382 | unsigned short cwd, swd; | |
10383 | @@ -959,7 +992,7 @@ asmlinkage void do_coprocessor_error(str | |
10384 | info.si_signo = SIGFPE; | |
10385 | info.si_errno = 0; | |
10386 | info.si_code = __SI_FAULT; | |
10387 | - info.si_addr = rip; | |
10388 | + info.si_addr = ip; | |
10389 | /* | |
10390 | * (~cwd & swd) will mask out exceptions that are not set to unmasked | |
10391 | * status. 0x3f is the exception bits in these regs, 0x200 is the | |
10392 | @@ -1008,7 +1041,7 @@ asmlinkage void bad_intr(void) | |
10393 | ||
10394 | asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) | |
10395 | { | |
10396 | - void __user *rip = (void __user *)(regs->rip); | |
10397 | + void __user *ip = (void __user *)(regs->ip); | |
10398 | struct task_struct * task; | |
10399 | siginfo_t info; | |
10400 | unsigned short mxcsr; | |
10401 | @@ -1028,7 +1061,7 @@ asmlinkage void do_simd_coprocessor_erro | |
10402 | info.si_signo = SIGFPE; | |
10403 | info.si_errno = 0; | |
10404 | info.si_code = __SI_FAULT; | |
10405 | - info.si_addr = rip; | |
10406 | + info.si_addr = ip; | |
10407 | /* | |
10408 | * The SIMD FPU exceptions are handled a little differently, as there | |
10409 | * is only a single status/control register. Thus, to determine which | |
10410 | @@ -1092,13 +1125,14 @@ asmlinkage void math_state_restore(void) | |
10411 | task_thread_info(me)->status |= TS_USEDFPU; | |
10412 | me->fpu_counter++; | |
10413 | } | |
10414 | +EXPORT_SYMBOL_GPL(math_state_restore); | |
10415 | ||
10416 | ||
10417 | /* | |
10418 | * NB. All these are "interrupt gates" (i.e. events_mask is set) because we | |
10419 | * specify <dpl>|4 in the second field. | |
10420 | */ | |
10421 | -static trap_info_t __cpuinitdata trap_table[] = { | |
10422 | +static const trap_info_t __cpuinitconst trap_table[] = { | |
10423 | { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error }, | |
10424 | { 1, 0|4, __KERNEL_CS, (unsigned long)debug }, | |
10425 | { 3, 3|4, __KERNEL_CS, (unsigned long)int3 }, | |
10426 | @@ -1169,3 +1203,14 @@ static int __init kstack_setup(char *s) | |
10427 | return 0; | |
10428 | } | |
10429 | early_param("kstack", kstack_setup); | |
10430 | + | |
10431 | + | |
10432 | +static int __init code_bytes_setup(char *s) | |
10433 | +{ | |
10434 | + code_bytes = simple_strtoul(s, NULL, 0); | |
10435 | + if (code_bytes > 8192) | |
10436 | + code_bytes = 8192; | |
10437 | + | |
10438 | + return 1; | |
10439 | +} | |
10440 | +__setup("code_bytes=", code_bytes_setup); | |
10441 | --- a/arch/x86/kernel/vsyscall_64-xen.c | |
10442 | +++ b/arch/x86/kernel/vsyscall_64-xen.c | |
10443 | @@ -43,12 +43,7 @@ | |
10444 | #include <asm/vgtod.h> | |
10445 | ||
10446 | #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) | |
10447 | -#define __syscall_clobber "r11","rcx","memory" | |
10448 | -#define __pa_vsymbol(x) \ | |
10449 | - ({unsigned long v; \ | |
10450 | - extern char __vsyscall_0; \ | |
10451 | - asm("" : "=r" (v) : "0" (x)); \ | |
10452 | - ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); }) | |
10453 | +#define __syscall_clobber "r11","cx","memory" | |
10454 | ||
10455 | /* | |
10456 | * vsyscall_gtod_data contains data that is : | |
10457 | @@ -102,7 +97,7 @@ static __always_inline void do_get_tz(st | |
10458 | static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) | |
10459 | { | |
10460 | int ret; | |
10461 | - asm volatile("vsysc2: syscall" | |
10462 | + asm volatile("syscall" | |
10463 | : "=a" (ret) | |
10464 | : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) | |
10465 | : __syscall_clobber ); | |
10466 | @@ -112,7 +107,7 @@ static __always_inline int gettimeofday( | |
10467 | static __always_inline long time_syscall(long *t) | |
10468 | { | |
10469 | long secs; | |
10470 | - asm volatile("vsysc1: syscall" | |
10471 | + asm volatile("syscall" | |
10472 | : "=a" (secs) | |
10473 | : "0" (__NR_time),"D" (t) : __syscall_clobber); | |
10474 | return secs; | |
10475 | @@ -190,7 +185,7 @@ time_t __vsyscall(1) vtime(time_t *t) | |
10476 | long __vsyscall(2) | |
10477 | vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) | |
10478 | { | |
10479 | - unsigned int dummy, p; | |
10480 | + unsigned int p; | |
10481 | unsigned long j = 0; | |
10482 | ||
10483 | /* Fast cache - only recompute value once per jiffies and avoid | |
10484 | @@ -205,7 +200,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s | |
10485 | p = tcache->blob[1]; | |
10486 | } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { | |
10487 | /* Load per CPU data from RDTSCP */ | |
10488 | - rdtscp(dummy, dummy, p); | |
10489 | + native_read_tscp(&p); | |
10490 | } else { | |
10491 | /* Load per CPU data from GDT */ | |
10492 | asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); | |
10493 | @@ -228,42 +223,11 @@ long __vsyscall(3) venosys_1(void) | |
10494 | ||
10495 | #ifdef CONFIG_SYSCTL | |
10496 | ||
10497 | -#define SYSCALL 0x050f | |
10498 | -#define NOP2 0x9090 | |
10499 | - | |
10500 | -/* | |
10501 | - * NOP out syscall in vsyscall page when not needed. | |
10502 | - */ | |
10503 | -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, | |
10504 | - void __user *buffer, size_t *lenp, loff_t *ppos) | |
10505 | +static int | |
10506 | +vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, | |
10507 | + void __user *buffer, size_t *lenp, loff_t *ppos) | |
10508 | { | |
10509 | - extern u16 vsysc1, vsysc2; | |
10510 | - u16 __iomem *map1; | |
10511 | - u16 __iomem *map2; | |
10512 | - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); | |
10513 | - if (!write) | |
10514 | - return ret; | |
10515 | - /* gcc has some trouble with __va(__pa()), so just do it this | |
10516 | - way. */ | |
10517 | - map1 = ioremap(__pa_vsymbol(&vsysc1), 2); | |
10518 | - if (!map1) | |
10519 | - return -ENOMEM; | |
10520 | - map2 = ioremap(__pa_vsymbol(&vsysc2), 2); | |
10521 | - if (!map2) { | |
10522 | - ret = -ENOMEM; | |
10523 | - goto out; | |
10524 | - } | |
10525 | - if (!vsyscall_gtod_data.sysctl_enabled) { | |
10526 | - writew(SYSCALL, map1); | |
10527 | - writew(SYSCALL, map2); | |
10528 | - } else { | |
10529 | - writew(NOP2, map1); | |
10530 | - writew(NOP2, map2); | |
10531 | - } | |
10532 | - iounmap(map2); | |
10533 | -out: | |
10534 | - iounmap(map1); | |
10535 | - return ret; | |
10536 | + return proc_dointvec(ctl, write, filp, buffer, lenp, ppos); | |
10537 | } | |
10538 | ||
10539 | static ctl_table kernel_table2[] = { | |
10540 | @@ -279,7 +243,6 @@ static ctl_table kernel_root_table2[] = | |
10541 | .child = kernel_table2 }, | |
10542 | {} | |
10543 | }; | |
10544 | - | |
10545 | #endif | |
10546 | ||
10547 | /* Assume __initcall executes before all user space. Hopefully kmod | |
10548 | @@ -301,7 +264,7 @@ static void __cpuinit vsyscall_set_cpu(i | |
10549 | d |= cpu; | |
10550 | d |= (node & 0xf) << 12; | |
10551 | d |= (node >> 4) << 48; | |
10552 | - if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu) | |
10553 | + if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu) | |
10554 | + GDT_ENTRY_PER_CPU), | |
10555 | d)) | |
10556 | BUG(); | |
10557 | @@ -322,7 +285,7 @@ cpu_vsyscall_notifier(struct notifier_bl | |
10558 | return NOTIFY_DONE; | |
10559 | } | |
10560 | ||
10561 | -static void __init map_vsyscall(void) | |
10562 | +void __init map_vsyscall(void) | |
10563 | { | |
10564 | extern char __vsyscall_0; | |
10565 | unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); | |
10566 | @@ -338,7 +301,6 @@ static int __init vsyscall_init(void) | |
10567 | BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); | |
10568 | BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); | |
10569 | BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); | |
10570 | - map_vsyscall(); | |
10571 | #ifdef CONFIG_XEN | |
10572 | vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */ | |
10573 | if (boot_cpu_has(X86_FEATURE_RDTSCP)) | |
10574 | --- a/arch/x86/kernel/xen_entry_64.S | |
10575 | +++ /dev/null | |
10576 | @@ -1,36 +0,0 @@ | |
10577 | -/* | |
10578 | - * Copied from arch/xen/i386/kernel/entry.S | |
10579 | - */ | |
10580 | -/* Offsets into shared_info_t. */ | |
10581 | -#define evtchn_upcall_pending /* 0 */ | |
10582 | -#define evtchn_upcall_mask 1 | |
10583 | - | |
10584 | -#define sizeof_vcpu_shift 6 | |
10585 | - | |
10586 | -#ifdef CONFIG_SMP | |
10587 | -//#define preempt_disable(reg) incl threadinfo_preempt_count(reg) | |
10588 | -//#define preempt_enable(reg) decl threadinfo_preempt_count(reg) | |
10589 | -#define preempt_disable(reg) | |
10590 | -#define preempt_enable(reg) | |
10591 | -#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \ | |
10592 | - movq %gs:pda_cpunumber,reg ; \ | |
10593 | - shl $32, reg ; \ | |
10594 | - shr $32-sizeof_vcpu_shift,reg ; \ | |
10595 | - addq HYPERVISOR_shared_info,reg | |
10596 | -#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \ | |
10597 | -#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff | |
10598 | -#else | |
10599 | -#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg | |
10600 | -#define XEN_PUT_VCPU_INFO(reg) | |
10601 | -#define XEN_PUT_VCPU_INFO_fixup | |
10602 | -#endif | |
10603 | - | |
10604 | -#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) | |
10605 | -#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) | |
10606 | -#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ | |
10607 | - XEN_LOCKED_BLOCK_EVENTS(reg) ; \ | |
10608 | - XEN_PUT_VCPU_INFO(reg) | |
10609 | -#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ | |
10610 | - XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \ | |
10611 | - XEN_PUT_VCPU_INFO(reg) | |
10612 | -#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) | |
10613 | --- a/arch/x86/mach-xen/setup.c | |
10614 | +++ b/arch/x86/mach-xen/setup.c | |
10615 | @@ -161,15 +161,12 @@ void __init machine_specific_arch_setup( | |
10616 | ||
10617 | /* Do an early initialization of the fixmap area */ | |
10618 | { | |
10619 | - extern pte_t swapper_pg_pmd[PTRS_PER_PTE]; | |
10620 | + extern pte_t swapper_pg_fixmap[PTRS_PER_PTE]; | |
10621 | unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); | |
10622 | - pgd_t *pgd = (pgd_t *)xen_start_info->pt_base; | |
10623 | - pud_t *pud = pud_offset(pgd + pgd_index(addr), addr); | |
10624 | + pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr); | |
10625 | pmd_t *pmd = pmd_offset(pud, addr); | |
10626 | ||
10627 | - swapper_pg_dir = pgd; | |
10628 | - init_mm.pgd = pgd; | |
10629 | - make_lowmem_page_readonly(swapper_pg_pmd, XENFEAT_writable_page_tables); | |
10630 | - set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_pmd) | _PAGE_TABLE)); | |
10631 | + make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables); | |
10632 | + set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE)); | |
10633 | } | |
10634 | } | |
10635 | --- a/arch/x86/mm/fault_32-xen.c | |
10636 | +++ /dev/null | |
10637 | @@ -1,757 +0,0 @@ | |
10638 | -/* | |
10639 | - * linux/arch/i386/mm/fault.c | |
10640 | - * | |
10641 | - * Copyright (C) 1995 Linus Torvalds | |
10642 | - */ | |
10643 | - | |
10644 | -#include <linux/signal.h> | |
10645 | -#include <linux/sched.h> | |
10646 | -#include <linux/kernel.h> | |
10647 | -#include <linux/errno.h> | |
10648 | -#include <linux/string.h> | |
10649 | -#include <linux/types.h> | |
10650 | -#include <linux/ptrace.h> | |
10651 | -#include <linux/mman.h> | |
10652 | -#include <linux/mm.h> | |
10653 | -#include <linux/smp.h> | |
10654 | -#include <linux/interrupt.h> | |
10655 | -#include <linux/init.h> | |
10656 | -#include <linux/tty.h> | |
10657 | -#include <linux/vt_kern.h> /* For unblank_screen() */ | |
10658 | -#include <linux/highmem.h> | |
10659 | -#include <linux/bootmem.h> /* for max_low_pfn */ | |
10660 | -#include <linux/vmalloc.h> | |
10661 | -#include <linux/module.h> | |
10662 | -#include <linux/kprobes.h> | |
10663 | -#include <linux/uaccess.h> | |
10664 | -#include <linux/kdebug.h> | |
10665 | -#include <linux/kprobes.h> | |
10666 | - | |
10667 | -#include <asm/system.h> | |
10668 | -#include <asm/desc.h> | |
10669 | -#include <asm/segment.h> | |
10670 | - | |
10671 | -extern void die(const char *,struct pt_regs *,long); | |
10672 | - | |
10673 | -#ifdef CONFIG_KPROBES | |
10674 | -static inline int notify_page_fault(struct pt_regs *regs) | |
10675 | -{ | |
10676 | - int ret = 0; | |
10677 | - | |
10678 | - /* kprobe_running() needs smp_processor_id() */ | |
10679 | - if (!user_mode_vm(regs)) { | |
10680 | - preempt_disable(); | |
10681 | - if (kprobe_running() && kprobe_fault_handler(regs, 14)) | |
10682 | - ret = 1; | |
10683 | - preempt_enable(); | |
10684 | - } | |
10685 | - | |
10686 | - return ret; | |
10687 | -} | |
10688 | -#else | |
10689 | -static inline int notify_page_fault(struct pt_regs *regs) | |
10690 | -{ | |
10691 | - return 0; | |
10692 | -} | |
10693 | -#endif | |
10694 | - | |
10695 | -/* | |
10696 | - * Return EIP plus the CS segment base. The segment limit is also | |
10697 | - * adjusted, clamped to the kernel/user address space (whichever is | |
10698 | - * appropriate), and returned in *eip_limit. | |
10699 | - * | |
10700 | - * The segment is checked, because it might have been changed by another | |
10701 | - * task between the original faulting instruction and here. | |
10702 | - * | |
10703 | - * If CS is no longer a valid code segment, or if EIP is beyond the | |
10704 | - * limit, or if it is a kernel address when CS is not a kernel segment, | |
10705 | - * then the returned value will be greater than *eip_limit. | |
10706 | - * | |
10707 | - * This is slow, but is very rarely executed. | |
10708 | - */ | |
10709 | -static inline unsigned long get_segment_eip(struct pt_regs *regs, | |
10710 | - unsigned long *eip_limit) | |
10711 | -{ | |
10712 | - unsigned long eip = regs->eip; | |
10713 | - unsigned seg = regs->xcs & 0xffff; | |
10714 | - u32 seg_ar, seg_limit, base, *desc; | |
10715 | - | |
10716 | - /* Unlikely, but must come before segment checks. */ | |
10717 | - if (unlikely(regs->eflags & VM_MASK)) { | |
10718 | - base = seg << 4; | |
10719 | - *eip_limit = base + 0xffff; | |
10720 | - return base + (eip & 0xffff); | |
10721 | - } | |
10722 | - | |
10723 | - /* The standard kernel/user address space limit. */ | |
10724 | - *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg; | |
10725 | - | |
10726 | - /* By far the most common cases. */ | |
10727 | - if (likely(SEGMENT_IS_FLAT_CODE(seg))) | |
10728 | - return eip; | |
10729 | - | |
10730 | - /* Check the segment exists, is within the current LDT/GDT size, | |
10731 | - that kernel/user (ring 0..3) has the appropriate privilege, | |
10732 | - that it's a code segment, and get the limit. */ | |
10733 | - __asm__ ("larl %3,%0; lsll %3,%1" | |
10734 | - : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); | |
10735 | - if ((~seg_ar & 0x9800) || eip > seg_limit) { | |
10736 | - *eip_limit = 0; | |
10737 | - return 1; /* So that returned eip > *eip_limit. */ | |
10738 | - } | |
10739 | - | |
10740 | - /* Get the GDT/LDT descriptor base. | |
10741 | - When you look for races in this code remember that | |
10742 | - LDT and other horrors are only used in user space. */ | |
10743 | - if (seg & (1<<2)) { | |
10744 | - /* Must lock the LDT while reading it. */ | |
10745 | - mutex_lock(¤t->mm->context.lock); | |
10746 | - desc = current->mm->context.ldt; | |
10747 | - desc = (void *)desc + (seg & ~7); | |
10748 | - } else { | |
10749 | - /* Must disable preemption while reading the GDT. */ | |
10750 | - desc = (u32 *)get_cpu_gdt_table(get_cpu()); | |
10751 | - desc = (void *)desc + (seg & ~7); | |
10752 | - } | |
10753 | - | |
10754 | - /* Decode the code segment base from the descriptor */ | |
10755 | - base = get_desc_base((unsigned long *)desc); | |
10756 | - | |
10757 | - if (seg & (1<<2)) { | |
10758 | - mutex_unlock(¤t->mm->context.lock); | |
10759 | - } else | |
10760 | - put_cpu(); | |
10761 | - | |
10762 | - /* Adjust EIP and segment limit, and clamp at the kernel limit. | |
10763 | - It's legitimate for segments to wrap at 0xffffffff. */ | |
10764 | - seg_limit += base; | |
10765 | - if (seg_limit < *eip_limit && seg_limit >= base) | |
10766 | - *eip_limit = seg_limit; | |
10767 | - return eip + base; | |
10768 | -} | |
10769 | - | |
10770 | -/* | |
10771 | - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. | |
10772 | - * Check that here and ignore it. | |
10773 | - */ | |
10774 | -static int __is_prefetch(struct pt_regs *regs, unsigned long addr) | |
10775 | -{ | |
10776 | - unsigned long limit; | |
10777 | - unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit); | |
10778 | - int scan_more = 1; | |
10779 | - int prefetch = 0; | |
10780 | - int i; | |
10781 | - | |
10782 | - for (i = 0; scan_more && i < 15; i++) { | |
10783 | - unsigned char opcode; | |
10784 | - unsigned char instr_hi; | |
10785 | - unsigned char instr_lo; | |
10786 | - | |
10787 | - if (instr > (unsigned char *)limit) | |
10788 | - break; | |
10789 | - if (probe_kernel_address(instr, opcode)) | |
10790 | - break; | |
10791 | - | |
10792 | - instr_hi = opcode & 0xf0; | |
10793 | - instr_lo = opcode & 0x0f; | |
10794 | - instr++; | |
10795 | - | |
10796 | - switch (instr_hi) { | |
10797 | - case 0x20: | |
10798 | - case 0x30: | |
10799 | - /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ | |
10800 | - scan_more = ((instr_lo & 7) == 0x6); | |
10801 | - break; | |
10802 | - | |
10803 | - case 0x60: | |
10804 | - /* 0x64 thru 0x67 are valid prefixes in all modes. */ | |
10805 | - scan_more = (instr_lo & 0xC) == 0x4; | |
10806 | - break; | |
10807 | - case 0xF0: | |
10808 | - /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ | |
10809 | - scan_more = !instr_lo || (instr_lo>>1) == 1; | |
10810 | - break; | |
10811 | - case 0x00: | |
10812 | - /* Prefetch instruction is 0x0F0D or 0x0F18 */ | |
10813 | - scan_more = 0; | |
10814 | - if (instr > (unsigned char *)limit) | |
10815 | - break; | |
10816 | - if (probe_kernel_address(instr, opcode)) | |
10817 | - break; | |
10818 | - prefetch = (instr_lo == 0xF) && | |
10819 | - (opcode == 0x0D || opcode == 0x18); | |
10820 | - break; | |
10821 | - default: | |
10822 | - scan_more = 0; | |
10823 | - break; | |
10824 | - } | |
10825 | - } | |
10826 | - return prefetch; | |
10827 | -} | |
10828 | - | |
10829 | -static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, | |
10830 | - unsigned long error_code) | |
10831 | -{ | |
10832 | - if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | |
10833 | - boot_cpu_data.x86 >= 6)) { | |
10834 | - /* Catch an obscure case of prefetch inside an NX page. */ | |
10835 | - if (nx_enabled && (error_code & 16)) | |
10836 | - return 0; | |
10837 | - return __is_prefetch(regs, addr); | |
10838 | - } | |
10839 | - return 0; | |
10840 | -} | |
10841 | - | |
10842 | -static noinline void force_sig_info_fault(int si_signo, int si_code, | |
10843 | - unsigned long address, struct task_struct *tsk) | |
10844 | -{ | |
10845 | - siginfo_t info; | |
10846 | - | |
10847 | - info.si_signo = si_signo; | |
10848 | - info.si_errno = 0; | |
10849 | - info.si_code = si_code; | |
10850 | - info.si_addr = (void __user *)address; | |
10851 | - force_sig_info(si_signo, &info, tsk); | |
10852 | -} | |
10853 | - | |
10854 | -fastcall void do_invalid_op(struct pt_regs *, unsigned long); | |
10855 | - | |
10856 | -#ifdef CONFIG_X86_PAE | |
10857 | -static void dump_fault_path(unsigned long address) | |
10858 | -{ | |
10859 | - unsigned long *p, page; | |
10860 | - unsigned long mfn; | |
10861 | - | |
10862 | - page = read_cr3(); | |
10863 | - p = (unsigned long *)__va(page); | |
10864 | - p += (address >> 30) * 2; | |
10865 | - printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]); | |
10866 | - if (p[0] & _PAGE_PRESENT) { | |
10867 | - mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20); | |
10868 | - page = mfn_to_pfn(mfn) << PAGE_SHIFT; | |
10869 | - p = (unsigned long *)__va(page); | |
10870 | - address &= 0x3fffffff; | |
10871 | - p += (address >> 21) * 2; | |
10872 | - printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n", | |
10873 | - page, p[1], p[0]); | |
10874 | - mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20); | |
10875 | -#ifdef CONFIG_HIGHPTE | |
10876 | - if (mfn_to_pfn(mfn) >= highstart_pfn) | |
10877 | - return; | |
10878 | -#endif | |
10879 | - if ((p[0] & _PAGE_PRESENT) && !(p[0] & _PAGE_PSE)) { | |
10880 | - page = mfn_to_pfn(mfn) << PAGE_SHIFT; | |
10881 | - p = (unsigned long *) __va(page); | |
10882 | - address &= 0x001fffff; | |
10883 | - p += (address >> 12) * 2; | |
10884 | - printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n", | |
10885 | - page, p[1], p[0]); | |
10886 | - } | |
10887 | - } | |
10888 | -} | |
10889 | -#else | |
10890 | -static void dump_fault_path(unsigned long address) | |
10891 | -{ | |
10892 | - unsigned long page; | |
10893 | - | |
10894 | - page = read_cr3(); | |
10895 | - page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT]; | |
10896 | - printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, | |
10897 | - machine_to_phys(page)); | |
10898 | - /* | |
10899 | - * We must not directly access the pte in the highpte | |
10900 | - * case if the page table is located in highmem. | |
10901 | - * And lets rather not kmap-atomic the pte, just in case | |
10902 | - * it's allocated already. | |
10903 | - */ | |
10904 | - if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn | |
10905 | - && (page & _PAGE_PRESENT) | |
10906 | - && !(page & _PAGE_PSE)) { | |
10907 | - page = machine_to_phys(page & PAGE_MASK); | |
10908 | - page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT) | |
10909 | - & (PTRS_PER_PTE - 1)]; | |
10910 | - printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page, | |
10911 | - machine_to_phys(page)); | |
10912 | - } | |
10913 | -} | |
10914 | -#endif | |
10915 | - | |
10916 | -static int spurious_fault(struct pt_regs *regs, | |
10917 | - unsigned long address, | |
10918 | - unsigned long error_code) | |
10919 | -{ | |
10920 | - pgd_t *pgd; | |
10921 | - pud_t *pud; | |
10922 | - pmd_t *pmd; | |
10923 | - pte_t *pte; | |
10924 | - | |
10925 | - /* Reserved-bit violation or user access to kernel space? */ | |
10926 | - if (error_code & 0x0c) | |
10927 | - return 0; | |
10928 | - | |
10929 | - pgd = init_mm.pgd + pgd_index(address); | |
10930 | - if (!pgd_present(*pgd)) | |
10931 | - return 0; | |
10932 | - | |
10933 | - pud = pud_offset(pgd, address); | |
10934 | - if (!pud_present(*pud)) | |
10935 | - return 0; | |
10936 | - | |
10937 | - pmd = pmd_offset(pud, address); | |
10938 | - if (!pmd_present(*pmd)) | |
10939 | - return 0; | |
10940 | - | |
10941 | - pte = pte_offset_kernel(pmd, address); | |
10942 | - if (!pte_present(*pte)) | |
10943 | - return 0; | |
10944 | - if ((error_code & 0x02) && !pte_write(*pte)) | |
10945 | - return 0; | |
10946 | -#ifdef CONFIG_X86_PAE | |
10947 | - if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX)) | |
10948 | - return 0; | |
10949 | -#endif | |
10950 | - | |
10951 | - return 1; | |
10952 | -} | |
10953 | - | |
10954 | -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) | |
10955 | -{ | |
10956 | - unsigned index = pgd_index(address); | |
10957 | - pgd_t *pgd_k; | |
10958 | - pud_t *pud, *pud_k; | |
10959 | - pmd_t *pmd, *pmd_k; | |
10960 | - | |
10961 | - pgd += index; | |
10962 | - pgd_k = init_mm.pgd + index; | |
10963 | - | |
10964 | - if (!pgd_present(*pgd_k)) | |
10965 | - return NULL; | |
10966 | - | |
10967 | - /* | |
10968 | - * set_pgd(pgd, *pgd_k); here would be useless on PAE | |
10969 | - * and redundant with the set_pmd() on non-PAE. As would | |
10970 | - * set_pud. | |
10971 | - */ | |
10972 | - | |
10973 | - pud = pud_offset(pgd, address); | |
10974 | - pud_k = pud_offset(pgd_k, address); | |
10975 | - if (!pud_present(*pud_k)) | |
10976 | - return NULL; | |
10977 | - | |
10978 | - pmd = pmd_offset(pud, address); | |
10979 | - pmd_k = pmd_offset(pud_k, address); | |
10980 | - if (!pmd_present(*pmd_k)) | |
10981 | - return NULL; | |
10982 | - if (!pmd_present(*pmd)) { | |
10983 | - bool lazy = x86_read_percpu(xen_lazy_mmu); | |
10984 | - | |
10985 | - x86_write_percpu(xen_lazy_mmu, false); | |
10986 | -#if CONFIG_XEN_COMPAT > 0x030002 | |
10987 | - set_pmd(pmd, *pmd_k); | |
10988 | -#else | |
10989 | - /* | |
10990 | - * When running on older Xen we must launder *pmd_k through | |
10991 | - * pmd_val() to ensure that _PAGE_PRESENT is correctly set. | |
10992 | - */ | |
10993 | - set_pmd(pmd, __pmd(pmd_val(*pmd_k))); | |
10994 | -#endif | |
10995 | - x86_write_percpu(xen_lazy_mmu, lazy); | |
10996 | - } else | |
10997 | - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); | |
10998 | - return pmd_k; | |
10999 | -} | |
11000 | - | |
11001 | -/* | |
11002 | - * Handle a fault on the vmalloc or module mapping area | |
11003 | - * | |
11004 | - * This assumes no large pages in there. | |
11005 | - */ | |
11006 | -static inline int vmalloc_fault(unsigned long address) | |
11007 | -{ | |
11008 | - unsigned long pgd_paddr; | |
11009 | - pmd_t *pmd_k; | |
11010 | - pte_t *pte_k; | |
11011 | - /* | |
11012 | - * Synchronize this task's top level page-table | |
11013 | - * with the 'reference' page table. | |
11014 | - * | |
11015 | - * Do _not_ use "current" here. We might be inside | |
11016 | - * an interrupt in the middle of a task switch.. | |
11017 | - */ | |
11018 | - pgd_paddr = read_cr3(); | |
11019 | - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); | |
11020 | - if (!pmd_k) | |
11021 | - return -1; | |
11022 | - pte_k = pte_offset_kernel(pmd_k, address); | |
11023 | - if (!pte_present(*pte_k)) | |
11024 | - return -1; | |
11025 | - return 0; | |
11026 | -} | |
11027 | - | |
11028 | -int show_unhandled_signals = 1; | |
11029 | - | |
11030 | -/* | |
11031 | - * This routine handles page faults. It determines the address, | |
11032 | - * and the problem, and then passes it off to one of the appropriate | |
11033 | - * routines. | |
11034 | - * | |
11035 | - * error_code: | |
11036 | - * bit 0 == 0 means no page found, 1 means protection fault | |
11037 | - * bit 1 == 0 means read, 1 means write | |
11038 | - * bit 2 == 0 means kernel, 1 means user-mode | |
11039 | - * bit 3 == 1 means use of reserved bit detected | |
11040 | - * bit 4 == 1 means fault was an instruction fetch | |
11041 | - */ | |
11042 | -fastcall void __kprobes do_page_fault(struct pt_regs *regs, | |
11043 | - unsigned long error_code) | |
11044 | -{ | |
11045 | - struct task_struct *tsk; | |
11046 | - struct mm_struct *mm; | |
11047 | - struct vm_area_struct * vma; | |
11048 | - unsigned long address; | |
11049 | - int write, si_code; | |
11050 | - int fault; | |
11051 | - | |
11052 | - /* | |
11053 | - * We can fault from pretty much anywhere, with unknown IRQ state. | |
11054 | - */ | |
11055 | - trace_hardirqs_fixup(); | |
11056 | - | |
11057 | - /* get the address */ | |
11058 | - address = read_cr2(); | |
11059 | - | |
11060 | - /* Set the "privileged fault" bit to something sane. */ | |
11061 | - error_code &= ~4; | |
11062 | - error_code |= (regs->xcs & 2) << 1; | |
11063 | - if (regs->eflags & X86_EFLAGS_VM) | |
11064 | - error_code |= 4; | |
11065 | - | |
11066 | - tsk = current; | |
11067 | - | |
11068 | - si_code = SEGV_MAPERR; | |
11069 | - | |
11070 | - /* | |
11071 | - * We fault-in kernel-space virtual memory on-demand. The | |
11072 | - * 'reference' page table is init_mm.pgd. | |
11073 | - * | |
11074 | - * NOTE! We MUST NOT take any locks for this case. We may | |
11075 | - * be in an interrupt or a critical region, and should | |
11076 | - * only copy the information from the master page table, | |
11077 | - * nothing more. | |
11078 | - * | |
11079 | - * This verifies that the fault happens in kernel space | |
11080 | - * (error_code & 4) == 0, and that the fault was not a | |
11081 | - * protection error (error_code & 9) == 0. | |
11082 | - */ | |
11083 | - if (unlikely(address >= TASK_SIZE)) { | |
11084 | -#ifdef CONFIG_XEN | |
11085 | - /* Faults in hypervisor area can never be patched up. */ | |
11086 | - if (address >= hypervisor_virt_start) | |
11087 | - goto bad_area_nosemaphore; | |
11088 | -#endif | |
11089 | - if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) | |
11090 | - return; | |
11091 | - /* Can take a spurious fault if mapping changes R/O -> R/W. */ | |
11092 | - if (spurious_fault(regs, address, error_code)) | |
11093 | - return; | |
11094 | - if (notify_page_fault(regs)) | |
11095 | - return; | |
11096 | - /* | |
11097 | - * Don't take the mm semaphore here. If we fixup a prefetch | |
11098 | - * fault we could otherwise deadlock. | |
11099 | - */ | |
11100 | - goto bad_area_nosemaphore; | |
11101 | - } | |
11102 | - | |
11103 | - if (notify_page_fault(regs)) | |
11104 | - return; | |
11105 | - | |
11106 | - /* It's safe to allow irq's after cr2 has been saved and the vmalloc | |
11107 | - fault has been handled. */ | |
11108 | - if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) | |
11109 | - local_irq_enable(); | |
11110 | - | |
11111 | - mm = tsk->mm; | |
11112 | - | |
11113 | - /* | |
11114 | - * If we're in an interrupt, have no user context or are running in an | |
11115 | - * atomic region then we must not take the fault.. | |
11116 | - */ | |
11117 | - if (in_atomic() || !mm) | |
11118 | - goto bad_area_nosemaphore; | |
11119 | - | |
11120 | - /* When running in the kernel we expect faults to occur only to | |
11121 | - * addresses in user space. All other faults represent errors in the | |
11122 | - * kernel and should generate an OOPS. Unfortunately, in the case of an | |
11123 | - * erroneous fault occurring in a code path which already holds mmap_sem | |
11124 | - * we will deadlock attempting to validate the fault against the | |
11125 | - * address space. Luckily the kernel only validly references user | |
11126 | - * space from well defined areas of code, which are listed in the | |
11127 | - * exceptions table. | |
11128 | - * | |
11129 | - * As the vast majority of faults will be valid we will only perform | |
11130 | - * the source reference check when there is a possibility of a deadlock. | |
11131 | - * Attempt to lock the address space, if we cannot we then validate the | |
11132 | - * source. If this is invalid we can skip the address space check, | |
11133 | - * thus avoiding the deadlock. | |
11134 | - */ | |
11135 | - if (!down_read_trylock(&mm->mmap_sem)) { | |
11136 | - if ((error_code & 4) == 0 && | |
11137 | - !search_exception_tables(regs->eip)) | |
11138 | - goto bad_area_nosemaphore; | |
11139 | - down_read(&mm->mmap_sem); | |
11140 | - } | |
11141 | - | |
11142 | - vma = find_vma(mm, address); | |
11143 | - if (!vma) | |
11144 | - goto bad_area; | |
11145 | - if (vma->vm_start <= address) | |
11146 | - goto good_area; | |
11147 | - if (!(vma->vm_flags & VM_GROWSDOWN)) | |
11148 | - goto bad_area; | |
11149 | - if (error_code & 4) { | |
11150 | - /* | |
11151 | - * Accessing the stack below %esp is always a bug. | |
11152 | - * The large cushion allows instructions like enter | |
11153 | - * and pusha to work. ("enter $65535,$31" pushes | |
11154 | - * 32 pointers and then decrements %esp by 65535.) | |
11155 | - */ | |
11156 | - if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp) | |
11157 | - goto bad_area; | |
11158 | - } | |
11159 | - if (expand_stack(vma, address)) | |
11160 | - goto bad_area; | |
11161 | -/* | |
11162 | - * Ok, we have a good vm_area for this memory access, so | |
11163 | - * we can handle it.. | |
11164 | - */ | |
11165 | -good_area: | |
11166 | - si_code = SEGV_ACCERR; | |
11167 | - write = 0; | |
11168 | - switch (error_code & 3) { | |
11169 | - default: /* 3: write, present */ | |
11170 | - /* fall through */ | |
11171 | - case 2: /* write, not present */ | |
11172 | - if (!(vma->vm_flags & VM_WRITE)) | |
11173 | - goto bad_area; | |
11174 | - write++; | |
11175 | - break; | |
11176 | - case 1: /* read, present */ | |
11177 | - goto bad_area; | |
11178 | - case 0: /* read, not present */ | |
11179 | - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) | |
11180 | - goto bad_area; | |
11181 | - } | |
11182 | - | |
11183 | - survive: | |
11184 | - /* | |
11185 | - * If for any reason at all we couldn't handle the fault, | |
11186 | - * make sure we exit gracefully rather than endlessly redo | |
11187 | - * the fault. | |
11188 | - */ | |
11189 | - fault = handle_mm_fault(mm, vma, address, write); | |
11190 | - if (unlikely(fault & VM_FAULT_ERROR)) { | |
11191 | - if (fault & VM_FAULT_OOM) | |
11192 | - goto out_of_memory; | |
11193 | - else if (fault & VM_FAULT_SIGBUS) | |
11194 | - goto do_sigbus; | |
11195 | - BUG(); | |
11196 | - } | |
11197 | - if (fault & VM_FAULT_MAJOR) | |
11198 | - tsk->maj_flt++; | |
11199 | - else | |
11200 | - tsk->min_flt++; | |
11201 | - | |
11202 | - /* | |
11203 | - * Did it hit the DOS screen memory VA from vm86 mode? | |
11204 | - */ | |
11205 | - if (regs->eflags & VM_MASK) { | |
11206 | - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; | |
11207 | - if (bit < 32) | |
11208 | - tsk->thread.screen_bitmap |= 1 << bit; | |
11209 | - } | |
11210 | - up_read(&mm->mmap_sem); | |
11211 | - return; | |
11212 | - | |
11213 | -/* | |
11214 | - * Something tried to access memory that isn't in our memory map.. | |
11215 | - * Fix it, but check if it's kernel or user first.. | |
11216 | - */ | |
11217 | -bad_area: | |
11218 | - up_read(&mm->mmap_sem); | |
11219 | - | |
11220 | -bad_area_nosemaphore: | |
11221 | - /* User mode accesses just cause a SIGSEGV */ | |
11222 | - if (error_code & 4) { | |
11223 | - /* | |
11224 | - * It's possible to have interrupts off here. | |
11225 | - */ | |
11226 | - local_irq_enable(); | |
11227 | - | |
11228 | - /* | |
11229 | - * Valid to do another page fault here because this one came | |
11230 | - * from user space. | |
11231 | - */ | |
11232 | - if (is_prefetch(regs, address, error_code)) | |
11233 | - return; | |
11234 | - | |
11235 | - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | |
11236 | - printk_ratelimit()) { | |
11237 | - printk("%s%s[%d]: segfault at %08lx eip %08lx " | |
11238 | - "esp %08lx error %lx\n", | |
11239 | - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, | |
11240 | - tsk->comm, task_pid_nr(tsk), address, regs->eip, | |
11241 | - regs->esp, error_code); | |
11242 | - } | |
11243 | - tsk->thread.cr2 = address; | |
11244 | - /* Kernel addresses are always protection faults */ | |
11245 | - tsk->thread.error_code = error_code | (address >= TASK_SIZE); | |
11246 | - tsk->thread.trap_no = 14; | |
11247 | - force_sig_info_fault(SIGSEGV, si_code, address, tsk); | |
11248 | - return; | |
11249 | - } | |
11250 | - | |
11251 | -#ifdef CONFIG_X86_F00F_BUG | |
11252 | - /* | |
11253 | - * Pentium F0 0F C7 C8 bug workaround. | |
11254 | - */ | |
11255 | - if (boot_cpu_data.f00f_bug) { | |
11256 | - unsigned long nr; | |
11257 | - | |
11258 | - nr = (address - idt_descr.address) >> 3; | |
11259 | - | |
11260 | - if (nr == 6) { | |
11261 | - do_invalid_op(regs, 0); | |
11262 | - return; | |
11263 | - } | |
11264 | - } | |
11265 | -#endif | |
11266 | - | |
11267 | -no_context: | |
11268 | - /* Are we prepared to handle this kernel fault? */ | |
11269 | - if (fixup_exception(regs)) | |
11270 | - return; | |
11271 | - | |
11272 | - /* | |
11273 | - * Valid to do another page fault here, because if this fault | |
11274 | - * had been triggered by is_prefetch fixup_exception would have | |
11275 | - * handled it. | |
11276 | - */ | |
11277 | - if (is_prefetch(regs, address, error_code)) | |
11278 | - return; | |
11279 | - | |
11280 | -/* | |
11281 | - * Oops. The kernel tried to access some bad page. We'll have to | |
11282 | - * terminate things with extreme prejudice. | |
11283 | - */ | |
11284 | - | |
11285 | - bust_spinlocks(1); | |
11286 | - | |
11287 | - if (oops_may_print()) { | |
11288 | -#ifdef CONFIG_X86_PAE | |
11289 | - if (error_code & 16) { | |
11290 | - pte_t *pte = lookup_address(address); | |
11291 | - | |
11292 | - if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) | |
11293 | - printk(KERN_CRIT "kernel tried to execute " | |
11294 | - "NX-protected page - exploit attempt? " | |
11295 | - "(uid: %d)\n", current->uid); | |
11296 | - } | |
11297 | -#endif | |
11298 | - if (address < PAGE_SIZE) | |
11299 | - printk(KERN_ALERT "BUG: unable to handle kernel NULL " | |
11300 | - "pointer dereference"); | |
11301 | - else | |
11302 | - printk(KERN_ALERT "BUG: unable to handle kernel paging" | |
11303 | - " request"); | |
11304 | - printk(" at virtual address %08lx\n",address); | |
11305 | - printk(KERN_ALERT "printing eip: %08lx\n", regs->eip); | |
11306 | - dump_fault_path(address); | |
11307 | - } | |
11308 | - tsk->thread.cr2 = address; | |
11309 | - tsk->thread.trap_no = 14; | |
11310 | - tsk->thread.error_code = error_code; | |
11311 | - die("Oops", regs, error_code); | |
11312 | - bust_spinlocks(0); | |
11313 | - do_exit(SIGKILL); | |
11314 | - | |
11315 | -/* | |
11316 | - * We ran out of memory, or some other thing happened to us that made | |
11317 | - * us unable to handle the page fault gracefully. | |
11318 | - */ | |
11319 | -out_of_memory: | |
11320 | - up_read(&mm->mmap_sem); | |
11321 | - if (is_global_init(tsk)) { | |
11322 | - yield(); | |
11323 | - down_read(&mm->mmap_sem); | |
11324 | - goto survive; | |
11325 | - } | |
11326 | - printk("VM: killing process %s\n", tsk->comm); | |
11327 | - if (error_code & 4) | |
11328 | - do_group_exit(SIGKILL); | |
11329 | - goto no_context; | |
11330 | - | |
11331 | -do_sigbus: | |
11332 | - up_read(&mm->mmap_sem); | |
11333 | - | |
11334 | - /* Kernel mode? Handle exceptions or die */ | |
11335 | - if (!(error_code & 4)) | |
11336 | - goto no_context; | |
11337 | - | |
11338 | - /* User space => ok to do another page fault */ | |
11339 | - if (is_prefetch(regs, address, error_code)) | |
11340 | - return; | |
11341 | - | |
11342 | - tsk->thread.cr2 = address; | |
11343 | - tsk->thread.error_code = error_code; | |
11344 | - tsk->thread.trap_no = 14; | |
11345 | - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); | |
11346 | -} | |
11347 | - | |
11348 | -void vmalloc_sync_all(void) | |
11349 | -{ | |
11350 | - /* | |
11351 | - * Note that races in the updates of insync and start aren't | |
11352 | - * problematic: insync can only get set bits added, and updates to | |
11353 | - * start are only improving performance (without affecting correctness | |
11354 | - * if undone). | |
11355 | - * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs. | |
11356 | - * This change works just fine with 2-level paging too. | |
11357 | - */ | |
11358 | -#define sync_index(a) ((a) >> PMD_SHIFT) | |
11359 | - static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD); | |
11360 | - static unsigned long start = TASK_SIZE; | |
11361 | - unsigned long address; | |
11362 | - | |
11363 | - if (SHARED_KERNEL_PMD) | |
11364 | - return; | |
11365 | - | |
11366 | - BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); | |
11367 | - for (address = start; | |
11368 | - address >= TASK_SIZE && address < hypervisor_virt_start; | |
11369 | - address += 1UL << PMD_SHIFT) { | |
11370 | - if (!test_bit(sync_index(address), insync)) { | |
11371 | - unsigned long flags; | |
11372 | - struct page *page; | |
11373 | - | |
11374 | - spin_lock_irqsave(&pgd_lock, flags); | |
11375 | - /* XEN: failure path assumes non-empty pgd_list. */ | |
11376 | - if (unlikely(!pgd_list)) { | |
11377 | - spin_unlock_irqrestore(&pgd_lock, flags); | |
11378 | - return; | |
11379 | - } | |
11380 | - for (page = pgd_list; page; page = | |
11381 | - (struct page *)page->index) | |
11382 | - if (!vmalloc_sync_one(page_address(page), | |
11383 | - address)) { | |
11384 | - BUG_ON(page != pgd_list); | |
11385 | - break; | |
11386 | - } | |
11387 | - spin_unlock_irqrestore(&pgd_lock, flags); | |
11388 | - if (!page) | |
11389 | - set_bit(sync_index(address), insync); | |
11390 | - } | |
11391 | - if (address == start && test_bit(sync_index(address), insync)) | |
11392 | - start = address + (1UL << PMD_SHIFT); | |
11393 | - } | |
11394 | -} | |
11395 | --- a/arch/x86/mm/fault_64-xen.c | |
11396 | +++ /dev/null | |
11397 | @@ -1,686 +0,0 @@ | |
11398 | -/* | |
11399 | - * linux/arch/x86-64/mm/fault.c | |
11400 | - * | |
11401 | - * Copyright (C) 1995 Linus Torvalds | |
11402 | - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. | |
11403 | - */ | |
11404 | - | |
11405 | -#include <linux/signal.h> | |
11406 | -#include <linux/sched.h> | |
11407 | -#include <linux/kernel.h> | |
11408 | -#include <linux/errno.h> | |
11409 | -#include <linux/string.h> | |
11410 | -#include <linux/types.h> | |
11411 | -#include <linux/ptrace.h> | |
11412 | -#include <linux/mman.h> | |
11413 | -#include <linux/mm.h> | |
11414 | -#include <linux/smp.h> | |
11415 | -#include <linux/interrupt.h> | |
11416 | -#include <linux/init.h> | |
11417 | -#include <linux/tty.h> | |
11418 | -#include <linux/vt_kern.h> /* For unblank_screen() */ | |
11419 | -#include <linux/compiler.h> | |
11420 | -#include <linux/vmalloc.h> | |
11421 | -#include <linux/module.h> | |
11422 | -#include <linux/kprobes.h> | |
11423 | -#include <linux/uaccess.h> | |
11424 | -#include <linux/kdebug.h> | |
11425 | -#include <linux/kprobes.h> | |
11426 | - | |
11427 | -#include <asm/system.h> | |
11428 | -#include <asm/pgalloc.h> | |
11429 | -#include <asm/smp.h> | |
11430 | -#include <asm/tlbflush.h> | |
11431 | -#include <asm/proto.h> | |
11432 | -#include <asm-generic/sections.h> | |
11433 | - | |
11434 | -/* Page fault error code bits */ | |
11435 | -#define PF_PROT (1<<0) /* or no page found */ | |
11436 | -#define PF_WRITE (1<<1) | |
11437 | -#define PF_USER (1<<2) | |
11438 | -#define PF_RSVD (1<<3) | |
11439 | -#define PF_INSTR (1<<4) | |
11440 | - | |
11441 | -#ifdef CONFIG_KPROBES | |
11442 | -static inline int notify_page_fault(struct pt_regs *regs) | |
11443 | -{ | |
11444 | - int ret = 0; | |
11445 | - | |
11446 | - /* kprobe_running() needs smp_processor_id() */ | |
11447 | - if (!user_mode(regs)) { | |
11448 | - preempt_disable(); | |
11449 | - if (kprobe_running() && kprobe_fault_handler(regs, 14)) | |
11450 | - ret = 1; | |
11451 | - preempt_enable(); | |
11452 | - } | |
11453 | - | |
11454 | - return ret; | |
11455 | -} | |
11456 | -#else | |
11457 | -static inline int notify_page_fault(struct pt_regs *regs) | |
11458 | -{ | |
11459 | - return 0; | |
11460 | -} | |
11461 | -#endif | |
11462 | - | |
11463 | -/* Sometimes the CPU reports invalid exceptions on prefetch. | |
11464 | - Check that here and ignore. | |
11465 | - Opcode checker based on code by Richard Brunner */ | |
11466 | -static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, | |
11467 | - unsigned long error_code) | |
11468 | -{ | |
11469 | - unsigned char *instr; | |
11470 | - int scan_more = 1; | |
11471 | - int prefetch = 0; | |
11472 | - unsigned char *max_instr; | |
11473 | - | |
11474 | - /* If it was a exec fault ignore */ | |
11475 | - if (error_code & PF_INSTR) | |
11476 | - return 0; | |
11477 | - | |
11478 | - instr = (unsigned char __user *)convert_rip_to_linear(current, regs); | |
11479 | - max_instr = instr + 15; | |
11480 | - | |
11481 | - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) | |
11482 | - return 0; | |
11483 | - | |
11484 | - while (scan_more && instr < max_instr) { | |
11485 | - unsigned char opcode; | |
11486 | - unsigned char instr_hi; | |
11487 | - unsigned char instr_lo; | |
11488 | - | |
11489 | - if (probe_kernel_address(instr, opcode)) | |
11490 | - break; | |
11491 | - | |
11492 | - instr_hi = opcode & 0xf0; | |
11493 | - instr_lo = opcode & 0x0f; | |
11494 | - instr++; | |
11495 | - | |
11496 | - switch (instr_hi) { | |
11497 | - case 0x20: | |
11498 | - case 0x30: | |
11499 | - /* Values 0x26,0x2E,0x36,0x3E are valid x86 | |
11500 | - prefixes. In long mode, the CPU will signal | |
11501 | - invalid opcode if some of these prefixes are | |
11502 | - present so we will never get here anyway */ | |
11503 | - scan_more = ((instr_lo & 7) == 0x6); | |
11504 | - break; | |
11505 | - | |
11506 | - case 0x40: | |
11507 | - /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes | |
11508 | - Need to figure out under what instruction mode the | |
11509 | - instruction was issued ... */ | |
11510 | - /* Could check the LDT for lm, but for now it's good | |
11511 | - enough to assume that long mode only uses well known | |
11512 | - segments or kernel. */ | |
11513 | - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); | |
11514 | - break; | |
11515 | - | |
11516 | - case 0x60: | |
11517 | - /* 0x64 thru 0x67 are valid prefixes in all modes. */ | |
11518 | - scan_more = (instr_lo & 0xC) == 0x4; | |
11519 | - break; | |
11520 | - case 0xF0: | |
11521 | - /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ | |
11522 | - scan_more = !instr_lo || (instr_lo>>1) == 1; | |
11523 | - break; | |
11524 | - case 0x00: | |
11525 | - /* Prefetch instruction is 0x0F0D or 0x0F18 */ | |
11526 | - scan_more = 0; | |
11527 | - if (probe_kernel_address(instr, opcode)) | |
11528 | - break; | |
11529 | - prefetch = (instr_lo == 0xF) && | |
11530 | - (opcode == 0x0D || opcode == 0x18); | |
11531 | - break; | |
11532 | - default: | |
11533 | - scan_more = 0; | |
11534 | - break; | |
11535 | - } | |
11536 | - } | |
11537 | - return prefetch; | |
11538 | -} | |
11539 | - | |
11540 | -static int bad_address(void *p) | |
11541 | -{ | |
11542 | - unsigned long dummy; | |
11543 | - return probe_kernel_address((unsigned long *)p, dummy); | |
11544 | -} | |
11545 | - | |
11546 | -void dump_pagetable(unsigned long address) | |
11547 | -{ | |
11548 | - pgd_t *pgd; | |
11549 | - pud_t *pud; | |
11550 | - pmd_t *pmd; | |
11551 | - pte_t *pte; | |
11552 | - | |
11553 | - pgd = (pgd_t *)read_cr3(); | |
11554 | - | |
11555 | - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); | |
11556 | - pgd += pgd_index(address); | |
11557 | - if (bad_address(pgd)) goto bad; | |
11558 | - printk("PGD %lx ", pgd_val(*pgd)); | |
11559 | - if (!pgd_present(*pgd)) goto ret; | |
11560 | - | |
11561 | - pud = pud_offset(pgd, address); | |
11562 | - if (bad_address(pud)) goto bad; | |
11563 | - printk("PUD %lx ", pud_val(*pud)); | |
11564 | - if (!pud_present(*pud)) goto ret; | |
11565 | - | |
11566 | - pmd = pmd_offset(pud, address); | |
11567 | - if (bad_address(pmd)) goto bad; | |
11568 | - printk("PMD %lx ", pmd_val(*pmd)); | |
11569 | - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; | |
11570 | - | |
11571 | - pte = pte_offset_kernel(pmd, address); | |
11572 | - if (bad_address(pte)) goto bad; | |
11573 | - printk("PTE %lx", pte_val(*pte)); | |
11574 | -ret: | |
11575 | - printk("\n"); | |
11576 | - return; | |
11577 | -bad: | |
11578 | - printk("BAD\n"); | |
11579 | -} | |
11580 | - | |
11581 | -static const char errata93_warning[] = | |
11582 | -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" | |
11583 | -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" | |
11584 | -KERN_ERR "******* Please consider a BIOS update.\n" | |
11585 | -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; | |
11586 | - | |
11587 | -/* Workaround for K8 erratum #93 & buggy BIOS. | |
11588 | - BIOS SMM functions are required to use a specific workaround | |
11589 | - to avoid corruption of the 64bit RIP register on C stepping K8. | |
11590 | - A lot of BIOS that didn't get tested properly miss this. | |
11591 | - The OS sees this as a page fault with the upper 32bits of RIP cleared. | |
11592 | - Try to work around it here. | |
11593 | - Note we only handle faults in kernel here. */ | |
11594 | - | |
11595 | -static int is_errata93(struct pt_regs *regs, unsigned long address) | |
11596 | -{ | |
11597 | - static int warned; | |
11598 | - if (address != regs->rip) | |
11599 | - return 0; | |
11600 | - if ((address >> 32) != 0) | |
11601 | - return 0; | |
11602 | - address |= 0xffffffffUL << 32; | |
11603 | - if ((address >= (u64)_stext && address <= (u64)_etext) || | |
11604 | - (address >= MODULES_VADDR && address <= MODULES_END)) { | |
11605 | - if (!warned) { | |
11606 | - printk(errata93_warning); | |
11607 | - warned = 1; | |
11608 | - } | |
11609 | - regs->rip = address; | |
11610 | - return 1; | |
11611 | - } | |
11612 | - return 0; | |
11613 | -} | |
11614 | - | |
11615 | -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, | |
11616 | - unsigned long error_code) | |
11617 | -{ | |
11618 | - unsigned long flags = oops_begin(); | |
11619 | - struct task_struct *tsk; | |
11620 | - | |
11621 | - printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", | |
11622 | - current->comm, address); | |
11623 | - dump_pagetable(address); | |
11624 | - tsk = current; | |
11625 | - tsk->thread.cr2 = address; | |
11626 | - tsk->thread.trap_no = 14; | |
11627 | - tsk->thread.error_code = error_code; | |
11628 | - __die("Bad pagetable", regs, error_code); | |
11629 | - oops_end(flags); | |
11630 | - do_exit(SIGKILL); | |
11631 | -} | |
11632 | - | |
11633 | -/* | |
11634 | - * Handle a fault on the vmalloc area | |
11635 | - * | |
11636 | - * This assumes no large pages in there. | |
11637 | - */ | |
11638 | -static int vmalloc_fault(unsigned long address) | |
11639 | -{ | |
11640 | - pgd_t *pgd, *pgd_ref; | |
11641 | - pud_t *pud, *pud_ref; | |
11642 | - pmd_t *pmd, *pmd_ref; | |
11643 | - pte_t *pte, *pte_ref; | |
11644 | - | |
11645 | - /* Copy kernel mappings over when needed. This can also | |
11646 | - happen within a race in page table update. In the later | |
11647 | - case just flush. */ | |
11648 | - | |
11649 | - /* On Xen the line below does not always work. Needs investigating! */ | |
11650 | - /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/ | |
11651 | - pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); | |
11652 | - pgd += pgd_index(address); | |
11653 | - pgd_ref = pgd_offset_k(address); | |
11654 | - if (pgd_none(*pgd_ref)) | |
11655 | - return -1; | |
11656 | - if (pgd_none(*pgd)) | |
11657 | - set_pgd(pgd, *pgd_ref); | |
11658 | - else | |
11659 | - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | |
11660 | - | |
11661 | - /* Below here mismatches are bugs because these lower tables | |
11662 | - are shared */ | |
11663 | - | |
11664 | - pud = pud_offset(pgd, address); | |
11665 | - pud_ref = pud_offset(pgd_ref, address); | |
11666 | - if (pud_none(*pud_ref)) | |
11667 | - return -1; | |
11668 | - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) | |
11669 | - BUG(); | |
11670 | - pmd = pmd_offset(pud, address); | |
11671 | - pmd_ref = pmd_offset(pud_ref, address); | |
11672 | - if (pmd_none(*pmd_ref)) | |
11673 | - return -1; | |
11674 | - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) | |
11675 | - BUG(); | |
11676 | - pte_ref = pte_offset_kernel(pmd_ref, address); | |
11677 | - if (!pte_present(*pte_ref)) | |
11678 | - return -1; | |
11679 | - pte = pte_offset_kernel(pmd, address); | |
11680 | - /* Don't use pte_page here, because the mappings can point | |
11681 | - outside mem_map, and the NUMA hash lookup cannot handle | |
11682 | - that. */ | |
11683 | - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) | |
11684 | - BUG(); | |
11685 | - return 0; | |
11686 | -} | |
11687 | - | |
11688 | -int show_unhandled_signals = 1; | |
11689 | - | |
11690 | - | |
11691 | -#define MEM_VERBOSE 1 | |
11692 | - | |
11693 | -#ifdef MEM_VERBOSE | |
11694 | -#define MEM_LOG(_f, _a...) \ | |
11695 | - printk("fault.c:[%d]-> " _f "\n", \ | |
11696 | - __LINE__ , ## _a ) | |
11697 | -#else | |
11698 | -#define MEM_LOG(_f, _a...) ((void)0) | |
11699 | -#endif | |
11700 | - | |
11701 | -static int spurious_fault(struct pt_regs *regs, | |
11702 | - unsigned long address, | |
11703 | - unsigned long error_code) | |
11704 | -{ | |
11705 | - pgd_t *pgd; | |
11706 | - pud_t *pud; | |
11707 | - pmd_t *pmd; | |
11708 | - pte_t *pte; | |
11709 | - | |
11710 | -#ifdef CONFIG_XEN | |
11711 | - /* Faults in hypervisor area are never spurious. */ | |
11712 | - if ((address >= HYPERVISOR_VIRT_START) && | |
11713 | - (address < HYPERVISOR_VIRT_END)) | |
11714 | - return 0; | |
11715 | -#endif | |
11716 | - | |
11717 | - /* Reserved-bit violation or user access to kernel space? */ | |
11718 | - if (error_code & (PF_RSVD|PF_USER)) | |
11719 | - return 0; | |
11720 | - | |
11721 | - pgd = init_mm.pgd + pgd_index(address); | |
11722 | - if (!pgd_present(*pgd)) | |
11723 | - return 0; | |
11724 | - | |
11725 | - pud = pud_offset(pgd, address); | |
11726 | - if (!pud_present(*pud)) | |
11727 | - return 0; | |
11728 | - | |
11729 | - pmd = pmd_offset(pud, address); | |
11730 | - if (!pmd_present(*pmd)) | |
11731 | - return 0; | |
11732 | - | |
11733 | - pte = pte_offset_kernel(pmd, address); | |
11734 | - if (!pte_present(*pte)) | |
11735 | - return 0; | |
11736 | - if ((error_code & PF_WRITE) && !pte_write(*pte)) | |
11737 | - return 0; | |
11738 | - if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX)) | |
11739 | - return 0; | |
11740 | - | |
11741 | - return 1; | |
11742 | -} | |
11743 | - | |
11744 | -/* | |
11745 | - * This routine handles page faults. It determines the address, | |
11746 | - * and the problem, and then passes it off to one of the appropriate | |
11747 | - * routines. | |
11748 | - */ | |
11749 | -asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, | |
11750 | - unsigned long error_code) | |
11751 | -{ | |
11752 | - struct task_struct *tsk; | |
11753 | - struct mm_struct *mm; | |
11754 | - struct vm_area_struct * vma; | |
11755 | - unsigned long address; | |
11756 | - const struct exception_table_entry *fixup; | |
11757 | - int write, fault; | |
11758 | - unsigned long flags; | |
11759 | - siginfo_t info; | |
11760 | - | |
11761 | - if (!user_mode(regs)) | |
11762 | - error_code &= ~PF_USER; /* means kernel */ | |
11763 | - | |
11764 | - /* | |
11765 | - * We can fault from pretty much anywhere, with unknown IRQ state. | |
11766 | - */ | |
11767 | - trace_hardirqs_fixup(); | |
11768 | - | |
11769 | - tsk = current; | |
11770 | - mm = tsk->mm; | |
11771 | - prefetchw(&mm->mmap_sem); | |
11772 | - | |
11773 | - /* get the address */ | |
11774 | - address = read_cr2(); | |
11775 | - | |
11776 | - info.si_code = SEGV_MAPERR; | |
11777 | - | |
11778 | - | |
11779 | - /* | |
11780 | - * We fault-in kernel-space virtual memory on-demand. The | |
11781 | - * 'reference' page table is init_mm.pgd. | |
11782 | - * | |
11783 | - * NOTE! We MUST NOT take any locks for this case. We may | |
11784 | - * be in an interrupt or a critical region, and should | |
11785 | - * only copy the information from the master page table, | |
11786 | - * nothing more. | |
11787 | - * | |
11788 | - * This verifies that the fault happens in kernel space | |
11789 | - * (error_code & 4) == 0, and that the fault was not a | |
11790 | - * protection error (error_code & 9) == 0. | |
11791 | - */ | |
11792 | - if (unlikely(address >= TASK_SIZE64)) { | |
11793 | - /* | |
11794 | - * Don't check for the module range here: its PML4 | |
11795 | - * is always initialized because it's shared with the main | |
11796 | - * kernel text. Only vmalloc may need PML4 syncups. | |
11797 | - */ | |
11798 | - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && | |
11799 | - ((address >= VMALLOC_START && address < VMALLOC_END))) { | |
11800 | - if (vmalloc_fault(address) >= 0) | |
11801 | - return; | |
11802 | - } | |
11803 | - /* Can take a spurious fault if mapping changes R/O -> R/W. */ | |
11804 | - if (spurious_fault(regs, address, error_code)) | |
11805 | - return; | |
11806 | - if (notify_page_fault(regs)) | |
11807 | - return; | |
11808 | - /* | |
11809 | - * Don't take the mm semaphore here. If we fixup a prefetch | |
11810 | - * fault we could otherwise deadlock. | |
11811 | - */ | |
11812 | - goto bad_area_nosemaphore; | |
11813 | - } | |
11814 | - | |
11815 | - if (notify_page_fault(regs)) | |
11816 | - return; | |
11817 | - | |
11818 | - if (likely(regs->eflags & X86_EFLAGS_IF)) | |
11819 | - local_irq_enable(); | |
11820 | - | |
11821 | - if (unlikely(error_code & PF_RSVD)) | |
11822 | - pgtable_bad(address, regs, error_code); | |
11823 | - | |
11824 | - /* | |
11825 | - * If we're in an interrupt or have no user | |
11826 | - * context, we must not take the fault.. | |
11827 | - */ | |
11828 | - if (unlikely(in_atomic() || !mm)) | |
11829 | - goto bad_area_nosemaphore; | |
11830 | - | |
11831 | - /* | |
11832 | - * User-mode registers count as a user access even for any | |
11833 | - * potential system fault or CPU buglet. | |
11834 | - */ | |
11835 | - if (user_mode_vm(regs)) | |
11836 | - error_code |= PF_USER; | |
11837 | - | |
11838 | - again: | |
11839 | - /* When running in the kernel we expect faults to occur only to | |
11840 | - * addresses in user space. All other faults represent errors in the | |
11841 | - * kernel and should generate an OOPS. Unfortunately, in the case of an | |
11842 | - * erroneous fault occurring in a code path which already holds mmap_sem | |
11843 | - * we will deadlock attempting to validate the fault against the | |
11844 | - * address space. Luckily the kernel only validly references user | |
11845 | - * space from well defined areas of code, which are listed in the | |
11846 | - * exceptions table. | |
11847 | - * | |
11848 | - * As the vast majority of faults will be valid we will only perform | |
11849 | - * the source reference check when there is a possibility of a deadlock. | |
11850 | - * Attempt to lock the address space, if we cannot we then validate the | |
11851 | - * source. If this is invalid we can skip the address space check, | |
11852 | - * thus avoiding the deadlock. | |
11853 | - */ | |
11854 | - if (!down_read_trylock(&mm->mmap_sem)) { | |
11855 | - if ((error_code & PF_USER) == 0 && | |
11856 | - !search_exception_tables(regs->rip)) | |
11857 | - goto bad_area_nosemaphore; | |
11858 | - down_read(&mm->mmap_sem); | |
11859 | - } | |
11860 | - | |
11861 | - vma = find_vma(mm, address); | |
11862 | - if (!vma) | |
11863 | - goto bad_area; | |
11864 | - if (likely(vma->vm_start <= address)) | |
11865 | - goto good_area; | |
11866 | - if (!(vma->vm_flags & VM_GROWSDOWN)) | |
11867 | - goto bad_area; | |
11868 | - if (error_code & 4) { | |
11869 | - /* Allow userspace just enough access below the stack pointer | |
11870 | - * to let the 'enter' instruction work. | |
11871 | - */ | |
11872 | - if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) | |
11873 | - goto bad_area; | |
11874 | - } | |
11875 | - if (expand_stack(vma, address)) | |
11876 | - goto bad_area; | |
11877 | -/* | |
11878 | - * Ok, we have a good vm_area for this memory access, so | |
11879 | - * we can handle it.. | |
11880 | - */ | |
11881 | -good_area: | |
11882 | - info.si_code = SEGV_ACCERR; | |
11883 | - write = 0; | |
11884 | - switch (error_code & (PF_PROT|PF_WRITE)) { | |
11885 | - default: /* 3: write, present */ | |
11886 | - /* fall through */ | |
11887 | - case PF_WRITE: /* write, not present */ | |
11888 | - if (!(vma->vm_flags & VM_WRITE)) | |
11889 | - goto bad_area; | |
11890 | - write++; | |
11891 | - break; | |
11892 | - case PF_PROT: /* read, present */ | |
11893 | - goto bad_area; | |
11894 | - case 0: /* read, not present */ | |
11895 | - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) | |
11896 | - goto bad_area; | |
11897 | - } | |
11898 | - | |
11899 | - /* | |
11900 | - * If for any reason at all we couldn't handle the fault, | |
11901 | - * make sure we exit gracefully rather than endlessly redo | |
11902 | - * the fault. | |
11903 | - */ | |
11904 | - fault = handle_mm_fault(mm, vma, address, write); | |
11905 | - if (unlikely(fault & VM_FAULT_ERROR)) { | |
11906 | - if (fault & VM_FAULT_OOM) | |
11907 | - goto out_of_memory; | |
11908 | - else if (fault & VM_FAULT_SIGBUS) | |
11909 | - goto do_sigbus; | |
11910 | - BUG(); | |
11911 | - } | |
11912 | - if (fault & VM_FAULT_MAJOR) | |
11913 | - tsk->maj_flt++; | |
11914 | - else | |
11915 | - tsk->min_flt++; | |
11916 | - up_read(&mm->mmap_sem); | |
11917 | - return; | |
11918 | - | |
11919 | -/* | |
11920 | - * Something tried to access memory that isn't in our memory map.. | |
11921 | - * Fix it, but check if it's kernel or user first.. | |
11922 | - */ | |
11923 | -bad_area: | |
11924 | - up_read(&mm->mmap_sem); | |
11925 | - | |
11926 | -bad_area_nosemaphore: | |
11927 | - /* User mode accesses just cause a SIGSEGV */ | |
11928 | - if (error_code & PF_USER) { | |
11929 | - | |
11930 | - /* | |
11931 | - * It's possible to have interrupts off here. | |
11932 | - */ | |
11933 | - local_irq_enable(); | |
11934 | - | |
11935 | - if (is_prefetch(regs, address, error_code)) | |
11936 | - return; | |
11937 | - | |
11938 | - /* Work around K8 erratum #100 K8 in compat mode | |
11939 | - occasionally jumps to illegal addresses >4GB. We | |
11940 | - catch this here in the page fault handler because | |
11941 | - these addresses are not reachable. Just detect this | |
11942 | - case and return. Any code segment in LDT is | |
11943 | - compatibility mode. */ | |
11944 | - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && | |
11945 | - (address >> 32)) | |
11946 | - return; | |
11947 | - | |
11948 | - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | |
11949 | - printk_ratelimit()) { | |
11950 | - printk( | |
11951 | - "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n", | |
11952 | - tsk->pid > 1 ? KERN_INFO : KERN_EMERG, | |
11953 | - tsk->comm, tsk->pid, address, regs->rip, | |
11954 | - regs->rsp, error_code); | |
11955 | - } | |
11956 | - | |
11957 | - tsk->thread.cr2 = address; | |
11958 | - /* Kernel addresses are always protection faults */ | |
11959 | - tsk->thread.error_code = error_code | (address >= TASK_SIZE); | |
11960 | - tsk->thread.trap_no = 14; | |
11961 | - info.si_signo = SIGSEGV; | |
11962 | - info.si_errno = 0; | |
11963 | - /* info.si_code has been set above */ | |
11964 | - info.si_addr = (void __user *)address; | |
11965 | - force_sig_info(SIGSEGV, &info, tsk); | |
11966 | - return; | |
11967 | - } | |
11968 | - | |
11969 | -no_context: | |
11970 | - | |
11971 | - /* Are we prepared to handle this kernel fault? */ | |
11972 | - fixup = search_exception_tables(regs->rip); | |
11973 | - if (fixup) { | |
11974 | - regs->rip = fixup->fixup; | |
11975 | - return; | |
11976 | - } | |
11977 | - | |
11978 | - /* | |
11979 | - * Hall of shame of CPU/BIOS bugs. | |
11980 | - */ | |
11981 | - | |
11982 | - if (is_prefetch(regs, address, error_code)) | |
11983 | - return; | |
11984 | - | |
11985 | - if (is_errata93(regs, address)) | |
11986 | - return; | |
11987 | - | |
11988 | -/* | |
11989 | - * Oops. The kernel tried to access some bad page. We'll have to | |
11990 | - * terminate things with extreme prejudice. | |
11991 | - */ | |
11992 | - | |
11993 | - flags = oops_begin(); | |
11994 | - | |
11995 | - if (address < PAGE_SIZE) | |
11996 | - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); | |
11997 | - else | |
11998 | - printk(KERN_ALERT "Unable to handle kernel paging request"); | |
11999 | - printk(" at %016lx RIP: \n" KERN_ALERT,address); | |
12000 | - printk_address(regs->rip); | |
12001 | - dump_pagetable(address); | |
12002 | - tsk->thread.cr2 = address; | |
12003 | - tsk->thread.trap_no = 14; | |
12004 | - tsk->thread.error_code = error_code; | |
12005 | - __die("Oops", regs, error_code); | |
12006 | - /* Executive summary in case the body of the oops scrolled away */ | |
12007 | - printk(KERN_EMERG "CR2: %016lx\n", address); | |
12008 | - oops_end(flags); | |
12009 | - do_exit(SIGKILL); | |
12010 | - | |
12011 | -/* | |
12012 | - * We ran out of memory, or some other thing happened to us that made | |
12013 | - * us unable to handle the page fault gracefully. | |
12014 | - */ | |
12015 | -out_of_memory: | |
12016 | - up_read(&mm->mmap_sem); | |
12017 | - if (is_global_init(current)) { | |
12018 | - yield(); | |
12019 | - goto again; | |
12020 | - } | |
12021 | - printk("VM: killing process %s\n", tsk->comm); | |
12022 | - if (error_code & 4) | |
12023 | - do_group_exit(SIGKILL); | |
12024 | - goto no_context; | |
12025 | - | |
12026 | -do_sigbus: | |
12027 | - up_read(&mm->mmap_sem); | |
12028 | - | |
12029 | - /* Kernel mode? Handle exceptions or die */ | |
12030 | - if (!(error_code & PF_USER)) | |
12031 | - goto no_context; | |
12032 | - | |
12033 | - tsk->thread.cr2 = address; | |
12034 | - tsk->thread.error_code = error_code; | |
12035 | - tsk->thread.trap_no = 14; | |
12036 | - info.si_signo = SIGBUS; | |
12037 | - info.si_errno = 0; | |
12038 | - info.si_code = BUS_ADRERR; | |
12039 | - info.si_addr = (void __user *)address; | |
12040 | - force_sig_info(SIGBUS, &info, tsk); | |
12041 | - return; | |
12042 | -} | |
12043 | - | |
12044 | -DEFINE_SPINLOCK(pgd_lock); | |
12045 | -LIST_HEAD(pgd_list); | |
12046 | - | |
12047 | -void vmalloc_sync_all(void) | |
12048 | -{ | |
12049 | - /* Note that races in the updates of insync and start aren't | |
12050 | - problematic: | |
12051 | - insync can only get set bits added, and updates to start are only | |
12052 | - improving performance (without affecting correctness if undone). */ | |
12053 | - static DECLARE_BITMAP(insync, PTRS_PER_PGD); | |
12054 | - static unsigned long start = VMALLOC_START & PGDIR_MASK; | |
12055 | - unsigned long address; | |
12056 | - | |
12057 | - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { | |
12058 | - if (!test_bit(pgd_index(address), insync)) { | |
12059 | - const pgd_t *pgd_ref = pgd_offset_k(address); | |
12060 | - struct page *page; | |
12061 | - | |
12062 | - if (pgd_none(*pgd_ref)) | |
12063 | - continue; | |
12064 | - spin_lock(&pgd_lock); | |
12065 | - list_for_each_entry(page, &pgd_list, lru) { | |
12066 | - pgd_t *pgd; | |
12067 | - pgd = (pgd_t *)page_address(page) + pgd_index(address); | |
12068 | - if (pgd_none(*pgd)) | |
12069 | - set_pgd(pgd, *pgd_ref); | |
12070 | - else | |
12071 | - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | |
12072 | - } | |
12073 | - spin_unlock(&pgd_lock); | |
12074 | - set_bit(pgd_index(address), insync); | |
12075 | - } | |
12076 | - if (address == start) | |
12077 | - start = address + PGDIR_SIZE; | |
12078 | - } | |
12079 | - /* Check that there is no need to do the same for the modules area. */ | |
12080 | - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); | |
12081 | - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == | |
12082 | - (__START_KERNEL & PGDIR_MASK))); | |
12083 | -} | |
12084 | --- /dev/null | |
12085 | +++ b/arch/x86/mm/fault-xen.c | |
12086 | @@ -0,0 +1,1026 @@ | |
12087 | +/* | |
12088 | + * Copyright (C) 1995 Linus Torvalds | |
12089 | + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. | |
12090 | + */ | |
12091 | + | |
12092 | +#include <linux/signal.h> | |
12093 | +#include <linux/sched.h> | |
12094 | +#include <linux/kernel.h> | |
12095 | +#include <linux/errno.h> | |
12096 | +#include <linux/string.h> | |
12097 | +#include <linux/types.h> | |
12098 | +#include <linux/ptrace.h> | |
12099 | +#include <linux/mman.h> | |
12100 | +#include <linux/mm.h> | |
12101 | +#include <linux/smp.h> | |
12102 | +#include <linux/interrupt.h> | |
12103 | +#include <linux/init.h> | |
12104 | +#include <linux/tty.h> | |
12105 | +#include <linux/vt_kern.h> /* For unblank_screen() */ | |
12106 | +#include <linux/compiler.h> | |
12107 | +#include <linux/highmem.h> | |
12108 | +#include <linux/bootmem.h> /* for max_low_pfn */ | |
12109 | +#include <linux/vmalloc.h> | |
12110 | +#include <linux/module.h> | |
12111 | +#include <linux/kprobes.h> | |
12112 | +#include <linux/uaccess.h> | |
12113 | +#include <linux/kdebug.h> | |
12114 | + | |
12115 | +#include <asm/system.h> | |
12116 | +#include <asm/desc.h> | |
12117 | +#include <asm/segment.h> | |
12118 | +#include <asm/pgalloc.h> | |
12119 | +#include <asm/smp.h> | |
12120 | +#include <asm/tlbflush.h> | |
12121 | +#include <asm/proto.h> | |
12122 | +#include <asm-generic/sections.h> | |
12123 | + | |
12124 | +/* | |
12125 | + * Page fault error code bits | |
12126 | + * bit 0 == 0 means no page found, 1 means protection fault | |
12127 | + * bit 1 == 0 means read, 1 means write | |
12128 | + * bit 2 == 0 means kernel, 1 means user-mode | |
12129 | + * bit 3 == 1 means use of reserved bit detected | |
12130 | + * bit 4 == 1 means fault was an instruction fetch | |
12131 | + */ | |
12132 | +#define PF_PROT (1<<0) | |
12133 | +#define PF_WRITE (1<<1) | |
12134 | +#define PF_USER (1<<2) | |
12135 | +#define PF_RSVD (1<<3) | |
12136 | +#define PF_INSTR (1<<4) | |
12137 | + | |
12138 | +static inline int notify_page_fault(struct pt_regs *regs) | |
12139 | +{ | |
12140 | +#ifdef CONFIG_KPROBES | |
12141 | + int ret = 0; | |
12142 | + | |
12143 | + /* kprobe_running() needs smp_processor_id() */ | |
12144 | +#ifdef CONFIG_X86_32 | |
12145 | + if (!user_mode_vm(regs)) { | |
12146 | +#else | |
12147 | + if (!user_mode(regs)) { | |
12148 | +#endif | |
12149 | + preempt_disable(); | |
12150 | + if (kprobe_running() && kprobe_fault_handler(regs, 14)) | |
12151 | + ret = 1; | |
12152 | + preempt_enable(); | |
12153 | + } | |
12154 | + | |
12155 | + return ret; | |
12156 | +#else | |
12157 | + return 0; | |
12158 | +#endif | |
12159 | +} | |
12160 | + | |
12161 | +/* | |
12162 | + * X86_32 | |
12163 | + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. | |
12164 | + * Check that here and ignore it. | |
12165 | + * | |
12166 | + * X86_64 | |
12167 | + * Sometimes the CPU reports invalid exceptions on prefetch. | |
12168 | + * Check that here and ignore it. | |
12169 | + * | |
12170 | + * Opcode checker based on code by Richard Brunner | |
12171 | + */ | |
12172 | +static int is_prefetch(struct pt_regs *regs, unsigned long addr, | |
12173 | + unsigned long error_code) | |
12174 | +{ | |
12175 | + unsigned char *instr; | |
12176 | + int scan_more = 1; | |
12177 | + int prefetch = 0; | |
12178 | + unsigned char *max_instr; | |
12179 | + | |
12180 | + /* | |
12181 | + * If it was a exec (instruction fetch) fault on NX page, then | |
12182 | + * do not ignore the fault: | |
12183 | + */ | |
12184 | + if (error_code & PF_INSTR) | |
12185 | + return 0; | |
12186 | + | |
12187 | + instr = (unsigned char *)convert_ip_to_linear(current, regs); | |
12188 | + max_instr = instr + 15; | |
12189 | + | |
12190 | + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) | |
12191 | + return 0; | |
12192 | + | |
12193 | + while (scan_more && instr < max_instr) { | |
12194 | + unsigned char opcode; | |
12195 | + unsigned char instr_hi; | |
12196 | + unsigned char instr_lo; | |
12197 | + | |
12198 | + if (probe_kernel_address(instr, opcode)) | |
12199 | + break; | |
12200 | + | |
12201 | + instr_hi = opcode & 0xf0; | |
12202 | + instr_lo = opcode & 0x0f; | |
12203 | + instr++; | |
12204 | + | |
12205 | + switch (instr_hi) { | |
12206 | + case 0x20: | |
12207 | + case 0x30: | |
12208 | + /* | |
12209 | + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. | |
12210 | + * In X86_64 long mode, the CPU will signal invalid | |
12211 | + * opcode if some of these prefixes are present so | |
12212 | + * X86_64 will never get here anyway | |
12213 | + */ | |
12214 | + scan_more = ((instr_lo & 7) == 0x6); | |
12215 | + break; | |
12216 | +#ifdef CONFIG_X86_64 | |
12217 | + case 0x40: | |
12218 | + /* | |
12219 | + * In AMD64 long mode 0x40..0x4F are valid REX prefixes | |
12220 | + * Need to figure out under what instruction mode the | |
12221 | + * instruction was issued. Could check the LDT for lm, | |
12222 | + * but for now it's good enough to assume that long | |
12223 | + * mode only uses well known segments or kernel. | |
12224 | + */ | |
12225 | + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); | |
12226 | + break; | |
12227 | +#endif | |
12228 | + case 0x60: | |
12229 | + /* 0x64 thru 0x67 are valid prefixes in all modes. */ | |
12230 | + scan_more = (instr_lo & 0xC) == 0x4; | |
12231 | + break; | |
12232 | + case 0xF0: | |
12233 | + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ | |
12234 | + scan_more = !instr_lo || (instr_lo>>1) == 1; | |
12235 | + break; | |
12236 | + case 0x00: | |
12237 | + /* Prefetch instruction is 0x0F0D or 0x0F18 */ | |
12238 | + scan_more = 0; | |
12239 | + | |
12240 | + if (probe_kernel_address(instr, opcode)) | |
12241 | + break; | |
12242 | + prefetch = (instr_lo == 0xF) && | |
12243 | + (opcode == 0x0D || opcode == 0x18); | |
12244 | + break; | |
12245 | + default: | |
12246 | + scan_more = 0; | |
12247 | + break; | |
12248 | + } | |
12249 | + } | |
12250 | + return prefetch; | |
12251 | +} | |
12252 | + | |
12253 | +static void force_sig_info_fault(int si_signo, int si_code, | |
12254 | + unsigned long address, struct task_struct *tsk) | |
12255 | +{ | |
12256 | + siginfo_t info; | |
12257 | + | |
12258 | + info.si_signo = si_signo; | |
12259 | + info.si_errno = 0; | |
12260 | + info.si_code = si_code; | |
12261 | + info.si_addr = (void __user *)address; | |
12262 | + force_sig_info(si_signo, &info, tsk); | |
12263 | +} | |
12264 | + | |
12265 | +#ifdef CONFIG_X86_64 | |
12266 | +static int bad_address(void *p) | |
12267 | +{ | |
12268 | + unsigned long dummy; | |
12269 | + return probe_kernel_address((unsigned long *)p, dummy); | |
12270 | +} | |
12271 | +#endif | |
12272 | + | |
12273 | +static void dump_pagetable(unsigned long address) | |
12274 | +{ | |
12275 | +#ifdef CONFIG_X86_32 | |
12276 | + __typeof__(pte_val(__pte(0))) page; | |
12277 | + | |
12278 | + page = read_cr3(); | |
12279 | + page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; | |
12280 | +#ifdef CONFIG_X86_PAE | |
12281 | + printk("*pdpt = %016Lx ", page); | |
12282 | + if ((page & _PAGE_PRESENT) | |
12283 | + && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) { | |
12284 | + page = mfn_to_pfn(page >> PAGE_SHIFT); | |
12285 | + page <<= PAGE_SHIFT; | |
12286 | + page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) | |
12287 | + & (PTRS_PER_PMD - 1)]; | |
12288 | + printk(KERN_CONT "*pde = %016Lx ", page); | |
12289 | + page &= ~_PAGE_NX; | |
12290 | + } | |
12291 | +#else | |
12292 | + printk("*pde = %08lx ", page); | |
12293 | +#endif | |
12294 | + | |
12295 | + /* | |
12296 | + * We must not directly access the pte in the highpte | |
12297 | + * case if the page table is located in highmem. | |
12298 | + * And let's rather not kmap-atomic the pte, just in case | |
12299 | + * it's allocated already. | |
12300 | + */ | |
12301 | + if ((page & _PAGE_PRESENT) | |
12302 | + && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn | |
12303 | + && !(page & _PAGE_PSE)) { | |
12304 | + page = mfn_to_pfn(page >> PAGE_SHIFT); | |
12305 | + page <<= PAGE_SHIFT; | |
12306 | + page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) | |
12307 | + & (PTRS_PER_PTE - 1)]; | |
12308 | + printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page); | |
12309 | + } | |
12310 | + | |
12311 | + printk(KERN_CONT "\n"); | |
12312 | +#else /* CONFIG_X86_64 */ | |
12313 | + pgd_t *pgd; | |
12314 | + pud_t *pud; | |
12315 | + pmd_t *pmd; | |
12316 | + pte_t *pte; | |
12317 | + | |
12318 | + pgd = (pgd_t *)read_cr3(); | |
12319 | + | |
12320 | + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); | |
12321 | + pgd += pgd_index(address); | |
12322 | + if (bad_address(pgd)) goto bad; | |
12323 | + printk("PGD %lx ", pgd_val(*pgd)); | |
12324 | + if (!pgd_present(*pgd)) goto ret; | |
12325 | + | |
12326 | + pud = pud_offset(pgd, address); | |
12327 | + if (bad_address(pud)) goto bad; | |
12328 | + printk(KERN_CONT "PUD %lx ", pud_val(*pud)); | |
12329 | + if (!pud_present(*pud) || pud_large(*pud)) | |
12330 | + goto ret; | |
12331 | + | |
12332 | + pmd = pmd_offset(pud, address); | |
12333 | + if (bad_address(pmd)) goto bad; | |
12334 | + printk(KERN_CONT "PMD %lx ", pmd_val(*pmd)); | |
12335 | + if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; | |
12336 | + | |
12337 | + pte = pte_offset_kernel(pmd, address); | |
12338 | + if (bad_address(pte)) goto bad; | |
12339 | + printk(KERN_CONT "PTE %lx", pte_val(*pte)); | |
12340 | +ret: | |
12341 | + printk(KERN_CONT "\n"); | |
12342 | + return; | |
12343 | +bad: | |
12344 | + printk("BAD\n"); | |
12345 | +#endif | |
12346 | +} | |
12347 | + | |
12348 | +#ifdef CONFIG_X86_32 | |
12349 | +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) | |
12350 | +{ | |
12351 | + unsigned index = pgd_index(address); | |
12352 | + pgd_t *pgd_k; | |
12353 | + pud_t *pud, *pud_k; | |
12354 | + pmd_t *pmd, *pmd_k; | |
12355 | + | |
12356 | + pgd += index; | |
12357 | + pgd_k = init_mm.pgd + index; | |
12358 | + | |
12359 | + if (!pgd_present(*pgd_k)) | |
12360 | + return NULL; | |
12361 | + | |
12362 | + /* | |
12363 | + * set_pgd(pgd, *pgd_k); here would be useless on PAE | |
12364 | + * and redundant with the set_pmd() on non-PAE. As would | |
12365 | + * set_pud. | |
12366 | + */ | |
12367 | + | |
12368 | + pud = pud_offset(pgd, address); | |
12369 | + pud_k = pud_offset(pgd_k, address); | |
12370 | + if (!pud_present(*pud_k)) | |
12371 | + return NULL; | |
12372 | + | |
12373 | + pmd = pmd_offset(pud, address); | |
12374 | + pmd_k = pmd_offset(pud_k, address); | |
12375 | + if (!pmd_present(*pmd_k)) | |
12376 | + return NULL; | |
12377 | + if (!pmd_present(*pmd)) { | |
12378 | + bool lazy = x86_read_percpu(xen_lazy_mmu); | |
12379 | + | |
12380 | + x86_write_percpu(xen_lazy_mmu, false); | |
12381 | +#if CONFIG_XEN_COMPAT > 0x030002 | |
12382 | + set_pmd(pmd, *pmd_k); | |
12383 | +#else | |
12384 | + /* | |
12385 | + * When running on older Xen we must launder *pmd_k through | |
12386 | + * pmd_val() to ensure that _PAGE_PRESENT is correctly set. | |
12387 | + */ | |
12388 | + set_pmd(pmd, __pmd(pmd_val(*pmd_k))); | |
12389 | +#endif | |
12390 | + x86_write_percpu(xen_lazy_mmu, lazy); | |
12391 | + } else | |
12392 | + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); | |
12393 | + return pmd_k; | |
12394 | +} | |
12395 | +#endif | |
12396 | + | |
12397 | +#ifdef CONFIG_X86_64 | |
12398 | +static const char errata93_warning[] = | |
12399 | +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" | |
12400 | +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" | |
12401 | +KERN_ERR "******* Please consider a BIOS update.\n" | |
12402 | +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; | |
12403 | +#endif | |
12404 | + | |
12405 | +/* Workaround for K8 erratum #93 & buggy BIOS. | |
12406 | + BIOS SMM functions are required to use a specific workaround | |
12407 | + to avoid corruption of the 64bit RIP register on C stepping K8. | |
12408 | + A lot of BIOS that didn't get tested properly miss this. | |
12409 | + The OS sees this as a page fault with the upper 32bits of RIP cleared. | |
12410 | + Try to work around it here. | |
12411 | + Note we only handle faults in kernel here. | |
12412 | + Does nothing for X86_32 | |
12413 | + */ | |
12414 | +static int is_errata93(struct pt_regs *regs, unsigned long address) | |
12415 | +{ | |
12416 | +#ifdef CONFIG_X86_64 | |
12417 | + static int warned; | |
12418 | + if (address != regs->ip) | |
12419 | + return 0; | |
12420 | + if ((address >> 32) != 0) | |
12421 | + return 0; | |
12422 | + address |= 0xffffffffUL << 32; | |
12423 | + if ((address >= (u64)_stext && address <= (u64)_etext) || | |
12424 | + (address >= MODULES_VADDR && address <= MODULES_END)) { | |
12425 | + if (!warned) { | |
12426 | + printk(errata93_warning); | |
12427 | + warned = 1; | |
12428 | + } | |
12429 | + regs->ip = address; | |
12430 | + return 1; | |
12431 | + } | |
12432 | +#endif | |
12433 | + return 0; | |
12434 | +} | |
12435 | + | |
12436 | +/* | |
12437 | + * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal | |
12438 | + * addresses >4GB. We catch this in the page fault handler because these | |
12439 | + * addresses are not reachable. Just detect this case and return. Any code | |
12440 | + * segment in LDT is compatibility mode. | |
12441 | + */ | |
12442 | +static int is_errata100(struct pt_regs *regs, unsigned long address) | |
12443 | +{ | |
12444 | +#ifdef CONFIG_X86_64 | |
12445 | + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && | |
12446 | + (address >> 32)) | |
12447 | + return 1; | |
12448 | +#endif | |
12449 | + return 0; | |
12450 | +} | |
12451 | + | |
12452 | +void do_invalid_op(struct pt_regs *, unsigned long); | |
12453 | + | |
12454 | +static int is_f00f_bug(struct pt_regs *regs, unsigned long address) | |
12455 | +{ | |
12456 | +#ifdef CONFIG_X86_F00F_BUG | |
12457 | + unsigned long nr; | |
12458 | + /* | |
12459 | + * Pentium F0 0F C7 C8 bug workaround. | |
12460 | + */ | |
12461 | + if (boot_cpu_data.f00f_bug) { | |
12462 | + nr = (address - idt_descr.address) >> 3; | |
12463 | + | |
12464 | + if (nr == 6) { | |
12465 | + do_invalid_op(regs, 0); | |
12466 | + return 1; | |
12467 | + } | |
12468 | + } | |
12469 | +#endif | |
12470 | + return 0; | |
12471 | +} | |
12472 | + | |
12473 | +static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, | |
12474 | + unsigned long address) | |
12475 | +{ | |
12476 | +#ifdef CONFIG_X86_32 | |
12477 | + if (!oops_may_print()) | |
12478 | + return; | |
12479 | +#endif | |
12480 | + | |
12481 | +#ifdef CONFIG_X86_PAE | |
12482 | + if (error_code & PF_INSTR) { | |
12483 | + unsigned int level; | |
12484 | + pte_t *pte = lookup_address(address, &level); | |
12485 | + | |
12486 | + if (pte && pte_present(*pte) && !pte_exec(*pte)) | |
12487 | + printk(KERN_CRIT "kernel tried to execute " | |
12488 | + "NX-protected page - exploit attempt? " | |
12489 | + "(uid: %d)\n", current->uid); | |
12490 | + } | |
12491 | +#endif | |
12492 | + | |
12493 | + printk(KERN_ALERT "BUG: unable to handle kernel "); | |
12494 | + if (address < PAGE_SIZE) | |
12495 | + printk(KERN_CONT "NULL pointer dereference"); | |
12496 | + else | |
12497 | + printk(KERN_CONT "paging request"); | |
12498 | +#ifdef CONFIG_X86_32 | |
12499 | + printk(KERN_CONT " at %08lx\n", address); | |
12500 | +#else | |
12501 | + printk(KERN_CONT " at %016lx\n", address); | |
12502 | +#endif | |
12503 | + printk(KERN_ALERT "IP:"); | |
12504 | + printk_address(regs->ip, 1); | |
12505 | + dump_pagetable(address); | |
12506 | +} | |
12507 | + | |
12508 | +#ifdef CONFIG_X86_64 | |
12509 | +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, | |
12510 | + unsigned long error_code) | |
12511 | +{ | |
12512 | + unsigned long flags = oops_begin(); | |
12513 | + struct task_struct *tsk; | |
12514 | + | |
12515 | + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", | |
12516 | + current->comm, address); | |
12517 | + dump_pagetable(address); | |
12518 | + tsk = current; | |
12519 | + tsk->thread.cr2 = address; | |
12520 | + tsk->thread.trap_no = 14; | |
12521 | + tsk->thread.error_code = error_code; | |
12522 | + if (__die("Bad pagetable", regs, error_code)) | |
12523 | + regs = NULL; | |
12524 | + oops_end(flags, regs, SIGKILL); | |
12525 | +} | |
12526 | +#endif | |
12527 | + | |
12528 | +static int spurious_fault_check(unsigned long error_code, pte_t *pte) | |
12529 | +{ | |
12530 | + if ((error_code & PF_WRITE) && !pte_write(*pte)) | |
12531 | + return 0; | |
12532 | + if ((error_code & PF_INSTR) && !pte_exec(*pte)) | |
12533 | + return 0; | |
12534 | + | |
12535 | + return 1; | |
12536 | +} | |
12537 | + | |
12538 | +/* | |
12539 | + * Handle a spurious fault caused by a stale TLB entry. This allows | |
12540 | + * us to lazily refresh the TLB when increasing the permissions of a | |
12541 | + * kernel page (RO -> RW or NX -> X). Doing it eagerly is very | |
12542 | + * expensive since that implies doing a full cross-processor TLB | |
12543 | + * flush, even if no stale TLB entries exist on other processors. | |
12544 | + * There are no security implications to leaving a stale TLB when | |
12545 | + * increasing the permissions on a page. | |
12546 | + */ | |
12547 | +static int spurious_fault(unsigned long address, | |
12548 | + unsigned long error_code) | |
12549 | +{ | |
12550 | + pgd_t *pgd; | |
12551 | + pud_t *pud; | |
12552 | + pmd_t *pmd; | |
12553 | + pte_t *pte; | |
12554 | + | |
12555 | + /* Reserved-bit violation or user access to kernel space? */ | |
12556 | + if (error_code & (PF_USER | PF_RSVD)) | |
12557 | + return 0; | |
12558 | + | |
12559 | + pgd = init_mm.pgd + pgd_index(address); | |
12560 | + if (!pgd_present(*pgd)) | |
12561 | + return 0; | |
12562 | + | |
12563 | + pud = pud_offset(pgd, address); | |
12564 | + if (!pud_present(*pud)) | |
12565 | + return 0; | |
12566 | + | |
12567 | + if (pud_large(*pud)) | |
12568 | + return spurious_fault_check(error_code, (pte_t *) pud); | |
12569 | + | |
12570 | + pmd = pmd_offset(pud, address); | |
12571 | + if (!pmd_present(*pmd)) | |
12572 | + return 0; | |
12573 | + | |
12574 | + if (pmd_large(*pmd)) | |
12575 | + return spurious_fault_check(error_code, (pte_t *) pmd); | |
12576 | + | |
12577 | + pte = pte_offset_kernel(pmd, address); | |
12578 | + if (!pte_present(*pte)) | |
12579 | + return 0; | |
12580 | + | |
12581 | + return spurious_fault_check(error_code, pte); | |
12582 | +} | |
12583 | + | |
12584 | +/* | |
12585 | + * X86_32 | |
12586 | + * Handle a fault on the vmalloc or module mapping area | |
12587 | + * | |
12588 | + * X86_64 | |
12589 | + * Handle a fault on the vmalloc area | |
12590 | + * | |
12591 | + * This assumes no large pages in there. | |
12592 | + */ | |
12593 | +static int vmalloc_fault(unsigned long address) | |
12594 | +{ | |
12595 | +#ifdef CONFIG_X86_32 | |
12596 | + unsigned long pgd_paddr; | |
12597 | + pmd_t *pmd_k; | |
12598 | + pte_t *pte_k; | |
12599 | + /* | |
12600 | + * Synchronize this task's top level page-table | |
12601 | + * with the 'reference' page table. | |
12602 | + * | |
12603 | + * Do _not_ use "current" here. We might be inside | |
12604 | + * an interrupt in the middle of a task switch.. | |
12605 | + */ | |
12606 | + pgd_paddr = read_cr3(); | |
12607 | + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); | |
12608 | + if (!pmd_k) | |
12609 | + return -1; | |
12610 | + pte_k = pte_offset_kernel(pmd_k, address); | |
12611 | + if (!pte_present(*pte_k)) | |
12612 | + return -1; | |
12613 | + return 0; | |
12614 | +#else | |
12615 | + pgd_t *pgd, *pgd_ref; | |
12616 | + pud_t *pud, *pud_ref; | |
12617 | + pmd_t *pmd, *pmd_ref; | |
12618 | + pte_t *pte, *pte_ref; | |
12619 | + | |
12620 | + /* Make sure we are in vmalloc area */ | |
12621 | + if (!(address >= VMALLOC_START && address < VMALLOC_END)) | |
12622 | + return -1; | |
12623 | + | |
12624 | + /* Copy kernel mappings over when needed. This can also | |
12625 | + happen within a race in page table update. In the later | |
12626 | + case just flush. */ | |
12627 | + | |
12628 | + /* On Xen the line below does not always work. Needs investigating! */ | |
12629 | + /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/ | |
12630 | + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); | |
12631 | + pgd += pgd_index(address); | |
12632 | + pgd_ref = pgd_offset_k(address); | |
12633 | + if (pgd_none(*pgd_ref)) | |
12634 | + return -1; | |
12635 | + if (pgd_none(*pgd)) | |
12636 | + set_pgd(pgd, *pgd_ref); | |
12637 | + else | |
12638 | + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | |
12639 | + | |
12640 | + /* Below here mismatches are bugs because these lower tables | |
12641 | + are shared */ | |
12642 | + | |
12643 | + pud = pud_offset(pgd, address); | |
12644 | + pud_ref = pud_offset(pgd_ref, address); | |
12645 | + if (pud_none(*pud_ref)) | |
12646 | + return -1; | |
12647 | + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) | |
12648 | + BUG(); | |
12649 | + pmd = pmd_offset(pud, address); | |
12650 | + pmd_ref = pmd_offset(pud_ref, address); | |
12651 | + if (pmd_none(*pmd_ref)) | |
12652 | + return -1; | |
12653 | + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) | |
12654 | + BUG(); | |
12655 | + pte_ref = pte_offset_kernel(pmd_ref, address); | |
12656 | + if (!pte_present(*pte_ref)) | |
12657 | + return -1; | |
12658 | + pte = pte_offset_kernel(pmd, address); | |
12659 | + /* Don't use pte_page here, because the mappings can point | |
12660 | + outside mem_map, and the NUMA hash lookup cannot handle | |
12661 | + that. */ | |
12662 | + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) | |
12663 | + BUG(); | |
12664 | + return 0; | |
12665 | +#endif | |
12666 | +} | |
12667 | + | |
12668 | +int show_unhandled_signals = 1; | |
12669 | + | |
12670 | +/* | |
12671 | + * This routine handles page faults. It determines the address, | |
12672 | + * and the problem, and then passes it off to one of the appropriate | |
12673 | + * routines. | |
12674 | + */ | |
12675 | +#ifdef CONFIG_X86_64 | |
12676 | +asmlinkage | |
12677 | +#endif | |
12678 | +void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |
12679 | +{ | |
12680 | + struct task_struct *tsk; | |
12681 | + struct mm_struct *mm; | |
12682 | + struct vm_area_struct *vma; | |
12683 | + unsigned long address; | |
12684 | + int write, si_code; | |
12685 | + int fault; | |
12686 | +#ifdef CONFIG_X86_64 | |
12687 | + unsigned long flags; | |
12688 | +#endif | |
12689 | + | |
12690 | + /* | |
12691 | + * We can fault from pretty much anywhere, with unknown IRQ state. | |
12692 | + */ | |
12693 | + trace_hardirqs_fixup(); | |
12694 | + | |
12695 | + /* Set the "privileged fault" bit to something sane. */ | |
12696 | + if (user_mode_vm(regs)) | |
12697 | + error_code |= PF_USER; | |
12698 | + else | |
12699 | + error_code &= ~PF_USER; | |
12700 | + | |
12701 | + tsk = current; | |
12702 | + mm = tsk->mm; | |
12703 | + prefetchw(&mm->mmap_sem); | |
12704 | + | |
12705 | + /* get the address */ | |
12706 | + address = read_cr2(); | |
12707 | + | |
12708 | + si_code = SEGV_MAPERR; | |
12709 | + | |
12710 | + if (notify_page_fault(regs)) | |
12711 | + return; | |
12712 | + | |
12713 | + /* | |
12714 | + * We fault-in kernel-space virtual memory on-demand. The | |
12715 | + * 'reference' page table is init_mm.pgd. | |
12716 | + * | |
12717 | + * NOTE! We MUST NOT take any locks for this case. We may | |
12718 | + * be in an interrupt or a critical region, and should | |
12719 | + * only copy the information from the master page table, | |
12720 | + * nothing more. | |
12721 | + * | |
12722 | + * This verifies that the fault happens in kernel space | |
12723 | + * (error_code & 4) == 0, and that the fault was not a | |
12724 | + * protection error (error_code & 9) == 0. | |
12725 | + */ | |
12726 | +#ifdef CONFIG_X86_32 | |
12727 | + if (unlikely(address >= TASK_SIZE)) { | |
12728 | +#else | |
12729 | + if (unlikely(address >= TASK_SIZE64)) { | |
12730 | +#endif | |
12731 | + /* Faults in hypervisor area can never be patched up. */ | |
12732 | +#if defined(CONFIG_X86_XEN) | |
12733 | + if (address >= hypervisor_virt_start) | |
12734 | + goto bad_area_nosemaphore; | |
12735 | +#elif defined(CONFIG_X86_64_XEN) | |
12736 | + /* Faults in hypervisor area are never spurious. */ | |
12737 | + if (address >= HYPERVISOR_VIRT_START | |
12738 | + && address < HYPERVISOR_VIRT_END) | |
12739 | + goto bad_area_nosemaphore; | |
12740 | +#endif | |
12741 | + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && | |
12742 | + vmalloc_fault(address) >= 0) | |
12743 | + return; | |
12744 | + | |
12745 | + /* Can handle a stale RO->RW TLB */ | |
12746 | + if (spurious_fault(address, error_code)) | |
12747 | + return; | |
12748 | + | |
12749 | + /* | |
12750 | + * Don't take the mm semaphore here. If we fixup a prefetch | |
12751 | + * fault we could otherwise deadlock. | |
12752 | + */ | |
12753 | + goto bad_area_nosemaphore; | |
12754 | + } | |
12755 | + | |
12756 | + | |
12757 | +#ifdef CONFIG_X86_32 | |
12758 | + /* It's safe to allow irq's after cr2 has been saved and the vmalloc | |
12759 | + fault has been handled. */ | |
12760 | + if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) | |
12761 | + local_irq_enable(); | |
12762 | + | |
12763 | + /* | |
12764 | + * If we're in an interrupt, have no user context or are running in an | |
12765 | + * atomic region then we must not take the fault. | |
12766 | + */ | |
12767 | + if (in_atomic() || !mm) | |
12768 | + goto bad_area_nosemaphore; | |
12769 | +#else /* CONFIG_X86_64 */ | |
12770 | + if (likely(regs->flags & X86_EFLAGS_IF)) | |
12771 | + local_irq_enable(); | |
12772 | + | |
12773 | + if (unlikely(error_code & PF_RSVD)) | |
12774 | + pgtable_bad(address, regs, error_code); | |
12775 | + | |
12776 | + /* | |
12777 | + * If we're in an interrupt, have no user context or are running in an | |
12778 | + * atomic region then we must not take the fault. | |
12779 | + */ | |
12780 | + if (unlikely(in_atomic() || !mm)) | |
12781 | + goto bad_area_nosemaphore; | |
12782 | + | |
12783 | + /* | |
12784 | + * User-mode registers count as a user access even for any | |
12785 | + * potential system fault or CPU buglet. | |
12786 | + */ | |
12787 | + if (user_mode_vm(regs)) | |
12788 | + error_code |= PF_USER; | |
12789 | +again: | |
12790 | +#endif | |
12791 | + /* When running in the kernel we expect faults to occur only to | |
12792 | + * addresses in user space. All other faults represent errors in the | |
12793 | + * kernel and should generate an OOPS. Unfortunately, in the case of an | |
12794 | + * erroneous fault occurring in a code path which already holds mmap_sem | |
12795 | + * we will deadlock attempting to validate the fault against the | |
12796 | + * address space. Luckily the kernel only validly references user | |
12797 | + * space from well defined areas of code, which are listed in the | |
12798 | + * exceptions table. | |
12799 | + * | |
12800 | + * As the vast majority of faults will be valid we will only perform | |
12801 | + * the source reference check when there is a possibility of a deadlock. | |
12802 | + * Attempt to lock the address space, if we cannot we then validate the | |
12803 | + * source. If this is invalid we can skip the address space check, | |
12804 | + * thus avoiding the deadlock. | |
12805 | + */ | |
12806 | + if (!down_read_trylock(&mm->mmap_sem)) { | |
12807 | + if ((error_code & PF_USER) == 0 && | |
12808 | + !search_exception_tables(regs->ip)) | |
12809 | + goto bad_area_nosemaphore; | |
12810 | + down_read(&mm->mmap_sem); | |
12811 | + } | |
12812 | + | |
12813 | + vma = find_vma(mm, address); | |
12814 | + if (!vma) | |
12815 | + goto bad_area; | |
12816 | + if (vma->vm_start <= address) | |
12817 | + goto good_area; | |
12818 | + if (!(vma->vm_flags & VM_GROWSDOWN)) | |
12819 | + goto bad_area; | |
12820 | + if (error_code & PF_USER) { | |
12821 | + /* | |
12822 | + * Accessing the stack below %sp is always a bug. | |
12823 | + * The large cushion allows instructions like enter | |
12824 | + * and pusha to work. ("enter $65535,$31" pushes | |
12825 | + * 32 pointers and then decrements %sp by 65535.) | |
12826 | + */ | |
12827 | + if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) | |
12828 | + goto bad_area; | |
12829 | + } | |
12830 | + if (expand_stack(vma, address)) | |
12831 | + goto bad_area; | |
12832 | +/* | |
12833 | + * Ok, we have a good vm_area for this memory access, so | |
12834 | + * we can handle it.. | |
12835 | + */ | |
12836 | +good_area: | |
12837 | + si_code = SEGV_ACCERR; | |
12838 | + write = 0; | |
12839 | + switch (error_code & (PF_PROT|PF_WRITE)) { | |
12840 | + default: /* 3: write, present */ | |
12841 | + /* fall through */ | |
12842 | + case PF_WRITE: /* write, not present */ | |
12843 | + if (!(vma->vm_flags & VM_WRITE)) | |
12844 | + goto bad_area; | |
12845 | + write++; | |
12846 | + break; | |
12847 | + case PF_PROT: /* read, present */ | |
12848 | + goto bad_area; | |
12849 | + case 0: /* read, not present */ | |
12850 | + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) | |
12851 | + goto bad_area; | |
12852 | + } | |
12853 | + | |
12854 | +#ifdef CONFIG_X86_32 | |
12855 | +survive: | |
12856 | +#endif | |
12857 | + /* | |
12858 | + * If for any reason at all we couldn't handle the fault, | |
12859 | + * make sure we exit gracefully rather than endlessly redo | |
12860 | + * the fault. | |
12861 | + */ | |
12862 | + fault = handle_mm_fault(mm, vma, address, write); | |
12863 | + if (unlikely(fault & VM_FAULT_ERROR)) { | |
12864 | + if (fault & VM_FAULT_OOM) | |
12865 | + goto out_of_memory; | |
12866 | + else if (fault & VM_FAULT_SIGBUS) | |
12867 | + goto do_sigbus; | |
12868 | + BUG(); | |
12869 | + } | |
12870 | + if (fault & VM_FAULT_MAJOR) | |
12871 | + tsk->maj_flt++; | |
12872 | + else | |
12873 | + tsk->min_flt++; | |
12874 | + | |
12875 | +#ifdef CONFIG_X86_32 | |
12876 | + /* | |
12877 | + * Did it hit the DOS screen memory VA from vm86 mode? | |
12878 | + */ | |
12879 | + if (v8086_mode(regs)) { | |
12880 | + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; | |
12881 | + if (bit < 32) | |
12882 | + tsk->thread.screen_bitmap |= 1 << bit; | |
12883 | + } | |
12884 | +#endif | |
12885 | + up_read(&mm->mmap_sem); | |
12886 | + return; | |
12887 | + | |
12888 | +/* | |
12889 | + * Something tried to access memory that isn't in our memory map.. | |
12890 | + * Fix it, but check if it's kernel or user first.. | |
12891 | + */ | |
12892 | +bad_area: | |
12893 | + up_read(&mm->mmap_sem); | |
12894 | + | |
12895 | +bad_area_nosemaphore: | |
12896 | + /* User mode accesses just cause a SIGSEGV */ | |
12897 | + if (error_code & PF_USER) { | |
12898 | + /* | |
12899 | + * It's possible to have interrupts off here. | |
12900 | + */ | |
12901 | + local_irq_enable(); | |
12902 | + | |
12903 | + /* | |
12904 | + * Valid to do another page fault here because this one came | |
12905 | + * from user space. | |
12906 | + */ | |
12907 | + if (is_prefetch(regs, address, error_code)) | |
12908 | + return; | |
12909 | + | |
12910 | + if (is_errata100(regs, address)) | |
12911 | + return; | |
12912 | + | |
12913 | + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | |
12914 | + printk_ratelimit()) { | |
12915 | + printk( | |
12916 | +#ifdef CONFIG_X86_32 | |
12917 | + "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx", | |
12918 | +#else | |
12919 | + "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx", | |
12920 | +#endif | |
12921 | + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, | |
12922 | + tsk->comm, task_pid_nr(tsk), address, regs->ip, | |
12923 | + regs->sp, error_code); | |
12924 | + print_vma_addr(" in ", regs->ip); | |
12925 | + printk("\n"); | |
12926 | + } | |
12927 | + | |
12928 | + tsk->thread.cr2 = address; | |
12929 | + /* Kernel addresses are always protection faults */ | |
12930 | + tsk->thread.error_code = error_code | (address >= TASK_SIZE); | |
12931 | + tsk->thread.trap_no = 14; | |
12932 | + force_sig_info_fault(SIGSEGV, si_code, address, tsk); | |
12933 | + return; | |
12934 | + } | |
12935 | + | |
12936 | + if (is_f00f_bug(regs, address)) | |
12937 | + return; | |
12938 | + | |
12939 | +no_context: | |
12940 | + /* Are we prepared to handle this kernel fault? */ | |
12941 | + if (fixup_exception(regs)) | |
12942 | + return; | |
12943 | + | |
12944 | + /* | |
12945 | + * X86_32 | |
12946 | + * Valid to do another page fault here, because if this fault | |
12947 | + * had been triggered by is_prefetch fixup_exception would have | |
12948 | + * handled it. | |
12949 | + * | |
12950 | + * X86_64 | |
12951 | + * Hall of shame of CPU/BIOS bugs. | |
12952 | + */ | |
12953 | + if (is_prefetch(regs, address, error_code)) | |
12954 | + return; | |
12955 | + | |
12956 | + if (is_errata93(regs, address)) | |
12957 | + return; | |
12958 | + | |
12959 | +/* | |
12960 | + * Oops. The kernel tried to access some bad page. We'll have to | |
12961 | + * terminate things with extreme prejudice. | |
12962 | + */ | |
12963 | +#ifdef CONFIG_X86_32 | |
12964 | + bust_spinlocks(1); | |
12965 | +#else | |
12966 | + flags = oops_begin(); | |
12967 | +#endif | |
12968 | + | |
12969 | + show_fault_oops(regs, error_code, address); | |
12970 | + | |
12971 | + tsk->thread.cr2 = address; | |
12972 | + tsk->thread.trap_no = 14; | |
12973 | + tsk->thread.error_code = error_code; | |
12974 | + | |
12975 | +#ifdef CONFIG_X86_32 | |
12976 | + die("Oops", regs, error_code); | |
12977 | + bust_spinlocks(0); | |
12978 | + do_exit(SIGKILL); | |
12979 | +#else | |
12980 | + if (__die("Oops", regs, error_code)) | |
12981 | + regs = NULL; | |
12982 | + /* Executive summary in case the body of the oops scrolled away */ | |
12983 | + printk(KERN_EMERG "CR2: %016lx\n", address); | |
12984 | + oops_end(flags, regs, SIGKILL); | |
12985 | +#endif | |
12986 | + | |
12987 | +/* | |
12988 | + * We ran out of memory, or some other thing happened to us that made | |
12989 | + * us unable to handle the page fault gracefully. | |
12990 | + */ | |
12991 | +out_of_memory: | |
12992 | + up_read(&mm->mmap_sem); | |
12993 | + if (is_global_init(tsk)) { | |
12994 | + yield(); | |
12995 | +#ifdef CONFIG_X86_32 | |
12996 | + down_read(&mm->mmap_sem); | |
12997 | + goto survive; | |
12998 | +#else | |
12999 | + goto again; | |
13000 | +#endif | |
13001 | + } | |
13002 | + | |
13003 | + printk("VM: killing process %s\n", tsk->comm); | |
13004 | + if (error_code & PF_USER) | |
13005 | + do_group_exit(SIGKILL); | |
13006 | + goto no_context; | |
13007 | + | |
13008 | +do_sigbus: | |
13009 | + up_read(&mm->mmap_sem); | |
13010 | + | |
13011 | + /* Kernel mode? Handle exceptions or die */ | |
13012 | + if (!(error_code & PF_USER)) | |
13013 | + goto no_context; | |
13014 | +#ifdef CONFIG_X86_32 | |
13015 | + /* User space => ok to do another page fault */ | |
13016 | + if (is_prefetch(regs, address, error_code)) | |
13017 | + return; | |
13018 | +#endif | |
13019 | + tsk->thread.cr2 = address; | |
13020 | + tsk->thread.error_code = error_code; | |
13021 | + tsk->thread.trap_no = 14; | |
13022 | + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); | |
13023 | +} | |
13024 | + | |
13025 | +DEFINE_SPINLOCK(pgd_lock); | |
13026 | +LIST_HEAD(pgd_list); | |
13027 | + | |
13028 | +void vmalloc_sync_all(void) | |
13029 | +{ | |
13030 | +#ifdef CONFIG_X86_32 | |
13031 | + /* | |
13032 | + * Note that races in the updates of insync and start aren't | |
13033 | + * problematic: insync can only get set bits added, and updates to | |
13034 | + * start are only improving performance (without affecting correctness | |
13035 | + * if undone). | |
13036 | + * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs. | |
13037 | + * This change works just fine with 2-level paging too. | |
13038 | + */ | |
13039 | +#define sync_index(a) ((a) >> PMD_SHIFT) | |
13040 | + static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD); | |
13041 | + static unsigned long start = TASK_SIZE; | |
13042 | + unsigned long address; | |
13043 | + | |
13044 | + if (SHARED_KERNEL_PMD) | |
13045 | + return; | |
13046 | + | |
13047 | + BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK); | |
13048 | + for (address = start; | |
13049 | + address < hypervisor_virt_start; | |
13050 | + address += PMD_SIZE) { | |
13051 | + if (!test_bit(sync_index(address), insync)) { | |
13052 | + unsigned long flags; | |
13053 | + struct page *page; | |
13054 | + | |
13055 | + spin_lock_irqsave(&pgd_lock, flags); | |
13056 | + /* XEN: failure path assumes non-empty pgd_list. */ | |
13057 | + if (unlikely(list_empty(&pgd_list))) { | |
13058 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
13059 | + return; | |
13060 | + } | |
13061 | + list_for_each_entry(page, &pgd_list, lru) { | |
13062 | + if (!vmalloc_sync_one(page_address(page), | |
13063 | + address)) | |
13064 | + break; | |
13065 | + } | |
13066 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
13067 | + if (!page) | |
13068 | + set_bit(sync_index(address), insync); | |
13069 | + } | |
13070 | + if (address == start && test_bit(sync_index(address), insync)) | |
13071 | + start = address + PMD_SIZE; | |
13072 | + } | |
13073 | +#else /* CONFIG_X86_64 */ | |
13074 | + /* | |
13075 | + * Note that races in the updates of insync and start aren't | |
13076 | + * problematic: insync can only get set bits added, and updates to | |
13077 | + * start are only improving performance (without affecting correctness | |
13078 | + * if undone). | |
13079 | + */ | |
13080 | + static DECLARE_BITMAP(insync, PTRS_PER_PGD); | |
13081 | + static unsigned long start = VMALLOC_START & PGDIR_MASK; | |
13082 | + unsigned long address; | |
13083 | + | |
13084 | + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { | |
13085 | + if (!test_bit(pgd_index(address), insync)) { | |
13086 | + const pgd_t *pgd_ref = pgd_offset_k(address); | |
13087 | + unsigned long flags; | |
13088 | + struct page *page; | |
13089 | + | |
13090 | + if (pgd_none(*pgd_ref)) | |
13091 | + continue; | |
13092 | + spin_lock_irqsave(&pgd_lock, flags); | |
13093 | + list_for_each_entry(page, &pgd_list, lru) { | |
13094 | + pgd_t *pgd; | |
13095 | + pgd = (pgd_t *)page_address(page) + pgd_index(address); | |
13096 | + if (pgd_none(*pgd)) | |
13097 | + set_pgd(pgd, *pgd_ref); | |
13098 | + else | |
13099 | + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | |
13100 | + } | |
13101 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
13102 | + set_bit(pgd_index(address), insync); | |
13103 | + } | |
13104 | + if (address == start) | |
13105 | + start = address + PGDIR_SIZE; | |
13106 | + } | |
13107 | + /* Check that there is no need to do the same for the modules area. */ | |
13108 | + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); | |
13109 | + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == | |
13110 | + (__START_KERNEL & PGDIR_MASK))); | |
13111 | +#endif | |
13112 | +} | |
13113 | --- a/arch/x86/mm/highmem_32-xen.c | |
13114 | +++ b/arch/x86/mm/highmem_32-xen.c | |
13115 | @@ -18,6 +18,49 @@ void kunmap(struct page *page) | |
13116 | kunmap_high(page); | |
13117 | } | |
13118 | ||
13119 | +static void debug_kmap_atomic_prot(enum km_type type) | |
13120 | +{ | |
13121 | +#ifdef CONFIG_DEBUG_HIGHMEM | |
13122 | + static unsigned warn_count = 10; | |
13123 | + | |
13124 | + if (unlikely(warn_count == 0)) | |
13125 | + return; | |
13126 | + | |
13127 | + if (unlikely(in_interrupt())) { | |
13128 | + if (in_irq()) { | |
13129 | + if (type != KM_IRQ0 && type != KM_IRQ1 && | |
13130 | + type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && | |
13131 | + type != KM_BOUNCE_READ) { | |
13132 | + WARN_ON(1); | |
13133 | + warn_count--; | |
13134 | + } | |
13135 | + } else if (!irqs_disabled()) { /* softirq */ | |
13136 | + if (type != KM_IRQ0 && type != KM_IRQ1 && | |
13137 | + type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && | |
13138 | + type != KM_SKB_SUNRPC_DATA && | |
13139 | + type != KM_SKB_DATA_SOFTIRQ && | |
13140 | + type != KM_BOUNCE_READ) { | |
13141 | + WARN_ON(1); | |
13142 | + warn_count--; | |
13143 | + } | |
13144 | + } | |
13145 | + } | |
13146 | + | |
13147 | + if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || | |
13148 | + type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { | |
13149 | + if (!irqs_disabled()) { | |
13150 | + WARN_ON(1); | |
13151 | + warn_count--; | |
13152 | + } | |
13153 | + } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { | |
13154 | + if (irq_count() == 0 && !irqs_disabled()) { | |
13155 | + WARN_ON(1); | |
13156 | + warn_count--; | |
13157 | + } | |
13158 | + } | |
13159 | +#endif | |
13160 | +} | |
13161 | + | |
13162 | /* | |
13163 | * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because | |
13164 | * no global lock is needed and because the kmap code must perform a global TLB | |
13165 | @@ -37,6 +80,8 @@ void *kmap_atomic_prot(struct page *page | |
13166 | if (!PageHighMem(page)) | |
13167 | return page_address(page); | |
13168 | ||
13169 | + debug_kmap_atomic_prot(type); | |
13170 | + | |
13171 | idx = type + KM_TYPE_NR*smp_processor_id(); | |
13172 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | |
13173 | BUG_ON(!pte_none(*(kmap_pte-idx))); | |
13174 | --- a/arch/x86/mm/hypervisor.c | |
13175 | +++ b/arch/x86/mm/hypervisor.c | |
13176 | @@ -831,15 +831,11 @@ int xen_limit_pages_to_max_mfn( | |
13177 | } | |
13178 | EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn); | |
13179 | ||
13180 | -#ifdef __i386__ | |
13181 | -int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b) | |
13182 | +int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc) | |
13183 | { | |
13184 | - __u32 *lp = (__u32 *)((char *)ldt + entry * 8); | |
13185 | - maddr_t mach_lp = arbitrary_virt_to_machine(lp); | |
13186 | - return HYPERVISOR_update_descriptor( | |
13187 | - mach_lp, (u64)entry_a | ((u64)entry_b<<32)); | |
13188 | + maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry); | |
13189 | + return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc); | |
13190 | } | |
13191 | -#endif | |
13192 | ||
13193 | #define MAX_BATCHED_FULL_PTES 32 | |
13194 | ||
13195 | --- a/arch/x86/mm/init_32-xen.c | |
13196 | +++ b/arch/x86/mm/init_32-xen.c | |
13197 | @@ -27,13 +27,13 @@ | |
13198 | #include <linux/bootmem.h> | |
13199 | #include <linux/slab.h> | |
13200 | #include <linux/proc_fs.h> | |
13201 | -#include <linux/efi.h> | |
13202 | #include <linux/memory_hotplug.h> | |
13203 | #include <linux/initrd.h> | |
13204 | #include <linux/cpumask.h> | |
13205 | #include <linux/dma-mapping.h> | |
13206 | #include <linux/scatterlist.h> | |
13207 | ||
13208 | +#include <asm/asm.h> | |
13209 | #include <asm/processor.h> | |
13210 | #include <asm/system.h> | |
13211 | #include <asm/uaccess.h> | |
13212 | @@ -42,18 +42,22 @@ | |
13213 | #include <asm/fixmap.h> | |
13214 | #include <asm/e820.h> | |
13215 | #include <asm/apic.h> | |
13216 | +#include <asm/bugs.h> | |
13217 | #include <asm/tlb.h> | |
13218 | #include <asm/tlbflush.h> | |
13219 | +#include <asm/pgalloc.h> | |
13220 | #include <asm/sections.h> | |
13221 | #include <asm/hypervisor.h> | |
13222 | #include <asm/swiotlb.h> | |
13223 | +#include <asm/setup.h> | |
13224 | +#include <asm/cacheflush.h> | |
13225 | ||
13226 | unsigned int __VMALLOC_RESERVE = 128 << 20; | |
13227 | ||
13228 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | |
13229 | unsigned long highstart_pfn, highend_pfn; | |
13230 | ||
13231 | -static int noinline do_test_wp_bit(void); | |
13232 | +static noinline int do_test_wp_bit(void); | |
13233 | ||
13234 | /* | |
13235 | * Creates a middle page table and puts a pointer to it in the | |
13236 | @@ -64,17 +68,16 @@ static pmd_t * __init one_md_table_init( | |
13237 | { | |
13238 | pud_t *pud; | |
13239 | pmd_t *pmd_table; | |
13240 | - | |
13241 | + | |
13242 | #ifdef CONFIG_X86_PAE | |
13243 | if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) { | |
13244 | pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); | |
13245 | ||
13246 | - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); | |
13247 | + paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); | |
13248 | make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables); | |
13249 | set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | |
13250 | pud = pud_offset(pgd, 0); | |
13251 | - if (pmd_table != pmd_offset(pud, 0)) | |
13252 | - BUG(); | |
13253 | + BUG_ON(pmd_table != pmd_offset(pud, 0)); | |
13254 | } | |
13255 | #endif | |
13256 | pud = pud_offset(pgd, 0); | |
13257 | @@ -85,7 +88,7 @@ static pmd_t * __init one_md_table_init( | |
13258 | ||
13259 | /* | |
13260 | * Create a page table and place a pointer to it in a middle page | |
13261 | - * directory entry. | |
13262 | + * directory entry: | |
13263 | */ | |
13264 | static pte_t * __init one_page_table_init(pmd_t *pmd) | |
13265 | { | |
13266 | @@ -99,9 +102,10 @@ static pte_t * __init one_page_table_ini | |
13267 | #ifdef CONFIG_DEBUG_PAGEALLOC | |
13268 | page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); | |
13269 | #endif | |
13270 | - if (!page_table) | |
13271 | + if (!page_table) { | |
13272 | page_table = | |
13273 | (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); | |
13274 | + } | |
13275 | ||
13276 | paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); | |
13277 | make_lowmem_page_readonly(page_table, | |
13278 | @@ -114,22 +118,21 @@ static pte_t * __init one_page_table_ini | |
13279 | } | |
13280 | ||
13281 | /* | |
13282 | - * This function initializes a certain range of kernel virtual memory | |
13283 | + * This function initializes a certain range of kernel virtual memory | |
13284 | * with new bootmem page tables, everywhere page tables are missing in | |
13285 | * the given range. | |
13286 | - */ | |
13287 | - | |
13288 | -/* | |
13289 | - * NOTE: The pagetables are allocated contiguous on the physical space | |
13290 | - * so we can cache the place of the first one and move around without | |
13291 | + * | |
13292 | + * NOTE: The pagetables are allocated contiguous on the physical space | |
13293 | + * so we can cache the place of the first one and move around without | |
13294 | * checking the pgd every time. | |
13295 | */ | |
13296 | -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) | |
13297 | +static void __init | |
13298 | +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) | |
13299 | { | |
13300 | - pgd_t *pgd; | |
13301 | - pmd_t *pmd; | |
13302 | int pgd_idx, pmd_idx; | |
13303 | unsigned long vaddr; | |
13304 | + pgd_t *pgd; | |
13305 | + pmd_t *pmd; | |
13306 | ||
13307 | vaddr = start; | |
13308 | pgd_idx = pgd_index(vaddr); | |
13309 | @@ -139,7 +142,8 @@ static void __init page_table_range_init | |
13310 | for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { | |
13311 | pmd = one_md_table_init(pgd); | |
13312 | pmd = pmd + pmd_index(vaddr); | |
13313 | - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { | |
13314 | + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); | |
13315 | + pmd++, pmd_idx++) { | |
13316 | if (vaddr < hypervisor_virt_start) | |
13317 | one_page_table_init(pmd); | |
13318 | ||
13319 | @@ -157,17 +161,17 @@ static inline int is_kernel_text(unsigne | |
13320 | } | |
13321 | ||
13322 | /* | |
13323 | - * This maps the physical memory to kernel virtual address space, a total | |
13324 | - * of max_low_pfn pages, by creating page tables starting from address | |
13325 | - * PAGE_OFFSET. | |
13326 | + * This maps the physical memory to kernel virtual address space, a total | |
13327 | + * of max_low_pfn pages, by creating page tables starting from address | |
13328 | + * PAGE_OFFSET: | |
13329 | */ | |
13330 | static void __init kernel_physical_mapping_init(pgd_t *pgd_base) | |
13331 | { | |
13332 | + int pgd_idx, pmd_idx, pte_ofs; | |
13333 | unsigned long pfn; | |
13334 | pgd_t *pgd; | |
13335 | pmd_t *pmd; | |
13336 | pte_t *pte; | |
13337 | - int pgd_idx, pmd_idx, pte_ofs; | |
13338 | ||
13339 | unsigned long max_ram_pfn = xen_start_info->nr_pages; | |
13340 | if (max_ram_pfn > max_low_pfn) | |
13341 | @@ -195,36 +199,49 @@ static void __init kernel_physical_mappi | |
13342 | if (pfn >= max_low_pfn) | |
13343 | continue; | |
13344 | pmd += pmd_idx; | |
13345 | - for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { | |
13346 | - unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; | |
13347 | - if (address >= hypervisor_virt_start) | |
13348 | + for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; | |
13349 | + pmd++, pmd_idx++) { | |
13350 | + unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; | |
13351 | + | |
13352 | + if (addr >= hypervisor_virt_start) | |
13353 | continue; | |
13354 | ||
13355 | - /* Map with big pages if possible, otherwise create normal page tables. */ | |
13356 | + /* | |
13357 | + * Map with big pages if possible, otherwise | |
13358 | + * create normal page tables: | |
13359 | + */ | |
13360 | if (cpu_has_pse) { | |
13361 | - unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; | |
13362 | - if (is_kernel_text(address) || is_kernel_text(address2)) | |
13363 | - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); | |
13364 | - else | |
13365 | - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); | |
13366 | + unsigned int addr2; | |
13367 | + pgprot_t prot = PAGE_KERNEL_LARGE; | |
13368 | + | |
13369 | + addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + | |
13370 | + PAGE_OFFSET + PAGE_SIZE-1; | |
13371 | + | |
13372 | + if (is_kernel_text(addr) || | |
13373 | + is_kernel_text(addr2)) | |
13374 | + prot = PAGE_KERNEL_LARGE_EXEC; | |
13375 | + | |
13376 | + set_pmd(pmd, pfn_pmd(pfn, prot)); | |
13377 | ||
13378 | pfn += PTRS_PER_PTE; | |
13379 | - } else { | |
13380 | - pte = one_page_table_init(pmd); | |
13381 | + continue; | |
13382 | + } | |
13383 | + pte = one_page_table_init(pmd); | |
13384 | + | |
13385 | + for (pte += pte_ofs; | |
13386 | + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; | |
13387 | + pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { | |
13388 | + pgprot_t prot = PAGE_KERNEL; | |
13389 | + | |
13390 | + /* XEN: Only map initial RAM allocation. */ | |
13391 | + if ((pfn >= max_ram_pfn) || pte_present(*pte)) | |
13392 | + continue; | |
13393 | + if (is_kernel_text(addr)) | |
13394 | + prot = PAGE_KERNEL_EXEC; | |
13395 | ||
13396 | - for (pte += pte_ofs; | |
13397 | - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; | |
13398 | - pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { | |
13399 | - /* XEN: Only map initial RAM allocation. */ | |
13400 | - if ((pfn >= max_ram_pfn) || pte_present(*pte)) | |
13401 | - continue; | |
13402 | - if (is_kernel_text(address)) | |
13403 | - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); | |
13404 | - else | |
13405 | - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); | |
13406 | - } | |
13407 | - pte_ofs = 0; | |
13408 | + set_pte(pte, pfn_pte(pfn, prot)); | |
13409 | } | |
13410 | + pte_ofs = 0; | |
13411 | } | |
13412 | pmd_idx = 0; | |
13413 | } | |
13414 | @@ -245,57 +262,23 @@ static inline int page_kills_ppro(unsign | |
13415 | ||
13416 | #endif | |
13417 | ||
13418 | -int page_is_ram(unsigned long pagenr) | |
13419 | -{ | |
13420 | - int i; | |
13421 | - unsigned long addr, end; | |
13422 | - | |
13423 | - if (efi_enabled) { | |
13424 | - efi_memory_desc_t *md; | |
13425 | - void *p; | |
13426 | - | |
13427 | - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { | |
13428 | - md = p; | |
13429 | - if (!is_available_memory(md)) | |
13430 | - continue; | |
13431 | - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; | |
13432 | - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; | |
13433 | - | |
13434 | - if ((pagenr >= addr) && (pagenr < end)) | |
13435 | - return 1; | |
13436 | - } | |
13437 | - return 0; | |
13438 | - } | |
13439 | - | |
13440 | - for (i = 0; i < e820.nr_map; i++) { | |
13441 | - | |
13442 | - if (e820.map[i].type != E820_RAM) /* not usable memory */ | |
13443 | - continue; | |
13444 | - /* | |
13445 | - * !!!FIXME!!! Some BIOSen report areas as RAM that | |
13446 | - * are not. Notably the 640->1Mb area. We need a sanity | |
13447 | - * check here. | |
13448 | - */ | |
13449 | - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; | |
13450 | - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; | |
13451 | - if ((pagenr >= addr) && (pagenr < end)) | |
13452 | - return 1; | |
13453 | - } | |
13454 | - return 0; | |
13455 | -} | |
13456 | - | |
13457 | #ifdef CONFIG_HIGHMEM | |
13458 | pte_t *kmap_pte; | |
13459 | pgprot_t kmap_prot; | |
13460 | ||
13461 | -#define kmap_get_fixmap_pte(vaddr) \ | |
13462 | - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) | |
13463 | +static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) | |
13464 | +{ | |
13465 | + return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), | |
13466 | + vaddr), vaddr), vaddr); | |
13467 | +} | |
13468 | ||
13469 | static void __init kmap_init(void) | |
13470 | { | |
13471 | unsigned long kmap_vstart; | |
13472 | ||
13473 | - /* cache the first kmap pte */ | |
13474 | + /* | |
13475 | + * Cache the first kmap pte: | |
13476 | + */ | |
13477 | kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); | |
13478 | kmap_pte = kmap_get_fixmap_pte(kmap_vstart); | |
13479 | ||
13480 | @@ -304,11 +287,11 @@ static void __init kmap_init(void) | |
13481 | ||
13482 | static void __init permanent_kmaps_init(pgd_t *pgd_base) | |
13483 | { | |
13484 | + unsigned long vaddr; | |
13485 | pgd_t *pgd; | |
13486 | pud_t *pud; | |
13487 | pmd_t *pmd; | |
13488 | pte_t *pte; | |
13489 | - unsigned long vaddr; | |
13490 | ||
13491 | vaddr = PKMAP_BASE; | |
13492 | page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); | |
13493 | @@ -317,7 +300,7 @@ static void __init permanent_kmaps_init( | |
13494 | pud = pud_offset(pgd, vaddr); | |
13495 | pmd = pmd_offset(pud, vaddr); | |
13496 | pte = pte_offset_kernel(pmd, vaddr); | |
13497 | - pkmap_page_table = pte; | |
13498 | + pkmap_page_table = pte; | |
13499 | } | |
13500 | ||
13501 | static void __meminit free_new_highpage(struct page *page, int pfn) | |
13502 | @@ -337,7 +320,8 @@ void __init add_one_highpage_init(struct | |
13503 | SetPageReserved(page); | |
13504 | } | |
13505 | ||
13506 | -static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn) | |
13507 | +static int __meminit | |
13508 | +add_one_highpage_hotplug(struct page *page, unsigned long pfn) | |
13509 | { | |
13510 | free_new_highpage(page, pfn); | |
13511 | totalram_pages++; | |
13512 | @@ -345,6 +329,7 @@ static int __meminit add_one_highpage_ho | |
13513 | max_mapnr = max(pfn, max_mapnr); | |
13514 | #endif | |
13515 | num_physpages++; | |
13516 | + | |
13517 | return 0; | |
13518 | } | |
13519 | ||
13520 | @@ -352,7 +337,7 @@ static int __meminit add_one_highpage_ho | |
13521 | * Not currently handling the NUMA case. | |
13522 | * Assuming single node and all memory that | |
13523 | * has been added dynamically that would be | |
13524 | - * onlined here is in HIGHMEM | |
13525 | + * onlined here is in HIGHMEM. | |
13526 | */ | |
13527 | void __meminit online_page(struct page *page) | |
13528 | { | |
13529 | @@ -360,13 +345,11 @@ void __meminit online_page(struct page * | |
13530 | add_one_highpage_hotplug(page, page_to_pfn(page)); | |
13531 | } | |
13532 | ||
13533 | - | |
13534 | -#ifdef CONFIG_NUMA | |
13535 | -extern void set_highmem_pages_init(int); | |
13536 | -#else | |
13537 | +#ifndef CONFIG_NUMA | |
13538 | static void __init set_highmem_pages_init(int bad_ppro) | |
13539 | { | |
13540 | int pfn; | |
13541 | + | |
13542 | for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { | |
13543 | /* | |
13544 | * Holes under sparsemem might not have no mem_map[]: | |
13545 | @@ -376,23 +359,18 @@ static void __init set_highmem_pages_ini | |
13546 | } | |
13547 | totalram_pages += totalhigh_pages; | |
13548 | } | |
13549 | -#endif /* CONFIG_FLATMEM */ | |
13550 | +#endif /* !CONFIG_NUMA */ | |
13551 | ||
13552 | #else | |
13553 | -#define kmap_init() do { } while (0) | |
13554 | -#define permanent_kmaps_init(pgd_base) do { } while (0) | |
13555 | -#define set_highmem_pages_init(bad_ppro) do { } while (0) | |
13556 | +# define kmap_init() do { } while (0) | |
13557 | +# define permanent_kmaps_init(pgd_base) do { } while (0) | |
13558 | +# define set_highmem_pages_init(bad_ppro) do { } while (0) | |
13559 | #endif /* CONFIG_HIGHMEM */ | |
13560 | ||
13561 | -unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; | |
13562 | +pteval_t __PAGE_KERNEL = _PAGE_KERNEL; | |
13563 | EXPORT_SYMBOL(__PAGE_KERNEL); | |
13564 | -unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; | |
13565 | ||
13566 | -#ifdef CONFIG_NUMA | |
13567 | -extern void __init remap_numa_kva(void); | |
13568 | -#else | |
13569 | -#define remap_numa_kva() do {} while (0) | |
13570 | -#endif | |
13571 | +pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; | |
13572 | ||
13573 | pgd_t *swapper_pg_dir; | |
13574 | ||
13575 | @@ -410,9 +388,8 @@ static void __init xen_pagetable_setup_d | |
13576 | * the boot process. | |
13577 | * | |
13578 | * If we're booting on native hardware, this will be a pagetable | |
13579 | - * constructed in arch/i386/kernel/head.S, and not running in PAE mode | |
13580 | - * (even if we'll end up running in PAE). The root of the pagetable | |
13581 | - * will be swapper_pg_dir. | |
13582 | + * constructed in arch/x86/kernel/head_32.S. The root of the | |
13583 | + * pagetable will be swapper_pg_dir. | |
13584 | * | |
13585 | * If we're booting paravirtualized under a hypervisor, then there are | |
13586 | * more options: we may already be running PAE, and the pagetable may | |
13587 | @@ -424,10 +401,10 @@ static void __init xen_pagetable_setup_d | |
13588 | * be partially populated, and so it avoids stomping on any existing | |
13589 | * mappings. | |
13590 | */ | |
13591 | -static void __init pagetable_init (void) | |
13592 | +static void __init pagetable_init(void) | |
13593 | { | |
13594 | - unsigned long vaddr, end; | |
13595 | pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base; | |
13596 | + unsigned long vaddr, end; | |
13597 | ||
13598 | xen_pagetable_setup_start(pgd_base); | |
13599 | ||
13600 | @@ -449,34 +426,36 @@ static void __init pagetable_init (void) | |
13601 | * Fixed mappings, only the page table structure has to be | |
13602 | * created - mappings will be set by set_fixmap(): | |
13603 | */ | |
13604 | + early_ioremap_clear(); | |
13605 | vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; | |
13606 | end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; | |
13607 | page_table_range_init(vaddr, end, pgd_base); | |
13608 | + early_ioremap_reset(); | |
13609 | ||
13610 | permanent_kmaps_init(pgd_base); | |
13611 | ||
13612 | xen_pagetable_setup_done(pgd_base); | |
13613 | } | |
13614 | ||
13615 | -#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI) | |
13616 | +#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN) | |
13617 | /* | |
13618 | - * Swap suspend & friends need this for resume because things like the intel-agp | |
13619 | + * ACPI suspend needs this for resume, because things like the intel-agp | |
13620 | * driver might have split up a kernel 4MB mapping. | |
13621 | */ | |
13622 | -char __nosavedata swsusp_pg_dir[PAGE_SIZE] | |
13623 | - __attribute__ ((aligned (PAGE_SIZE))); | |
13624 | +char swsusp_pg_dir[PAGE_SIZE] | |
13625 | + __attribute__ ((aligned(PAGE_SIZE))); | |
13626 | ||
13627 | static inline void save_pg_dir(void) | |
13628 | { | |
13629 | memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); | |
13630 | } | |
13631 | -#else | |
13632 | +#else /* !CONFIG_ACPI_SLEEP */ | |
13633 | static inline void save_pg_dir(void) | |
13634 | { | |
13635 | } | |
13636 | -#endif | |
13637 | +#endif /* !CONFIG_ACPI_SLEEP */ | |
13638 | ||
13639 | -void zap_low_mappings (void) | |
13640 | +void zap_low_mappings(void) | |
13641 | { | |
13642 | int i; | |
13643 | ||
13644 | @@ -488,22 +467,24 @@ void zap_low_mappings (void) | |
13645 | * Note that "pgd_clear()" doesn't do it for | |
13646 | * us, because pgd_clear() is a no-op on i386. | |
13647 | */ | |
13648 | - for (i = 0; i < USER_PTRS_PER_PGD; i++) | |
13649 | + for (i = 0; i < USER_PTRS_PER_PGD; i++) { | |
13650 | #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) | |
13651 | set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); | |
13652 | #else | |
13653 | set_pgd(swapper_pg_dir+i, __pgd(0)); | |
13654 | #endif | |
13655 | + } | |
13656 | flush_tlb_all(); | |
13657 | } | |
13658 | ||
13659 | -int nx_enabled = 0; | |
13660 | +int nx_enabled; | |
13661 | + | |
13662 | +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; | |
13663 | +EXPORT_SYMBOL_GPL(__supported_pte_mask); | |
13664 | ||
13665 | #ifdef CONFIG_X86_PAE | |
13666 | ||
13667 | -static int disable_nx __initdata = 0; | |
13668 | -u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; | |
13669 | -EXPORT_SYMBOL_GPL(__supported_pte_mask); | |
13670 | +static int disable_nx __initdata; | |
13671 | ||
13672 | /* | |
13673 | * noexec = on|off | |
13674 | @@ -520,11 +501,14 @@ static int __init noexec_setup(char *str | |
13675 | __supported_pte_mask |= _PAGE_NX; | |
13676 | disable_nx = 0; | |
13677 | } | |
13678 | - } else if (!strcmp(str,"off")) { | |
13679 | - disable_nx = 1; | |
13680 | - __supported_pte_mask &= ~_PAGE_NX; | |
13681 | - } else | |
13682 | - return -EINVAL; | |
13683 | + } else { | |
13684 | + if (!strcmp(str, "off")) { | |
13685 | + disable_nx = 1; | |
13686 | + __supported_pte_mask &= ~_PAGE_NX; | |
13687 | + } else { | |
13688 | + return -EINVAL; | |
13689 | + } | |
13690 | + } | |
13691 | ||
13692 | return 0; | |
13693 | } | |
13694 | @@ -536,6 +520,7 @@ static void __init set_nx(void) | |
13695 | ||
13696 | if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { | |
13697 | cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); | |
13698 | + | |
13699 | if ((v[3] & (1 << 20)) && !disable_nx) { | |
13700 | rdmsr(MSR_EFER, l, h); | |
13701 | l |= EFER_NX; | |
13702 | @@ -545,35 +530,6 @@ static void __init set_nx(void) | |
13703 | } | |
13704 | } | |
13705 | } | |
13706 | - | |
13707 | -/* | |
13708 | - * Enables/disables executability of a given kernel page and | |
13709 | - * returns the previous setting. | |
13710 | - */ | |
13711 | -int __init set_kernel_exec(unsigned long vaddr, int enable) | |
13712 | -{ | |
13713 | - pte_t *pte; | |
13714 | - int ret = 1; | |
13715 | - | |
13716 | - if (!nx_enabled) | |
13717 | - goto out; | |
13718 | - | |
13719 | - pte = lookup_address(vaddr); | |
13720 | - BUG_ON(!pte); | |
13721 | - | |
13722 | - if (!pte_exec_kernel(*pte)) | |
13723 | - ret = 0; | |
13724 | - | |
13725 | - if (enable) | |
13726 | - pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); | |
13727 | - else | |
13728 | - pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); | |
13729 | - pte_update_defer(&init_mm, vaddr, pte); | |
13730 | - __flush_tlb_all(); | |
13731 | -out: | |
13732 | - return ret; | |
13733 | -} | |
13734 | - | |
13735 | #endif | |
13736 | ||
13737 | /* | |
13738 | @@ -590,21 +546,10 @@ void __init paging_init(void) | |
13739 | #ifdef CONFIG_X86_PAE | |
13740 | set_nx(); | |
13741 | if (nx_enabled) | |
13742 | - printk("NX (Execute Disable) protection: active\n"); | |
13743 | + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); | |
13744 | #endif | |
13745 | - | |
13746 | pagetable_init(); | |
13747 | ||
13748 | -#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) | |
13749 | - /* | |
13750 | - * We will bail out later - printk doesn't work right now so | |
13751 | - * the user would just see a hanging kernel. | |
13752 | - * when running as xen domain we are already in PAE mode at | |
13753 | - * this point. | |
13754 | - */ | |
13755 | - if (cpu_has_pae) | |
13756 | - set_in_cr4(X86_CR4_PAE); | |
13757 | -#endif | |
13758 | __flush_tlb_all(); | |
13759 | ||
13760 | kmap_init(); | |
13761 | @@ -631,10 +576,10 @@ void __init paging_init(void) | |
13762 | * used to involve black magic jumps to work around some nasty CPU bugs, | |
13763 | * but fortunately the switch to using exceptions got rid of all that. | |
13764 | */ | |
13765 | - | |
13766 | static void __init test_wp_bit(void) | |
13767 | { | |
13768 | - printk("Checking if this processor honours the WP bit even in supervisor mode... "); | |
13769 | + printk(KERN_INFO | |
13770 | + "Checking if this processor honours the WP bit even in supervisor mode..."); | |
13771 | ||
13772 | /* Any page-aligned address will do, the test is non-destructive */ | |
13773 | __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); | |
13774 | @@ -642,23 +587,22 @@ static void __init test_wp_bit(void) | |
13775 | clear_fixmap(FIX_WP_TEST); | |
13776 | ||
13777 | if (!boot_cpu_data.wp_works_ok) { | |
13778 | - printk("No.\n"); | |
13779 | + printk(KERN_CONT "No.\n"); | |
13780 | #ifdef CONFIG_X86_WP_WORKS_OK | |
13781 | - panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); | |
13782 | + panic( | |
13783 | + "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); | |
13784 | #endif | |
13785 | } else { | |
13786 | - printk("Ok.\n"); | |
13787 | + printk(KERN_CONT "Ok.\n"); | |
13788 | } | |
13789 | } | |
13790 | ||
13791 | -static struct kcore_list kcore_mem, kcore_vmalloc; | |
13792 | +static struct kcore_list kcore_mem, kcore_vmalloc; | |
13793 | ||
13794 | void __init mem_init(void) | |
13795 | { | |
13796 | - extern int ppro_with_ram_bug(void); | |
13797 | int codesize, reservedpages, datasize, initsize; | |
13798 | - int tmp; | |
13799 | - int bad_ppro; | |
13800 | + int tmp, bad_ppro; | |
13801 | unsigned long pfn; | |
13802 | ||
13803 | #if defined(CONFIG_SWIOTLB) | |
13804 | @@ -668,19 +612,19 @@ void __init mem_init(void) | |
13805 | #ifdef CONFIG_FLATMEM | |
13806 | BUG_ON(!mem_map); | |
13807 | #endif | |
13808 | - | |
13809 | bad_ppro = ppro_with_ram_bug(); | |
13810 | ||
13811 | #ifdef CONFIG_HIGHMEM | |
13812 | /* check that fixmap and pkmap do not overlap */ | |
13813 | - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { | |
13814 | - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); | |
13815 | + if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { | |
13816 | + printk(KERN_ERR | |
13817 | + "fixmap and kmap areas overlap - this will crash\n"); | |
13818 | printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", | |
13819 | - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); | |
13820 | + PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE, | |
13821 | + FIXADDR_START); | |
13822 | BUG(); | |
13823 | } | |
13824 | #endif | |
13825 | - | |
13826 | /* this will put all low memory onto the freelists */ | |
13827 | totalram_pages += free_all_bootmem(); | |
13828 | /* XEN: init and count low-mem pages outside initial allocation. */ | |
13829 | @@ -693,7 +637,7 @@ void __init mem_init(void) | |
13830 | reservedpages = 0; | |
13831 | for (tmp = 0; tmp < max_low_pfn; tmp++) | |
13832 | /* | |
13833 | - * Only count reserved RAM pages | |
13834 | + * Only count reserved RAM pages: | |
13835 | */ | |
13836 | if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) | |
13837 | reservedpages++; | |
13838 | @@ -704,11 +648,12 @@ void __init mem_init(void) | |
13839 | datasize = (unsigned long) &_edata - (unsigned long) &_etext; | |
13840 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | |
13841 | ||
13842 | - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | |
13843 | - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | |
13844 | + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | |
13845 | + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | |
13846 | VMALLOC_END-VMALLOC_START); | |
13847 | ||
13848 | - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", | |
13849 | + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " | |
13850 | + "%dk reserved, %dk data, %dk init, %ldk highmem)\n", | |
13851 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | |
13852 | num_physpages << (PAGE_SHIFT-10), | |
13853 | codesize >> 10, | |
13854 | @@ -719,54 +664,53 @@ void __init mem_init(void) | |
13855 | ); | |
13856 | ||
13857 | #if 1 /* double-sanity-check paranoia */ | |
13858 | - printk("virtual kernel memory layout:\n" | |
13859 | - " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" | |
13860 | + printk(KERN_INFO "virtual kernel memory layout:\n" | |
13861 | + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" | |
13862 | #ifdef CONFIG_HIGHMEM | |
13863 | - " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" | |
13864 | + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" | |
13865 | #endif | |
13866 | - " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" | |
13867 | - " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" | |
13868 | - " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" | |
13869 | - " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" | |
13870 | - " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", | |
13871 | - FIXADDR_START, FIXADDR_TOP, | |
13872 | - (FIXADDR_TOP - FIXADDR_START) >> 10, | |
13873 | + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" | |
13874 | + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" | |
13875 | + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" | |
13876 | + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" | |
13877 | + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", | |
13878 | + FIXADDR_START, FIXADDR_TOP, | |
13879 | + (FIXADDR_TOP - FIXADDR_START) >> 10, | |
13880 | ||
13881 | #ifdef CONFIG_HIGHMEM | |
13882 | - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, | |
13883 | - (LAST_PKMAP*PAGE_SIZE) >> 10, | |
13884 | + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, | |
13885 | + (LAST_PKMAP*PAGE_SIZE) >> 10, | |
13886 | #endif | |
13887 | ||
13888 | - VMALLOC_START, VMALLOC_END, | |
13889 | - (VMALLOC_END - VMALLOC_START) >> 20, | |
13890 | + VMALLOC_START, VMALLOC_END, | |
13891 | + (VMALLOC_END - VMALLOC_START) >> 20, | |
13892 | ||
13893 | - (unsigned long)__va(0), (unsigned long)high_memory, | |
13894 | - ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, | |
13895 | + (unsigned long)__va(0), (unsigned long)high_memory, | |
13896 | + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, | |
13897 | ||
13898 | - (unsigned long)&__init_begin, (unsigned long)&__init_end, | |
13899 | - ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10, | |
13900 | + (unsigned long)&__init_begin, (unsigned long)&__init_end, | |
13901 | + ((unsigned long)&__init_end - | |
13902 | + (unsigned long)&__init_begin) >> 10, | |
13903 | ||
13904 | - (unsigned long)&_etext, (unsigned long)&_edata, | |
13905 | - ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, | |
13906 | + (unsigned long)&_etext, (unsigned long)&_edata, | |
13907 | + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, | |
13908 | ||
13909 | - (unsigned long)&_text, (unsigned long)&_etext, | |
13910 | - ((unsigned long)&_etext - (unsigned long)&_text) >> 10); | |
13911 | + (unsigned long)&_text, (unsigned long)&_etext, | |
13912 | + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); | |
13913 | ||
13914 | #ifdef CONFIG_HIGHMEM | |
13915 | - BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START); | |
13916 | - BUG_ON(VMALLOC_END > PKMAP_BASE); | |
13917 | + BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); | |
13918 | + BUG_ON(VMALLOC_END > PKMAP_BASE); | |
13919 | #endif | |
13920 | - BUG_ON(VMALLOC_START > VMALLOC_END); | |
13921 | - BUG_ON((unsigned long)high_memory > VMALLOC_START); | |
13922 | + BUG_ON(VMALLOC_START > VMALLOC_END); | |
13923 | + BUG_ON((unsigned long)high_memory > VMALLOC_START); | |
13924 | #endif /* double-sanity-check paranoia */ | |
13925 | ||
13926 | -#ifdef CONFIG_X86_PAE | |
13927 | - if (!cpu_has_pae) | |
13928 | - panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); | |
13929 | -#endif | |
13930 | if (boot_cpu_data.wp_works_ok < 0) | |
13931 | test_wp_bit(); | |
13932 | ||
13933 | + cpa_init(); | |
13934 | + | |
13935 | /* | |
13936 | * Subtle. SMP is doing it's boot stuff late (because it has to | |
13937 | * fork idle threads) - but it also needs low mappings for the | |
13938 | @@ -790,49 +734,35 @@ int arch_add_memory(int nid, u64 start, | |
13939 | ||
13940 | return __add_pages(zone, start_pfn, nr_pages); | |
13941 | } | |
13942 | - | |
13943 | #endif | |
13944 | ||
13945 | -struct kmem_cache *pmd_cache; | |
13946 | - | |
13947 | -void __init pgtable_cache_init(void) | |
13948 | -{ | |
13949 | - if (PTRS_PER_PMD > 1) | |
13950 | - pmd_cache = kmem_cache_create("pmd", | |
13951 | - PTRS_PER_PMD*sizeof(pmd_t), | |
13952 | - PTRS_PER_PMD*sizeof(pmd_t), | |
13953 | - SLAB_PANIC, | |
13954 | - pmd_ctor); | |
13955 | -} | |
13956 | - | |
13957 | /* | |
13958 | * This function cannot be __init, since exceptions don't work in that | |
13959 | * section. Put this after the callers, so that it cannot be inlined. | |
13960 | */ | |
13961 | -static int noinline do_test_wp_bit(void) | |
13962 | +static noinline int do_test_wp_bit(void) | |
13963 | { | |
13964 | char tmp_reg; | |
13965 | int flag; | |
13966 | ||
13967 | __asm__ __volatile__( | |
13968 | - " movb %0,%1 \n" | |
13969 | - "1: movb %1,%0 \n" | |
13970 | - " xorl %2,%2 \n" | |
13971 | + " movb %0, %1 \n" | |
13972 | + "1: movb %1, %0 \n" | |
13973 | + " xorl %2, %2 \n" | |
13974 | "2: \n" | |
13975 | - ".section __ex_table,\"a\"\n" | |
13976 | - " .align 4 \n" | |
13977 | - " .long 1b,2b \n" | |
13978 | - ".previous \n" | |
13979 | + _ASM_EXTABLE(1b,2b) | |
13980 | :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), | |
13981 | "=q" (tmp_reg), | |
13982 | "=r" (flag) | |
13983 | :"2" (1) | |
13984 | :"memory"); | |
13985 | - | |
13986 | + | |
13987 | return flag; | |
13988 | } | |
13989 | ||
13990 | #ifdef CONFIG_DEBUG_RODATA | |
13991 | +const int rodata_test_data = 0xC3; | |
13992 | +EXPORT_SYMBOL_GPL(rodata_test_data); | |
13993 | ||
13994 | void mark_rodata_ro(void) | |
13995 | { | |
13996 | @@ -845,32 +775,58 @@ void mark_rodata_ro(void) | |
13997 | if (num_possible_cpus() <= 1) | |
13998 | #endif | |
13999 | { | |
14000 | - change_page_attr(virt_to_page(start), | |
14001 | - size >> PAGE_SHIFT, PAGE_KERNEL_RX); | |
14002 | - printk("Write protecting the kernel text: %luk\n", size >> 10); | |
14003 | + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); | |
14004 | + printk(KERN_INFO "Write protecting the kernel text: %luk\n", | |
14005 | + size >> 10); | |
14006 | + | |
14007 | +#ifdef CONFIG_CPA_DEBUG | |
14008 | + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", | |
14009 | + start, start+size); | |
14010 | + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); | |
14011 | + | |
14012 | + printk(KERN_INFO "Testing CPA: write protecting again\n"); | |
14013 | + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); | |
14014 | +#endif | |
14015 | } | |
14016 | #endif | |
14017 | start += size; | |
14018 | size = (unsigned long)__end_rodata - start; | |
14019 | - change_page_attr(virt_to_page(start), | |
14020 | - size >> PAGE_SHIFT, PAGE_KERNEL_RO); | |
14021 | - printk("Write protecting the kernel read-only data: %luk\n", | |
14022 | - size >> 10); | |
14023 | + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); | |
14024 | + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", | |
14025 | + size >> 10); | |
14026 | + rodata_test(); | |
14027 | + | |
14028 | +#ifdef CONFIG_CPA_DEBUG | |
14029 | + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); | |
14030 | + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); | |
14031 | ||
14032 | - /* | |
14033 | - * change_page_attr() requires a global_flush_tlb() call after it. | |
14034 | - * We do this after the printk so that if something went wrong in the | |
14035 | - * change, the printk gets out at least to give a better debug hint | |
14036 | - * of who is the culprit. | |
14037 | - */ | |
14038 | - global_flush_tlb(); | |
14039 | + printk(KERN_INFO "Testing CPA: write protecting again\n"); | |
14040 | + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); | |
14041 | +#endif | |
14042 | } | |
14043 | #endif | |
14044 | ||
14045 | void free_init_pages(char *what, unsigned long begin, unsigned long end) | |
14046 | { | |
14047 | +#ifdef CONFIG_DEBUG_PAGEALLOC | |
14048 | + /* | |
14049 | + * If debugging page accesses then do not free this memory but | |
14050 | + * mark them not present - any buggy init-section access will | |
14051 | + * create a kernel page fault: | |
14052 | + */ | |
14053 | + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", | |
14054 | + begin, PAGE_ALIGN(end)); | |
14055 | + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); | |
14056 | +#else | |
14057 | unsigned long addr; | |
14058 | ||
14059 | + /* | |
14060 | + * We just marked the kernel text read only above, now that | |
14061 | + * we are going to free part of that, we need to make that | |
14062 | + * writeable first. | |
14063 | + */ | |
14064 | + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); | |
14065 | + | |
14066 | for (addr = begin; addr < end; addr += PAGE_SIZE) { | |
14067 | ClearPageReserved(virt_to_page(addr)); | |
14068 | init_page_count(virt_to_page(addr)); | |
14069 | @@ -879,6 +835,7 @@ void free_init_pages(char *what, unsigne | |
14070 | totalram_pages++; | |
14071 | } | |
14072 | printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); | |
14073 | +#endif | |
14074 | } | |
14075 | ||
14076 | void free_initmem(void) | |
14077 | @@ -894,4 +851,3 @@ void free_initrd_mem(unsigned long start | |
14078 | free_init_pages("initrd memory", start, end); | |
14079 | } | |
14080 | #endif | |
14081 | - | |
14082 | --- a/arch/x86/mm/init_64-xen.c | |
14083 | +++ b/arch/x86/mm/init_64-xen.c | |
14084 | @@ -46,14 +46,13 @@ | |
14085 | #include <asm/proto.h> | |
14086 | #include <asm/smp.h> | |
14087 | #include <asm/sections.h> | |
14088 | +#include <asm/kdebug.h> | |
14089 | +#include <asm/numa.h> | |
14090 | +#include <asm/cacheflush.h> | |
14091 | ||
14092 | #include <xen/features.h> | |
14093 | ||
14094 | -#ifndef Dprintk | |
14095 | -#define Dprintk(x...) | |
14096 | -#endif | |
14097 | - | |
14098 | -const struct dma_mapping_ops* dma_ops; | |
14099 | +const struct dma_mapping_ops *dma_ops; | |
14100 | EXPORT_SYMBOL(dma_ops); | |
14101 | ||
14102 | #if CONFIG_XEN_COMPAT <= 0x030002 | |
14103 | @@ -80,7 +79,21 @@ extern pte_t level1_fixmap_pgt[PTRS_PER_ | |
14104 | (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \ | |
14105 | __START_KERNEL_map))) | |
14106 | ||
14107 | -static void __meminit early_make_page_readonly(void *va, unsigned int feature) | |
14108 | +pmd_t *__init early_get_pmd(unsigned long va) | |
14109 | +{ | |
14110 | + unsigned long addr; | |
14111 | + unsigned long *page = (unsigned long *)init_level4_pgt; | |
14112 | + | |
14113 | + addr = page[pgd_index(va)]; | |
14114 | + addr_to_page(addr, page); | |
14115 | + | |
14116 | + addr = page[pud_index(va)]; | |
14117 | + addr_to_page(addr, page); | |
14118 | + | |
14119 | + return (pmd_t *)&page[pmd_index(va)]; | |
14120 | +} | |
14121 | + | |
14122 | +void __meminit early_make_page_readonly(void *va, unsigned int feature) | |
14123 | { | |
14124 | unsigned long addr, _va = (unsigned long)va; | |
14125 | pte_t pte, *ptep; | |
14126 | @@ -107,76 +120,6 @@ static void __meminit early_make_page_re | |
14127 | BUG(); | |
14128 | } | |
14129 | ||
14130 | -static void __make_page_readonly(void *va) | |
14131 | -{ | |
14132 | - pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep; | |
14133 | - unsigned long addr = (unsigned long) va; | |
14134 | - | |
14135 | - pgd = pgd_offset_k(addr); | |
14136 | - pud = pud_offset(pgd, addr); | |
14137 | - pmd = pmd_offset(pud, addr); | |
14138 | - ptep = pte_offset_kernel(pmd, addr); | |
14139 | - | |
14140 | - pte.pte = ptep->pte & ~_PAGE_RW; | |
14141 | - if (HYPERVISOR_update_va_mapping(addr, pte, 0)) | |
14142 | - xen_l1_entry_update(ptep, pte); /* fallback */ | |
14143 | - | |
14144 | - if ((addr >= VMALLOC_START) && (addr < VMALLOC_END)) | |
14145 | - __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT)); | |
14146 | -} | |
14147 | - | |
14148 | -static void __make_page_writable(void *va) | |
14149 | -{ | |
14150 | - pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep; | |
14151 | - unsigned long addr = (unsigned long) va; | |
14152 | - | |
14153 | - pgd = pgd_offset_k(addr); | |
14154 | - pud = pud_offset(pgd, addr); | |
14155 | - pmd = pmd_offset(pud, addr); | |
14156 | - ptep = pte_offset_kernel(pmd, addr); | |
14157 | - | |
14158 | - pte.pte = ptep->pte | _PAGE_RW; | |
14159 | - if (HYPERVISOR_update_va_mapping(addr, pte, 0)) | |
14160 | - xen_l1_entry_update(ptep, pte); /* fallback */ | |
14161 | - | |
14162 | - if ((addr >= VMALLOC_START) && (addr < VMALLOC_END)) | |
14163 | - __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT)); | |
14164 | -} | |
14165 | - | |
14166 | -void make_page_readonly(void *va, unsigned int feature) | |
14167 | -{ | |
14168 | - if (!xen_feature(feature)) | |
14169 | - __make_page_readonly(va); | |
14170 | -} | |
14171 | - | |
14172 | -void make_page_writable(void *va, unsigned int feature) | |
14173 | -{ | |
14174 | - if (!xen_feature(feature)) | |
14175 | - __make_page_writable(va); | |
14176 | -} | |
14177 | - | |
14178 | -void make_pages_readonly(void *va, unsigned nr, unsigned int feature) | |
14179 | -{ | |
14180 | - if (xen_feature(feature)) | |
14181 | - return; | |
14182 | - | |
14183 | - while (nr-- != 0) { | |
14184 | - __make_page_readonly(va); | |
14185 | - va = (void*)((unsigned long)va + PAGE_SIZE); | |
14186 | - } | |
14187 | -} | |
14188 | - | |
14189 | -void make_pages_writable(void *va, unsigned nr, unsigned int feature) | |
14190 | -{ | |
14191 | - if (xen_feature(feature)) | |
14192 | - return; | |
14193 | - | |
14194 | - while (nr-- != 0) { | |
14195 | - __make_page_writable(va); | |
14196 | - va = (void*)((unsigned long)va + PAGE_SIZE); | |
14197 | - } | |
14198 | -} | |
14199 | - | |
14200 | /* | |
14201 | * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the | |
14202 | * physical space so we can cache the place of the first one and move | |
14203 | @@ -187,22 +130,26 @@ void show_mem(void) | |
14204 | { | |
14205 | long i, total = 0, reserved = 0; | |
14206 | long shared = 0, cached = 0; | |
14207 | - pg_data_t *pgdat; | |
14208 | struct page *page; | |
14209 | + pg_data_t *pgdat; | |
14210 | ||
14211 | printk(KERN_INFO "Mem-info:\n"); | |
14212 | show_free_areas(); | |
14213 | - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | |
14214 | + printk(KERN_INFO "Free swap: %6ldkB\n", | |
14215 | + nr_swap_pages << (PAGE_SHIFT-10)); | |
14216 | ||
14217 | for_each_online_pgdat(pgdat) { | |
14218 | - for (i = 0; i < pgdat->node_spanned_pages; ++i) { | |
14219 | - /* this loop can take a while with 256 GB and 4k pages | |
14220 | - so update the NMI watchdog */ | |
14221 | - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { | |
14222 | + for (i = 0; i < pgdat->node_spanned_pages; ++i) { | |
14223 | + /* | |
14224 | + * This loop can take a while with 256 GB and | |
14225 | + * 4k pages so defer the NMI watchdog: | |
14226 | + */ | |
14227 | + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) | |
14228 | touch_nmi_watchdog(); | |
14229 | - } | |
14230 | + | |
14231 | if (!pfn_valid(pgdat->node_start_pfn + i)) | |
14232 | continue; | |
14233 | + | |
14234 | page = pfn_to_page(pgdat->node_start_pfn + i); | |
14235 | total++; | |
14236 | if (PageReserved(page)) | |
14237 | @@ -211,58 +158,67 @@ void show_mem(void) | |
14238 | cached++; | |
14239 | else if (page_count(page)) | |
14240 | shared += page_count(page) - 1; | |
14241 | - } | |
14242 | + } | |
14243 | } | |
14244 | - printk(KERN_INFO "%lu pages of RAM\n", total); | |
14245 | - printk(KERN_INFO "%lu reserved pages\n",reserved); | |
14246 | - printk(KERN_INFO "%lu pages shared\n",shared); | |
14247 | - printk(KERN_INFO "%lu pages swap cached\n",cached); | |
14248 | + printk(KERN_INFO "%lu pages of RAM\n", total); | |
14249 | + printk(KERN_INFO "%lu reserved pages\n", reserved); | |
14250 | + printk(KERN_INFO "%lu pages shared\n", shared); | |
14251 | + printk(KERN_INFO "%lu pages swap cached\n", cached); | |
14252 | } | |
14253 | ||
14254 | +static unsigned long __meminitdata table_start; | |
14255 | +static unsigned long __meminitdata table_end; | |
14256 | ||
14257 | static __init void *spp_getpage(void) | |
14258 | -{ | |
14259 | +{ | |
14260 | void *ptr; | |
14261 | + | |
14262 | if (after_bootmem) | |
14263 | - ptr = (void *) get_zeroed_page(GFP_ATOMIC); | |
14264 | + ptr = (void *) get_zeroed_page(GFP_ATOMIC); | |
14265 | else if (start_pfn < table_end) { | |
14266 | ptr = __va(start_pfn << PAGE_SHIFT); | |
14267 | start_pfn++; | |
14268 | memset(ptr, 0, PAGE_SIZE); | |
14269 | } else | |
14270 | ptr = alloc_bootmem_pages(PAGE_SIZE); | |
14271 | - if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) | |
14272 | - panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); | |
14273 | ||
14274 | - Dprintk("spp_getpage %p\n", ptr); | |
14275 | + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) { | |
14276 | + panic("set_pte_phys: cannot allocate page data %s\n", | |
14277 | + after_bootmem ? "after bootmem" : ""); | |
14278 | + } | |
14279 | + | |
14280 | + pr_debug("spp_getpage %p\n", ptr); | |
14281 | + | |
14282 | return ptr; | |
14283 | -} | |
14284 | +} | |
14285 | ||
14286 | #define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address)) | |
14287 | #define pud_offset_u(address) (level3_user_pgt + pud_index(address)) | |
14288 | ||
14289 | -static __init void set_pte_phys(unsigned long vaddr, | |
14290 | - unsigned long phys, pgprot_t prot, int user_mode) | |
14291 | +static __init void | |
14292 | +set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode) | |
14293 | { | |
14294 | pgd_t *pgd; | |
14295 | pud_t *pud; | |
14296 | pmd_t *pmd; | |
14297 | pte_t *pte, new_pte; | |
14298 | ||
14299 | - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); | |
14300 | + pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys); | |
14301 | ||
14302 | pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr)); | |
14303 | if (pgd_none(*pgd)) { | |
14304 | - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); | |
14305 | + printk(KERN_ERR | |
14306 | + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); | |
14307 | return; | |
14308 | } | |
14309 | pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr)); | |
14310 | if (pud_none(*pud)) { | |
14311 | - pmd = (pmd_t *) spp_getpage(); | |
14312 | + pmd = (pmd_t *) spp_getpage(); | |
14313 | make_page_readonly(pmd, XENFEAT_writable_page_tables); | |
14314 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); | |
14315 | if (pmd != pmd_offset(pud, 0)) { | |
14316 | - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); | |
14317 | + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", | |
14318 | + pmd, pmd_offset(pud, 0)); | |
14319 | return; | |
14320 | } | |
14321 | } | |
14322 | @@ -272,7 +228,7 @@ static __init void set_pte_phys(unsigned | |
14323 | make_page_readonly(pte, XENFEAT_writable_page_tables); | |
14324 | set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); | |
14325 | if (pte != pte_offset_kernel(pmd, 0)) { | |
14326 | - printk("PAGETABLE BUG #02!\n"); | |
14327 | + printk(KERN_ERR "PAGETABLE BUG #02!\n"); | |
14328 | return; | |
14329 | } | |
14330 | } | |
14331 | @@ -294,30 +250,30 @@ static __init void set_pte_phys(unsigned | |
14332 | __flush_tlb_one(vaddr); | |
14333 | } | |
14334 | ||
14335 | -static __init void set_pte_phys_ma(unsigned long vaddr, | |
14336 | - unsigned long phys, pgprot_t prot) | |
14337 | +static __init void | |
14338 | +set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot) | |
14339 | { | |
14340 | pgd_t *pgd; | |
14341 | pud_t *pud; | |
14342 | pmd_t *pmd; | |
14343 | pte_t *pte, new_pte; | |
14344 | ||
14345 | - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); | |
14346 | + pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys); | |
14347 | ||
14348 | pgd = pgd_offset_k(vaddr); | |
14349 | if (pgd_none(*pgd)) { | |
14350 | - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); | |
14351 | + printk(KERN_ERR | |
14352 | + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); | |
14353 | return; | |
14354 | } | |
14355 | pud = pud_offset(pgd, vaddr); | |
14356 | if (pud_none(*pud)) { | |
14357 | - | |
14358 | - pmd = (pmd_t *) spp_getpage(); | |
14359 | + pmd = (pmd_t *) spp_getpage(); | |
14360 | make_page_readonly(pmd, XENFEAT_writable_page_tables); | |
14361 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); | |
14362 | if (pmd != pmd_offset(pud, 0)) { | |
14363 | - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); | |
14364 | - return; | |
14365 | + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", | |
14366 | + pmd, pmd_offset(pud, 0)); | |
14367 | } | |
14368 | } | |
14369 | pmd = pmd_offset(pud, vaddr); | |
14370 | @@ -326,7 +282,7 @@ static __init void set_pte_phys_ma(unsig | |
14371 | make_page_readonly(pte, XENFEAT_writable_page_tables); | |
14372 | set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); | |
14373 | if (pte != pte_offset_kernel(pmd, 0)) { | |
14374 | - printk("PAGETABLE BUG #02!\n"); | |
14375 | + printk(KERN_ERR "PAGETABLE BUG #02!\n"); | |
14376 | return; | |
14377 | } | |
14378 | } | |
14379 | @@ -350,14 +306,44 @@ static __init void set_pte_phys_ma(unsig | |
14380 | __flush_tlb_one(vaddr); | |
14381 | } | |
14382 | ||
14383 | +#ifndef CONFIG_XEN | |
14384 | +/* | |
14385 | + * The head.S code sets up the kernel high mapping: | |
14386 | + * | |
14387 | + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) | |
14388 | + * | |
14389 | + * phys_addr holds the negative offset to the kernel, which is added | |
14390 | + * to the compile time generated pmds. This results in invalid pmds up | |
14391 | + * to the point where we hit the physaddr 0 mapping. | |
14392 | + * | |
14393 | + * We limit the mappings to the region from _text to _end. _end is | |
14394 | + * rounded up to the 2MB boundary. This catches the invalid pmds as | |
14395 | + * well, as they are located before _text: | |
14396 | + */ | |
14397 | +void __init cleanup_highmap(void) | |
14398 | +{ | |
14399 | + unsigned long vaddr = __START_KERNEL_map; | |
14400 | + unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; | |
14401 | + pmd_t *pmd = level2_kernel_pgt; | |
14402 | + pmd_t *last_pmd = pmd + PTRS_PER_PMD; | |
14403 | + | |
14404 | + for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { | |
14405 | + if (!pmd_present(*pmd)) | |
14406 | + continue; | |
14407 | + if (vaddr < (unsigned long) _text || vaddr > end) | |
14408 | + set_pmd(pmd, __pmd(0)); | |
14409 | + } | |
14410 | +} | |
14411 | +#endif | |
14412 | + | |
14413 | /* NOTE: this is meant to be run only at boot */ | |
14414 | -void __init | |
14415 | -__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) | |
14416 | +void __init | |
14417 | +__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) | |
14418 | { | |
14419 | unsigned long address = __fix_to_virt(idx); | |
14420 | ||
14421 | if (idx >= __end_of_fixed_addresses) { | |
14422 | - printk("Invalid __set_fixmap\n"); | |
14423 | + printk(KERN_ERR "Invalid __set_fixmap\n"); | |
14424 | return; | |
14425 | } | |
14426 | switch (idx) { | |
14427 | @@ -375,16 +361,14 @@ __set_fixmap (enum fixed_addresses idx, | |
14428 | } | |
14429 | } | |
14430 | ||
14431 | -unsigned long __meminitdata table_start, table_end; | |
14432 | - | |
14433 | static __meminit void *alloc_static_page(unsigned long *phys) | |
14434 | { | |
14435 | unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map; | |
14436 | ||
14437 | if (after_bootmem) { | |
14438 | void *adr = (void *)get_zeroed_page(GFP_ATOMIC); | |
14439 | - | |
14440 | *phys = __pa(adr); | |
14441 | + | |
14442 | return adr; | |
14443 | } | |
14444 | ||
14445 | @@ -396,7 +380,7 @@ static __meminit void *alloc_static_page | |
14446 | ||
14447 | #define PTE_SIZE PAGE_SIZE | |
14448 | ||
14449 | -static inline int make_readonly(unsigned long paddr) | |
14450 | +static inline int __meminit make_readonly(unsigned long paddr) | |
14451 | { | |
14452 | extern char __vsyscall_0; | |
14453 | int readonly = 0; | |
14454 | @@ -430,33 +414,38 @@ static inline int make_readonly(unsigned | |
14455 | /* Must run before zap_low_mappings */ | |
14456 | __meminit void *early_ioremap(unsigned long addr, unsigned long size) | |
14457 | { | |
14458 | - unsigned long vaddr; | |
14459 | pmd_t *pmd, *last_pmd; | |
14460 | + unsigned long vaddr; | |
14461 | int i, pmds; | |
14462 | ||
14463 | pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; | |
14464 | vaddr = __START_KERNEL_map; | |
14465 | pmd = level2_kernel_pgt; | |
14466 | last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; | |
14467 | + | |
14468 | for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { | |
14469 | for (i = 0; i < pmds; i++) { | |
14470 | if (pmd_present(pmd[i])) | |
14471 | - goto next; | |
14472 | + goto continue_outer_loop; | |
14473 | } | |
14474 | vaddr += addr & ~PMD_MASK; | |
14475 | addr &= PMD_MASK; | |
14476 | + | |
14477 | for (i = 0; i < pmds; i++, addr += PMD_SIZE) | |
14478 | - set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); | |
14479 | - __flush_tlb(); | |
14480 | + set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); | |
14481 | + __flush_tlb_all(); | |
14482 | + | |
14483 | return (void *)vaddr; | |
14484 | - next: | |
14485 | +continue_outer_loop: | |
14486 | ; | |
14487 | } | |
14488 | printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); | |
14489 | return NULL; | |
14490 | } | |
14491 | ||
14492 | -/* To avoid virtual aliases later */ | |
14493 | +/* | |
14494 | + * To avoid virtual aliases later: | |
14495 | + */ | |
14496 | __meminit void early_iounmap(void *addr, unsigned long size) | |
14497 | { | |
14498 | unsigned long vaddr; | |
14499 | @@ -466,9 +455,11 @@ __meminit void early_iounmap(void *addr, | |
14500 | vaddr = (unsigned long)addr; | |
14501 | pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; | |
14502 | pmd = level2_kernel_pgt + pmd_index(vaddr); | |
14503 | + | |
14504 | for (i = 0; i < pmds; i++) | |
14505 | pmd_clear(pmd + i); | |
14506 | - __flush_tlb(); | |
14507 | + | |
14508 | + __flush_tlb_all(); | |
14509 | } | |
14510 | #endif | |
14511 | ||
14512 | @@ -517,18 +508,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned | |
14513 | static void __meminit | |
14514 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) | |
14515 | { | |
14516 | - pmd_t *pmd = pmd_offset(pud,0); | |
14517 | + pmd_t *pmd = pmd_offset(pud, 0); | |
14518 | spin_lock(&init_mm.page_table_lock); | |
14519 | phys_pmd_init(pmd, address, end); | |
14520 | spin_unlock(&init_mm.page_table_lock); | |
14521 | __flush_tlb_all(); | |
14522 | } | |
14523 | ||
14524 | -static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) | |
14525 | -{ | |
14526 | +static void __meminit | |
14527 | +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) | |
14528 | +{ | |
14529 | int i = pud_index(addr); | |
14530 | ||
14531 | - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { | |
14532 | + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { | |
14533 | unsigned long pmd_phys; | |
14534 | pud_t *pud = pud_page + pud_index(addr); | |
14535 | pmd_t *pmd; | |
14536 | @@ -550,8 +542,8 @@ static void __meminit phys_pud_init(pud_ | |
14537 | ||
14538 | early_make_page_readonly(pmd, XENFEAT_writable_page_tables); | |
14539 | } | |
14540 | - __flush_tlb(); | |
14541 | -} | |
14542 | + __flush_tlb_all(); | |
14543 | +} | |
14544 | ||
14545 | void __init xen_init_pt(void) | |
14546 | { | |
14547 | @@ -632,6 +624,7 @@ void __init xen_init_pt(void) | |
14548 | static void __init extend_init_mapping(unsigned long tables_space) | |
14549 | { | |
14550 | unsigned long va = __START_KERNEL_map; | |
14551 | + unsigned long start = start_pfn; | |
14552 | unsigned long phys, addr, *pte_page; | |
14553 | pmd_t *pmd; | |
14554 | pte_t *pte, new_pte; | |
14555 | @@ -682,6 +675,10 @@ static void __init extend_init_mapping(u | |
14556 | BUG(); | |
14557 | va += PAGE_SIZE; | |
14558 | } | |
14559 | + | |
14560 | + if (start_pfn > start) | |
14561 | + reserve_early(start << PAGE_SHIFT, | |
14562 | + start_pfn << PAGE_SHIFT, "INITMAP"); | |
14563 | } | |
14564 | ||
14565 | static void __init find_early_table_space(unsigned long end) | |
14566 | @@ -706,7 +703,7 @@ static void __init find_early_table_spac | |
14567 | (table_start << PAGE_SHIFT) + tables); | |
14568 | } | |
14569 | ||
14570 | -static void xen_finish_init_mapping(void) | |
14571 | +static void __init xen_finish_init_mapping(void) | |
14572 | { | |
14573 | unsigned long i, start, end; | |
14574 | ||
14575 | @@ -742,8 +739,10 @@ static void xen_finish_init_mapping(void | |
14576 | * Prefetch pte's for the bt_ioremap() area. It gets used before the | |
14577 | * boot-time allocator is online, so allocate-on-demand would fail. | |
14578 | */ | |
14579 | + early_ioremap_clear(); | |
14580 | for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++) | |
14581 | __set_fixmap(i, 0, __pgprot(0)); | |
14582 | + early_ioremap_reset(); | |
14583 | ||
14584 | /* Switch to the real shared_info page, and clear the dummy page. */ | |
14585 | set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); | |
14586 | @@ -764,20 +763,23 @@ static void xen_finish_init_mapping(void | |
14587 | table_end = start_pfn; | |
14588 | } | |
14589 | ||
14590 | -/* Setup the direct mapping of the physical memory at PAGE_OFFSET. | |
14591 | - This runs before bootmem is initialized and gets pages directly from the | |
14592 | - physical memory. To access them they are temporarily mapped. */ | |
14593 | +/* | |
14594 | + * Setup the direct mapping of the physical memory at PAGE_OFFSET. | |
14595 | + * This runs before bootmem is initialized and gets pages directly from | |
14596 | + * the physical memory. To access them they are temporarily mapped. | |
14597 | + */ | |
14598 | void __init_refok init_memory_mapping(unsigned long start, unsigned long end) | |
14599 | -{ | |
14600 | +{ | |
14601 | unsigned long next; | |
14602 | ||
14603 | - Dprintk("init_memory_mapping\n"); | |
14604 | + pr_debug("init_memory_mapping\n"); | |
14605 | ||
14606 | - /* | |
14607 | + /* | |
14608 | * Find space for the kernel direct mapping tables. | |
14609 | - * Later we should allocate these tables in the local node of the memory | |
14610 | - * mapped. Unfortunately this is done currently before the nodes are | |
14611 | - * discovered. | |
14612 | + * | |
14613 | + * Later we should allocate these tables in the local node of the | |
14614 | + * memory mapped. Unfortunately this is done currently before the | |
14615 | + * nodes are discovered. | |
14616 | */ | |
14617 | if (!after_bootmem) | |
14618 | find_early_table_space(end); | |
14619 | @@ -786,8 +788,8 @@ void __init_refok init_memory_mapping(un | |
14620 | end = (unsigned long)__va(end); | |
14621 | ||
14622 | for (; start < end; start = next) { | |
14623 | - unsigned long pud_phys; | |
14624 | pgd_t *pgd = pgd_offset_k(start); | |
14625 | + unsigned long pud_phys; | |
14626 | pud_t *pud; | |
14627 | ||
14628 | if (after_bootmem) | |
14629 | @@ -795,8 +797,8 @@ void __init_refok init_memory_mapping(un | |
14630 | else | |
14631 | pud = alloc_static_page(&pud_phys); | |
14632 | next = start + PGDIR_SIZE; | |
14633 | - if (next > end) | |
14634 | - next = end; | |
14635 | + if (next > end) | |
14636 | + next = end; | |
14637 | phys_pud_init(pud, __pa(start), __pa(next)); | |
14638 | if (!after_bootmem) { | |
14639 | early_make_page_readonly(pud, XENFEAT_writable_page_tables); | |
14640 | @@ -810,12 +812,17 @@ void __init_refok init_memory_mapping(un | |
14641 | } | |
14642 | ||
14643 | __flush_tlb_all(); | |
14644 | + | |
14645 | + if (!after_bootmem) | |
14646 | + reserve_early(table_start << PAGE_SHIFT, | |
14647 | + table_end << PAGE_SHIFT, "PGTABLE"); | |
14648 | } | |
14649 | ||
14650 | #ifndef CONFIG_NUMA | |
14651 | void __init paging_init(void) | |
14652 | { | |
14653 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | |
14654 | + | |
14655 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | |
14656 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; | |
14657 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; | |
14658 | @@ -829,40 +836,6 @@ void __init paging_init(void) | |
14659 | } | |
14660 | #endif | |
14661 | ||
14662 | -/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches | |
14663 | - from the CPU leading to inconsistent cache lines. address and size | |
14664 | - must be aligned to 2MB boundaries. | |
14665 | - Does nothing when the mapping doesn't exist. */ | |
14666 | -void __init clear_kernel_mapping(unsigned long address, unsigned long size) | |
14667 | -{ | |
14668 | - unsigned long end = address + size; | |
14669 | - | |
14670 | - BUG_ON(address & ~LARGE_PAGE_MASK); | |
14671 | - BUG_ON(size & ~LARGE_PAGE_MASK); | |
14672 | - | |
14673 | - for (; address < end; address += LARGE_PAGE_SIZE) { | |
14674 | - pgd_t *pgd = pgd_offset_k(address); | |
14675 | - pud_t *pud; | |
14676 | - pmd_t *pmd; | |
14677 | - if (pgd_none(*pgd)) | |
14678 | - continue; | |
14679 | - pud = pud_offset(pgd, address); | |
14680 | - if (pud_none(*pud)) | |
14681 | - continue; | |
14682 | - pmd = pmd_offset(pud, address); | |
14683 | - if (!pmd || pmd_none(*pmd)) | |
14684 | - continue; | |
14685 | - if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) { | |
14686 | - /* Could handle this, but it should not happen currently. */ | |
14687 | - printk(KERN_ERR | |
14688 | - "clear_kernel_mapping: mapping has been split. will leak memory\n"); | |
14689 | - pmd_ERROR(*pmd); | |
14690 | - } | |
14691 | - set_pmd(pmd, __pmd(0)); | |
14692 | - } | |
14693 | - __flush_tlb_all(); | |
14694 | -} | |
14695 | - | |
14696 | /* | |
14697 | * Memory hotplug specific functions | |
14698 | */ | |
14699 | @@ -888,16 +861,12 @@ int arch_add_memory(int nid, u64 start, | |
14700 | unsigned long nr_pages = size >> PAGE_SHIFT; | |
14701 | int ret; | |
14702 | ||
14703 | - init_memory_mapping(start, (start + size -1)); | |
14704 | + init_memory_mapping(start, start + size-1); | |
14705 | ||
14706 | ret = __add_pages(zone, start_pfn, nr_pages); | |
14707 | - if (ret) | |
14708 | - goto error; | |
14709 | + WARN_ON(1); | |
14710 | ||
14711 | return ret; | |
14712 | -error: | |
14713 | - printk("%s: Problem encountered in __add_pages!\n", __func__); | |
14714 | - return ret; | |
14715 | } | |
14716 | EXPORT_SYMBOL_GPL(arch_add_memory); | |
14717 | ||
14718 | @@ -911,36 +880,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to | |
14719 | ||
14720 | #endif /* CONFIG_MEMORY_HOTPLUG */ | |
14721 | ||
14722 | -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | |
14723 | -/* | |
14724 | - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, | |
14725 | - * just online the pages. | |
14726 | - */ | |
14727 | -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) | |
14728 | -{ | |
14729 | - int err = -EIO; | |
14730 | - unsigned long pfn; | |
14731 | - unsigned long total = 0, mem = 0; | |
14732 | - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { | |
14733 | - if (pfn_valid(pfn)) { | |
14734 | - online_page(pfn_to_page(pfn)); | |
14735 | - err = 0; | |
14736 | - mem++; | |
14737 | - } | |
14738 | - total++; | |
14739 | - } | |
14740 | - if (!err) { | |
14741 | - z->spanned_pages += total; | |
14742 | - z->present_pages += mem; | |
14743 | - z->zone_pgdat->node_spanned_pages += total; | |
14744 | - z->zone_pgdat->node_present_pages += mem; | |
14745 | - } | |
14746 | - return err; | |
14747 | -} | |
14748 | -#endif | |
14749 | - | |
14750 | -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, | |
14751 | - kcore_vsyscall; | |
14752 | +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, | |
14753 | + kcore_modules, kcore_vsyscall; | |
14754 | ||
14755 | void __init mem_init(void) | |
14756 | { | |
14757 | @@ -949,8 +890,7 @@ void __init mem_init(void) | |
14758 | ||
14759 | pci_iommu_alloc(); | |
14760 | ||
14761 | - /* clear the zero-page */ | |
14762 | - memset(empty_zero_page, 0, PAGE_SIZE); | |
14763 | + /* clear_bss() already clear the empty_zero_page */ | |
14764 | ||
14765 | reservedpages = 0; | |
14766 | ||
14767 | @@ -968,7 +908,6 @@ void __init mem_init(void) | |
14768 | } | |
14769 | reservedpages = end_pfn - totalram_pages - | |
14770 | absent_pages_in_range(0, end_pfn); | |
14771 | - | |
14772 | after_bootmem = 1; | |
14773 | ||
14774 | codesize = (unsigned long) &_etext - (unsigned long) &_text; | |
14775 | @@ -976,46 +915,64 @@ void __init mem_init(void) | |
14776 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | |
14777 | ||
14778 | /* Register memory areas for /proc/kcore */ | |
14779 | - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | |
14780 | - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | |
14781 | + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | |
14782 | + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | |
14783 | VMALLOC_END-VMALLOC_START); | |
14784 | kclist_add(&kcore_kernel, &_stext, _end - _stext); | |
14785 | kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); | |
14786 | - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, | |
14787 | + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, | |
14788 | VSYSCALL_END - VSYSCALL_START); | |
14789 | ||
14790 | - printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", | |
14791 | + printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " | |
14792 | + "%ldk reserved, %ldk data, %ldk init)\n", | |
14793 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | |
14794 | end_pfn << (PAGE_SHIFT-10), | |
14795 | codesize >> 10, | |
14796 | reservedpages << (PAGE_SHIFT-10), | |
14797 | datasize >> 10, | |
14798 | initsize >> 10); | |
14799 | + | |
14800 | + cpa_init(); | |
14801 | } | |
14802 | ||
14803 | void free_init_pages(char *what, unsigned long begin, unsigned long end) | |
14804 | { | |
14805 | - unsigned long addr; | |
14806 | + unsigned long addr = begin; | |
14807 | ||
14808 | - if (begin >= end) | |
14809 | + if (addr >= end) | |
14810 | return; | |
14811 | ||
14812 | + /* | |
14813 | + * If debugging page accesses then do not free this memory but | |
14814 | + * mark them not present - any buggy init-section access will | |
14815 | + * create a kernel page fault: | |
14816 | + */ | |
14817 | +#ifdef CONFIG_DEBUG_PAGEALLOC | |
14818 | + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", | |
14819 | + begin, PAGE_ALIGN(end)); | |
14820 | + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); | |
14821 | +#else | |
14822 | printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); | |
14823 | - for (addr = begin; addr < end; addr += PAGE_SIZE) { | |
14824 | + | |
14825 | + for (; addr < end; addr += PAGE_SIZE) { | |
14826 | ClearPageReserved(virt_to_page(addr)); | |
14827 | init_page_count(virt_to_page(addr)); | |
14828 | memset((void *)(addr & ~(PAGE_SIZE-1)), | |
14829 | POISON_FREE_INITMEM, PAGE_SIZE); | |
14830 | if (addr >= __START_KERNEL_map) { | |
14831 | /* make_readonly() reports all kernel addresses. */ | |
14832 | - __make_page_writable(__va(__pa(addr))); | |
14833 | - change_page_attr_addr(addr, 1, __pgprot(0)); | |
14834 | + if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)), | |
14835 | + pfn_pte(__pa(addr) >> PAGE_SHIFT, | |
14836 | + PAGE_KERNEL), | |
14837 | + 0)) | |
14838 | + BUG(); | |
14839 | + if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) | |
14840 | + BUG(); | |
14841 | } | |
14842 | free_page(addr); | |
14843 | totalram_pages++; | |
14844 | } | |
14845 | - if (addr > __START_KERNEL_map) | |
14846 | - global_flush_tlb(); | |
14847 | +#endif | |
14848 | } | |
14849 | ||
14850 | void free_initmem(void) | |
14851 | @@ -1026,6 +983,8 @@ void free_initmem(void) | |
14852 | } | |
14853 | ||
14854 | #ifdef CONFIG_DEBUG_RODATA | |
14855 | +const int rodata_test_data = 0xC3; | |
14856 | +EXPORT_SYMBOL_GPL(rodata_test_data); | |
14857 | ||
14858 | void mark_rodata_ro(void) | |
14859 | { | |
14860 | @@ -1047,18 +1006,27 @@ void mark_rodata_ro(void) | |
14861 | if (end <= start) | |
14862 | return; | |
14863 | ||
14864 | - change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); | |
14865 | ||
14866 | printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", | |
14867 | (end - start) >> 10); | |
14868 | + set_memory_ro(start, (end - start) >> PAGE_SHIFT); | |
14869 | ||
14870 | /* | |
14871 | - * change_page_attr_addr() requires a global_flush_tlb() call after it. | |
14872 | - * We do this after the printk so that if something went wrong in the | |
14873 | - * change, the printk gets out at least to give a better debug hint | |
14874 | - * of who is the culprit. | |
14875 | + * The rodata section (but not the kernel text!) should also be | |
14876 | + * not-executable. | |
14877 | */ | |
14878 | - global_flush_tlb(); | |
14879 | + start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; | |
14880 | + set_memory_nx(start, (end - start) >> PAGE_SHIFT); | |
14881 | + | |
14882 | + rodata_test(); | |
14883 | + | |
14884 | +#ifdef CONFIG_CPA_DEBUG | |
14885 | + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); | |
14886 | + set_memory_rw(start, (end-start) >> PAGE_SHIFT); | |
14887 | + | |
14888 | + printk(KERN_INFO "Testing CPA: again\n"); | |
14889 | + set_memory_ro(start, (end-start) >> PAGE_SHIFT); | |
14890 | +#endif | |
14891 | } | |
14892 | #endif | |
14893 | ||
14894 | @@ -1069,17 +1037,21 @@ void free_initrd_mem(unsigned long start | |
14895 | } | |
14896 | #endif | |
14897 | ||
14898 | -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) | |
14899 | -{ | |
14900 | +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) | |
14901 | +{ | |
14902 | #ifdef CONFIG_NUMA | |
14903 | int nid = phys_to_nid(phys); | |
14904 | #endif | |
14905 | unsigned long pfn = phys >> PAGE_SHIFT; | |
14906 | + | |
14907 | if (pfn >= end_pfn) { | |
14908 | - /* This can happen with kdump kernels when accessing firmware | |
14909 | - tables. */ | |
14910 | + /* | |
14911 | + * This can happen with kdump kernels when accessing | |
14912 | + * firmware tables: | |
14913 | + */ | |
14914 | if (pfn < end_pfn_map) | |
14915 | return; | |
14916 | + | |
14917 | printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", | |
14918 | phys, len); | |
14919 | return; | |
14920 | @@ -1087,9 +1059,9 @@ void __init reserve_bootmem_generic(unsi | |
14921 | ||
14922 | /* Should check here against the e820 map to avoid double free */ | |
14923 | #ifdef CONFIG_NUMA | |
14924 | - reserve_bootmem_node(NODE_DATA(nid), phys, len); | |
14925 | -#else | |
14926 | - reserve_bootmem(phys, len); | |
14927 | + reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); | |
14928 | +#else | |
14929 | + reserve_bootmem(phys, len, BOOTMEM_DEFAULT); | |
14930 | #endif | |
14931 | #ifndef CONFIG_XEN | |
14932 | if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { | |
14933 | @@ -1101,46 +1073,49 @@ void __init reserve_bootmem_generic(unsi | |
14934 | #endif | |
14935 | } | |
14936 | ||
14937 | -int kern_addr_valid(unsigned long addr) | |
14938 | -{ | |
14939 | +int kern_addr_valid(unsigned long addr) | |
14940 | +{ | |
14941 | unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; | |
14942 | - pgd_t *pgd; | |
14943 | - pud_t *pud; | |
14944 | - pmd_t *pmd; | |
14945 | - pte_t *pte; | |
14946 | + pgd_t *pgd; | |
14947 | + pud_t *pud; | |
14948 | + pmd_t *pmd; | |
14949 | + pte_t *pte; | |
14950 | ||
14951 | if (above != 0 && above != -1UL) | |
14952 | - return 0; | |
14953 | - | |
14954 | + return 0; | |
14955 | + | |
14956 | pgd = pgd_offset_k(addr); | |
14957 | if (pgd_none(*pgd)) | |
14958 | return 0; | |
14959 | ||
14960 | pud = pud_offset(pgd, addr); | |
14961 | if (pud_none(*pud)) | |
14962 | - return 0; | |
14963 | + return 0; | |
14964 | ||
14965 | pmd = pmd_offset(pud, addr); | |
14966 | if (pmd_none(*pmd)) | |
14967 | return 0; | |
14968 | + | |
14969 | if (pmd_large(*pmd)) | |
14970 | return pfn_valid(pmd_pfn(*pmd)); | |
14971 | ||
14972 | pte = pte_offset_kernel(pmd, addr); | |
14973 | if (pte_none(*pte)) | |
14974 | return 0; | |
14975 | + | |
14976 | return pfn_valid(pte_pfn(*pte)); | |
14977 | } | |
14978 | ||
14979 | -/* A pseudo VMA to allow ptrace access for the vsyscall page. This only | |
14980 | - covers the 64bit vsyscall page now. 32bit has a real VMA now and does | |
14981 | - not need special handling anymore. */ | |
14982 | - | |
14983 | +/* | |
14984 | + * A pseudo VMA to allow ptrace access for the vsyscall page. This only | |
14985 | + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does | |
14986 | + * not need special handling anymore: | |
14987 | + */ | |
14988 | static struct vm_area_struct gate_vma = { | |
14989 | - .vm_start = VSYSCALL_START, | |
14990 | - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), | |
14991 | - .vm_page_prot = PAGE_READONLY_EXEC, | |
14992 | - .vm_flags = VM_READ | VM_EXEC | |
14993 | + .vm_start = VSYSCALL_START, | |
14994 | + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), | |
14995 | + .vm_page_prot = PAGE_READONLY_EXEC, | |
14996 | + .vm_flags = VM_READ | VM_EXEC | |
14997 | }; | |
14998 | ||
14999 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | |
15000 | @@ -1155,14 +1130,17 @@ struct vm_area_struct *get_gate_vma(stru | |
15001 | int in_gate_area(struct task_struct *task, unsigned long addr) | |
15002 | { | |
15003 | struct vm_area_struct *vma = get_gate_vma(task); | |
15004 | + | |
15005 | if (!vma) | |
15006 | return 0; | |
15007 | + | |
15008 | return (addr >= vma->vm_start) && (addr < vma->vm_end); | |
15009 | } | |
15010 | ||
15011 | -/* Use this when you have no reliable task/vma, typically from interrupt | |
15012 | - * context. It is less reliable than using the task's vma and may give | |
15013 | - * false positives. | |
15014 | +/* | |
15015 | + * Use this when you have no reliable task/vma, typically from interrupt | |
15016 | + * context. It is less reliable than using the task's vma and may give | |
15017 | + * false positives: | |
15018 | */ | |
15019 | int in_gate_area_no_task(unsigned long addr) | |
15020 | { | |
15021 | @@ -1182,8 +1160,8 @@ const char *arch_vma_name(struct vm_area | |
15022 | /* | |
15023 | * Initialise the sparsemem vmemmap using huge-pages at the PMD level. | |
15024 | */ | |
15025 | -int __meminit vmemmap_populate(struct page *start_page, | |
15026 | - unsigned long size, int node) | |
15027 | +int __meminit | |
15028 | +vmemmap_populate(struct page *start_page, unsigned long size, int node) | |
15029 | { | |
15030 | unsigned long addr = (unsigned long)start_page; | |
15031 | unsigned long end = (unsigned long)(start_page + size); | |
15032 | @@ -1198,6 +1176,7 @@ int __meminit vmemmap_populate(struct pa | |
15033 | pgd = vmemmap_pgd_populate(addr, node); | |
15034 | if (!pgd) | |
15035 | return -ENOMEM; | |
15036 | + | |
15037 | pud = vmemmap_pud_populate(pgd, addr, node); | |
15038 | if (!pud) | |
15039 | return -ENOMEM; | |
15040 | @@ -1205,20 +1184,22 @@ int __meminit vmemmap_populate(struct pa | |
15041 | pmd = pmd_offset(pud, addr); | |
15042 | if (pmd_none(*pmd)) { | |
15043 | pte_t entry; | |
15044 | - void *p = vmemmap_alloc_block(PMD_SIZE, node); | |
15045 | + void *p; | |
15046 | + | |
15047 | + p = vmemmap_alloc_block(PMD_SIZE, node); | |
15048 | if (!p) | |
15049 | return -ENOMEM; | |
15050 | ||
15051 | - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); | |
15052 | - mk_pte_huge(entry); | |
15053 | - set_pmd(pmd, __pmd(pte_val(entry))); | |
15054 | + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, | |
15055 | + PAGE_KERNEL_LARGE); | |
15056 | + set_pmd(pmd, __pmd_ma(__pte_val(entry))); | |
15057 | ||
15058 | printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", | |
15059 | addr, addr + PMD_SIZE - 1, p, node); | |
15060 | - } else | |
15061 | + } else { | |
15062 | vmemmap_verify((pte_t *)pmd, node, addr, next); | |
15063 | + } | |
15064 | } | |
15065 | - | |
15066 | return 0; | |
15067 | } | |
15068 | #endif | |
15069 | --- a/arch/x86/mm/ioremap_32-xen.c | |
15070 | +++ /dev/null | |
15071 | @@ -1,445 +0,0 @@ | |
15072 | -/* | |
15073 | - * arch/i386/mm/ioremap.c | |
15074 | - * | |
15075 | - * Re-map IO memory to kernel address space so that we can access it. | |
15076 | - * This is needed for high PCI addresses that aren't mapped in the | |
15077 | - * 640k-1MB IO memory area on PC's | |
15078 | - * | |
15079 | - * (C) Copyright 1995 1996 Linus Torvalds | |
15080 | - */ | |
15081 | - | |
15082 | -#include <linux/vmalloc.h> | |
15083 | -#include <linux/init.h> | |
15084 | -#include <linux/slab.h> | |
15085 | -#include <linux/module.h> | |
15086 | -#include <linux/io.h> | |
15087 | -#include <linux/sched.h> | |
15088 | -#include <asm/fixmap.h> | |
15089 | -#include <asm/cacheflush.h> | |
15090 | -#include <asm/tlbflush.h> | |
15091 | -#include <asm/pgtable.h> | |
15092 | -#include <asm/pgalloc.h> | |
15093 | - | |
15094 | -#define ISA_START_ADDRESS 0x0 | |
15095 | -#define ISA_END_ADDRESS 0x100000 | |
15096 | - | |
15097 | -static int direct_remap_area_pte_fn(pte_t *pte, | |
15098 | - struct page *pmd_page, | |
15099 | - unsigned long address, | |
15100 | - void *data) | |
15101 | -{ | |
15102 | - mmu_update_t **v = (mmu_update_t **)data; | |
15103 | - | |
15104 | - BUG_ON(!pte_none(*pte)); | |
15105 | - | |
15106 | - (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) << | |
15107 | - PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); | |
15108 | - (*v)++; | |
15109 | - | |
15110 | - return 0; | |
15111 | -} | |
15112 | - | |
15113 | -static int __direct_remap_pfn_range(struct mm_struct *mm, | |
15114 | - unsigned long address, | |
15115 | - unsigned long mfn, | |
15116 | - unsigned long size, | |
15117 | - pgprot_t prot, | |
15118 | - domid_t domid) | |
15119 | -{ | |
15120 | - int rc; | |
15121 | - unsigned long i, start_address; | |
15122 | - mmu_update_t *u, *v, *w; | |
15123 | - | |
15124 | - u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); | |
15125 | - if (u == NULL) | |
15126 | - return -ENOMEM; | |
15127 | - | |
15128 | - start_address = address; | |
15129 | - | |
15130 | - flush_cache_all(); | |
15131 | - | |
15132 | - for (i = 0; i < size; i += PAGE_SIZE) { | |
15133 | - if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) { | |
15134 | - /* Flush a full batch after filling in the PTE ptrs. */ | |
15135 | - rc = apply_to_page_range(mm, start_address, | |
15136 | - address - start_address, | |
15137 | - direct_remap_area_pte_fn, &w); | |
15138 | - if (rc) | |
15139 | - goto out; | |
15140 | - rc = -EFAULT; | |
15141 | - if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) | |
15142 | - goto out; | |
15143 | - v = w = u; | |
15144 | - start_address = address; | |
15145 | - } | |
15146 | - | |
15147 | - /* | |
15148 | - * Fill in the machine address: PTE ptr is done later by | |
15149 | - * apply_to_page_range(). | |
15150 | - */ | |
15151 | - v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO; | |
15152 | - | |
15153 | - mfn++; | |
15154 | - address += PAGE_SIZE; | |
15155 | - v++; | |
15156 | - } | |
15157 | - | |
15158 | - if (v != u) { | |
15159 | - /* Final batch. */ | |
15160 | - rc = apply_to_page_range(mm, start_address, | |
15161 | - address - start_address, | |
15162 | - direct_remap_area_pte_fn, &w); | |
15163 | - if (rc) | |
15164 | - goto out; | |
15165 | - rc = -EFAULT; | |
15166 | - if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) | |
15167 | - goto out; | |
15168 | - } | |
15169 | - | |
15170 | - rc = 0; | |
15171 | - | |
15172 | - out: | |
15173 | - flush_tlb_all(); | |
15174 | - | |
15175 | - free_page((unsigned long)u); | |
15176 | - | |
15177 | - return rc; | |
15178 | -} | |
15179 | - | |
15180 | -int direct_remap_pfn_range(struct vm_area_struct *vma, | |
15181 | - unsigned long address, | |
15182 | - unsigned long mfn, | |
15183 | - unsigned long size, | |
15184 | - pgprot_t prot, | |
15185 | - domid_t domid) | |
15186 | -{ | |
15187 | - if (xen_feature(XENFEAT_auto_translated_physmap)) | |
15188 | - return remap_pfn_range(vma, address, mfn, size, prot); | |
15189 | - | |
15190 | - if (domid == DOMID_SELF) | |
15191 | - return -EINVAL; | |
15192 | - | |
15193 | - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | |
15194 | - | |
15195 | - vma->vm_mm->context.has_foreign_mappings = 1; | |
15196 | - | |
15197 | - return __direct_remap_pfn_range( | |
15198 | - vma->vm_mm, address, mfn, size, prot, domid); | |
15199 | -} | |
15200 | -EXPORT_SYMBOL(direct_remap_pfn_range); | |
15201 | - | |
15202 | -int direct_kernel_remap_pfn_range(unsigned long address, | |
15203 | - unsigned long mfn, | |
15204 | - unsigned long size, | |
15205 | - pgprot_t prot, | |
15206 | - domid_t domid) | |
15207 | -{ | |
15208 | - return __direct_remap_pfn_range( | |
15209 | - &init_mm, address, mfn, size, prot, domid); | |
15210 | -} | |
15211 | -EXPORT_SYMBOL(direct_kernel_remap_pfn_range); | |
15212 | - | |
15213 | -static int lookup_pte_fn( | |
15214 | - pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) | |
15215 | -{ | |
15216 | - uint64_t *ptep = (uint64_t *)data; | |
15217 | - if (ptep) | |
15218 | - *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) << | |
15219 | - PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); | |
15220 | - return 0; | |
15221 | -} | |
15222 | - | |
15223 | -int create_lookup_pte_addr(struct mm_struct *mm, | |
15224 | - unsigned long address, | |
15225 | - uint64_t *ptep) | |
15226 | -{ | |
15227 | - return apply_to_page_range(mm, address, PAGE_SIZE, | |
15228 | - lookup_pte_fn, ptep); | |
15229 | -} | |
15230 | - | |
15231 | -EXPORT_SYMBOL(create_lookup_pte_addr); | |
15232 | - | |
15233 | -static int noop_fn( | |
15234 | - pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) | |
15235 | -{ | |
15236 | - return 0; | |
15237 | -} | |
15238 | - | |
15239 | -int touch_pte_range(struct mm_struct *mm, | |
15240 | - unsigned long address, | |
15241 | - unsigned long size) | |
15242 | -{ | |
15243 | - return apply_to_page_range(mm, address, size, noop_fn, NULL); | |
15244 | -} | |
15245 | - | |
15246 | -EXPORT_SYMBOL(touch_pte_range); | |
15247 | - | |
15248 | -/* | |
15249 | - * Does @address reside within a non-highmem page that is local to this virtual | |
15250 | - * machine (i.e., not an I/O page, nor a memory page belonging to another VM). | |
15251 | - * See the comment that accompanies mfn_to_local_pfn() in page.h to understand | |
15252 | - * why this works. | |
15253 | - */ | |
15254 | -static inline int is_local_lowmem(unsigned long address) | |
15255 | -{ | |
15256 | - extern unsigned long max_low_pfn; | |
15257 | - return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn); | |
15258 | -} | |
15259 | - | |
15260 | -/* | |
15261 | - * Generic mapping function (not visible outside): | |
15262 | - */ | |
15263 | - | |
15264 | -/* | |
15265 | - * Remap an arbitrary physical address space into the kernel virtual | |
15266 | - * address space. Needed when the kernel wants to access high addresses | |
15267 | - * directly. | |
15268 | - * | |
15269 | - * NOTE! We need to allow non-page-aligned mappings too: we will obviously | |
15270 | - * have to convert them into an offset in a page-aligned mapping, but the | |
15271 | - * caller shouldn't need to know that small detail. | |
15272 | - */ | |
15273 | -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) | |
15274 | -{ | |
15275 | - void __iomem * addr; | |
15276 | - struct vm_struct * area; | |
15277 | - unsigned long offset, last_addr; | |
15278 | - pgprot_t prot; | |
15279 | - domid_t domid = DOMID_IO; | |
15280 | - | |
15281 | - /* Don't allow wraparound or zero size */ | |
15282 | - last_addr = phys_addr + size - 1; | |
15283 | - if (!size || last_addr < phys_addr) | |
15284 | - return NULL; | |
15285 | - | |
15286 | - /* | |
15287 | - * Don't remap the low PCI/ISA area, it's always mapped.. | |
15288 | - */ | |
15289 | - if (is_initial_xendomain() && | |
15290 | - phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | |
15291 | - return (void __iomem *) isa_bus_to_virt(phys_addr); | |
15292 | - | |
15293 | - /* | |
15294 | - * Don't allow anybody to remap normal RAM that we're using.. | |
15295 | - */ | |
15296 | - if (is_local_lowmem(phys_addr)) { | |
15297 | - char *t_addr, *t_end; | |
15298 | - struct page *page; | |
15299 | - | |
15300 | - t_addr = bus_to_virt(phys_addr); | |
15301 | - t_end = t_addr + (size - 1); | |
15302 | - | |
15303 | - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) | |
15304 | - if(!PageReserved(page)) | |
15305 | - return NULL; | |
15306 | - | |
15307 | - domid = DOMID_SELF; | |
15308 | - } | |
15309 | - | |
15310 | - prot = __pgprot(_KERNPG_TABLE | flags); | |
15311 | - | |
15312 | - /* | |
15313 | - * Mappings have to be page-aligned | |
15314 | - */ | |
15315 | - offset = phys_addr & ~PAGE_MASK; | |
15316 | - phys_addr &= PAGE_MASK; | |
15317 | - size = PAGE_ALIGN(last_addr+1) - phys_addr; | |
15318 | - | |
15319 | - /* | |
15320 | - * Ok, go for it.. | |
15321 | - */ | |
15322 | - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); | |
15323 | - if (!area) | |
15324 | - return NULL; | |
15325 | - area->phys_addr = phys_addr; | |
15326 | - addr = (void __iomem *) area->addr; | |
15327 | - if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr, | |
15328 | - phys_addr>>PAGE_SHIFT, | |
15329 | - size, prot, domid)) { | |
15330 | - vunmap((void __force *) addr); | |
15331 | - return NULL; | |
15332 | - } | |
15333 | - return (void __iomem *) (offset + (char __iomem *)addr); | |
15334 | -} | |
15335 | -EXPORT_SYMBOL(__ioremap); | |
15336 | - | |
15337 | -/** | |
15338 | - * ioremap_nocache - map bus memory into CPU space | |
15339 | - * @offset: bus address of the memory | |
15340 | - * @size: size of the resource to map | |
15341 | - * | |
15342 | - * ioremap_nocache performs a platform specific sequence of operations to | |
15343 | - * make bus memory CPU accessible via the readb/readw/readl/writeb/ | |
15344 | - * writew/writel functions and the other mmio helpers. The returned | |
15345 | - * address is not guaranteed to be usable directly as a virtual | |
15346 | - * address. | |
15347 | - * | |
15348 | - * This version of ioremap ensures that the memory is marked uncachable | |
15349 | - * on the CPU as well as honouring existing caching rules from things like | |
15350 | - * the PCI bus. Note that there are other caches and buffers on many | |
15351 | - * busses. In particular driver authors should read up on PCI writes | |
15352 | - * | |
15353 | - * It's useful if some control registers are in such an area and | |
15354 | - * write combining or read caching is not desirable: | |
15355 | - * | |
15356 | - * Must be freed with iounmap. | |
15357 | - */ | |
15358 | - | |
15359 | -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) | |
15360 | -{ | |
15361 | - unsigned long last_addr; | |
15362 | - void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); | |
15363 | - if (!p) | |
15364 | - return p; | |
15365 | - | |
15366 | - /* Guaranteed to be > phys_addr, as per __ioremap() */ | |
15367 | - last_addr = phys_addr + size - 1; | |
15368 | - | |
15369 | - if (is_local_lowmem(last_addr)) { | |
15370 | - struct page *ppage = virt_to_page(bus_to_virt(phys_addr)); | |
15371 | - unsigned long npages; | |
15372 | - | |
15373 | - phys_addr &= PAGE_MASK; | |
15374 | - | |
15375 | - /* This might overflow and become zero.. */ | |
15376 | - last_addr = PAGE_ALIGN(last_addr); | |
15377 | - | |
15378 | - /* .. but that's ok, because modulo-2**n arithmetic will make | |
15379 | - * the page-aligned "last - first" come out right. | |
15380 | - */ | |
15381 | - npages = (last_addr - phys_addr) >> PAGE_SHIFT; | |
15382 | - | |
15383 | - if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { | |
15384 | - iounmap(p); | |
15385 | - p = NULL; | |
15386 | - } | |
15387 | - global_flush_tlb(); | |
15388 | - } | |
15389 | - | |
15390 | - return p; | |
15391 | -} | |
15392 | -EXPORT_SYMBOL(ioremap_nocache); | |
15393 | - | |
15394 | -/** | |
15395 | - * iounmap - Free a IO remapping | |
15396 | - * @addr: virtual address from ioremap_* | |
15397 | - * | |
15398 | - * Caller must ensure there is only one unmapping for the same pointer. | |
15399 | - */ | |
15400 | -void iounmap(volatile void __iomem *addr) | |
15401 | -{ | |
15402 | - struct vm_struct *p, *o; | |
15403 | - | |
15404 | - if ((void __force *)addr <= high_memory) | |
15405 | - return; | |
15406 | - | |
15407 | - /* | |
15408 | - * __ioremap special-cases the PCI/ISA range by not instantiating a | |
15409 | - * vm_area and by simply returning an address into the kernel mapping | |
15410 | - * of ISA space. So handle that here. | |
15411 | - */ | |
15412 | - if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) | |
15413 | - return; | |
15414 | - | |
15415 | - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); | |
15416 | - | |
15417 | - /* Use the vm area unlocked, assuming the caller | |
15418 | - ensures there isn't another iounmap for the same address | |
15419 | - in parallel. Reuse of the virtual address is prevented by | |
15420 | - leaving it in the global lists until we're done with it. | |
15421 | - cpa takes care of the direct mappings. */ | |
15422 | - read_lock(&vmlist_lock); | |
15423 | - for (p = vmlist; p; p = p->next) { | |
15424 | - if (p->addr == addr) | |
15425 | - break; | |
15426 | - } | |
15427 | - read_unlock(&vmlist_lock); | |
15428 | - | |
15429 | - if (!p) { | |
15430 | - printk("iounmap: bad address %p\n", addr); | |
15431 | - dump_stack(); | |
15432 | - return; | |
15433 | - } | |
15434 | - | |
15435 | - /* Reset the direct mapping. Can block */ | |
15436 | - if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) { | |
15437 | - change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)), | |
15438 | - get_vm_area_size(p) >> PAGE_SHIFT, | |
15439 | - PAGE_KERNEL); | |
15440 | - global_flush_tlb(); | |
15441 | - } | |
15442 | - | |
15443 | - /* Finally remove it */ | |
15444 | - o = remove_vm_area((void *)addr); | |
15445 | - BUG_ON(p != o || o == NULL); | |
15446 | - kfree(p); | |
15447 | -} | |
15448 | -EXPORT_SYMBOL(iounmap); | |
15449 | - | |
15450 | -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) | |
15451 | -{ | |
15452 | - unsigned long offset, last_addr; | |
15453 | - unsigned int nrpages; | |
15454 | - enum fixed_addresses idx; | |
15455 | - | |
15456 | - /* Don't allow wraparound or zero size */ | |
15457 | - last_addr = phys_addr + size - 1; | |
15458 | - if (!size || last_addr < phys_addr) | |
15459 | - return NULL; | |
15460 | - | |
15461 | - /* | |
15462 | - * Don't remap the low PCI/ISA area, it's always mapped.. | |
15463 | - */ | |
15464 | - if (is_initial_xendomain() && | |
15465 | - phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | |
15466 | - return isa_bus_to_virt(phys_addr); | |
15467 | - | |
15468 | - /* | |
15469 | - * Mappings have to be page-aligned | |
15470 | - */ | |
15471 | - offset = phys_addr & ~PAGE_MASK; | |
15472 | - phys_addr &= PAGE_MASK; | |
15473 | - size = PAGE_ALIGN(last_addr) - phys_addr; | |
15474 | - | |
15475 | - /* | |
15476 | - * Mappings have to fit in the FIX_BTMAP area. | |
15477 | - */ | |
15478 | - nrpages = size >> PAGE_SHIFT; | |
15479 | - if (nrpages > NR_FIX_BTMAPS) | |
15480 | - return NULL; | |
15481 | - | |
15482 | - /* | |
15483 | - * Ok, go for it.. | |
15484 | - */ | |
15485 | - idx = FIX_BTMAP_BEGIN; | |
15486 | - while (nrpages > 0) { | |
15487 | - set_fixmap(idx, phys_addr); | |
15488 | - phys_addr += PAGE_SIZE; | |
15489 | - --idx; | |
15490 | - --nrpages; | |
15491 | - } | |
15492 | - return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); | |
15493 | -} | |
15494 | - | |
15495 | -void __init bt_iounmap(void *addr, unsigned long size) | |
15496 | -{ | |
15497 | - unsigned long virt_addr; | |
15498 | - unsigned long offset; | |
15499 | - unsigned int nrpages; | |
15500 | - enum fixed_addresses idx; | |
15501 | - | |
15502 | - virt_addr = (unsigned long)addr; | |
15503 | - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) | |
15504 | - return; | |
15505 | - if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) | |
15506 | - return; | |
15507 | - offset = virt_addr & ~PAGE_MASK; | |
15508 | - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; | |
15509 | - | |
15510 | - idx = FIX_BTMAP_BEGIN; | |
15511 | - while (nrpages > 0) { | |
15512 | - clear_fixmap(idx); | |
15513 | - --idx; | |
15514 | - --nrpages; | |
15515 | - } | |
15516 | -} | |
15517 | --- /dev/null | |
15518 | +++ b/arch/x86/mm/ioremap-xen.c | |
15519 | @@ -0,0 +1,685 @@ | |
15520 | +/* | |
15521 | + * Re-map IO memory to kernel address space so that we can access it. | |
15522 | + * This is needed for high PCI addresses that aren't mapped in the | |
15523 | + * 640k-1MB IO memory area on PC's | |
15524 | + * | |
15525 | + * (C) Copyright 1995 1996 Linus Torvalds | |
15526 | + */ | |
15527 | + | |
15528 | +#include <linux/bootmem.h> | |
15529 | +#include <linux/init.h> | |
15530 | +#include <linux/io.h> | |
15531 | +#include <linux/module.h> | |
15532 | +#include <linux/pfn.h> | |
15533 | +#include <linux/slab.h> | |
15534 | +#include <linux/vmalloc.h> | |
15535 | + | |
15536 | +#include <asm/cacheflush.h> | |
15537 | +#include <asm/e820.h> | |
15538 | +#include <asm/fixmap.h> | |
15539 | +#include <asm/pgtable.h> | |
15540 | +#include <asm/tlbflush.h> | |
15541 | +#include <asm/pgalloc.h> | |
15542 | + | |
15543 | +enum ioremap_mode { | |
15544 | + IOR_MODE_UNCACHED, | |
15545 | + IOR_MODE_CACHED, | |
15546 | +}; | |
15547 | + | |
15548 | +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) | |
15549 | + | |
15550 | +unsigned long __phys_addr(unsigned long x) | |
15551 | +{ | |
15552 | + if (x >= __START_KERNEL_map) | |
15553 | + return x - __START_KERNEL_map + phys_base; | |
15554 | + return x - PAGE_OFFSET; | |
15555 | +} | |
15556 | +EXPORT_SYMBOL(__phys_addr); | |
15557 | + | |
15558 | +#endif | |
15559 | + | |
15560 | +static int direct_remap_area_pte_fn(pte_t *pte, | |
15561 | + struct page *pmd_page, | |
15562 | + unsigned long address, | |
15563 | + void *data) | |
15564 | +{ | |
15565 | + mmu_update_t **v = (mmu_update_t **)data; | |
15566 | + | |
15567 | + BUG_ON(!pte_none(*pte)); | |
15568 | + | |
15569 | + (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) << | |
15570 | + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); | |
15571 | + (*v)++; | |
15572 | + | |
15573 | + return 0; | |
15574 | +} | |
15575 | + | |
15576 | +static int __direct_remap_pfn_range(struct mm_struct *mm, | |
15577 | + unsigned long address, | |
15578 | + unsigned long mfn, | |
15579 | + unsigned long size, | |
15580 | + pgprot_t prot, | |
15581 | + domid_t domid) | |
15582 | +{ | |
15583 | + int rc; | |
15584 | + unsigned long i, start_address; | |
15585 | + mmu_update_t *u, *v, *w; | |
15586 | + | |
15587 | + u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); | |
15588 | + if (u == NULL) | |
15589 | + return -ENOMEM; | |
15590 | + | |
15591 | + start_address = address; | |
15592 | + | |
15593 | + flush_cache_all(); | |
15594 | + | |
15595 | + for (i = 0; i < size; i += PAGE_SIZE) { | |
15596 | + if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) { | |
15597 | + /* Flush a full batch after filling in the PTE ptrs. */ | |
15598 | + rc = apply_to_page_range(mm, start_address, | |
15599 | + address - start_address, | |
15600 | + direct_remap_area_pte_fn, &w); | |
15601 | + if (rc) | |
15602 | + goto out; | |
15603 | + rc = -EFAULT; | |
15604 | + if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) | |
15605 | + goto out; | |
15606 | + v = w = u; | |
15607 | + start_address = address; | |
15608 | + } | |
15609 | + | |
15610 | + /* | |
15611 | + * Fill in the machine address: PTE ptr is done later by | |
15612 | + * apply_to_page_range(). | |
15613 | + */ | |
15614 | + v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO; | |
15615 | + | |
15616 | + mfn++; | |
15617 | + address += PAGE_SIZE; | |
15618 | + v++; | |
15619 | + } | |
15620 | + | |
15621 | + if (v != u) { | |
15622 | + /* Final batch. */ | |
15623 | + rc = apply_to_page_range(mm, start_address, | |
15624 | + address - start_address, | |
15625 | + direct_remap_area_pte_fn, &w); | |
15626 | + if (rc) | |
15627 | + goto out; | |
15628 | + rc = -EFAULT; | |
15629 | + if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) | |
15630 | + goto out; | |
15631 | + } | |
15632 | + | |
15633 | + rc = 0; | |
15634 | + | |
15635 | + out: | |
15636 | + flush_tlb_all(); | |
15637 | + | |
15638 | + free_page((unsigned long)u); | |
15639 | + | |
15640 | + return rc; | |
15641 | +} | |
15642 | + | |
15643 | +int direct_remap_pfn_range(struct vm_area_struct *vma, | |
15644 | + unsigned long address, | |
15645 | + unsigned long mfn, | |
15646 | + unsigned long size, | |
15647 | + pgprot_t prot, | |
15648 | + domid_t domid) | |
15649 | +{ | |
15650 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
15651 | + return remap_pfn_range(vma, address, mfn, size, prot); | |
15652 | + | |
15653 | + if (domid == DOMID_SELF) | |
15654 | + return -EINVAL; | |
15655 | + | |
15656 | + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | |
15657 | + | |
15658 | + vma->vm_mm->context.has_foreign_mappings = 1; | |
15659 | + | |
15660 | + return __direct_remap_pfn_range( | |
15661 | + vma->vm_mm, address, mfn, size, prot, domid); | |
15662 | +} | |
15663 | +EXPORT_SYMBOL(direct_remap_pfn_range); | |
15664 | + | |
15665 | +int direct_kernel_remap_pfn_range(unsigned long address, | |
15666 | + unsigned long mfn, | |
15667 | + unsigned long size, | |
15668 | + pgprot_t prot, | |
15669 | + domid_t domid) | |
15670 | +{ | |
15671 | + return __direct_remap_pfn_range( | |
15672 | + &init_mm, address, mfn, size, prot, domid); | |
15673 | +} | |
15674 | +EXPORT_SYMBOL(direct_kernel_remap_pfn_range); | |
15675 | + | |
15676 | +static int lookup_pte_fn( | |
15677 | + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) | |
15678 | +{ | |
15679 | + uint64_t *ptep = (uint64_t *)data; | |
15680 | + if (ptep) | |
15681 | + *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) << | |
15682 | + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); | |
15683 | + return 0; | |
15684 | +} | |
15685 | + | |
15686 | +int create_lookup_pte_addr(struct mm_struct *mm, | |
15687 | + unsigned long address, | |
15688 | + uint64_t *ptep) | |
15689 | +{ | |
15690 | + return apply_to_page_range(mm, address, PAGE_SIZE, | |
15691 | + lookup_pte_fn, ptep); | |
15692 | +} | |
15693 | + | |
15694 | +EXPORT_SYMBOL(create_lookup_pte_addr); | |
15695 | + | |
15696 | +static int noop_fn( | |
15697 | + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) | |
15698 | +{ | |
15699 | + return 0; | |
15700 | +} | |
15701 | + | |
15702 | +int touch_pte_range(struct mm_struct *mm, | |
15703 | + unsigned long address, | |
15704 | + unsigned long size) | |
15705 | +{ | |
15706 | + return apply_to_page_range(mm, address, size, noop_fn, NULL); | |
15707 | +} | |
15708 | + | |
15709 | +EXPORT_SYMBOL(touch_pte_range); | |
15710 | + | |
15711 | +#ifdef CONFIG_X86_32 | |
15712 | +int page_is_ram(unsigned long pagenr) | |
15713 | +{ | |
15714 | + unsigned long addr, end; | |
15715 | + int i; | |
15716 | + | |
15717 | +#ifndef CONFIG_XEN | |
15718 | + /* | |
15719 | + * A special case is the first 4Kb of memory; | |
15720 | + * This is a BIOS owned area, not kernel ram, but generally | |
15721 | + * not listed as such in the E820 table. | |
15722 | + */ | |
15723 | + if (pagenr == 0) | |
15724 | + return 0; | |
15725 | + | |
15726 | + /* | |
15727 | + * Second special case: Some BIOSen report the PC BIOS | |
15728 | + * area (640->1Mb) as ram even though it is not. | |
15729 | + */ | |
15730 | + if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) && | |
15731 | + pagenr < (BIOS_END >> PAGE_SHIFT)) | |
15732 | + return 0; | |
15733 | +#endif | |
15734 | + | |
15735 | + for (i = 0; i < e820.nr_map; i++) { | |
15736 | + /* | |
15737 | + * Not usable memory: | |
15738 | + */ | |
15739 | + if (e820.map[i].type != E820_RAM) | |
15740 | + continue; | |
15741 | + addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT; | |
15742 | + end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT; | |
15743 | + | |
15744 | + | |
15745 | + if ((pagenr >= addr) && (pagenr < end)) | |
15746 | + return 1; | |
15747 | + } | |
15748 | + return 0; | |
15749 | +} | |
15750 | +#endif | |
15751 | + | |
15752 | +/* | |
15753 | + * Fix up the linear direct mapping of the kernel to avoid cache attribute | |
15754 | + * conflicts. | |
15755 | + */ | |
15756 | +static int ioremap_change_attr(unsigned long vaddr, unsigned long size, | |
15757 | + enum ioremap_mode mode) | |
15758 | +{ | |
15759 | + unsigned long nrpages = size >> PAGE_SHIFT; | |
15760 | + int err; | |
15761 | + | |
15762 | + switch (mode) { | |
15763 | + case IOR_MODE_UNCACHED: | |
15764 | + default: | |
15765 | + err = set_memory_uc(vaddr, nrpages); | |
15766 | + break; | |
15767 | + case IOR_MODE_CACHED: | |
15768 | + err = set_memory_wb(vaddr, nrpages); | |
15769 | + break; | |
15770 | + } | |
15771 | + | |
15772 | + return err; | |
15773 | +} | |
15774 | + | |
15775 | +/* | |
15776 | + * Remap an arbitrary physical address space into the kernel virtual | |
15777 | + * address space. Needed when the kernel wants to access high addresses | |
15778 | + * directly. | |
15779 | + * | |
15780 | + * NOTE! We need to allow non-page-aligned mappings too: we will obviously | |
15781 | + * have to convert them into an offset in a page-aligned mapping, but the | |
15782 | + * caller shouldn't need to know that small detail. | |
15783 | + */ | |
15784 | +static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, | |
15785 | + enum ioremap_mode mode) | |
15786 | +{ | |
15787 | + unsigned long mfn, offset, last_addr, vaddr; | |
15788 | + struct vm_struct *area; | |
15789 | + pgprot_t prot; | |
15790 | + domid_t domid = DOMID_IO; | |
15791 | + | |
15792 | + /* Don't allow wraparound or zero size */ | |
15793 | + last_addr = phys_addr + size - 1; | |
15794 | + if (!size || last_addr < phys_addr) | |
15795 | + return NULL; | |
15796 | + | |
15797 | + /* | |
15798 | + * Don't remap the low PCI/ISA area, it's always mapped.. | |
15799 | + */ | |
15800 | + if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS) | |
15801 | + return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr); | |
15802 | + | |
15803 | + /* | |
15804 | + * Don't allow anybody to remap normal RAM that we're using.. | |
15805 | + */ | |
15806 | + for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) { | |
15807 | + unsigned long pfn = mfn_to_local_pfn(mfn); | |
15808 | + | |
15809 | + if (pfn >= max_pfn) | |
15810 | + continue; | |
15811 | + | |
15812 | + domid = DOMID_SELF; | |
15813 | + | |
15814 | + if (pfn >= max_pfn_mapped) /* bogus */ | |
15815 | + continue; | |
15816 | + | |
15817 | + if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) | |
15818 | + return NULL; | |
15819 | + } | |
15820 | + | |
15821 | + switch (mode) { | |
15822 | + case IOR_MODE_UNCACHED: | |
15823 | + default: | |
15824 | + /* | |
15825 | + * FIXME: we will use UC MINUS for now, as video fb drivers | |
15826 | + * depend on it. Upcoming ioremap_wc() will fix this behavior. | |
15827 | + */ | |
15828 | + prot = PAGE_KERNEL_UC_MINUS; | |
15829 | + break; | |
15830 | + case IOR_MODE_CACHED: | |
15831 | + prot = PAGE_KERNEL; | |
15832 | + break; | |
15833 | + } | |
15834 | + | |
15835 | + /* | |
15836 | + * Mappings have to be page-aligned | |
15837 | + */ | |
15838 | + offset = phys_addr & ~PAGE_MASK; | |
15839 | + phys_addr &= PAGE_MASK; | |
15840 | + size = PAGE_ALIGN(last_addr+1) - phys_addr; | |
15841 | + | |
15842 | + /* | |
15843 | + * Ok, go for it.. | |
15844 | + */ | |
15845 | + area = get_vm_area(size, VM_IOREMAP | (mode << 20)); | |
15846 | + if (!area) | |
15847 | + return NULL; | |
15848 | + area->phys_addr = phys_addr; | |
15849 | + vaddr = (unsigned long) area->addr; | |
15850 | + if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr), | |
15851 | + size, prot, domid)) { | |
15852 | + free_vm_area(area); | |
15853 | + return NULL; | |
15854 | + } | |
15855 | + | |
15856 | + if (ioremap_change_attr(vaddr, size, mode) < 0) { | |
15857 | + iounmap((void __iomem *) vaddr); | |
15858 | + return NULL; | |
15859 | + } | |
15860 | + | |
15861 | + return (void __iomem *) (vaddr + offset); | |
15862 | +} | |
15863 | + | |
15864 | +/** | |
15865 | + * ioremap_nocache - map bus memory into CPU space | |
15866 | + * @offset: bus address of the memory | |
15867 | + * @size: size of the resource to map | |
15868 | + * | |
15869 | + * ioremap_nocache performs a platform specific sequence of operations to | |
15870 | + * make bus memory CPU accessible via the readb/readw/readl/writeb/ | |
15871 | + * writew/writel functions and the other mmio helpers. The returned | |
15872 | + * address is not guaranteed to be usable directly as a virtual | |
15873 | + * address. | |
15874 | + * | |
15875 | + * This version of ioremap ensures that the memory is marked uncachable | |
15876 | + * on the CPU as well as honouring existing caching rules from things like | |
15877 | + * the PCI bus. Note that there are other caches and buffers on many | |
15878 | + * busses. In particular driver authors should read up on PCI writes | |
15879 | + * | |
15880 | + * It's useful if some control registers are in such an area and | |
15881 | + * write combining or read caching is not desirable: | |
15882 | + * | |
15883 | + * Must be freed with iounmap. | |
15884 | + */ | |
15885 | +void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) | |
15886 | +{ | |
15887 | + return __ioremap(phys_addr, size, IOR_MODE_UNCACHED); | |
15888 | +} | |
15889 | +EXPORT_SYMBOL(ioremap_nocache); | |
15890 | + | |
15891 | +void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) | |
15892 | +{ | |
15893 | + return __ioremap(phys_addr, size, IOR_MODE_CACHED); | |
15894 | +} | |
15895 | +EXPORT_SYMBOL(ioremap_cache); | |
15896 | + | |
15897 | +/** | |
15898 | + * iounmap - Free a IO remapping | |
15899 | + * @addr: virtual address from ioremap_* | |
15900 | + * | |
15901 | + * Caller must ensure there is only one unmapping for the same pointer. | |
15902 | + */ | |
15903 | +void iounmap(volatile void __iomem *addr) | |
15904 | +{ | |
15905 | + struct vm_struct *p, *o; | |
15906 | + | |
15907 | + if ((void __force *)addr <= high_memory) | |
15908 | + return; | |
15909 | + | |
15910 | + /* | |
15911 | + * __ioremap special-cases the PCI/ISA range by not instantiating a | |
15912 | + * vm_area and by simply returning an address into the kernel mapping | |
15913 | + * of ISA space. So handle that here. | |
15914 | + */ | |
15915 | + if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) | |
15916 | + return; | |
15917 | + | |
15918 | + addr = (volatile void __iomem *) | |
15919 | + (PAGE_MASK & (unsigned long __force)addr); | |
15920 | + | |
15921 | + /* Use the vm area unlocked, assuming the caller | |
15922 | + ensures there isn't another iounmap for the same address | |
15923 | + in parallel. Reuse of the virtual address is prevented by | |
15924 | + leaving it in the global lists until we're done with it. | |
15925 | + cpa takes care of the direct mappings. */ | |
15926 | + read_lock(&vmlist_lock); | |
15927 | + for (p = vmlist; p; p = p->next) { | |
15928 | + if (p->addr == addr) | |
15929 | + break; | |
15930 | + } | |
15931 | + read_unlock(&vmlist_lock); | |
15932 | + | |
15933 | + if (!p) { | |
15934 | + printk(KERN_ERR "iounmap: bad address %p\n", addr); | |
15935 | + dump_stack(); | |
15936 | + return; | |
15937 | + } | |
15938 | + | |
15939 | + if ((p->flags >> 20) != IOR_MODE_CACHED) { | |
15940 | + unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT; | |
15941 | + unsigned long mfn = p->phys_addr; | |
15942 | + unsigned long va = (unsigned long)addr; | |
15943 | + | |
15944 | + for (; n > 0; n--, mfn++, va += PAGE_SIZE) | |
15945 | + if (mfn_to_local_pfn(mfn) < max_pfn) | |
15946 | + set_memory_wb(va, 1); | |
15947 | + } | |
15948 | + | |
15949 | + /* Finally remove it */ | |
15950 | + o = remove_vm_area((void *)addr); | |
15951 | + BUG_ON(p != o || o == NULL); | |
15952 | + kfree(p); | |
15953 | +} | |
15954 | +EXPORT_SYMBOL(iounmap); | |
15955 | + | |
15956 | +int __initdata early_ioremap_debug; | |
15957 | + | |
15958 | +static int __init early_ioremap_debug_setup(char *str) | |
15959 | +{ | |
15960 | + early_ioremap_debug = 1; | |
15961 | + | |
15962 | + return 0; | |
15963 | +} | |
15964 | +early_param("early_ioremap_debug", early_ioremap_debug_setup); | |
15965 | + | |
15966 | +static __initdata int after_paging_init; | |
15967 | +static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] | |
15968 | + __attribute__((aligned(PAGE_SIZE))); | |
15969 | + | |
15970 | +#ifdef CONFIG_X86_32 | |
15971 | +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) | |
15972 | +{ | |
15973 | + /* Don't assume we're using swapper_pg_dir at this point */ | |
15974 | + pgd_t *base = __va(read_cr3()); | |
15975 | + pgd_t *pgd = &base[pgd_index(addr)]; | |
15976 | + pud_t *pud = pud_offset(pgd, addr); | |
15977 | + pmd_t *pmd = pmd_offset(pud, addr); | |
15978 | + | |
15979 | + return pmd; | |
15980 | +} | |
15981 | +#else | |
15982 | +#define early_ioremap_pmd early_get_pmd | |
15983 | +#define make_lowmem_page_readonly early_make_page_readonly | |
15984 | +#define make_lowmem_page_writable make_page_writable | |
15985 | +#endif | |
15986 | + | |
15987 | +static inline pte_t * __init early_ioremap_pte(unsigned long addr) | |
15988 | +{ | |
15989 | + return &bm_pte[pte_index(addr)]; | |
15990 | +} | |
15991 | + | |
15992 | +void __init early_ioremap_init(void) | |
15993 | +{ | |
15994 | + pmd_t *pmd; | |
15995 | + | |
15996 | + if (early_ioremap_debug) | |
15997 | + printk(KERN_INFO "early_ioremap_init()\n"); | |
15998 | + | |
15999 | + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); | |
16000 | + memset(bm_pte, 0, sizeof(bm_pte)); | |
16001 | + make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables); | |
16002 | + pmd_populate_kernel(&init_mm, pmd, bm_pte); | |
16003 | + | |
16004 | + /* | |
16005 | + * The boot-ioremap range spans multiple pmds, for which | |
16006 | + * we are not prepared: | |
16007 | + */ | |
16008 | + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { | |
16009 | + WARN_ON(1); | |
16010 | + printk(KERN_WARNING "pmd %p != %p\n", | |
16011 | + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))); | |
16012 | + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", | |
16013 | + fix_to_virt(FIX_BTMAP_BEGIN)); | |
16014 | + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", | |
16015 | + fix_to_virt(FIX_BTMAP_END)); | |
16016 | + | |
16017 | + printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); | |
16018 | + printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", | |
16019 | + FIX_BTMAP_BEGIN); | |
16020 | + } | |
16021 | +} | |
16022 | + | |
16023 | +void __init early_ioremap_clear(void) | |
16024 | +{ | |
16025 | + pmd_t *pmd; | |
16026 | + | |
16027 | + if (early_ioremap_debug) | |
16028 | + printk(KERN_INFO "early_ioremap_clear()\n"); | |
16029 | + | |
16030 | + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); | |
16031 | + pmd_clear(pmd); | |
16032 | + make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables); | |
16033 | + /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */ | |
16034 | + __flush_tlb_all(); | |
16035 | +} | |
16036 | + | |
16037 | +void __init early_ioremap_reset(void) | |
16038 | +{ | |
16039 | + enum fixed_addresses idx; | |
16040 | + unsigned long addr, phys; | |
16041 | + pte_t *pte; | |
16042 | + | |
16043 | + after_paging_init = 1; | |
16044 | + for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) { | |
16045 | + addr = fix_to_virt(idx); | |
16046 | + pte = early_ioremap_pte(addr); | |
16047 | + if (pte_present(*pte)) { | |
16048 | + phys = __pte_val(*pte) & PAGE_MASK; | |
16049 | + set_fixmap(idx, phys); | |
16050 | + } | |
16051 | + } | |
16052 | +} | |
16053 | + | |
16054 | +static void __init __early_set_fixmap(enum fixed_addresses idx, | |
16055 | + unsigned long phys, pgprot_t flags) | |
16056 | +{ | |
16057 | + unsigned long addr = __fix_to_virt(idx); | |
16058 | + pte_t *pte; | |
16059 | + | |
16060 | + if (idx >= __end_of_fixed_addresses) { | |
16061 | + BUG(); | |
16062 | + return; | |
16063 | + } | |
16064 | + pte = early_ioremap_pte(addr); | |
16065 | + if (pgprot_val(flags)) | |
16066 | + set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags)); | |
16067 | + else | |
16068 | + pte_clear(NULL, addr, pte); | |
16069 | + __flush_tlb_one(addr); | |
16070 | +} | |
16071 | + | |
16072 | +static inline void __init early_set_fixmap(enum fixed_addresses idx, | |
16073 | + unsigned long phys) | |
16074 | +{ | |
16075 | + if (after_paging_init) | |
16076 | + set_fixmap(idx, phys); | |
16077 | + else | |
16078 | + __early_set_fixmap(idx, phys, PAGE_KERNEL); | |
16079 | +} | |
16080 | + | |
16081 | +static inline void __init early_clear_fixmap(enum fixed_addresses idx) | |
16082 | +{ | |
16083 | + if (after_paging_init) | |
16084 | + clear_fixmap(idx); | |
16085 | + else | |
16086 | + __early_set_fixmap(idx, 0, __pgprot(0)); | |
16087 | +} | |
16088 | + | |
16089 | + | |
16090 | +int __initdata early_ioremap_nested; | |
16091 | + | |
16092 | +static int __init check_early_ioremap_leak(void) | |
16093 | +{ | |
16094 | + if (!early_ioremap_nested) | |
16095 | + return 0; | |
16096 | + | |
16097 | + printk(KERN_WARNING | |
16098 | + "Debug warning: early ioremap leak of %d areas detected.\n", | |
16099 | + early_ioremap_nested); | |
16100 | + printk(KERN_WARNING | |
16101 | + "please boot with early_ioremap_debug and report the dmesg.\n"); | |
16102 | + WARN_ON(1); | |
16103 | + | |
16104 | + return 1; | |
16105 | +} | |
16106 | +late_initcall(check_early_ioremap_leak); | |
16107 | + | |
16108 | +void __init *early_ioremap(unsigned long phys_addr, unsigned long size) | |
16109 | +{ | |
16110 | + unsigned long offset, last_addr; | |
16111 | + unsigned int nrpages, nesting; | |
16112 | + enum fixed_addresses idx0, idx; | |
16113 | + | |
16114 | + WARN_ON(system_state != SYSTEM_BOOTING); | |
16115 | + | |
16116 | + nesting = early_ioremap_nested; | |
16117 | + if (early_ioremap_debug) { | |
16118 | + printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", | |
16119 | + phys_addr, size, nesting); | |
16120 | + dump_stack(); | |
16121 | + } | |
16122 | + | |
16123 | + /* Don't allow wraparound or zero size */ | |
16124 | + last_addr = phys_addr + size - 1; | |
16125 | + if (!size || last_addr < phys_addr) { | |
16126 | + WARN_ON(1); | |
16127 | + return NULL; | |
16128 | + } | |
16129 | + | |
16130 | + if (nesting >= FIX_BTMAPS_NESTING) { | |
16131 | + WARN_ON(1); | |
16132 | + return NULL; | |
16133 | + } | |
16134 | + early_ioremap_nested++; | |
16135 | + /* | |
16136 | + * Mappings have to be page-aligned | |
16137 | + */ | |
16138 | + offset = phys_addr & ~PAGE_MASK; | |
16139 | + phys_addr &= PAGE_MASK; | |
16140 | + size = PAGE_ALIGN(last_addr) - phys_addr; | |
16141 | + | |
16142 | + /* | |
16143 | + * Mappings have to fit in the FIX_BTMAP area. | |
16144 | + */ | |
16145 | + nrpages = size >> PAGE_SHIFT; | |
16146 | + if (nrpages > NR_FIX_BTMAPS) { | |
16147 | + WARN_ON(1); | |
16148 | + return NULL; | |
16149 | + } | |
16150 | + | |
16151 | + /* | |
16152 | + * Ok, go for it.. | |
16153 | + */ | |
16154 | + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; | |
16155 | + idx = idx0; | |
16156 | + while (nrpages > 0) { | |
16157 | + early_set_fixmap(idx, phys_addr); | |
16158 | + phys_addr += PAGE_SIZE; | |
16159 | + --idx; | |
16160 | + --nrpages; | |
16161 | + } | |
16162 | + if (early_ioremap_debug) | |
16163 | + printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); | |
16164 | + | |
16165 | + return (void *) (offset + fix_to_virt(idx0)); | |
16166 | +} | |
16167 | + | |
16168 | +void __init early_iounmap(void *addr, unsigned long size) | |
16169 | +{ | |
16170 | + unsigned long virt_addr; | |
16171 | + unsigned long offset; | |
16172 | + unsigned int nrpages; | |
16173 | + enum fixed_addresses idx; | |
16174 | + unsigned int nesting; | |
16175 | + | |
16176 | + nesting = --early_ioremap_nested; | |
16177 | + WARN_ON(nesting < 0); | |
16178 | + | |
16179 | + if (early_ioremap_debug) { | |
16180 | + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, | |
16181 | + size, nesting); | |
16182 | + dump_stack(); | |
16183 | + } | |
16184 | + | |
16185 | + virt_addr = (unsigned long)addr; | |
16186 | + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { | |
16187 | + WARN_ON(1); | |
16188 | + return; | |
16189 | + } | |
16190 | + offset = virt_addr & ~PAGE_MASK; | |
16191 | + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; | |
16192 | + | |
16193 | + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; | |
16194 | + while (nrpages > 0) { | |
16195 | + early_clear_fixmap(idx); | |
16196 | + --idx; | |
16197 | + --nrpages; | |
16198 | + } | |
16199 | +} | |
16200 | + | |
16201 | +void __this_fixmap_does_not_exist(void) | |
16202 | +{ | |
16203 | + WARN_ON(1); | |
16204 | +} | |
16205 | --- a/arch/x86/mm/pageattr_64-xen.c | |
16206 | +++ /dev/null | |
16207 | @@ -1,542 +0,0 @@ | |
16208 | -/* | |
16209 | - * Copyright 2002 Andi Kleen, SuSE Labs. | |
16210 | - * Thanks to Ben LaHaise for precious feedback. | |
16211 | - */ | |
16212 | - | |
16213 | -#include <linux/mm.h> | |
16214 | -#include <linux/sched.h> | |
16215 | -#include <linux/highmem.h> | |
16216 | -#include <linux/module.h> | |
16217 | -#include <linux/slab.h> | |
16218 | -#include <asm/uaccess.h> | |
16219 | -#include <asm/processor.h> | |
16220 | -#include <asm/tlbflush.h> | |
16221 | -#include <asm/io.h> | |
16222 | - | |
16223 | -#ifdef CONFIG_XEN | |
16224 | -#include <asm/pgalloc.h> | |
16225 | -#include <asm/mmu_context.h> | |
16226 | - | |
16227 | -static void _pin_lock(struct mm_struct *mm, int lock) { | |
16228 | - if (lock) | |
16229 | - spin_lock(&mm->page_table_lock); | |
16230 | -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | |
16231 | - /* While mm->page_table_lock protects us against insertions and | |
16232 | - * removals of higher level page table pages, it doesn't protect | |
16233 | - * against updates of pte-s. Such updates, however, require the | |
16234 | - * pte pages to be in consistent state (unpinned+writable or | |
16235 | - * pinned+readonly). The pinning and attribute changes, however | |
16236 | - * cannot be done atomically, which is why such updates must be | |
16237 | - * prevented from happening concurrently. | |
16238 | - * Note that no pte lock can ever elsewhere be acquired nesting | |
16239 | - * with an already acquired one in the same mm, or with the mm's | |
16240 | - * page_table_lock already acquired, as that would break in the | |
16241 | - * non-split case (where all these are actually resolving to the | |
16242 | - * one page_table_lock). Thus acquiring all of them here is not | |
16243 | - * going to result in dead locks, and the order of acquires | |
16244 | - * doesn't matter. | |
16245 | - */ | |
16246 | - { | |
16247 | - pgd_t *pgd = mm->pgd; | |
16248 | - unsigned g; | |
16249 | - | |
16250 | - for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { | |
16251 | - pud_t *pud; | |
16252 | - unsigned u; | |
16253 | - | |
16254 | - if (pgd_none(*pgd)) | |
16255 | - continue; | |
16256 | - pud = pud_offset(pgd, 0); | |
16257 | - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | |
16258 | - pmd_t *pmd; | |
16259 | - unsigned m; | |
16260 | - | |
16261 | - if (pud_none(*pud)) | |
16262 | - continue; | |
16263 | - pmd = pmd_offset(pud, 0); | |
16264 | - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { | |
16265 | - spinlock_t *ptl; | |
16266 | - | |
16267 | - if (pmd_none(*pmd)) | |
16268 | - continue; | |
16269 | - ptl = pte_lockptr(0, pmd); | |
16270 | - if (lock) | |
16271 | - spin_lock(ptl); | |
16272 | - else | |
16273 | - spin_unlock(ptl); | |
16274 | - } | |
16275 | - } | |
16276 | - } | |
16277 | - } | |
16278 | -#endif | |
16279 | - if (!lock) | |
16280 | - spin_unlock(&mm->page_table_lock); | |
16281 | -} | |
16282 | -#define pin_lock(mm) _pin_lock(mm, 1) | |
16283 | -#define pin_unlock(mm) _pin_lock(mm, 0) | |
16284 | - | |
16285 | -#define PIN_BATCH 8 | |
16286 | -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); | |
16287 | - | |
16288 | -static inline unsigned int pgd_walk_set_prot(void *pt, pgprot_t flags, | |
16289 | - unsigned int cpu, unsigned int seq) | |
16290 | -{ | |
16291 | - struct page *page = virt_to_page(pt); | |
16292 | - unsigned long pfn = page_to_pfn(page); | |
16293 | - | |
16294 | - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, | |
16295 | - (unsigned long)__va(pfn << PAGE_SHIFT), | |
16296 | - pfn_pte(pfn, flags), 0); | |
16297 | - if (unlikely(++seq == PIN_BATCH)) { | |
16298 | - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), | |
16299 | - PIN_BATCH, NULL))) | |
16300 | - BUG(); | |
16301 | - seq = 0; | |
16302 | - } | |
16303 | - | |
16304 | - return seq; | |
16305 | -} | |
16306 | - | |
16307 | -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) | |
16308 | -{ | |
16309 | - pgd_t *pgd = pgd_base; | |
16310 | - pud_t *pud; | |
16311 | - pmd_t *pmd; | |
16312 | - pte_t *pte; | |
16313 | - int g,u,m; | |
16314 | - unsigned int cpu, seq; | |
16315 | - multicall_entry_t *mcl; | |
16316 | - | |
16317 | - cpu = get_cpu(); | |
16318 | - | |
16319 | - /* | |
16320 | - * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not | |
16321 | - * be the 'current' task's pagetables (e.g., current may be 32-bit, | |
16322 | - * but the pagetables may be for a 64-bit task). | |
16323 | - * Subtracting 1 from TASK_SIZE64 means the loop limit is correct | |
16324 | - * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE. | |
16325 | - */ | |
16326 | - for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { | |
16327 | - if (pgd_none(*pgd)) | |
16328 | - continue; | |
16329 | - pud = pud_offset(pgd, 0); | |
16330 | - if (PTRS_PER_PUD > 1) /* not folded */ | |
16331 | - seq = pgd_walk_set_prot(pud,flags,cpu,seq); | |
16332 | - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | |
16333 | - if (pud_none(*pud)) | |
16334 | - continue; | |
16335 | - pmd = pmd_offset(pud, 0); | |
16336 | - if (PTRS_PER_PMD > 1) /* not folded */ | |
16337 | - seq = pgd_walk_set_prot(pmd,flags,cpu,seq); | |
16338 | - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { | |
16339 | - if (pmd_none(*pmd)) | |
16340 | - continue; | |
16341 | - pte = pte_offset_kernel(pmd,0); | |
16342 | - seq = pgd_walk_set_prot(pte,flags,cpu,seq); | |
16343 | - } | |
16344 | - } | |
16345 | - } | |
16346 | - | |
16347 | - mcl = per_cpu(pb_mcl, cpu); | |
16348 | - if (unlikely(seq > PIN_BATCH - 2)) { | |
16349 | - if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL))) | |
16350 | - BUG(); | |
16351 | - seq = 0; | |
16352 | - } | |
16353 | - MULTI_update_va_mapping(mcl + seq, | |
16354 | - (unsigned long)__user_pgd(pgd_base), | |
16355 | - pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags), | |
16356 | - 0); | |
16357 | - MULTI_update_va_mapping(mcl + seq + 1, | |
16358 | - (unsigned long)pgd_base, | |
16359 | - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), | |
16360 | - UVMF_TLB_FLUSH); | |
16361 | - if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL))) | |
16362 | - BUG(); | |
16363 | - | |
16364 | - put_cpu(); | |
16365 | -} | |
16366 | - | |
16367 | -static void __pgd_pin(pgd_t *pgd) | |
16368 | -{ | |
16369 | - pgd_walk(pgd, PAGE_KERNEL_RO); | |
16370 | - xen_pgd_pin(__pa(pgd)); /* kernel */ | |
16371 | - xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */ | |
16372 | - SetPagePinned(virt_to_page(pgd)); | |
16373 | -} | |
16374 | - | |
16375 | -static void __pgd_unpin(pgd_t *pgd) | |
16376 | -{ | |
16377 | - xen_pgd_unpin(__pa(pgd)); | |
16378 | - xen_pgd_unpin(__pa(__user_pgd(pgd))); | |
16379 | - pgd_walk(pgd, PAGE_KERNEL); | |
16380 | - ClearPagePinned(virt_to_page(pgd)); | |
16381 | -} | |
16382 | - | |
16383 | -void pgd_test_and_unpin(pgd_t *pgd) | |
16384 | -{ | |
16385 | - if (PagePinned(virt_to_page(pgd))) | |
16386 | - __pgd_unpin(pgd); | |
16387 | -} | |
16388 | - | |
16389 | -void mm_pin(struct mm_struct *mm) | |
16390 | -{ | |
16391 | - if (xen_feature(XENFEAT_writable_page_tables)) | |
16392 | - return; | |
16393 | - | |
16394 | - pin_lock(mm); | |
16395 | - __pgd_pin(mm->pgd); | |
16396 | - pin_unlock(mm); | |
16397 | -} | |
16398 | - | |
16399 | -void mm_unpin(struct mm_struct *mm) | |
16400 | -{ | |
16401 | - if (xen_feature(XENFEAT_writable_page_tables)) | |
16402 | - return; | |
16403 | - | |
16404 | - pin_lock(mm); | |
16405 | - __pgd_unpin(mm->pgd); | |
16406 | - pin_unlock(mm); | |
16407 | -} | |
16408 | - | |
16409 | -void mm_pin_all(void) | |
16410 | -{ | |
16411 | - struct page *page; | |
16412 | - unsigned long flags; | |
16413 | - | |
16414 | - if (xen_feature(XENFEAT_writable_page_tables)) | |
16415 | - return; | |
16416 | - | |
16417 | - /* | |
16418 | - * Allow uninterrupted access to the pgd_list. Also protects | |
16419 | - * __pgd_pin() by disabling preemption. | |
16420 | - * All other CPUs must be at a safe point (e.g., in stop_machine | |
16421 | - * or offlined entirely). | |
16422 | - */ | |
16423 | - spin_lock_irqsave(&pgd_lock, flags); | |
16424 | - list_for_each_entry(page, &pgd_list, lru) { | |
16425 | - if (!PagePinned(page)) | |
16426 | - __pgd_pin((pgd_t *)page_address(page)); | |
16427 | - } | |
16428 | - spin_unlock_irqrestore(&pgd_lock, flags); | |
16429 | -} | |
16430 | - | |
16431 | -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | |
16432 | -{ | |
16433 | - if (!PagePinned(virt_to_page(mm->pgd))) | |
16434 | - mm_pin(mm); | |
16435 | -} | |
16436 | - | |
16437 | -void arch_exit_mmap(struct mm_struct *mm) | |
16438 | -{ | |
16439 | - struct task_struct *tsk = current; | |
16440 | - | |
16441 | - task_lock(tsk); | |
16442 | - | |
16443 | - /* | |
16444 | - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() | |
16445 | - * *much* faster this way, as no tlb flushes means bigger wrpt batches. | |
16446 | - */ | |
16447 | - if (tsk->active_mm == mm) { | |
16448 | - tsk->active_mm = &init_mm; | |
16449 | - atomic_inc(&init_mm.mm_count); | |
16450 | - | |
16451 | - switch_mm(mm, &init_mm, tsk); | |
16452 | - | |
16453 | - atomic_dec(&mm->mm_count); | |
16454 | - BUG_ON(atomic_read(&mm->mm_count) == 0); | |
16455 | - } | |
16456 | - | |
16457 | - task_unlock(tsk); | |
16458 | - | |
16459 | - if (PagePinned(virt_to_page(mm->pgd)) | |
16460 | - && (atomic_read(&mm->mm_count) == 1) | |
16461 | - && !mm->context.has_foreign_mappings) | |
16462 | - mm_unpin(mm); | |
16463 | -} | |
16464 | - | |
16465 | -static void _pte_free(struct page *page, unsigned int order) | |
16466 | -{ | |
16467 | - BUG_ON(order); | |
16468 | - pte_free(page); | |
16469 | -} | |
16470 | - | |
16471 | -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) | |
16472 | -{ | |
16473 | - struct page *pte; | |
16474 | - | |
16475 | - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); | |
16476 | - if (pte) { | |
16477 | - SetPageForeign(pte, _pte_free); | |
16478 | - init_page_count(pte); | |
16479 | - } | |
16480 | - return pte; | |
16481 | -} | |
16482 | - | |
16483 | -void pte_free(struct page *pte) | |
16484 | -{ | |
16485 | - unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT); | |
16486 | - | |
16487 | - if (!pte_write(*virt_to_ptep(va))) | |
16488 | - if (HYPERVISOR_update_va_mapping( | |
16489 | - va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0)) | |
16490 | - BUG(); | |
16491 | - | |
16492 | - ClearPageForeign(pte); | |
16493 | - init_page_count(pte); | |
16494 | - | |
16495 | - __free_page(pte); | |
16496 | -} | |
16497 | -#endif /* CONFIG_XEN */ | |
16498 | - | |
16499 | -pte_t *lookup_address(unsigned long address) | |
16500 | -{ | |
16501 | - pgd_t *pgd = pgd_offset_k(address); | |
16502 | - pud_t *pud; | |
16503 | - pmd_t *pmd; | |
16504 | - pte_t *pte; | |
16505 | - if (pgd_none(*pgd)) | |
16506 | - return NULL; | |
16507 | - pud = pud_offset(pgd, address); | |
16508 | - if (!pud_present(*pud)) | |
16509 | - return NULL; | |
16510 | - pmd = pmd_offset(pud, address); | |
16511 | - if (!pmd_present(*pmd)) | |
16512 | - return NULL; | |
16513 | - if (pmd_large(*pmd)) | |
16514 | - return (pte_t *)pmd; | |
16515 | - pte = pte_offset_kernel(pmd, address); | |
16516 | - if (pte && !pte_present(*pte)) | |
16517 | - pte = NULL; | |
16518 | - return pte; | |
16519 | -} | |
16520 | - | |
16521 | -static struct page *split_large_page(unsigned long address, pgprot_t prot, | |
16522 | - pgprot_t ref_prot) | |
16523 | -{ | |
16524 | - int i; | |
16525 | - unsigned long addr; | |
16526 | - struct page *base = alloc_pages(GFP_KERNEL, 0); | |
16527 | - pte_t *pbase; | |
16528 | - if (!base) | |
16529 | - return NULL; | |
16530 | - /* | |
16531 | - * page_private is used to track the number of entries in | |
16532 | - * the page table page have non standard attributes. | |
16533 | - */ | |
16534 | - SetPagePrivate(base); | |
16535 | - page_private(base) = 0; | |
16536 | - | |
16537 | - address = __pa(address); | |
16538 | - addr = address & LARGE_PAGE_MASK; | |
16539 | - pbase = (pte_t *)page_address(base); | |
16540 | - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { | |
16541 | - pbase[i] = pfn_pte(addr >> PAGE_SHIFT, | |
16542 | - addr == address ? prot : ref_prot); | |
16543 | - } | |
16544 | - return base; | |
16545 | -} | |
16546 | - | |
16547 | -void clflush_cache_range(void *adr, int size) | |
16548 | -{ | |
16549 | - int i; | |
16550 | - for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size) | |
16551 | - clflush(adr+i); | |
16552 | -} | |
16553 | - | |
16554 | -static void flush_kernel_map(void *arg) | |
16555 | -{ | |
16556 | - struct list_head *l = (struct list_head *)arg; | |
16557 | - struct page *pg; | |
16558 | - | |
16559 | - /* When clflush is available always use it because it is | |
16560 | - much cheaper than WBINVD. */ | |
16561 | - /* clflush is still broken. Disable for now. */ | |
16562 | - if (1 || !cpu_has_clflush) | |
16563 | - asm volatile("wbinvd" ::: "memory"); | |
16564 | - else list_for_each_entry(pg, l, lru) { | |
16565 | - void *adr = page_address(pg); | |
16566 | - clflush_cache_range(adr, PAGE_SIZE); | |
16567 | - } | |
16568 | - __flush_tlb_all(); | |
16569 | -} | |
16570 | - | |
16571 | -static inline void flush_map(struct list_head *l) | |
16572 | -{ | |
16573 | - on_each_cpu(flush_kernel_map, l, 1, 1); | |
16574 | -} | |
16575 | - | |
16576 | -static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ | |
16577 | - | |
16578 | -static inline void save_page(struct page *fpage) | |
16579 | -{ | |
16580 | - if (!test_and_set_bit(PG_arch_1, &fpage->flags)) | |
16581 | - list_add(&fpage->lru, &deferred_pages); | |
16582 | -} | |
16583 | - | |
16584 | -/* | |
16585 | - * No more special protections in this 2/4MB area - revert to a | |
16586 | - * large page again. | |
16587 | - */ | |
16588 | -static void revert_page(unsigned long address, pgprot_t ref_prot) | |
16589 | -{ | |
16590 | - pgd_t *pgd; | |
16591 | - pud_t *pud; | |
16592 | - pmd_t *pmd; | |
16593 | - pte_t large_pte; | |
16594 | - unsigned long pfn; | |
16595 | - | |
16596 | - pgd = pgd_offset_k(address); | |
16597 | - BUG_ON(pgd_none(*pgd)); | |
16598 | - pud = pud_offset(pgd,address); | |
16599 | - BUG_ON(pud_none(*pud)); | |
16600 | - pmd = pmd_offset(pud, address); | |
16601 | - BUG_ON(__pmd_val(*pmd) & _PAGE_PSE); | |
16602 | - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; | |
16603 | - large_pte = pfn_pte(pfn, ref_prot); | |
16604 | - large_pte = pte_mkhuge(large_pte); | |
16605 | - set_pte((pte_t *)pmd, large_pte); | |
16606 | -} | |
16607 | - | |
16608 | -static int | |
16609 | -__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, | |
16610 | - pgprot_t ref_prot) | |
16611 | -{ | |
16612 | - pte_t *kpte; | |
16613 | - struct page *kpte_page; | |
16614 | - pgprot_t ref_prot2; | |
16615 | - | |
16616 | - kpte = lookup_address(address); | |
16617 | - if (!kpte) return 0; | |
16618 | - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); | |
16619 | - BUG_ON(PageLRU(kpte_page)); | |
16620 | - BUG_ON(PageCompound(kpte_page)); | |
16621 | - if (pgprot_val(prot) != pgprot_val(ref_prot)) { | |
16622 | - if (!pte_huge(*kpte)) { | |
16623 | - set_pte(kpte, pfn_pte(pfn, prot)); | |
16624 | - } else { | |
16625 | - /* | |
16626 | - * split_large_page will take the reference for this | |
16627 | - * change_page_attr on the split page. | |
16628 | - */ | |
16629 | - struct page *split; | |
16630 | - ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); | |
16631 | - split = split_large_page(address, prot, ref_prot2); | |
16632 | - if (!split) | |
16633 | - return -ENOMEM; | |
16634 | - pgprot_val(ref_prot2) &= ~_PAGE_NX; | |
16635 | - set_pte(kpte, mk_pte(split, ref_prot2)); | |
16636 | - kpte_page = split; | |
16637 | - } | |
16638 | - page_private(kpte_page)++; | |
16639 | - } else if (!pte_huge(*kpte)) { | |
16640 | - set_pte(kpte, pfn_pte(pfn, ref_prot)); | |
16641 | - BUG_ON(page_private(kpte_page) == 0); | |
16642 | - page_private(kpte_page)--; | |
16643 | - } else | |
16644 | - BUG(); | |
16645 | - | |
16646 | - /* on x86-64 the direct mapping set at boot is not using 4k pages */ | |
16647 | - /* | |
16648 | - * ..., but the XEN guest kernels (currently) do: | |
16649 | - * If the pte was reserved, it means it was created at boot | |
16650 | - * time (not via split_large_page) and in turn we must not | |
16651 | - * replace it with a large page. | |
16652 | - */ | |
16653 | -#ifndef CONFIG_XEN | |
16654 | - BUG_ON(PageReserved(kpte_page)); | |
16655 | -#else | |
16656 | - if (PageReserved(kpte_page)) | |
16657 | - return 0; | |
16658 | -#endif | |
16659 | - | |
16660 | - save_page(kpte_page); | |
16661 | - if (page_private(kpte_page) == 0) | |
16662 | - revert_page(address, ref_prot); | |
16663 | - return 0; | |
16664 | -} | |
16665 | - | |
16666 | -/* | |
16667 | - * Change the page attributes of an page in the linear mapping. | |
16668 | - * | |
16669 | - * This should be used when a page is mapped with a different caching policy | |
16670 | - * than write-back somewhere - some CPUs do not like it when mappings with | |
16671 | - * different caching policies exist. This changes the page attributes of the | |
16672 | - * in kernel linear mapping too. | |
16673 | - * | |
16674 | - * The caller needs to ensure that there are no conflicting mappings elsewhere. | |
16675 | - * This function only deals with the kernel linear map. | |
16676 | - * | |
16677 | - * Caller must call global_flush_tlb() after this. | |
16678 | - */ | |
16679 | -int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) | |
16680 | -{ | |
16681 | - int err = 0, kernel_map = 0; | |
16682 | - int i; | |
16683 | - | |
16684 | - if (address >= __START_KERNEL_map | |
16685 | - && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { | |
16686 | - address = (unsigned long)__va(__pa(address)); | |
16687 | - kernel_map = 1; | |
16688 | - } | |
16689 | - | |
16690 | - down_write(&init_mm.mmap_sem); | |
16691 | - for (i = 0; i < numpages; i++, address += PAGE_SIZE) { | |
16692 | - unsigned long pfn = __pa(address) >> PAGE_SHIFT; | |
16693 | - | |
16694 | - if (!kernel_map || pte_present(pfn_pte(0, prot))) { | |
16695 | - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); | |
16696 | - if (err) | |
16697 | - break; | |
16698 | - } | |
16699 | - /* Handle kernel mapping too which aliases part of the | |
16700 | - * lowmem */ | |
16701 | - if (__pa(address) < KERNEL_TEXT_SIZE) { | |
16702 | - unsigned long addr2; | |
16703 | - pgprot_t prot2; | |
16704 | - addr2 = __START_KERNEL_map + __pa(address); | |
16705 | - /* Make sure the kernel mappings stay executable */ | |
16706 | - prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); | |
16707 | - err = __change_page_attr(addr2, pfn, prot2, | |
16708 | - PAGE_KERNEL_EXEC); | |
16709 | - } | |
16710 | - } | |
16711 | - up_write(&init_mm.mmap_sem); | |
16712 | - return err; | |
16713 | -} | |
16714 | - | |
16715 | -/* Don't call this for MMIO areas that may not have a mem_map entry */ | |
16716 | -int change_page_attr(struct page *page, int numpages, pgprot_t prot) | |
16717 | -{ | |
16718 | - unsigned long addr = (unsigned long)page_address(page); | |
16719 | - return change_page_attr_addr(addr, numpages, prot); | |
16720 | -} | |
16721 | - | |
16722 | -void global_flush_tlb(void) | |
16723 | -{ | |
16724 | - struct page *pg, *next; | |
16725 | - struct list_head l; | |
16726 | - | |
16727 | - /* | |
16728 | - * Write-protect the semaphore, to exclude two contexts | |
16729 | - * doing a list_replace_init() call in parallel and to | |
16730 | - * exclude new additions to the deferred_pages list: | |
16731 | - */ | |
16732 | - down_write(&init_mm.mmap_sem); | |
16733 | - list_replace_init(&deferred_pages, &l); | |
16734 | - up_write(&init_mm.mmap_sem); | |
16735 | - | |
16736 | - flush_map(&l); | |
16737 | - | |
16738 | - list_for_each_entry_safe(pg, next, &l, lru) { | |
16739 | - list_del(&pg->lru); | |
16740 | - clear_bit(PG_arch_1, &pg->flags); | |
16741 | - if (page_private(pg) != 0) | |
16742 | - continue; | |
16743 | - ClearPagePrivate(pg); | |
16744 | - __free_page(pg); | |
16745 | - } | |
16746 | -} | |
16747 | - | |
16748 | -EXPORT_SYMBOL(change_page_attr); | |
16749 | -EXPORT_SYMBOL(global_flush_tlb); | |
16750 | --- /dev/null | |
16751 | +++ b/arch/x86/mm/pageattr-xen.c | |
16752 | @@ -0,0 +1,1412 @@ | |
16753 | +/* | |
16754 | + * Copyright 2002 Andi Kleen, SuSE Labs. | |
16755 | + * Thanks to Ben LaHaise for precious feedback. | |
16756 | + */ | |
16757 | +#include <linux/highmem.h> | |
16758 | +#include <linux/bootmem.h> | |
16759 | +#include <linux/module.h> | |
16760 | +#include <linux/sched.h> | |
16761 | +#include <linux/slab.h> | |
16762 | +#include <linux/mm.h> | |
16763 | +#include <linux/interrupt.h> | |
16764 | + | |
16765 | +#include <asm/e820.h> | |
16766 | +#include <asm/processor.h> | |
16767 | +#include <asm/tlbflush.h> | |
16768 | +#include <asm/sections.h> | |
16769 | +#include <asm/uaccess.h> | |
16770 | +#include <asm/pgalloc.h> | |
16771 | +#include <asm/proto.h> | |
16772 | +#include <asm/mmu_context.h> | |
16773 | + | |
16774 | +#ifndef CONFIG_X86_64 | |
16775 | +#define TASK_SIZE64 TASK_SIZE | |
16776 | +#endif | |
16777 | + | |
16778 | +static void _pin_lock(struct mm_struct *mm, int lock) { | |
16779 | + if (lock) | |
16780 | + spin_lock(&mm->page_table_lock); | |
16781 | +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | |
16782 | + /* While mm->page_table_lock protects us against insertions and | |
16783 | + * removals of higher level page table pages, it doesn't protect | |
16784 | + * against updates of pte-s. Such updates, however, require the | |
16785 | + * pte pages to be in consistent state (unpinned+writable or | |
16786 | + * pinned+readonly). The pinning and attribute changes, however | |
16787 | + * cannot be done atomically, which is why such updates must be | |
16788 | + * prevented from happening concurrently. | |
16789 | + * Note that no pte lock can ever elsewhere be acquired nesting | |
16790 | + * with an already acquired one in the same mm, or with the mm's | |
16791 | + * page_table_lock already acquired, as that would break in the | |
16792 | + * non-split case (where all these are actually resolving to the | |
16793 | + * one page_table_lock). Thus acquiring all of them here is not | |
16794 | + * going to result in dead locks, and the order of acquires | |
16795 | + * doesn't matter. | |
16796 | + */ | |
16797 | + { | |
16798 | + pgd_t *pgd = mm->pgd; | |
16799 | + unsigned g; | |
16800 | + | |
16801 | + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { | |
16802 | + pud_t *pud; | |
16803 | + unsigned u; | |
16804 | + | |
16805 | + if (pgd_none(*pgd)) | |
16806 | + continue; | |
16807 | + pud = pud_offset(pgd, 0); | |
16808 | + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | |
16809 | + pmd_t *pmd; | |
16810 | + unsigned m; | |
16811 | + | |
16812 | + if (pud_none(*pud)) | |
16813 | + continue; | |
16814 | + pmd = pmd_offset(pud, 0); | |
16815 | + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { | |
16816 | + spinlock_t *ptl; | |
16817 | + | |
16818 | + if (pmd_none(*pmd)) | |
16819 | + continue; | |
16820 | + ptl = pte_lockptr(0, pmd); | |
16821 | + if (lock) | |
16822 | + spin_lock(ptl); | |
16823 | + else | |
16824 | + spin_unlock(ptl); | |
16825 | + } | |
16826 | + } | |
16827 | + } | |
16828 | + } | |
16829 | +#endif | |
16830 | + if (!lock) | |
16831 | + spin_unlock(&mm->page_table_lock); | |
16832 | +} | |
16833 | +#define pin_lock(mm) _pin_lock(mm, 1) | |
16834 | +#define pin_unlock(mm) _pin_lock(mm, 0) | |
16835 | + | |
16836 | +#define PIN_BATCH sizeof(void *) | |
16837 | +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); | |
16838 | + | |
16839 | +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags, | |
16840 | + unsigned int cpu, unsigned int seq) | |
16841 | +{ | |
16842 | + unsigned long pfn = page_to_pfn(page); | |
16843 | + | |
16844 | + if (PageHighMem(page)) { | |
16845 | + if (pgprot_val(flags) & _PAGE_RW) | |
16846 | + ClearPagePinned(page); | |
16847 | + else | |
16848 | + SetPagePinned(page); | |
16849 | + } else { | |
16850 | + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, | |
16851 | + (unsigned long)__va(pfn << PAGE_SHIFT), | |
16852 | + pfn_pte(pfn, flags), 0); | |
16853 | + if (unlikely(++seq == PIN_BATCH)) { | |
16854 | + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), | |
16855 | + PIN_BATCH, NULL))) | |
16856 | + BUG(); | |
16857 | + seq = 0; | |
16858 | + } | |
16859 | + } | |
16860 | + | |
16861 | + return seq; | |
16862 | +} | |
16863 | + | |
16864 | +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) | |
16865 | +{ | |
16866 | + pgd_t *pgd = pgd_base; | |
16867 | + pud_t *pud; | |
16868 | + pmd_t *pmd; | |
16869 | + int g,u,m; | |
16870 | + unsigned int cpu, seq; | |
16871 | + multicall_entry_t *mcl; | |
16872 | + | |
16873 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
16874 | + return; | |
16875 | + | |
16876 | + cpu = get_cpu(); | |
16877 | + | |
16878 | + /* | |
16879 | + * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables | |
16880 | + * may not be the 'current' task's pagetables (e.g., current may be | |
16881 | + * 32-bit, but the pagetables may be for a 64-bit task). | |
16882 | + * Subtracting 1 from TASK_SIZE64 means the loop limit is correct | |
16883 | + * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE. | |
16884 | + */ | |
16885 | + for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { | |
16886 | + if (pgd_none(*pgd)) | |
16887 | + continue; | |
16888 | + pud = pud_offset(pgd, 0); | |
16889 | + if (PTRS_PER_PUD > 1) /* not folded */ | |
16890 | + seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq); | |
16891 | + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | |
16892 | + if (pud_none(*pud)) | |
16893 | + continue; | |
16894 | + pmd = pmd_offset(pud, 0); | |
16895 | + if (PTRS_PER_PMD > 1) /* not folded */ | |
16896 | + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq); | |
16897 | + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { | |
16898 | + if (pmd_none(*pmd)) | |
16899 | + continue; | |
16900 | + seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq); | |
16901 | + } | |
16902 | + } | |
16903 | + } | |
16904 | + | |
16905 | + mcl = per_cpu(pb_mcl, cpu); | |
16906 | +#ifdef CONFIG_X86_64 | |
16907 | + if (unlikely(seq > PIN_BATCH - 2)) { | |
16908 | + if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL))) | |
16909 | + BUG(); | |
16910 | + seq = 0; | |
16911 | + } | |
16912 | + MULTI_update_va_mapping(mcl + seq, | |
16913 | + (unsigned long)__user_pgd(pgd_base), | |
16914 | + pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags), | |
16915 | + 0); | |
16916 | + MULTI_update_va_mapping(mcl + seq + 1, | |
16917 | + (unsigned long)pgd_base, | |
16918 | + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), | |
16919 | + UVMF_TLB_FLUSH); | |
16920 | + if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL))) | |
16921 | + BUG(); | |
16922 | +#else | |
16923 | + if (likely(seq != 0)) { | |
16924 | + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, | |
16925 | + (unsigned long)pgd_base, | |
16926 | + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), | |
16927 | + UVMF_TLB_FLUSH); | |
16928 | + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), | |
16929 | + seq + 1, NULL))) | |
16930 | + BUG(); | |
16931 | + } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base, | |
16932 | + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), | |
16933 | + UVMF_TLB_FLUSH)) | |
16934 | + BUG(); | |
16935 | +#endif | |
16936 | + | |
16937 | + put_cpu(); | |
16938 | +} | |
16939 | + | |
16940 | +static void __pgd_pin(pgd_t *pgd) | |
16941 | +{ | |
16942 | + pgd_walk(pgd, PAGE_KERNEL_RO); | |
16943 | + kmap_flush_unused(); | |
16944 | + xen_pgd_pin(__pa(pgd)); /* kernel */ | |
16945 | +#ifdef CONFIG_X86_64 | |
16946 | + xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */ | |
16947 | +#endif | |
16948 | + SetPagePinned(virt_to_page(pgd)); | |
16949 | +} | |
16950 | + | |
16951 | +static void __pgd_unpin(pgd_t *pgd) | |
16952 | +{ | |
16953 | + xen_pgd_unpin(__pa(pgd)); | |
16954 | +#ifdef CONFIG_X86_64 | |
16955 | + xen_pgd_unpin(__pa(__user_pgd(pgd))); | |
16956 | +#endif | |
16957 | + pgd_walk(pgd, PAGE_KERNEL); | |
16958 | + ClearPagePinned(virt_to_page(pgd)); | |
16959 | +} | |
16960 | + | |
16961 | +void pgd_test_and_unpin(pgd_t *pgd) | |
16962 | +{ | |
16963 | + if (PagePinned(virt_to_page(pgd))) | |
16964 | + __pgd_unpin(pgd); | |
16965 | +} | |
16966 | + | |
16967 | +void mm_pin(struct mm_struct *mm) | |
16968 | +{ | |
16969 | + if (xen_feature(XENFEAT_writable_page_tables)) | |
16970 | + return; | |
16971 | + | |
16972 | + pin_lock(mm); | |
16973 | + __pgd_pin(mm->pgd); | |
16974 | + pin_unlock(mm); | |
16975 | +} | |
16976 | + | |
16977 | +void mm_unpin(struct mm_struct *mm) | |
16978 | +{ | |
16979 | + if (xen_feature(XENFEAT_writable_page_tables)) | |
16980 | + return; | |
16981 | + | |
16982 | + pin_lock(mm); | |
16983 | + __pgd_unpin(mm->pgd); | |
16984 | + pin_unlock(mm); | |
16985 | +} | |
16986 | + | |
16987 | +void mm_pin_all(void) | |
16988 | +{ | |
16989 | + struct page *page; | |
16990 | + unsigned long flags; | |
16991 | + | |
16992 | + if (xen_feature(XENFEAT_writable_page_tables)) | |
16993 | + return; | |
16994 | + | |
16995 | + /* | |
16996 | + * Allow uninterrupted access to the pgd_list. Also protects | |
16997 | + * __pgd_pin() by disabling preemption. | |
16998 | + * All other CPUs must be at a safe point (e.g., in stop_machine | |
16999 | + * or offlined entirely). | |
17000 | + */ | |
17001 | + spin_lock_irqsave(&pgd_lock, flags); | |
17002 | + list_for_each_entry(page, &pgd_list, lru) { | |
17003 | + if (!PagePinned(page)) | |
17004 | + __pgd_pin((pgd_t *)page_address(page)); | |
17005 | + } | |
17006 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
17007 | +} | |
17008 | + | |
17009 | +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | |
17010 | +{ | |
17011 | + if (!PagePinned(virt_to_page(mm->pgd))) | |
17012 | + mm_pin(mm); | |
17013 | +} | |
17014 | + | |
17015 | +void arch_exit_mmap(struct mm_struct *mm) | |
17016 | +{ | |
17017 | + struct task_struct *tsk = current; | |
17018 | + | |
17019 | + task_lock(tsk); | |
17020 | + | |
17021 | + /* | |
17022 | + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() | |
17023 | + * *much* faster this way, as no tlb flushes means bigger wrpt batches. | |
17024 | + */ | |
17025 | + if (tsk->active_mm == mm) { | |
17026 | + tsk->active_mm = &init_mm; | |
17027 | + atomic_inc(&init_mm.mm_count); | |
17028 | + | |
17029 | + switch_mm(mm, &init_mm, tsk); | |
17030 | + | |
17031 | + atomic_dec(&mm->mm_count); | |
17032 | + BUG_ON(atomic_read(&mm->mm_count) == 0); | |
17033 | + } | |
17034 | + | |
17035 | + task_unlock(tsk); | |
17036 | + | |
17037 | + if (PagePinned(virt_to_page(mm->pgd)) | |
17038 | + && atomic_read(&mm->mm_count) == 1 | |
17039 | + && !mm->context.has_foreign_mappings) | |
17040 | + mm_unpin(mm); | |
17041 | +} | |
17042 | + | |
17043 | +static void _pte_free(struct page *page, unsigned int order) | |
17044 | +{ | |
17045 | + BUG_ON(order); | |
17046 | + __pte_free(page); | |
17047 | +} | |
17048 | + | |
17049 | +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) | |
17050 | +{ | |
17051 | + struct page *pte; | |
17052 | + | |
17053 | +#ifdef CONFIG_HIGHPTE | |
17054 | + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); | |
17055 | +#else | |
17056 | + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); | |
17057 | +#endif | |
17058 | + if (pte) { | |
17059 | + pgtable_page_ctor(pte); | |
17060 | + SetPageForeign(pte, _pte_free); | |
17061 | + init_page_count(pte); | |
17062 | + } | |
17063 | + return pte; | |
17064 | +} | |
17065 | + | |
17066 | +void __pte_free(pgtable_t pte) | |
17067 | +{ | |
17068 | + if (!PageHighMem(pte)) { | |
17069 | + unsigned long va = (unsigned long)page_address(pte); | |
17070 | + unsigned int level; | |
17071 | + pte_t *ptep = lookup_address(va, &level); | |
17072 | + | |
17073 | + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep)); | |
17074 | + if (!pte_write(*ptep) | |
17075 | + && HYPERVISOR_update_va_mapping(va, | |
17076 | + mk_pte(pte, PAGE_KERNEL), | |
17077 | + 0)) | |
17078 | + BUG(); | |
17079 | + } else | |
17080 | +#ifdef CONFIG_HIGHPTE | |
17081 | + ClearPagePinned(pte); | |
17082 | +#else | |
17083 | + BUG(); | |
17084 | +#endif | |
17085 | + | |
17086 | + ClearPageForeign(pte); | |
17087 | + init_page_count(pte); | |
17088 | + pgtable_page_dtor(pte); | |
17089 | + __free_page(pte); | |
17090 | +} | |
17091 | + | |
17092 | +#if PAGETABLE_LEVELS >= 3 | |
17093 | +static void _pmd_free(struct page *page, unsigned int order) | |
17094 | +{ | |
17095 | + BUG_ON(order); | |
17096 | + __pmd_free(page); | |
17097 | +} | |
17098 | + | |
17099 | +pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) | |
17100 | +{ | |
17101 | + struct page *pmd; | |
17102 | + | |
17103 | + pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); | |
17104 | + if (!pmd) | |
17105 | + return NULL; | |
17106 | + SetPageForeign(pmd, _pmd_free); | |
17107 | + init_page_count(pmd); | |
17108 | + return page_address(pmd); | |
17109 | +} | |
17110 | + | |
17111 | +void __pmd_free(pgtable_t pmd) | |
17112 | +{ | |
17113 | + unsigned long va = (unsigned long)page_address(pmd); | |
17114 | + unsigned int level; | |
17115 | + pte_t *ptep = lookup_address(va, &level); | |
17116 | + | |
17117 | + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep)); | |
17118 | + if (!pte_write(*ptep) | |
17119 | + && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0)) | |
17120 | + BUG(); | |
17121 | + | |
17122 | + ClearPageForeign(pmd); | |
17123 | + init_page_count(pmd); | |
17124 | + __free_page(pmd); | |
17125 | +} | |
17126 | +#endif | |
17127 | + | |
17128 | +/* blktap and gntdev need this, as otherwise they would implicitly (and | |
17129 | + * needlessly, as they never use it) reference init_mm. */ | |
17130 | +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma, | |
17131 | + unsigned long addr, pte_t *ptep, int full) | |
17132 | +{ | |
17133 | + return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full); | |
17134 | +} | |
17135 | +EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full); | |
17136 | + | |
17137 | +/* | |
17138 | + * The current flushing context - we pass it instead of 5 arguments: | |
17139 | + */ | |
17140 | +struct cpa_data { | |
17141 | + unsigned long vaddr; | |
17142 | + pgprot_t mask_set; | |
17143 | + pgprot_t mask_clr; | |
17144 | + int numpages; | |
17145 | + int flushtlb; | |
17146 | + unsigned long pfn; | |
17147 | +}; | |
17148 | + | |
17149 | +#ifdef CONFIG_X86_64 | |
17150 | + | |
17151 | +static inline unsigned long highmap_start_pfn(void) | |
17152 | +{ | |
17153 | + return __pa(_text) >> PAGE_SHIFT; | |
17154 | +} | |
17155 | + | |
17156 | +static inline unsigned long highmap_end_pfn(void) | |
17157 | +{ | |
17158 | + return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; | |
17159 | +} | |
17160 | + | |
17161 | +#endif | |
17162 | + | |
17163 | +#ifdef CONFIG_DEBUG_PAGEALLOC | |
17164 | +# define debug_pagealloc 1 | |
17165 | +#else | |
17166 | +# define debug_pagealloc 0 | |
17167 | +#endif | |
17168 | + | |
17169 | +static inline int | |
17170 | +within(unsigned long addr, unsigned long start, unsigned long end) | |
17171 | +{ | |
17172 | + return addr >= start && addr < end; | |
17173 | +} | |
17174 | + | |
17175 | +/* | |
17176 | + * Flushing functions | |
17177 | + */ | |
17178 | + | |
17179 | +/** | |
17180 | + * clflush_cache_range - flush a cache range with clflush | |
17181 | + * @addr: virtual start address | |
17182 | + * @size: number of bytes to flush | |
17183 | + * | |
17184 | + * clflush is an unordered instruction which needs fencing with mfence | |
17185 | + * to avoid ordering issues. | |
17186 | + */ | |
17187 | +void clflush_cache_range(void *vaddr, unsigned int size) | |
17188 | +{ | |
17189 | + void *vend = vaddr + size - 1; | |
17190 | + | |
17191 | + mb(); | |
17192 | + | |
17193 | + for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) | |
17194 | + clflush(vaddr); | |
17195 | + /* | |
17196 | + * Flush any possible final partial cacheline: | |
17197 | + */ | |
17198 | + clflush(vend); | |
17199 | + | |
17200 | + mb(); | |
17201 | +} | |
17202 | + | |
17203 | +static void __cpa_flush_all(void *arg) | |
17204 | +{ | |
17205 | + unsigned long cache = (unsigned long)arg; | |
17206 | + | |
17207 | + /* | |
17208 | + * Flush all to work around Errata in early athlons regarding | |
17209 | + * large page flushing. | |
17210 | + */ | |
17211 | + __flush_tlb_all(); | |
17212 | + | |
17213 | + if (cache && boot_cpu_data.x86_model >= 4) | |
17214 | + wbinvd(); | |
17215 | +} | |
17216 | + | |
17217 | +static void cpa_flush_all(unsigned long cache) | |
17218 | +{ | |
17219 | + BUG_ON(irqs_disabled()); | |
17220 | + | |
17221 | + on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); | |
17222 | +} | |
17223 | + | |
17224 | +static void __cpa_flush_range(void *arg) | |
17225 | +{ | |
17226 | + /* | |
17227 | + * We could optimize that further and do individual per page | |
17228 | + * tlb invalidates for a low number of pages. Caveat: we must | |
17229 | + * flush the high aliases on 64bit as well. | |
17230 | + */ | |
17231 | + __flush_tlb_all(); | |
17232 | +} | |
17233 | + | |
17234 | +static void cpa_flush_range(unsigned long start, int numpages, int cache) | |
17235 | +{ | |
17236 | + unsigned int i, level; | |
17237 | + unsigned long addr; | |
17238 | + | |
17239 | + BUG_ON(irqs_disabled()); | |
17240 | + WARN_ON(PAGE_ALIGN(start) != start); | |
17241 | + | |
17242 | + on_each_cpu(__cpa_flush_range, NULL, 1, 1); | |
17243 | + | |
17244 | + if (!cache) | |
17245 | + return; | |
17246 | + | |
17247 | + /* | |
17248 | + * We only need to flush on one CPU, | |
17249 | + * clflush is a MESI-coherent instruction that | |
17250 | + * will cause all other CPUs to flush the same | |
17251 | + * cachelines: | |
17252 | + */ | |
17253 | + for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { | |
17254 | + pte_t *pte = lookup_address(addr, &level); | |
17255 | + | |
17256 | + /* | |
17257 | + * Only flush present addresses: | |
17258 | + */ | |
17259 | + if (pte && (__pte_val(*pte) & _PAGE_PRESENT)) | |
17260 | + clflush_cache_range((void *) addr, PAGE_SIZE); | |
17261 | + } | |
17262 | +} | |
17263 | + | |
17264 | +/* | |
17265 | + * Certain areas of memory on x86 require very specific protection flags, | |
17266 | + * for example the BIOS area or kernel text. Callers don't always get this | |
17267 | + * right (again, ioremap() on BIOS memory is not uncommon) so this function | |
17268 | + * checks and fixes these known static required protection bits. | |
17269 | + */ | |
17270 | +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, | |
17271 | + unsigned long pfn) | |
17272 | +{ | |
17273 | + pgprot_t forbidden = __pgprot(0); | |
17274 | + | |
17275 | +#ifndef CONFIG_XEN | |
17276 | + /* | |
17277 | + * The BIOS area between 640k and 1Mb needs to be executable for | |
17278 | + * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. | |
17279 | + */ | |
17280 | + if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) | |
17281 | + pgprot_val(forbidden) |= _PAGE_NX; | |
17282 | +#endif | |
17283 | + | |
17284 | + /* | |
17285 | + * The kernel text needs to be executable for obvious reasons | |
17286 | + * Does not cover __inittext since that is gone later on. On | |
17287 | + * 64bit we do not enforce !NX on the low mapping | |
17288 | + */ | |
17289 | + if (within(address, (unsigned long)_text, (unsigned long)_etext)) | |
17290 | + pgprot_val(forbidden) |= _PAGE_NX; | |
17291 | + | |
17292 | + /* | |
17293 | + * The .rodata section needs to be read-only. Using the pfn | |
17294 | + * catches all aliases. | |
17295 | + */ | |
17296 | + if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, | |
17297 | + __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) | |
17298 | + pgprot_val(forbidden) |= _PAGE_RW; | |
17299 | + | |
17300 | + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); | |
17301 | + | |
17302 | + return prot; | |
17303 | +} | |
17304 | + | |
17305 | +/* | |
17306 | + * Lookup the page table entry for a virtual address. Return a pointer | |
17307 | + * to the entry and the level of the mapping. | |
17308 | + * | |
17309 | + * Note: We return pud and pmd either when the entry is marked large | |
17310 | + * or when the present bit is not set. Otherwise we would return a | |
17311 | + * pointer to a nonexisting mapping. | |
17312 | + */ | |
17313 | +pte_t *lookup_address(unsigned long address, unsigned int *level) | |
17314 | +{ | |
17315 | + pgd_t *pgd = pgd_offset_k(address); | |
17316 | + pud_t *pud; | |
17317 | + pmd_t *pmd; | |
17318 | + | |
17319 | + *level = PG_LEVEL_NONE; | |
17320 | + | |
17321 | + if (pgd_none(*pgd)) | |
17322 | + return NULL; | |
17323 | + | |
17324 | + pud = pud_offset(pgd, address); | |
17325 | + if (pud_none(*pud)) | |
17326 | + return NULL; | |
17327 | + | |
17328 | + *level = PG_LEVEL_1G; | |
17329 | + if (pud_large(*pud) || !pud_present(*pud)) | |
17330 | + return (pte_t *)pud; | |
17331 | + | |
17332 | + pmd = pmd_offset(pud, address); | |
17333 | + if (pmd_none(*pmd)) | |
17334 | + return NULL; | |
17335 | + | |
17336 | + *level = PG_LEVEL_2M; | |
17337 | + if (pmd_large(*pmd) || !pmd_present(*pmd)) | |
17338 | + return (pte_t *)pmd; | |
17339 | + | |
17340 | + *level = PG_LEVEL_4K; | |
17341 | + | |
17342 | + return pte_offset_kernel(pmd, address); | |
17343 | +} | |
17344 | + | |
17345 | +/* | |
17346 | + * Set the new pmd in all the pgds we know about: | |
17347 | + */ | |
17348 | +static void __set_pmd_pte(pte_t *kpte, unsigned long address, | |
17349 | + unsigned int level, pte_t pte) | |
17350 | +{ | |
17351 | + /* change init_mm */ | |
17352 | + switch(level) { | |
17353 | + case PG_LEVEL_2M: | |
17354 | + xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte))); | |
17355 | + break; | |
17356 | +#ifdef CONFIG_X86_64 | |
17357 | + case PG_LEVEL_1G: | |
17358 | + xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte))); | |
17359 | + break; | |
17360 | +#endif | |
17361 | + default: | |
17362 | + BUG(); | |
17363 | + } | |
17364 | +#ifdef CONFIG_X86_32 | |
17365 | + if (!SHARED_KERNEL_PMD) { | |
17366 | + struct page *page; | |
17367 | + | |
17368 | + list_for_each_entry(page, &pgd_list, lru) { | |
17369 | + pgd_t *pgd; | |
17370 | + pud_t *pud; | |
17371 | + pmd_t *pmd; | |
17372 | + | |
17373 | + pgd = (pgd_t *)page_address(page) + pgd_index(address); | |
17374 | + pud = pud_offset(pgd, address); | |
17375 | + pmd = pmd_offset(pud, address); | |
17376 | + xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte))); | |
17377 | + } | |
17378 | + } | |
17379 | +#endif | |
17380 | +} | |
17381 | + | |
17382 | +static int | |
17383 | +try_preserve_large_page(pte_t *kpte, unsigned long address, | |
17384 | + struct cpa_data *cpa) | |
17385 | +{ | |
17386 | + unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; | |
17387 | + pte_t new_pte, old_pte, *tmp; | |
17388 | + pgprot_t old_prot, new_prot; | |
17389 | + int i, do_split = 1; | |
17390 | + unsigned int level; | |
17391 | + | |
17392 | + spin_lock_irqsave(&pgd_lock, flags); | |
17393 | + /* | |
17394 | + * Check for races, another CPU might have split this page | |
17395 | + * up already: | |
17396 | + */ | |
17397 | + tmp = lookup_address(address, &level); | |
17398 | + if (tmp != kpte) | |
17399 | + goto out_unlock; | |
17400 | + | |
17401 | + switch (level) { | |
17402 | + case PG_LEVEL_2M: | |
17403 | + psize = PMD_PAGE_SIZE; | |
17404 | + pmask = PMD_PAGE_MASK; | |
17405 | + break; | |
17406 | +#ifdef CONFIG_X86_64 | |
17407 | + case PG_LEVEL_1G: | |
17408 | + psize = PUD_PAGE_SIZE; | |
17409 | + pmask = PUD_PAGE_MASK; | |
17410 | + break; | |
17411 | +#endif | |
17412 | + default: | |
17413 | + do_split = -EINVAL; | |
17414 | + goto out_unlock; | |
17415 | + } | |
17416 | + | |
17417 | + /* | |
17418 | + * Calculate the number of pages, which fit into this large | |
17419 | + * page starting at address: | |
17420 | + */ | |
17421 | + nextpage_addr = (address + psize) & pmask; | |
17422 | + numpages = (nextpage_addr - address) >> PAGE_SHIFT; | |
17423 | + if (numpages < cpa->numpages) | |
17424 | + cpa->numpages = numpages; | |
17425 | + | |
17426 | + /* | |
17427 | + * We are safe now. Check whether the new pgprot is the same: | |
17428 | + */ | |
17429 | + old_pte = *kpte; | |
17430 | + old_prot = new_prot = pte_pgprot(old_pte); | |
17431 | + | |
17432 | + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); | |
17433 | + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); | |
17434 | + | |
17435 | + /* | |
17436 | + * old_pte points to the large page base address. So we need | |
17437 | + * to add the offset of the virtual address: | |
17438 | + */ | |
17439 | + pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); | |
17440 | + cpa->pfn = pfn; | |
17441 | + | |
17442 | + new_prot = static_protections(new_prot, address, pfn); | |
17443 | + | |
17444 | + /* | |
17445 | + * We need to check the full range, whether | |
17446 | + * static_protection() requires a different pgprot for one of | |
17447 | + * the pages in the range we try to preserve: | |
17448 | + */ | |
17449 | + if (pfn < max_mapnr) { | |
17450 | + addr = address + PAGE_SIZE; | |
17451 | + for (i = 1; i < cpa->numpages && ++pfn < max_mapnr; | |
17452 | + i++, addr += PAGE_SIZE) { | |
17453 | + pgprot_t chk_prot = static_protections(new_prot, addr, pfn); | |
17454 | + | |
17455 | + if (pgprot_val(chk_prot) != pgprot_val(new_prot)) | |
17456 | + goto out_unlock; | |
17457 | + } | |
17458 | + } | |
17459 | + | |
17460 | + /* | |
17461 | + * If there are no changes, return. maxpages has been updated | |
17462 | + * above: | |
17463 | + */ | |
17464 | + if (pgprot_val(new_prot) == pgprot_val(old_prot)) { | |
17465 | + do_split = 0; | |
17466 | + goto out_unlock; | |
17467 | + } | |
17468 | + | |
17469 | + /* | |
17470 | + * We need to change the attributes. Check, whether we can | |
17471 | + * change the large page in one go. We request a split, when | |
17472 | + * the address is not aligned and the number of pages is | |
17473 | + * smaller than the number of pages in the large page. Note | |
17474 | + * that we limited the number of possible pages already to | |
17475 | + * the number of pages in the large page. | |
17476 | + */ | |
17477 | + if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { | |
17478 | + /* | |
17479 | + * The address is aligned and the number of pages | |
17480 | + * covers the full page. | |
17481 | + */ | |
17482 | + new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot)); | |
17483 | + __set_pmd_pte(kpte, address, level, new_pte); | |
17484 | + cpa->flushtlb = 1; | |
17485 | + do_split = 0; | |
17486 | + } | |
17487 | + | |
17488 | +out_unlock: | |
17489 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
17490 | + | |
17491 | + return do_split; | |
17492 | +} | |
17493 | + | |
17494 | +static LIST_HEAD(page_pool); | |
17495 | +static unsigned long pool_size, pool_pages, pool_low; | |
17496 | +static unsigned long pool_used, pool_failed; | |
17497 | + | |
17498 | +static void cpa_fill_pool(struct page **ret) | |
17499 | +{ | |
17500 | + gfp_t gfp = GFP_KERNEL; | |
17501 | + unsigned long flags; | |
17502 | + struct page *p; | |
17503 | + | |
17504 | + /* | |
17505 | + * Avoid recursion (on debug-pagealloc) and also signal | |
17506 | + * our priority to get to these pagetables: | |
17507 | + */ | |
17508 | + if (current->flags & PF_MEMALLOC) | |
17509 | + return; | |
17510 | + current->flags |= PF_MEMALLOC; | |
17511 | + | |
17512 | + /* | |
17513 | + * Allocate atomically from atomic contexts: | |
17514 | + */ | |
17515 | + if (in_atomic() || irqs_disabled() || debug_pagealloc) | |
17516 | + gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; | |
17517 | + | |
17518 | + while (pool_pages < pool_size || (ret && !*ret)) { | |
17519 | + p = alloc_pages(gfp, 0); | |
17520 | + if (!p) { | |
17521 | + pool_failed++; | |
17522 | + break; | |
17523 | + } | |
17524 | + /* | |
17525 | + * If the call site needs a page right now, provide it: | |
17526 | + */ | |
17527 | + if (ret && !*ret) { | |
17528 | + *ret = p; | |
17529 | + continue; | |
17530 | + } | |
17531 | + spin_lock_irqsave(&pgd_lock, flags); | |
17532 | + list_add(&p->lru, &page_pool); | |
17533 | + pool_pages++; | |
17534 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
17535 | + } | |
17536 | + | |
17537 | + current->flags &= ~PF_MEMALLOC; | |
17538 | +} | |
17539 | + | |
17540 | +#define SHIFT_MB (20 - PAGE_SHIFT) | |
17541 | +#define ROUND_MB_GB ((1 << 10) - 1) | |
17542 | +#define SHIFT_MB_GB 10 | |
17543 | +#define POOL_PAGES_PER_GB 16 | |
17544 | + | |
17545 | +void __init cpa_init(void) | |
17546 | +{ | |
17547 | + struct sysinfo si; | |
17548 | + unsigned long gb; | |
17549 | + | |
17550 | + si_meminfo(&si); | |
17551 | + /* | |
17552 | + * Calculate the number of pool pages: | |
17553 | + * | |
17554 | + * Convert totalram (nr of pages) to MiB and round to the next | |
17555 | + * GiB. Shift MiB to Gib and multiply the result by | |
17556 | + * POOL_PAGES_PER_GB: | |
17557 | + */ | |
17558 | + if (debug_pagealloc) { | |
17559 | + gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; | |
17560 | + pool_size = POOL_PAGES_PER_GB * gb; | |
17561 | + } else { | |
17562 | + pool_size = 1; | |
17563 | + } | |
17564 | + pool_low = pool_size; | |
17565 | + | |
17566 | + cpa_fill_pool(NULL); | |
17567 | + printk(KERN_DEBUG | |
17568 | + "CPA: page pool initialized %lu of %lu pages preallocated\n", | |
17569 | + pool_pages, pool_size); | |
17570 | +} | |
17571 | + | |
17572 | +static int split_large_page(pte_t *kpte, unsigned long address) | |
17573 | +{ | |
17574 | + unsigned long flags, mfn, mfninc = 1; | |
17575 | + unsigned int i, level; | |
17576 | + pte_t *pbase, *tmp; | |
17577 | + pgprot_t ref_prot; | |
17578 | + struct page *base; | |
17579 | + | |
17580 | + /* | |
17581 | + * Get a page from the pool. The pool list is protected by the | |
17582 | + * pgd_lock, which we have to take anyway for the split | |
17583 | + * operation: | |
17584 | + */ | |
17585 | + spin_lock_irqsave(&pgd_lock, flags); | |
17586 | + if (list_empty(&page_pool)) { | |
17587 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
17588 | + base = NULL; | |
17589 | + cpa_fill_pool(&base); | |
17590 | + if (!base) | |
17591 | + return -ENOMEM; | |
17592 | + spin_lock_irqsave(&pgd_lock, flags); | |
17593 | + } else { | |
17594 | + base = list_first_entry(&page_pool, struct page, lru); | |
17595 | + list_del(&base->lru); | |
17596 | + pool_pages--; | |
17597 | + | |
17598 | + if (pool_pages < pool_low) | |
17599 | + pool_low = pool_pages; | |
17600 | + } | |
17601 | + | |
17602 | + /* | |
17603 | + * Check for races, another CPU might have split this page | |
17604 | + * up for us already: | |
17605 | + */ | |
17606 | + tmp = lookup_address(address, &level); | |
17607 | + if (tmp != kpte) | |
17608 | + goto out_unlock; | |
17609 | + | |
17610 | + pbase = (pte_t *)page_address(base); | |
17611 | +#ifdef CONFIG_X86_32 | |
17612 | + paravirt_alloc_pt(&init_mm, page_to_pfn(base)); | |
17613 | +#endif | |
17614 | + ref_prot = pte_pgprot(pte_clrhuge(*kpte)); | |
17615 | + | |
17616 | +#ifdef CONFIG_X86_64 | |
17617 | + if (level == PG_LEVEL_1G) { | |
17618 | + mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; | |
17619 | + pgprot_val(ref_prot) |= _PAGE_PSE; | |
17620 | + } | |
17621 | +#endif | |
17622 | + | |
17623 | + /* | |
17624 | + * Get the target mfn from the original entry: | |
17625 | + */ | |
17626 | + mfn = __pte_mfn(*kpte); | |
17627 | + for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc) | |
17628 | + set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot)); | |
17629 | + | |
17630 | + /* | |
17631 | + * Install the new, split up pagetable. Important details here: | |
17632 | + * | |
17633 | + * On Intel the NX bit of all levels must be cleared to make a | |
17634 | + * page executable. See section 4.13.2 of Intel 64 and IA-32 | |
17635 | + * Architectures Software Developer's Manual). | |
17636 | + * | |
17637 | + * Mark the entry present. The current mapping might be | |
17638 | + * set to not present, which we preserved above. | |
17639 | + */ | |
17640 | + if (HYPERVISOR_update_va_mapping((unsigned long)pbase, | |
17641 | + mk_pte(base, PAGE_KERNEL_RO), 0)) | |
17642 | + BUG(); | |
17643 | + ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); | |
17644 | + pgprot_val(ref_prot) |= _PAGE_PRESENT; | |
17645 | + __set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot)); | |
17646 | + base = NULL; | |
17647 | + | |
17648 | +out_unlock: | |
17649 | + /* | |
17650 | + * If we dropped out via the lookup_address check under | |
17651 | + * pgd_lock then stick the page back into the pool: | |
17652 | + */ | |
17653 | + if (base) { | |
17654 | + list_add(&base->lru, &page_pool); | |
17655 | + pool_pages++; | |
17656 | + } else | |
17657 | + pool_used++; | |
17658 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
17659 | + | |
17660 | + return 0; | |
17661 | +} | |
17662 | + | |
17663 | +static int __change_page_attr(struct cpa_data *cpa, int primary) | |
17664 | +{ | |
17665 | + unsigned long address = cpa->vaddr; | |
17666 | + int do_split, err; | |
17667 | + unsigned int level; | |
17668 | + pte_t *kpte, old_pte; | |
17669 | + | |
17670 | +repeat: | |
17671 | + kpte = lookup_address(address, &level); | |
17672 | + if (!kpte) | |
17673 | + return primary ? -EINVAL : 0; | |
17674 | + | |
17675 | + old_pte = *kpte; | |
17676 | + if (!__pte_val(old_pte)) { | |
17677 | + if (!primary) | |
17678 | + return 0; | |
17679 | + printk(KERN_WARNING "CPA: called for zero pte. " | |
17680 | + "vaddr = %lx cpa->vaddr = %lx\n", address, | |
17681 | + cpa->vaddr); | |
17682 | + WARN_ON(1); | |
17683 | + return -EINVAL; | |
17684 | + } | |
17685 | + | |
17686 | + if (level == PG_LEVEL_4K) { | |
17687 | + pte_t new_pte; | |
17688 | + pgprot_t new_prot = pte_pgprot(old_pte); | |
17689 | + unsigned long mfn = __pte_mfn(old_pte); | |
17690 | + | |
17691 | + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); | |
17692 | + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); | |
17693 | + | |
17694 | + new_prot = static_protections(new_prot, address, | |
17695 | + mfn_to_local_pfn(mfn)); | |
17696 | + | |
17697 | + /* | |
17698 | + * We need to keep the mfn from the existing PTE, | |
17699 | + * after all we're only going to change it's attributes | |
17700 | + * not the memory it points to | |
17701 | + */ | |
17702 | + new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot)); | |
17703 | + cpa->pfn = mfn_to_local_pfn(mfn); | |
17704 | + /* | |
17705 | + * Do we really change anything ? | |
17706 | + */ | |
17707 | + if (__pte_val(old_pte) != __pte_val(new_pte)) { | |
17708 | + set_pte_atomic(kpte, new_pte); | |
17709 | + cpa->flushtlb = 1; | |
17710 | + } | |
17711 | + cpa->numpages = 1; | |
17712 | + return 0; | |
17713 | + } | |
17714 | + | |
17715 | + /* | |
17716 | + * Check, whether we can keep the large page intact | |
17717 | + * and just change the pte: | |
17718 | + */ | |
17719 | + do_split = try_preserve_large_page(kpte, address, cpa); | |
17720 | + /* | |
17721 | + * When the range fits into the existing large page, | |
17722 | + * return. cp->numpages and cpa->tlbflush have been updated in | |
17723 | + * try_large_page: | |
17724 | + */ | |
17725 | + if (do_split <= 0) | |
17726 | + return do_split; | |
17727 | + | |
17728 | + /* | |
17729 | + * We have to split the large page: | |
17730 | + */ | |
17731 | + err = split_large_page(kpte, address); | |
17732 | + if (!err) { | |
17733 | + cpa->flushtlb = 1; | |
17734 | + goto repeat; | |
17735 | + } | |
17736 | + | |
17737 | + return err; | |
17738 | +} | |
17739 | + | |
17740 | +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); | |
17741 | + | |
17742 | +static int cpa_process_alias(struct cpa_data *cpa) | |
17743 | +{ | |
17744 | + struct cpa_data alias_cpa; | |
17745 | + int ret = 0; | |
17746 | + | |
17747 | + if (cpa->pfn > max_pfn_mapped) | |
17748 | + return 0; | |
17749 | + | |
17750 | + /* | |
17751 | + * No need to redo, when the primary call touched the direct | |
17752 | + * mapping already: | |
17753 | + */ | |
17754 | + if (!within(cpa->vaddr, PAGE_OFFSET, | |
17755 | + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { | |
17756 | + | |
17757 | + alias_cpa = *cpa; | |
17758 | + alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); | |
17759 | + | |
17760 | + ret = __change_page_attr_set_clr(&alias_cpa, 0); | |
17761 | + } | |
17762 | + | |
17763 | +#ifdef CONFIG_X86_64 | |
17764 | + if (ret) | |
17765 | + return ret; | |
17766 | + /* | |
17767 | + * No need to redo, when the primary call touched the high | |
17768 | + * mapping already: | |
17769 | + */ | |
17770 | + if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) | |
17771 | + return 0; | |
17772 | + | |
17773 | + /* | |
17774 | + * If the physical address is inside the kernel map, we need | |
17775 | + * to touch the high mapped kernel as well: | |
17776 | + */ | |
17777 | + if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) | |
17778 | + return 0; | |
17779 | + | |
17780 | + alias_cpa = *cpa; | |
17781 | + alias_cpa.vaddr = | |
17782 | + (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map; | |
17783 | + | |
17784 | + /* | |
17785 | + * The high mapping range is imprecise, so ignore the return value. | |
17786 | + */ | |
17787 | + __change_page_attr_set_clr(&alias_cpa, 0); | |
17788 | +#endif | |
17789 | + return ret; | |
17790 | +} | |
17791 | + | |
17792 | +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) | |
17793 | +{ | |
17794 | + int ret, numpages = cpa->numpages; | |
17795 | + | |
17796 | + while (numpages) { | |
17797 | + /* | |
17798 | + * Store the remaining nr of pages for the large page | |
17799 | + * preservation check. | |
17800 | + */ | |
17801 | + cpa->numpages = numpages; | |
17802 | + | |
17803 | + ret = __change_page_attr(cpa, checkalias); | |
17804 | + if (ret) | |
17805 | + return ret; | |
17806 | + | |
17807 | + if (checkalias) { | |
17808 | + ret = cpa_process_alias(cpa); | |
17809 | + if (ret) | |
17810 | + return ret; | |
17811 | + } | |
17812 | + | |
17813 | + /* | |
17814 | + * Adjust the number of pages with the result of the | |
17815 | + * CPA operation. Either a large page has been | |
17816 | + * preserved or a single page update happened. | |
17817 | + */ | |
17818 | + BUG_ON(cpa->numpages > numpages); | |
17819 | + numpages -= cpa->numpages; | |
17820 | + cpa->vaddr += cpa->numpages * PAGE_SIZE; | |
17821 | + } | |
17822 | + return 0; | |
17823 | +} | |
17824 | + | |
17825 | +static inline int cache_attr(pgprot_t attr) | |
17826 | +{ | |
17827 | + return pgprot_val(attr) & | |
17828 | + (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); | |
17829 | +} | |
17830 | + | |
17831 | +static int change_page_attr_set_clr(unsigned long addr, int numpages, | |
17832 | + pgprot_t mask_set, pgprot_t mask_clr) | |
17833 | +{ | |
17834 | + struct cpa_data cpa; | |
17835 | + int ret, cache, checkalias; | |
17836 | + | |
17837 | + /* | |
17838 | + * Check, if we are requested to change a not supported | |
17839 | + * feature: | |
17840 | + */ | |
17841 | + mask_set = canon_pgprot(mask_set); | |
17842 | + mask_clr = canon_pgprot(mask_clr); | |
17843 | + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) | |
17844 | + return 0; | |
17845 | + | |
17846 | + /* Ensure we are PAGE_SIZE aligned */ | |
17847 | + if (addr & ~PAGE_MASK) { | |
17848 | + addr &= PAGE_MASK; | |
17849 | + /* | |
17850 | + * People should not be passing in unaligned addresses: | |
17851 | + */ | |
17852 | + WARN_ON_ONCE(1); | |
17853 | + } | |
17854 | + | |
17855 | + cpa.vaddr = addr; | |
17856 | + cpa.numpages = numpages; | |
17857 | + cpa.mask_set = mask_set; | |
17858 | + cpa.mask_clr = mask_clr; | |
17859 | + cpa.flushtlb = 0; | |
17860 | + | |
17861 | + /* No alias checking for _NX bit modifications */ | |
17862 | + checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; | |
17863 | + | |
17864 | + ret = __change_page_attr_set_clr(&cpa, checkalias); | |
17865 | + | |
17866 | + /* | |
17867 | + * Check whether we really changed something: | |
17868 | + */ | |
17869 | + if (!cpa.flushtlb) | |
17870 | + goto out; | |
17871 | + | |
17872 | + /* | |
17873 | + * No need to flush, when we did not set any of the caching | |
17874 | + * attributes: | |
17875 | + */ | |
17876 | + cache = cache_attr(mask_set); | |
17877 | + | |
17878 | + /* | |
17879 | + * On success we use clflush, when the CPU supports it to | |
17880 | + * avoid the wbindv. If the CPU does not support it and in the | |
17881 | + * error case we fall back to cpa_flush_all (which uses | |
17882 | + * wbindv): | |
17883 | + */ | |
17884 | + if (!ret && cpu_has_clflush) | |
17885 | + cpa_flush_range(addr, numpages, cache); | |
17886 | + else | |
17887 | + cpa_flush_all(cache); | |
17888 | + | |
17889 | +out: | |
17890 | + cpa_fill_pool(NULL); | |
17891 | + | |
17892 | + return ret; | |
17893 | +} | |
17894 | + | |
17895 | +static inline int change_page_attr_set(unsigned long addr, int numpages, | |
17896 | + pgprot_t mask) | |
17897 | +{ | |
17898 | + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); | |
17899 | +} | |
17900 | + | |
17901 | +static inline int change_page_attr_clear(unsigned long addr, int numpages, | |
17902 | + pgprot_t mask) | |
17903 | +{ | |
17904 | + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); | |
17905 | +} | |
17906 | + | |
17907 | +int set_memory_uc(unsigned long addr, int numpages) | |
17908 | +{ | |
17909 | + return change_page_attr_set(addr, numpages, | |
17910 | + __pgprot(_PAGE_PCD)); | |
17911 | +} | |
17912 | +EXPORT_SYMBOL(set_memory_uc); | |
17913 | + | |
17914 | +int set_memory_wb(unsigned long addr, int numpages) | |
17915 | +{ | |
17916 | + return change_page_attr_clear(addr, numpages, | |
17917 | + __pgprot(_PAGE_PCD | _PAGE_PWT)); | |
17918 | +} | |
17919 | +EXPORT_SYMBOL(set_memory_wb); | |
17920 | + | |
17921 | +int set_memory_x(unsigned long addr, int numpages) | |
17922 | +{ | |
17923 | + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); | |
17924 | +} | |
17925 | +EXPORT_SYMBOL(set_memory_x); | |
17926 | + | |
17927 | +int set_memory_nx(unsigned long addr, int numpages) | |
17928 | +{ | |
17929 | + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); | |
17930 | +} | |
17931 | +EXPORT_SYMBOL(set_memory_nx); | |
17932 | + | |
17933 | +int set_memory_ro(unsigned long addr, int numpages) | |
17934 | +{ | |
17935 | + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); | |
17936 | +} | |
17937 | + | |
17938 | +int set_memory_rw(unsigned long addr, int numpages) | |
17939 | +{ | |
17940 | + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); | |
17941 | +} | |
17942 | + | |
17943 | +int set_memory_np(unsigned long addr, int numpages) | |
17944 | +{ | |
17945 | + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); | |
17946 | +} | |
17947 | + | |
17948 | +int set_pages_uc(struct page *page, int numpages) | |
17949 | +{ | |
17950 | + unsigned long addr = (unsigned long)page_address(page); | |
17951 | + | |
17952 | + return set_memory_uc(addr, numpages); | |
17953 | +} | |
17954 | +EXPORT_SYMBOL(set_pages_uc); | |
17955 | + | |
17956 | +int set_pages_wb(struct page *page, int numpages) | |
17957 | +{ | |
17958 | + unsigned long addr = (unsigned long)page_address(page); | |
17959 | + | |
17960 | + return set_memory_wb(addr, numpages); | |
17961 | +} | |
17962 | +EXPORT_SYMBOL(set_pages_wb); | |
17963 | + | |
17964 | +int set_pages_x(struct page *page, int numpages) | |
17965 | +{ | |
17966 | + unsigned long addr = (unsigned long)page_address(page); | |
17967 | + | |
17968 | + return set_memory_x(addr, numpages); | |
17969 | +} | |
17970 | +EXPORT_SYMBOL(set_pages_x); | |
17971 | + | |
17972 | +int set_pages_nx(struct page *page, int numpages) | |
17973 | +{ | |
17974 | + unsigned long addr = (unsigned long)page_address(page); | |
17975 | + | |
17976 | + return set_memory_nx(addr, numpages); | |
17977 | +} | |
17978 | +EXPORT_SYMBOL(set_pages_nx); | |
17979 | + | |
17980 | +int set_pages_ro(struct page *page, int numpages) | |
17981 | +{ | |
17982 | + unsigned long addr = (unsigned long)page_address(page); | |
17983 | + | |
17984 | + return set_memory_ro(addr, numpages); | |
17985 | +} | |
17986 | + | |
17987 | +int set_pages_rw(struct page *page, int numpages) | |
17988 | +{ | |
17989 | + unsigned long addr = (unsigned long)page_address(page); | |
17990 | + | |
17991 | + return set_memory_rw(addr, numpages); | |
17992 | +} | |
17993 | + | |
17994 | +#ifdef CONFIG_DEBUG_PAGEALLOC | |
17995 | + | |
17996 | +static int __set_pages_p(struct page *page, int numpages) | |
17997 | +{ | |
17998 | + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), | |
17999 | + .numpages = numpages, | |
18000 | + .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), | |
18001 | + .mask_clr = __pgprot(0)}; | |
18002 | + | |
18003 | + return __change_page_attr_set_clr(&cpa, 1); | |
18004 | +} | |
18005 | + | |
18006 | +static int __set_pages_np(struct page *page, int numpages) | |
18007 | +{ | |
18008 | + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), | |
18009 | + .numpages = numpages, | |
18010 | + .mask_set = __pgprot(0), | |
18011 | + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; | |
18012 | + | |
18013 | + return __change_page_attr_set_clr(&cpa, 1); | |
18014 | +} | |
18015 | + | |
18016 | +void kernel_map_pages(struct page *page, int numpages, int enable) | |
18017 | +{ | |
18018 | + if (PageHighMem(page)) | |
18019 | + return; | |
18020 | + if (!enable) { | |
18021 | + debug_check_no_locks_freed(page_address(page), | |
18022 | + numpages * PAGE_SIZE); | |
18023 | + } | |
18024 | + | |
18025 | + /* | |
18026 | + * If page allocator is not up yet then do not call c_p_a(): | |
18027 | + */ | |
18028 | + if (!debug_pagealloc_enabled) | |
18029 | + return; | |
18030 | + | |
18031 | + /* | |
18032 | + * The return value is ignored as the calls cannot fail. | |
18033 | + * Large pages are kept enabled at boot time, and are | |
18034 | + * split up quickly with DEBUG_PAGEALLOC. If a splitup | |
18035 | + * fails here (due to temporary memory shortage) no damage | |
18036 | + * is done because we just keep the largepage intact up | |
18037 | + * to the next attempt when it will likely be split up: | |
18038 | + */ | |
18039 | + if (enable) | |
18040 | + __set_pages_p(page, numpages); | |
18041 | + else | |
18042 | + __set_pages_np(page, numpages); | |
18043 | + | |
18044 | + /* | |
18045 | + * We should perform an IPI and flush all tlbs, | |
18046 | + * but that can deadlock->flush only current cpu: | |
18047 | + */ | |
18048 | + __flush_tlb_all(); | |
18049 | + | |
18050 | + /* | |
18051 | + * Try to refill the page pool here. We can do this only after | |
18052 | + * the tlb flush. | |
18053 | + */ | |
18054 | + cpa_fill_pool(NULL); | |
18055 | +} | |
18056 | + | |
18057 | +#ifdef CONFIG_HIBERNATION | |
18058 | + | |
18059 | +bool kernel_page_present(struct page *page) | |
18060 | +{ | |
18061 | + unsigned int level; | |
18062 | + pte_t *pte; | |
18063 | + | |
18064 | + if (PageHighMem(page)) | |
18065 | + return false; | |
18066 | + | |
18067 | + pte = lookup_address((unsigned long)page_address(page), &level); | |
18068 | + return (__pte_val(*pte) & _PAGE_PRESENT); | |
18069 | +} | |
18070 | + | |
18071 | +#endif /* CONFIG_HIBERNATION */ | |
18072 | + | |
18073 | +#endif /* CONFIG_DEBUG_PAGEALLOC */ | |
18074 | + | |
18075 | +static inline int in_secondary_range(unsigned long va) | |
18076 | +{ | |
18077 | +#ifdef CONFIG_X86_64 | |
18078 | + return va >= VMALLOC_START && va < VMALLOC_END; | |
18079 | +#else | |
18080 | + return va >= (unsigned long)high_memory; | |
18081 | +#endif | |
18082 | +} | |
18083 | + | |
18084 | +static void __make_page_readonly(unsigned long va) | |
18085 | +{ | |
18086 | + pte_t *pte; | |
18087 | + unsigned int level; | |
18088 | + | |
18089 | + pte = lookup_address(va, &level); | |
18090 | + BUG_ON(!pte || level != PG_LEVEL_4K); | |
18091 | + if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0)) | |
18092 | + BUG(); | |
18093 | + if (in_secondary_range(va)) { | |
18094 | + unsigned long pfn = pte_pfn(*pte); | |
18095 | + | |
18096 | +#ifdef CONFIG_HIGHMEM | |
18097 | + if (pfn >= highstart_pfn) | |
18098 | + kmap_flush_unused(); /* flush stale writable kmaps */ | |
18099 | + else | |
18100 | +#endif | |
18101 | + __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT)); | |
18102 | + } | |
18103 | +} | |
18104 | + | |
18105 | +static void __make_page_writable(unsigned long va) | |
18106 | +{ | |
18107 | + pte_t *pte; | |
18108 | + unsigned int level; | |
18109 | + | |
18110 | + pte = lookup_address(va, &level); | |
18111 | + BUG_ON(!pte || level != PG_LEVEL_4K); | |
18112 | + if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0)) | |
18113 | + BUG(); | |
18114 | + if (in_secondary_range(va)) { | |
18115 | + unsigned long pfn = pte_pfn(*pte); | |
18116 | + | |
18117 | +#ifdef CONFIG_HIGHMEM | |
18118 | + if (pfn < highstart_pfn) | |
18119 | +#endif | |
18120 | + __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT)); | |
18121 | + } | |
18122 | +} | |
18123 | + | |
18124 | +void make_page_readonly(void *va, unsigned int feature) | |
18125 | +{ | |
18126 | + if (!xen_feature(feature)) | |
18127 | + __make_page_readonly((unsigned long)va); | |
18128 | +} | |
18129 | + | |
18130 | +void make_page_writable(void *va, unsigned int feature) | |
18131 | +{ | |
18132 | + if (!xen_feature(feature)) | |
18133 | + __make_page_writable((unsigned long)va); | |
18134 | +} | |
18135 | + | |
18136 | +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature) | |
18137 | +{ | |
18138 | + unsigned long addr; | |
18139 | + | |
18140 | + if (xen_feature(feature)) | |
18141 | + return; | |
18142 | + | |
18143 | + for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE) | |
18144 | + __make_page_readonly(addr); | |
18145 | +} | |
18146 | + | |
18147 | +void make_pages_writable(void *va, unsigned int nr, unsigned int feature) | |
18148 | +{ | |
18149 | + unsigned long addr; | |
18150 | + | |
18151 | + if (xen_feature(feature)) | |
18152 | + return; | |
18153 | + | |
18154 | + for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE) | |
18155 | + __make_page_writable(addr); | |
18156 | +} | |
18157 | + | |
18158 | +/* | |
18159 | + * The testcases use internal knowledge of the implementation that shouldn't | |
18160 | + * be exposed to the rest of the kernel. Include these directly here. | |
18161 | + */ | |
18162 | +#ifdef CONFIG_CPA_DEBUG | |
18163 | +#include "pageattr-test.c" | |
18164 | +#endif | |
18165 | --- a/arch/x86/mm/pgtable_32-xen.c | |
18166 | +++ b/arch/x86/mm/pgtable_32-xen.c | |
18167 | @@ -29,8 +29,6 @@ | |
18168 | #include <xen/features.h> | |
18169 | #include <asm/hypervisor.h> | |
18170 | ||
18171 | -static void pgd_test_and_unpin(pgd_t *pgd); | |
18172 | - | |
18173 | void show_mem(void) | |
18174 | { | |
18175 | int total = 0, reserved = 0; | |
18176 | @@ -167,53 +165,6 @@ pte_t *pte_alloc_one_kernel(struct mm_st | |
18177 | return pte; | |
18178 | } | |
18179 | ||
18180 | -static void _pte_free(struct page *page, unsigned int order) | |
18181 | -{ | |
18182 | - BUG_ON(order); | |
18183 | - pte_free(page); | |
18184 | -} | |
18185 | - | |
18186 | -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) | |
18187 | -{ | |
18188 | - struct page *pte; | |
18189 | - | |
18190 | -#ifdef CONFIG_HIGHPTE | |
18191 | - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); | |
18192 | -#else | |
18193 | - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); | |
18194 | -#endif | |
18195 | - if (pte) { | |
18196 | - SetPageForeign(pte, _pte_free); | |
18197 | - init_page_count(pte); | |
18198 | - } | |
18199 | - return pte; | |
18200 | -} | |
18201 | - | |
18202 | -void pte_free(struct page *pte) | |
18203 | -{ | |
18204 | - unsigned long pfn = page_to_pfn(pte); | |
18205 | - | |
18206 | - if (!PageHighMem(pte)) { | |
18207 | - unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT); | |
18208 | - | |
18209 | - if (!pte_write(*virt_to_ptep(va))) | |
18210 | - if (HYPERVISOR_update_va_mapping( | |
18211 | - va, pfn_pte(pfn, PAGE_KERNEL), 0)) | |
18212 | - BUG(); | |
18213 | - } else | |
18214 | - ClearPagePinned(pte); | |
18215 | - | |
18216 | - ClearPageForeign(pte); | |
18217 | - init_page_count(pte); | |
18218 | - | |
18219 | - __free_page(pte); | |
18220 | -} | |
18221 | - | |
18222 | -void pmd_ctor(struct kmem_cache *cache, void *pmd) | |
18223 | -{ | |
18224 | - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); | |
18225 | -} | |
18226 | - | |
18227 | /* | |
18228 | * List of all pgd's needed for non-PAE so it can invalidate entries | |
18229 | * in both cached and uncached pgd's; not needed for PAE since the | |
18230 | @@ -224,224 +175,191 @@ void pmd_ctor(struct kmem_cache *cache, | |
18231 | * vmalloc faults work because attached pagetables are never freed. | |
18232 | * -- wli | |
18233 | */ | |
18234 | -DEFINE_SPINLOCK(pgd_lock); | |
18235 | -struct page *pgd_list; | |
18236 | - | |
18237 | static inline void pgd_list_add(pgd_t *pgd) | |
18238 | { | |
18239 | struct page *page = virt_to_page(pgd); | |
18240 | - page->index = (unsigned long)pgd_list; | |
18241 | - if (pgd_list) | |
18242 | - set_page_private(pgd_list, (unsigned long)&page->index); | |
18243 | - pgd_list = page; | |
18244 | - set_page_private(page, (unsigned long)&pgd_list); | |
18245 | + | |
18246 | + list_add(&page->lru, &pgd_list); | |
18247 | } | |
18248 | ||
18249 | static inline void pgd_list_del(pgd_t *pgd) | |
18250 | { | |
18251 | - struct page *next, **pprev, *page = virt_to_page(pgd); | |
18252 | - next = (struct page *)page->index; | |
18253 | - pprev = (struct page **)page_private(page); | |
18254 | - *pprev = next; | |
18255 | - if (next) | |
18256 | - set_page_private(next, (unsigned long)pprev); | |
18257 | -} | |
18258 | + struct page *page = virt_to_page(pgd); | |
18259 | ||
18260 | + list_del(&page->lru); | |
18261 | +} | |
18262 | ||
18263 | +#define UNSHARED_PTRS_PER_PGD \ | |
18264 | + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) | |
18265 | ||
18266 | -#if (PTRS_PER_PMD == 1) | |
18267 | -/* Non-PAE pgd constructor */ | |
18268 | -static void pgd_ctor(void *pgd) | |
18269 | +static void pgd_ctor(void *p) | |
18270 | { | |
18271 | + pgd_t *pgd = p; | |
18272 | unsigned long flags; | |
18273 | ||
18274 | - /* !PAE, no pagetable sharing */ | |
18275 | + pgd_test_and_unpin(pgd); | |
18276 | + | |
18277 | + /* Clear usermode parts of PGD */ | |
18278 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); | |
18279 | ||
18280 | spin_lock_irqsave(&pgd_lock, flags); | |
18281 | ||
18282 | - /* must happen under lock */ | |
18283 | - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, | |
18284 | - swapper_pg_dir + USER_PTRS_PER_PGD, | |
18285 | - KERNEL_PGD_PTRS); | |
18286 | - | |
18287 | - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, | |
18288 | - __pa(swapper_pg_dir) >> PAGE_SHIFT, | |
18289 | - USER_PTRS_PER_PGD, | |
18290 | - KERNEL_PGD_PTRS); | |
18291 | - pgd_list_add(pgd); | |
18292 | - spin_unlock_irqrestore(&pgd_lock, flags); | |
18293 | -} | |
18294 | -#else /* PTRS_PER_PMD > 1 */ | |
18295 | -/* PAE pgd constructor */ | |
18296 | -static void pgd_ctor(void *pgd) | |
18297 | -{ | |
18298 | - /* PAE, kernel PMD may be shared */ | |
18299 | - | |
18300 | - if (SHARED_KERNEL_PMD) { | |
18301 | - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, | |
18302 | + /* If the pgd points to a shared pagetable level (either the | |
18303 | + ptes in non-PAE, or shared PMD in PAE), then just copy the | |
18304 | + references from swapper_pg_dir. */ | |
18305 | + if (PAGETABLE_LEVELS == 2 || | |
18306 | + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { | |
18307 | + clone_pgd_range(pgd + USER_PTRS_PER_PGD, | |
18308 | swapper_pg_dir + USER_PTRS_PER_PGD, | |
18309 | KERNEL_PGD_PTRS); | |
18310 | - } else { | |
18311 | - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); | |
18312 | + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, | |
18313 | + __pa(swapper_pg_dir) >> PAGE_SHIFT, | |
18314 | + USER_PTRS_PER_PGD, | |
18315 | + KERNEL_PGD_PTRS); | |
18316 | } | |
18317 | + | |
18318 | + /* list required to sync kernel mapping updates */ | |
18319 | + if (PAGETABLE_LEVELS == 2) | |
18320 | + pgd_list_add(pgd); | |
18321 | + | |
18322 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
18323 | } | |
18324 | -#endif /* PTRS_PER_PMD */ | |
18325 | ||
18326 | static void pgd_dtor(void *pgd) | |
18327 | { | |
18328 | unsigned long flags; /* can be called from interrupt context */ | |
18329 | ||
18330 | - if (SHARED_KERNEL_PMD) | |
18331 | - return; | |
18332 | - | |
18333 | - paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); | |
18334 | - spin_lock_irqsave(&pgd_lock, flags); | |
18335 | - pgd_list_del(pgd); | |
18336 | - spin_unlock_irqrestore(&pgd_lock, flags); | |
18337 | + if (!SHARED_KERNEL_PMD) { | |
18338 | + spin_lock_irqsave(&pgd_lock, flags); | |
18339 | + pgd_list_del(pgd); | |
18340 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
18341 | + } | |
18342 | ||
18343 | pgd_test_and_unpin(pgd); | |
18344 | } | |
18345 | ||
18346 | -#define UNSHARED_PTRS_PER_PGD \ | |
18347 | - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) | |
18348 | - | |
18349 | -/* If we allocate a pmd for part of the kernel address space, then | |
18350 | - make sure its initialized with the appropriate kernel mappings. | |
18351 | - Otherwise use a cached zeroed pmd. */ | |
18352 | -static pmd_t *pmd_cache_alloc(int idx) | |
18353 | +#ifdef CONFIG_X86_PAE | |
18354 | +/* | |
18355 | + * Mop up any pmd pages which may still be attached to the pgd. | |
18356 | + * Normally they will be freed by munmap/exit_mmap, but any pmd we | |
18357 | + * preallocate which never got a corresponding vma will need to be | |
18358 | + * freed manually. | |
18359 | + */ | |
18360 | +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | |
18361 | { | |
18362 | - pmd_t *pmd; | |
18363 | + int i; | |
18364 | ||
18365 | - if (idx >= USER_PTRS_PER_PGD) { | |
18366 | - pmd = (pmd_t *)__get_free_page(GFP_KERNEL); | |
18367 | + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { | |
18368 | + pgd_t pgd = pgdp[i]; | |
18369 | ||
18370 | -#ifndef CONFIG_XEN | |
18371 | - if (pmd) | |
18372 | - memcpy(pmd, | |
18373 | - (void *)pgd_page_vaddr(swapper_pg_dir[idx]), | |
18374 | - sizeof(pmd_t) * PTRS_PER_PMD); | |
18375 | -#endif | |
18376 | - } else | |
18377 | - pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); | |
18378 | + if (__pgd_val(pgd) != 0) { | |
18379 | + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); | |
18380 | ||
18381 | - return pmd; | |
18382 | -} | |
18383 | + pgdp[i] = xen_make_pgd(0); | |
18384 | ||
18385 | -static void pmd_cache_free(pmd_t *pmd, int idx) | |
18386 | -{ | |
18387 | - if (idx >= USER_PTRS_PER_PGD) { | |
18388 | - make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables); | |
18389 | - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); | |
18390 | - free_page((unsigned long)pmd); | |
18391 | - } else | |
18392 | - kmem_cache_free(pmd_cache, pmd); | |
18393 | + paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); | |
18394 | + pmd_free(mm, pmd); | |
18395 | + } | |
18396 | + } | |
18397 | } | |
18398 | ||
18399 | -pgd_t *pgd_alloc(struct mm_struct *mm) | |
18400 | +/* | |
18401 | + * In PAE mode, we need to do a cr3 reload (=tlb flush) when | |
18402 | + * updating the top-level pagetable entries to guarantee the | |
18403 | + * processor notices the update. Since this is expensive, and | |
18404 | + * all 4 top-level entries are used almost immediately in a | |
18405 | + * new process's life, we just pre-populate them here. | |
18406 | + * | |
18407 | + * Also, if we're in a paravirt environment where the kernel pmd is | |
18408 | + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate | |
18409 | + * and initialize the kernel pmds here. | |
18410 | + */ | |
18411 | +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | |
18412 | { | |
18413 | + pud_t *pud; | |
18414 | + pmd_t *pmds[UNSHARED_PTRS_PER_PGD]; | |
18415 | + unsigned long addr, flags; | |
18416 | int i; | |
18417 | - pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); | |
18418 | - pmd_t **pmds = NULL; | |
18419 | - unsigned long flags; | |
18420 | - | |
18421 | - pgd_test_and_unpin(pgd); | |
18422 | - | |
18423 | - if (PTRS_PER_PMD == 1 || !pgd) | |
18424 | - return pgd; | |
18425 | - | |
18426 | -#ifdef CONFIG_XEN | |
18427 | - if (!SHARED_KERNEL_PMD) { | |
18428 | - /* | |
18429 | - * We can race save/restore (if we sleep during a GFP_KERNEL memory | |
18430 | - * allocation). We therefore store virtual addresses of pmds as they | |
18431 | - * do not change across save/restore, and poke the machine addresses | |
18432 | - * into the pgdir under the pgd_lock. | |
18433 | - */ | |
18434 | - pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL); | |
18435 | - if (!pmds) { | |
18436 | - quicklist_free(0, pgd_dtor, pgd); | |
18437 | - return NULL; | |
18438 | - } | |
18439 | - } | |
18440 | -#endif | |
18441 | ||
18442 | - /* Allocate pmds, remember virtual addresses. */ | |
18443 | - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { | |
18444 | - pmd_t *pmd = pmd_cache_alloc(i); | |
18445 | - | |
18446 | - if (!pmd) | |
18447 | + /* | |
18448 | + * We can race save/restore (if we sleep during a GFP_KERNEL memory | |
18449 | + * allocation). We therefore store virtual addresses of pmds as they | |
18450 | + * do not change across save/restore, and poke the machine addresses | |
18451 | + * into the pgdir under the pgd_lock. | |
18452 | + */ | |
18453 | + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) { | |
18454 | + pmds[i] = pmd_alloc_one(mm, addr); | |
18455 | + if (!pmds[i]) | |
18456 | goto out_oom; | |
18457 | - | |
18458 | - paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); | |
18459 | - if (pmds) | |
18460 | - pmds[i] = pmd; | |
18461 | - else | |
18462 | - set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); | |
18463 | } | |
18464 | ||
18465 | -#ifdef CONFIG_XEN | |
18466 | - if (SHARED_KERNEL_PMD) | |
18467 | - return pgd; | |
18468 | - | |
18469 | spin_lock_irqsave(&pgd_lock, flags); | |
18470 | ||
18471 | /* Protect against save/restore: move below 4GB under pgd_lock. */ | |
18472 | - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) { | |
18473 | - int rc = xen_create_contiguous_region( | |
18474 | - (unsigned long)pgd, 0, 32); | |
18475 | - if (rc) { | |
18476 | - spin_unlock_irqrestore(&pgd_lock, flags); | |
18477 | - goto out_oom; | |
18478 | - } | |
18479 | + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb) | |
18480 | + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) { | |
18481 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
18482 | +out_oom: | |
18483 | + while (i--) | |
18484 | + pmd_free(mm, pmds[i]); | |
18485 | + return 0; | |
18486 | } | |
18487 | ||
18488 | /* Copy kernel pmd contents and write-protect the new pmds. */ | |
18489 | - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { | |
18490 | - memcpy(pmds[i], | |
18491 | - (void *)pgd_page_vaddr(swapper_pg_dir[i]), | |
18492 | - sizeof(pmd_t) * PTRS_PER_PMD); | |
18493 | - make_lowmem_page_readonly( | |
18494 | - pmds[i], XENFEAT_writable_page_tables); | |
18495 | - } | |
18496 | + pud = pud_offset(pgd, 0); | |
18497 | + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; | |
18498 | + i++, pud++, addr += PUD_SIZE) { | |
18499 | + if (i >= USER_PTRS_PER_PGD) { | |
18500 | + memcpy(pmds[i], | |
18501 | + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), | |
18502 | + sizeof(pmd_t) * PTRS_PER_PMD); | |
18503 | + make_lowmem_page_readonly( | |
18504 | + pmds[i], XENFEAT_writable_page_tables); | |
18505 | + } | |
18506 | ||
18507 | - /* It is safe to poke machine addresses of pmds under the pmd_lock. */ | |
18508 | - for (i = 0; i < PTRS_PER_PGD; i++) | |
18509 | - set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i]))); | |
18510 | + /* It is safe to poke machine addresses of pmds under the pgd_lock. */ | |
18511 | + pud_populate(mm, pud, pmds[i]); | |
18512 | + } | |
18513 | ||
18514 | - /* Ensure this pgd gets picked up and pinned on save/restore. */ | |
18515 | + /* List required to sync kernel mapping updates and | |
18516 | + * to pin/unpin on save/restore. */ | |
18517 | pgd_list_add(pgd); | |
18518 | ||
18519 | spin_unlock_irqrestore(&pgd_lock, flags); | |
18520 | ||
18521 | - kfree(pmds); | |
18522 | -#endif | |
18523 | + return 1; | |
18524 | +} | |
18525 | +#else /* !CONFIG_X86_PAE */ | |
18526 | +/* No need to prepopulate any pagetable entries in non-PAE modes. */ | |
18527 | +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | |
18528 | +{ | |
18529 | + return 1; | |
18530 | +} | |
18531 | ||
18532 | - return pgd; | |
18533 | +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | |
18534 | +{ | |
18535 | +} | |
18536 | +#endif /* CONFIG_X86_PAE */ | |
18537 | ||
18538 | -out_oom: | |
18539 | - if (!pmds) { | |
18540 | - for (i--; i >= 0; i--) { | |
18541 | - pgd_t pgdent = pgd[i]; | |
18542 | - void* pmd = (void *)__va(pgd_val(pgdent)-1); | |
18543 | - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); | |
18544 | - pmd_cache_free(pmd, i); | |
18545 | - } | |
18546 | - } else { | |
18547 | - for (i--; i >= 0; i--) { | |
18548 | - paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT); | |
18549 | - pmd_cache_free(pmds[i], i); | |
18550 | - } | |
18551 | - kfree(pmds); | |
18552 | +pgd_t *pgd_alloc(struct mm_struct *mm) | |
18553 | +{ | |
18554 | + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); | |
18555 | + | |
18556 | + /* so that alloc_pd can use it */ | |
18557 | + mm->pgd = pgd; | |
18558 | + if (pgd) | |
18559 | + pgd_ctor(pgd); | |
18560 | + | |
18561 | + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { | |
18562 | + free_page((unsigned long)pgd); | |
18563 | + pgd = NULL; | |
18564 | } | |
18565 | - quicklist_free(0, pgd_dtor, pgd); | |
18566 | - return NULL; | |
18567 | + | |
18568 | + return pgd; | |
18569 | } | |
18570 | ||
18571 | -void pgd_free(pgd_t *pgd) | |
18572 | +void pgd_free(struct mm_struct *mm, pgd_t *pgd) | |
18573 | { | |
18574 | - int i; | |
18575 | - | |
18576 | /* | |
18577 | * After this the pgd should not be pinned for the duration of this | |
18578 | * function's execution. We should never sleep and thus never race: | |
18579 | @@ -450,39 +368,43 @@ void pgd_free(pgd_t *pgd) | |
18580 | * 2. The machine addresses in PGD entries will not become invalid | |
18581 | * due to a concurrent save/restore. | |
18582 | */ | |
18583 | - pgd_test_and_unpin(pgd); | |
18584 | + pgd_dtor(pgd); | |
18585 | ||
18586 | - /* in the PAE case user pgd entries are overwritten before usage */ | |
18587 | - if (PTRS_PER_PMD > 1) { | |
18588 | - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { | |
18589 | - pgd_t pgdent = pgd[i]; | |
18590 | - void* pmd = (void *)__va(pgd_val(pgdent)-1); | |
18591 | - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); | |
18592 | - pmd_cache_free(pmd, i); | |
18593 | - } | |
18594 | + if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb)) | |
18595 | + xen_destroy_contiguous_region((unsigned long)pgd, 0); | |
18596 | ||
18597 | - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) | |
18598 | - xen_destroy_contiguous_region((unsigned long)pgd, 0); | |
18599 | - } | |
18600 | + pgd_mop_up_pmds(mm, pgd); | |
18601 | + free_page((unsigned long)pgd); | |
18602 | +} | |
18603 | ||
18604 | - /* in the non-PAE case, free_pgtables() clears user pgd entries */ | |
18605 | - quicklist_free(0, pgd_dtor, pgd); | |
18606 | +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) | |
18607 | +{ | |
18608 | + pgtable_page_dtor(pte); | |
18609 | + paravirt_release_pt(page_to_pfn(pte)); | |
18610 | + tlb_remove_page(tlb, pte); | |
18611 | } | |
18612 | ||
18613 | -void check_pgt_cache(void) | |
18614 | +#ifdef CONFIG_X86_PAE | |
18615 | + | |
18616 | +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) | |
18617 | { | |
18618 | - quicklist_trim(0, pgd_dtor, 25, 16); | |
18619 | + paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); | |
18620 | + tlb_remove_page(tlb, virt_to_page(pmd)); | |
18621 | } | |
18622 | ||
18623 | +#endif | |
18624 | + | |
18625 | void make_lowmem_page_readonly(void *va, unsigned int feature) | |
18626 | { | |
18627 | pte_t *pte; | |
18628 | + unsigned int level; | |
18629 | int rc; | |
18630 | ||
18631 | if (xen_feature(feature)) | |
18632 | return; | |
18633 | ||
18634 | - pte = virt_to_ptep(va); | |
18635 | + pte = lookup_address((unsigned long)va, &level); | |
18636 | + BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte)); | |
18637 | rc = HYPERVISOR_update_va_mapping( | |
18638 | (unsigned long)va, pte_wrprotect(*pte), 0); | |
18639 | BUG_ON(rc); | |
18640 | @@ -491,313 +413,15 @@ void make_lowmem_page_readonly(void *va, | |
18641 | void make_lowmem_page_writable(void *va, unsigned int feature) | |
18642 | { | |
18643 | pte_t *pte; | |
18644 | + unsigned int level; | |
18645 | int rc; | |
18646 | ||
18647 | if (xen_feature(feature)) | |
18648 | return; | |
18649 | ||
18650 | - pte = virt_to_ptep(va); | |
18651 | + pte = lookup_address((unsigned long)va, &level); | |
18652 | + BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte)); | |
18653 | rc = HYPERVISOR_update_va_mapping( | |
18654 | (unsigned long)va, pte_mkwrite(*pte), 0); | |
18655 | BUG_ON(rc); | |
18656 | } | |
18657 | - | |
18658 | -void make_page_readonly(void *va, unsigned int feature) | |
18659 | -{ | |
18660 | - pte_t *pte; | |
18661 | - int rc; | |
18662 | - | |
18663 | - if (xen_feature(feature)) | |
18664 | - return; | |
18665 | - | |
18666 | - pte = virt_to_ptep(va); | |
18667 | - rc = HYPERVISOR_update_va_mapping( | |
18668 | - (unsigned long)va, pte_wrprotect(*pte), 0); | |
18669 | - if (rc) /* fallback? */ | |
18670 | - xen_l1_entry_update(pte, pte_wrprotect(*pte)); | |
18671 | - if ((unsigned long)va >= (unsigned long)high_memory) { | |
18672 | - unsigned long pfn = pte_pfn(*pte); | |
18673 | -#ifdef CONFIG_HIGHMEM | |
18674 | - if (pfn >= highstart_pfn) | |
18675 | - kmap_flush_unused(); /* flush stale writable kmaps */ | |
18676 | - else | |
18677 | -#endif | |
18678 | - make_lowmem_page_readonly( | |
18679 | - phys_to_virt(pfn << PAGE_SHIFT), feature); | |
18680 | - } | |
18681 | -} | |
18682 | - | |
18683 | -void make_page_writable(void *va, unsigned int feature) | |
18684 | -{ | |
18685 | - pte_t *pte; | |
18686 | - int rc; | |
18687 | - | |
18688 | - if (xen_feature(feature)) | |
18689 | - return; | |
18690 | - | |
18691 | - pte = virt_to_ptep(va); | |
18692 | - rc = HYPERVISOR_update_va_mapping( | |
18693 | - (unsigned long)va, pte_mkwrite(*pte), 0); | |
18694 | - if (rc) /* fallback? */ | |
18695 | - xen_l1_entry_update(pte, pte_mkwrite(*pte)); | |
18696 | - if ((unsigned long)va >= (unsigned long)high_memory) { | |
18697 | - unsigned long pfn = pte_pfn(*pte); | |
18698 | -#ifdef CONFIG_HIGHMEM | |
18699 | - if (pfn < highstart_pfn) | |
18700 | -#endif | |
18701 | - make_lowmem_page_writable( | |
18702 | - phys_to_virt(pfn << PAGE_SHIFT), feature); | |
18703 | - } | |
18704 | -} | |
18705 | - | |
18706 | -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature) | |
18707 | -{ | |
18708 | - if (xen_feature(feature)) | |
18709 | - return; | |
18710 | - | |
18711 | - while (nr-- != 0) { | |
18712 | - make_page_readonly(va, feature); | |
18713 | - va = (void *)((unsigned long)va + PAGE_SIZE); | |
18714 | - } | |
18715 | -} | |
18716 | - | |
18717 | -void make_pages_writable(void *va, unsigned int nr, unsigned int feature) | |
18718 | -{ | |
18719 | - if (xen_feature(feature)) | |
18720 | - return; | |
18721 | - | |
18722 | - while (nr-- != 0) { | |
18723 | - make_page_writable(va, feature); | |
18724 | - va = (void *)((unsigned long)va + PAGE_SIZE); | |
18725 | - } | |
18726 | -} | |
18727 | - | |
18728 | -static void _pin_lock(struct mm_struct *mm, int lock) { | |
18729 | - if (lock) | |
18730 | - spin_lock(&mm->page_table_lock); | |
18731 | -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | |
18732 | - /* While mm->page_table_lock protects us against insertions and | |
18733 | - * removals of higher level page table pages, it doesn't protect | |
18734 | - * against updates of pte-s. Such updates, however, require the | |
18735 | - * pte pages to be in consistent state (unpinned+writable or | |
18736 | - * pinned+readonly). The pinning and attribute changes, however | |
18737 | - * cannot be done atomically, which is why such updates must be | |
18738 | - * prevented from happening concurrently. | |
18739 | - * Note that no pte lock can ever elsewhere be acquired nesting | |
18740 | - * with an already acquired one in the same mm, or with the mm's | |
18741 | - * page_table_lock already acquired, as that would break in the | |
18742 | - * non-split case (where all these are actually resolving to the | |
18743 | - * one page_table_lock). Thus acquiring all of them here is not | |
18744 | - * going to result in dead locks, and the order of acquires | |
18745 | - * doesn't matter. | |
18746 | - */ | |
18747 | - { | |
18748 | - pgd_t *pgd = mm->pgd; | |
18749 | - unsigned g; | |
18750 | - | |
18751 | - for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { | |
18752 | - pud_t *pud; | |
18753 | - unsigned u; | |
18754 | - | |
18755 | - if (pgd_none(*pgd)) | |
18756 | - continue; | |
18757 | - pud = pud_offset(pgd, 0); | |
18758 | - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | |
18759 | - pmd_t *pmd; | |
18760 | - unsigned m; | |
18761 | - | |
18762 | - if (pud_none(*pud)) | |
18763 | - continue; | |
18764 | - pmd = pmd_offset(pud, 0); | |
18765 | - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { | |
18766 | - spinlock_t *ptl; | |
18767 | - | |
18768 | - if (pmd_none(*pmd)) | |
18769 | - continue; | |
18770 | - ptl = pte_lockptr(0, pmd); | |
18771 | - if (lock) | |
18772 | - spin_lock(ptl); | |
18773 | - else | |
18774 | - spin_unlock(ptl); | |
18775 | - } | |
18776 | - } | |
18777 | - } | |
18778 | - } | |
18779 | -#endif | |
18780 | - if (!lock) | |
18781 | - spin_unlock(&mm->page_table_lock); | |
18782 | -} | |
18783 | -#define pin_lock(mm) _pin_lock(mm, 1) | |
18784 | -#define pin_unlock(mm) _pin_lock(mm, 0) | |
18785 | - | |
18786 | -#define PIN_BATCH 4 | |
18787 | -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); | |
18788 | - | |
18789 | -static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags, | |
18790 | - unsigned int cpu, unsigned seq) | |
18791 | -{ | |
18792 | - unsigned long pfn = page_to_pfn(page); | |
18793 | - | |
18794 | - if (PageHighMem(page)) { | |
18795 | - if (pgprot_val(flags) & _PAGE_RW) | |
18796 | - ClearPagePinned(page); | |
18797 | - else | |
18798 | - SetPagePinned(page); | |
18799 | - } else { | |
18800 | - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, | |
18801 | - (unsigned long)__va(pfn << PAGE_SHIFT), | |
18802 | - pfn_pte(pfn, flags), 0); | |
18803 | - if (unlikely(++seq == PIN_BATCH)) { | |
18804 | - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), | |
18805 | - PIN_BATCH, NULL))) | |
18806 | - BUG(); | |
18807 | - seq = 0; | |
18808 | - } | |
18809 | - } | |
18810 | - | |
18811 | - return seq; | |
18812 | -} | |
18813 | - | |
18814 | -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) | |
18815 | -{ | |
18816 | - pgd_t *pgd = pgd_base; | |
18817 | - pud_t *pud; | |
18818 | - pmd_t *pmd; | |
18819 | - int g, u, m; | |
18820 | - unsigned int cpu, seq; | |
18821 | - | |
18822 | - if (xen_feature(XENFEAT_auto_translated_physmap)) | |
18823 | - return; | |
18824 | - | |
18825 | - cpu = get_cpu(); | |
18826 | - | |
18827 | - for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { | |
18828 | - if (pgd_none(*pgd)) | |
18829 | - continue; | |
18830 | - pud = pud_offset(pgd, 0); | |
18831 | - if (PTRS_PER_PUD > 1) /* not folded */ | |
18832 | - seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq); | |
18833 | - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | |
18834 | - if (pud_none(*pud)) | |
18835 | - continue; | |
18836 | - pmd = pmd_offset(pud, 0); | |
18837 | - if (PTRS_PER_PMD > 1) /* not folded */ | |
18838 | - seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq); | |
18839 | - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { | |
18840 | - if (pmd_none(*pmd)) | |
18841 | - continue; | |
18842 | - seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq); | |
18843 | - } | |
18844 | - } | |
18845 | - } | |
18846 | - | |
18847 | - if (likely(seq != 0)) { | |
18848 | - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, | |
18849 | - (unsigned long)pgd_base, | |
18850 | - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), | |
18851 | - UVMF_TLB_FLUSH); | |
18852 | - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), | |
18853 | - seq + 1, NULL))) | |
18854 | - BUG(); | |
18855 | - } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base, | |
18856 | - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), | |
18857 | - UVMF_TLB_FLUSH)) | |
18858 | - BUG(); | |
18859 | - | |
18860 | - put_cpu(); | |
18861 | -} | |
18862 | - | |
18863 | -static void __pgd_pin(pgd_t *pgd) | |
18864 | -{ | |
18865 | - pgd_walk(pgd, PAGE_KERNEL_RO); | |
18866 | - kmap_flush_unused(); | |
18867 | - xen_pgd_pin(__pa(pgd)); | |
18868 | - SetPagePinned(virt_to_page(pgd)); | |
18869 | -} | |
18870 | - | |
18871 | -static void __pgd_unpin(pgd_t *pgd) | |
18872 | -{ | |
18873 | - xen_pgd_unpin(__pa(pgd)); | |
18874 | - pgd_walk(pgd, PAGE_KERNEL); | |
18875 | - ClearPagePinned(virt_to_page(pgd)); | |
18876 | -} | |
18877 | - | |
18878 | -static void pgd_test_and_unpin(pgd_t *pgd) | |
18879 | -{ | |
18880 | - if (PagePinned(virt_to_page(pgd))) | |
18881 | - __pgd_unpin(pgd); | |
18882 | -} | |
18883 | - | |
18884 | -void mm_pin(struct mm_struct *mm) | |
18885 | -{ | |
18886 | - if (xen_feature(XENFEAT_writable_page_tables)) | |
18887 | - return; | |
18888 | - pin_lock(mm); | |
18889 | - __pgd_pin(mm->pgd); | |
18890 | - pin_unlock(mm); | |
18891 | -} | |
18892 | - | |
18893 | -void mm_unpin(struct mm_struct *mm) | |
18894 | -{ | |
18895 | - if (xen_feature(XENFEAT_writable_page_tables)) | |
18896 | - return; | |
18897 | - pin_lock(mm); | |
18898 | - __pgd_unpin(mm->pgd); | |
18899 | - pin_unlock(mm); | |
18900 | -} | |
18901 | - | |
18902 | -void mm_pin_all(void) | |
18903 | -{ | |
18904 | - struct page *page; | |
18905 | - unsigned long flags; | |
18906 | - | |
18907 | - if (xen_feature(XENFEAT_writable_page_tables)) | |
18908 | - return; | |
18909 | - | |
18910 | - /* | |
18911 | - * Allow uninterrupted access to the pgd_list. Also protects | |
18912 | - * __pgd_pin() by disabling preemption. | |
18913 | - * All other CPUs must be at a safe point (e.g., in stop_machine | |
18914 | - * or offlined entirely). | |
18915 | - */ | |
18916 | - spin_lock_irqsave(&pgd_lock, flags); | |
18917 | - for (page = pgd_list; page; page = (struct page *)page->index) { | |
18918 | - if (!PagePinned(page)) | |
18919 | - __pgd_pin((pgd_t *)page_address(page)); | |
18920 | - } | |
18921 | - spin_unlock_irqrestore(&pgd_lock, flags); | |
18922 | -} | |
18923 | - | |
18924 | -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | |
18925 | -{ | |
18926 | - if (!PagePinned(virt_to_page(mm->pgd))) | |
18927 | - mm_pin(mm); | |
18928 | -} | |
18929 | - | |
18930 | -void arch_exit_mmap(struct mm_struct *mm) | |
18931 | -{ | |
18932 | - struct task_struct *tsk = current; | |
18933 | - | |
18934 | - task_lock(tsk); | |
18935 | - | |
18936 | - /* | |
18937 | - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() | |
18938 | - * *much* faster this way, as no tlb flushes means bigger wrpt batches. | |
18939 | - */ | |
18940 | - if (tsk->active_mm == mm) { | |
18941 | - tsk->active_mm = &init_mm; | |
18942 | - atomic_inc(&init_mm.mm_count); | |
18943 | - | |
18944 | - switch_mm(mm, &init_mm, tsk); | |
18945 | - | |
18946 | - atomic_dec(&mm->mm_count); | |
18947 | - BUG_ON(atomic_read(&mm->mm_count) == 0); | |
18948 | - } | |
18949 | - | |
18950 | - task_unlock(tsk); | |
18951 | - | |
18952 | - if (PagePinned(virt_to_page(mm->pgd)) && | |
18953 | - (atomic_read(&mm->mm_count) == 1) && | |
18954 | - !mm->context.has_foreign_mappings) | |
18955 | - mm_unpin(mm); | |
18956 | -} | |
18957 | --- a/arch/x86/pci/irq-xen.c | |
18958 | +++ b/arch/x86/pci/irq-xen.c | |
18959 | @@ -204,6 +204,7 @@ static int pirq_ali_get(struct pci_dev * | |
18960 | { | |
18961 | static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; | |
18962 | ||
18963 | + WARN_ON_ONCE(pirq >= 16); | |
18964 | return irqmap[read_config_nybble(router, 0x48, pirq-1)]; | |
18965 | } | |
18966 | ||
18967 | @@ -211,7 +212,8 @@ static int pirq_ali_set(struct pci_dev * | |
18968 | { | |
18969 | static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 }; | |
18970 | unsigned int val = irqmap[irq]; | |
18971 | - | |
18972 | + | |
18973 | + WARN_ON_ONCE(pirq >= 16); | |
18974 | if (val) { | |
18975 | write_config_nybble(router, 0x48, pirq-1, val); | |
18976 | return 1; | |
18977 | @@ -261,12 +263,16 @@ static int pirq_via_set(struct pci_dev * | |
18978 | static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18979 | { | |
18980 | static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; | |
18981 | + | |
18982 | + WARN_ON_ONCE(pirq >= 5); | |
18983 | return read_config_nybble(router, 0x55, pirqmap[pirq-1]); | |
18984 | } | |
18985 | ||
18986 | static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18987 | { | |
18988 | static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; | |
18989 | + | |
18990 | + WARN_ON_ONCE(pirq >= 5); | |
18991 | write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); | |
18992 | return 1; | |
18993 | } | |
18994 | @@ -279,12 +285,16 @@ static int pirq_via586_set(struct pci_de | |
18995 | static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18996 | { | |
18997 | static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; | |
18998 | + | |
18999 | + WARN_ON_ONCE(pirq >= 4); | |
19000 | return read_config_nybble(router,0x43, pirqmap[pirq-1]); | |
19001 | } | |
19002 | ||
19003 | static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
19004 | { | |
19005 | static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; | |
19006 | + | |
19007 | + WARN_ON_ONCE(pirq >= 4); | |
19008 | write_config_nybble(router, 0x43, pirqmap[pirq-1], irq); | |
19009 | return 1; | |
19010 | } | |
19011 | @@ -423,6 +433,7 @@ static int pirq_sis_set(struct pci_dev * | |
19012 | ||
19013 | static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
19014 | { | |
19015 | + WARN_ON_ONCE(pirq >= 9); | |
19016 | if (pirq > 8) { | |
19017 | printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); | |
19018 | return 0; | |
19019 | @@ -432,6 +443,7 @@ static int pirq_vlsi_get(struct pci_dev | |
19020 | ||
19021 | static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
19022 | { | |
19023 | + WARN_ON_ONCE(pirq >= 9); | |
19024 | if (pirq > 8) { | |
19025 | printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); | |
19026 | return 0; | |
19027 | @@ -453,14 +465,14 @@ static int pirq_vlsi_set(struct pci_dev | |
19028 | */ | |
19029 | static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
19030 | { | |
19031 | - outb_p(pirq, 0xc00); | |
19032 | + outb(pirq, 0xc00); | |
19033 | return inb(0xc01) & 0xf; | |
19034 | } | |
19035 | ||
19036 | static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
19037 | { | |
19038 | - outb_p(pirq, 0xc00); | |
19039 | - outb_p(irq, 0xc01); | |
19040 | + outb(pirq, 0xc00); | |
19041 | + outb(irq, 0xc01); | |
19042 | return 1; | |
19043 | } | |
19044 | ||
19045 | @@ -575,6 +587,10 @@ static __init int intel_router_probe(str | |
19046 | case PCI_DEVICE_ID_INTEL_ICH9_4: | |
19047 | case PCI_DEVICE_ID_INTEL_ICH9_5: | |
19048 | case PCI_DEVICE_ID_INTEL_TOLAPAI_0: | |
19049 | + case PCI_DEVICE_ID_INTEL_ICH10_0: | |
19050 | + case PCI_DEVICE_ID_INTEL_ICH10_1: | |
19051 | + case PCI_DEVICE_ID_INTEL_ICH10_2: | |
19052 | + case PCI_DEVICE_ID_INTEL_ICH10_3: | |
19053 | r->name = "PIIX/ICH"; | |
19054 | r->get = pirq_piix_get; | |
19055 | r->set = pirq_piix_set; | |
19056 | --- a/arch/x86/vdso/Makefile | |
19057 | +++ b/arch/x86/vdso/Makefile | |
19058 | @@ -66,6 +66,7 @@ vdso32.so-$(VDSO32-y) += int80 | |
19059 | vdso32.so-$(CONFIG_COMPAT) += syscall | |
19060 | vdso32.so-$(VDSO32-y) += sysenter | |
19061 | xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80 | |
19062 | +xen-vdso32-$(CONFIG_X86_32) += syscall | |
19063 | vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y) | |
19064 | ||
19065 | vdso32-images = $(vdso32.so-y:%=vdso32-%.so) | |
19066 | --- a/arch/x86/vdso/vdso32.S | |
19067 | +++ b/arch/x86/vdso/vdso32.S | |
19068 | @@ -19,4 +19,16 @@ vdso32_sysenter_start: | |
19069 | .incbin "arch/x86/vdso/vdso32-sysenter.so" | |
19070 | vdso32_sysenter_end: | |
19071 | ||
19072 | +#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200 | |
19073 | + .globl vdso32_int80_start, vdso32_int80_end | |
19074 | +vdso32_int80_start: | |
19075 | + .incbin "arch/x86/vdso/vdso32-int80.so" | |
19076 | +vdso32_int80_end: | |
19077 | +#elif defined(CONFIG_X86_XEN) | |
19078 | + .globl vdso32_syscall_start, vdso32_syscall_end | |
19079 | +vdso32_syscall_start: | |
19080 | + .incbin "arch/x86/vdso/vdso32-syscall.so" | |
19081 | +vdso32_syscall_end: | |
19082 | +#endif | |
19083 | + | |
19084 | __FINIT | |
19085 | --- a/arch/x86/vdso/vdso32-setup.c | |
19086 | +++ b/arch/x86/vdso/vdso32-setup.c | |
19087 | @@ -26,10 +26,6 @@ | |
19088 | #include <asm/vdso.h> | |
19089 | #include <asm/proto.h> | |
19090 | ||
19091 | -#ifdef CONFIG_XEN | |
19092 | -#include <xen/interface/callback.h> | |
19093 | -#endif | |
19094 | - | |
19095 | enum { | |
19096 | VDSO_DISABLED = 0, | |
19097 | VDSO_ENABLED = 1, | |
19098 | @@ -229,7 +225,6 @@ static inline void map_compat_vdso(int m | |
19099 | ||
19100 | void enable_sep_cpu(void) | |
19101 | { | |
19102 | -#ifndef CONFIG_XEN | |
19103 | int cpu = get_cpu(); | |
19104 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | |
19105 | ||
19106 | @@ -244,35 +239,6 @@ void enable_sep_cpu(void) | |
19107 | wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); | |
19108 | wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); | |
19109 | put_cpu(); | |
19110 | -#else | |
19111 | - extern asmlinkage void ia32pv_sysenter_target(void); | |
19112 | - static struct callback_register sysenter = { | |
19113 | - .type = CALLBACKTYPE_sysenter, | |
19114 | - .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target }, | |
19115 | - }; | |
19116 | - | |
19117 | - if (!boot_cpu_has(X86_FEATURE_SEP)) | |
19118 | - return; | |
19119 | - | |
19120 | - get_cpu(); | |
19121 | - | |
19122 | - if (xen_feature(XENFEAT_supervisor_mode_kernel)) | |
19123 | - sysenter.address.eip = (unsigned long)ia32_sysenter_target; | |
19124 | - | |
19125 | - switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) { | |
19126 | - case 0: | |
19127 | - break; | |
19128 | -#if CONFIG_XEN_COMPAT < 0x030200 | |
19129 | - case -ENOSYS: | |
19130 | - sysenter.type = CALLBACKTYPE_sysenter_deprecated; | |
19131 | - if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0) | |
19132 | - break; | |
19133 | -#endif | |
19134 | - default: | |
19135 | - clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability); | |
19136 | - break; | |
19137 | - } | |
19138 | -#endif | |
19139 | } | |
19140 | ||
19141 | static struct vm_area_struct gate_vma; | |
19142 | --- /dev/null | |
19143 | +++ b/arch/x86/vdso/vdso32-setup-xen.c | |
19144 | @@ -0,0 +1,506 @@ | |
19145 | +/* | |
19146 | + * (C) Copyright 2002 Linus Torvalds | |
19147 | + * Portions based on the vdso-randomization code from exec-shield: | |
19148 | + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar | |
19149 | + * | |
19150 | + * This file contains the needed initializations to support sysenter. | |
19151 | + */ | |
19152 | + | |
19153 | +#include <linux/init.h> | |
19154 | +#include <linux/smp.h> | |
19155 | +#include <linux/thread_info.h> | |
19156 | +#include <linux/sched.h> | |
19157 | +#include <linux/gfp.h> | |
19158 | +#include <linux/string.h> | |
19159 | +#include <linux/elf.h> | |
19160 | +#include <linux/mm.h> | |
19161 | +#include <linux/err.h> | |
19162 | +#include <linux/module.h> | |
19163 | + | |
19164 | +#include <asm/cpufeature.h> | |
19165 | +#include <asm/msr.h> | |
19166 | +#include <asm/pgtable.h> | |
19167 | +#include <asm/unistd.h> | |
19168 | +#include <asm/elf.h> | |
19169 | +#include <asm/tlbflush.h> | |
19170 | +#include <asm/vdso.h> | |
19171 | +#include <asm/proto.h> | |
19172 | + | |
19173 | +#include <xen/interface/callback.h> | |
19174 | + | |
19175 | +enum { | |
19176 | + VDSO_DISABLED = 0, | |
19177 | + VDSO_ENABLED = 1, | |
19178 | + VDSO_COMPAT = 2, | |
19179 | +}; | |
19180 | + | |
19181 | +#ifdef CONFIG_COMPAT_VDSO | |
19182 | +#define VDSO_DEFAULT VDSO_COMPAT | |
19183 | +#else | |
19184 | +#define VDSO_DEFAULT VDSO_ENABLED | |
19185 | +#endif | |
19186 | + | |
19187 | +#ifdef CONFIG_X86_64 | |
19188 | +#define vdso_enabled sysctl_vsyscall32 | |
19189 | +#define arch_setup_additional_pages syscall32_setup_pages | |
19190 | +#endif | |
19191 | + | |
19192 | +/* | |
19193 | + * This is the difference between the prelinked addresses in the vDSO images | |
19194 | + * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO | |
19195 | + * in the user address space. | |
19196 | + */ | |
19197 | +#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK) | |
19198 | + | |
19199 | +/* | |
19200 | + * Should the kernel map a VDSO page into processes and pass its | |
19201 | + * address down to glibc upon exec()? | |
19202 | + */ | |
19203 | +unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; | |
19204 | + | |
19205 | +static int __init vdso_setup(char *s) | |
19206 | +{ | |
19207 | + vdso_enabled = simple_strtoul(s, NULL, 0); | |
19208 | + | |
19209 | + return 1; | |
19210 | +} | |
19211 | + | |
19212 | +/* | |
19213 | + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO | |
19214 | + * behavior on both 64-bit and 32-bit kernels. | |
19215 | + * On 32-bit kernels, vdso=[012] means the same thing. | |
19216 | + */ | |
19217 | +__setup("vdso32=", vdso_setup); | |
19218 | + | |
19219 | +#ifdef CONFIG_X86_32 | |
19220 | +__setup_param("vdso=", vdso32_setup, vdso_setup, 0); | |
19221 | + | |
19222 | +EXPORT_SYMBOL_GPL(vdso_enabled); | |
19223 | +#endif | |
19224 | + | |
19225 | +static __init void reloc_symtab(Elf32_Ehdr *ehdr, | |
19226 | + unsigned offset, unsigned size) | |
19227 | +{ | |
19228 | + Elf32_Sym *sym = (void *)ehdr + offset; | |
19229 | + unsigned nsym = size / sizeof(*sym); | |
19230 | + unsigned i; | |
19231 | + | |
19232 | + for(i = 0; i < nsym; i++, sym++) { | |
19233 | + if (sym->st_shndx == SHN_UNDEF || | |
19234 | + sym->st_shndx == SHN_ABS) | |
19235 | + continue; /* skip */ | |
19236 | + | |
19237 | + if (sym->st_shndx > SHN_LORESERVE) { | |
19238 | + printk(KERN_INFO "VDSO: unexpected st_shndx %x\n", | |
19239 | + sym->st_shndx); | |
19240 | + continue; | |
19241 | + } | |
19242 | + | |
19243 | + switch(ELF_ST_TYPE(sym->st_info)) { | |
19244 | + case STT_OBJECT: | |
19245 | + case STT_FUNC: | |
19246 | + case STT_SECTION: | |
19247 | + case STT_FILE: | |
19248 | + sym->st_value += VDSO_ADDR_ADJUST; | |
19249 | + } | |
19250 | + } | |
19251 | +} | |
19252 | + | |
19253 | +static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) | |
19254 | +{ | |
19255 | + Elf32_Dyn *dyn = (void *)ehdr + offset; | |
19256 | + | |
19257 | + for(; dyn->d_tag != DT_NULL; dyn++) | |
19258 | + switch(dyn->d_tag) { | |
19259 | + case DT_PLTGOT: | |
19260 | + case DT_HASH: | |
19261 | + case DT_STRTAB: | |
19262 | + case DT_SYMTAB: | |
19263 | + case DT_RELA: | |
19264 | + case DT_INIT: | |
19265 | + case DT_FINI: | |
19266 | + case DT_REL: | |
19267 | + case DT_DEBUG: | |
19268 | + case DT_JMPREL: | |
19269 | + case DT_VERSYM: | |
19270 | + case DT_VERDEF: | |
19271 | + case DT_VERNEED: | |
19272 | + case DT_ADDRRNGLO ... DT_ADDRRNGHI: | |
19273 | + /* definitely pointers needing relocation */ | |
19274 | + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; | |
19275 | + break; | |
19276 | + | |
19277 | + case DT_ENCODING ... OLD_DT_LOOS-1: | |
19278 | + case DT_LOOS ... DT_HIOS-1: | |
19279 | + /* Tags above DT_ENCODING are pointers if | |
19280 | + they're even */ | |
19281 | + if (dyn->d_tag >= DT_ENCODING && | |
19282 | + (dyn->d_tag & 1) == 0) | |
19283 | + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; | |
19284 | + break; | |
19285 | + | |
19286 | + case DT_VERDEFNUM: | |
19287 | + case DT_VERNEEDNUM: | |
19288 | + case DT_FLAGS_1: | |
19289 | + case DT_RELACOUNT: | |
19290 | + case DT_RELCOUNT: | |
19291 | + case DT_VALRNGLO ... DT_VALRNGHI: | |
19292 | + /* definitely not pointers */ | |
19293 | + break; | |
19294 | + | |
19295 | + case OLD_DT_LOOS ... DT_LOOS-1: | |
19296 | + case DT_HIOS ... DT_VALRNGLO-1: | |
19297 | + default: | |
19298 | + if (dyn->d_tag > DT_ENCODING) | |
19299 | + printk(KERN_INFO "VDSO: unexpected DT_tag %x\n", | |
19300 | + dyn->d_tag); | |
19301 | + break; | |
19302 | + } | |
19303 | +} | |
19304 | + | |
19305 | +static __init void relocate_vdso(Elf32_Ehdr *ehdr) | |
19306 | +{ | |
19307 | + Elf32_Phdr *phdr; | |
19308 | + Elf32_Shdr *shdr; | |
19309 | + int i; | |
19310 | + | |
19311 | + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || | |
19312 | + !elf_check_arch_ia32(ehdr) || | |
19313 | + ehdr->e_type != ET_DYN); | |
19314 | + | |
19315 | + ehdr->e_entry += VDSO_ADDR_ADJUST; | |
19316 | + | |
19317 | + /* rebase phdrs */ | |
19318 | + phdr = (void *)ehdr + ehdr->e_phoff; | |
19319 | + for (i = 0; i < ehdr->e_phnum; i++) { | |
19320 | + phdr[i].p_vaddr += VDSO_ADDR_ADJUST; | |
19321 | + | |
19322 | + /* relocate dynamic stuff */ | |
19323 | + if (phdr[i].p_type == PT_DYNAMIC) | |
19324 | + reloc_dyn(ehdr, phdr[i].p_offset); | |
19325 | + } | |
19326 | + | |
19327 | + /* rebase sections */ | |
19328 | + shdr = (void *)ehdr + ehdr->e_shoff; | |
19329 | + for(i = 0; i < ehdr->e_shnum; i++) { | |
19330 | + if (!(shdr[i].sh_flags & SHF_ALLOC)) | |
19331 | + continue; | |
19332 | + | |
19333 | + shdr[i].sh_addr += VDSO_ADDR_ADJUST; | |
19334 | + | |
19335 | + if (shdr[i].sh_type == SHT_SYMTAB || | |
19336 | + shdr[i].sh_type == SHT_DYNSYM) | |
19337 | + reloc_symtab(ehdr, shdr[i].sh_offset, | |
19338 | + shdr[i].sh_size); | |
19339 | + } | |
19340 | +} | |
19341 | + | |
19342 | +/* | |
19343 | + * These symbols are defined by vdso32.S to mark the bounds | |
19344 | + * of the ELF DSO images included therein. | |
19345 | + */ | |
19346 | +extern const char vdso32_default_start, vdso32_default_end; | |
19347 | +extern const char vdso32_sysenter_start, vdso32_sysenter_end; | |
19348 | +static struct page *vdso32_pages[1]; | |
19349 | + | |
19350 | +#ifdef CONFIG_X86_64 | |
19351 | + | |
19352 | +#if CONFIG_XEN_COMPAT < 0x030200 | |
19353 | +static int use_int80 = 1; | |
19354 | +#endif | |
19355 | +static int use_sysenter __read_mostly = -1; | |
19356 | + | |
19357 | +#define vdso32_sysenter() (use_sysenter > 0) | |
19358 | + | |
19359 | +/* May not be __init: called during resume */ | |
19360 | +void syscall32_cpu_init(void) | |
19361 | +{ | |
19362 | + static const struct callback_register cstar = { | |
19363 | + .type = CALLBACKTYPE_syscall32, | |
19364 | + .address = (unsigned long)ia32_cstar_target | |
19365 | + }; | |
19366 | + static const struct callback_register sysenter = { | |
19367 | + .type = CALLBACKTYPE_sysenter, | |
19368 | + .address = (unsigned long)ia32_sysenter_target | |
19369 | + }; | |
19370 | + | |
19371 | + if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) || | |
19372 | + (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)) | |
19373 | +#if CONFIG_XEN_COMPAT < 0x030200 | |
19374 | + return; | |
19375 | + use_int80 = 0; | |
19376 | +#else | |
19377 | + BUG(); | |
19378 | +#endif | |
19379 | + | |
19380 | + if (use_sysenter < 0) | |
19381 | + use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL); | |
19382 | +} | |
19383 | + | |
19384 | +#define compat_uses_vma 1 | |
19385 | + | |
19386 | +static inline void map_compat_vdso(int map) | |
19387 | +{ | |
19388 | +} | |
19389 | + | |
19390 | +#else /* CONFIG_X86_32 */ | |
19391 | + | |
19392 | +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) | |
19393 | + | |
19394 | +extern asmlinkage void ia32pv_cstar_target(void); | |
19395 | +static /*const*/ struct callback_register __cpuinitdata cstar = { | |
19396 | + .type = CALLBACKTYPE_syscall32, | |
19397 | + .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target }, | |
19398 | +}; | |
19399 | + | |
19400 | +void __cpuinit enable_sep_cpu(void) | |
19401 | +{ | |
19402 | + extern asmlinkage void ia32pv_sysenter_target(void); | |
19403 | + static struct callback_register __cpuinitdata sysenter = { | |
19404 | + .type = CALLBACKTYPE_sysenter, | |
19405 | + .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target }, | |
19406 | + }; | |
19407 | + | |
19408 | + if (boot_cpu_has(X86_FEATURE_SYSCALL)) { | |
19409 | + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0) | |
19410 | + BUG(); | |
19411 | + return; | |
19412 | + } | |
19413 | + | |
19414 | + if (!boot_cpu_has(X86_FEATURE_SEP)) | |
19415 | + return; | |
19416 | + | |
19417 | + if (xen_feature(XENFEAT_supervisor_mode_kernel)) | |
19418 | + sysenter.address.eip = (unsigned long)ia32_sysenter_target; | |
19419 | + | |
19420 | + switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) { | |
19421 | + case 0: | |
19422 | + break; | |
19423 | +#if CONFIG_XEN_COMPAT < 0x030200 | |
19424 | + case -ENOSYS: | |
19425 | + sysenter.type = CALLBACKTYPE_sysenter_deprecated; | |
19426 | + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0) | |
19427 | + break; | |
19428 | +#endif | |
19429 | + default: | |
19430 | + setup_clear_cpu_cap(X86_FEATURE_SEP); | |
19431 | + break; | |
19432 | + } | |
19433 | +} | |
19434 | + | |
19435 | +static struct vm_area_struct gate_vma; | |
19436 | + | |
19437 | +static int __init gate_vma_init(void) | |
19438 | +{ | |
19439 | + gate_vma.vm_mm = NULL; | |
19440 | + gate_vma.vm_start = FIXADDR_USER_START; | |
19441 | + gate_vma.vm_end = FIXADDR_USER_END; | |
19442 | + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; | |
19443 | + gate_vma.vm_page_prot = __P101; | |
19444 | + /* | |
19445 | + * Make sure the vDSO gets into every core dump. | |
19446 | + * Dumping its contents makes post-mortem fully interpretable later | |
19447 | + * without matching up the same kernel and hardware config to see | |
19448 | + * what PC values meant. | |
19449 | + */ | |
19450 | + gate_vma.vm_flags |= VM_ALWAYSDUMP; | |
19451 | + return 0; | |
19452 | +} | |
19453 | + | |
19454 | +#define compat_uses_vma 0 | |
19455 | + | |
19456 | +static void map_compat_vdso(int map) | |
19457 | +{ | |
19458 | + static int vdso_mapped; | |
19459 | + | |
19460 | + if (map == vdso_mapped) | |
19461 | + return; | |
19462 | + | |
19463 | + vdso_mapped = map; | |
19464 | + | |
19465 | + __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT, | |
19466 | + map ? PAGE_READONLY_EXEC : PAGE_NONE); | |
19467 | + | |
19468 | + /* flush stray tlbs */ | |
19469 | + flush_tlb_all(); | |
19470 | +} | |
19471 | + | |
19472 | +#endif /* CONFIG_X86_64 */ | |
19473 | + | |
19474 | +int __init sysenter_setup(void) | |
19475 | +{ | |
19476 | + void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); | |
19477 | + const void *vsyscall; | |
19478 | + size_t vsyscall_len; | |
19479 | + | |
19480 | + vdso32_pages[0] = virt_to_page(syscall_page); | |
19481 | + | |
19482 | +#ifdef CONFIG_X86_32 | |
19483 | + gate_vma_init(); | |
19484 | + | |
19485 | + printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); | |
19486 | +#endif | |
19487 | + | |
19488 | +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200 | |
19489 | + if (use_int80) { | |
19490 | + extern const char vdso32_int80_start, vdso32_int80_end; | |
19491 | + | |
19492 | + vsyscall = &vdso32_int80_start; | |
19493 | + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start; | |
19494 | + } else | |
19495 | +#elif defined(CONFIG_X86_32) | |
19496 | + if (boot_cpu_has(X86_FEATURE_SYSCALL) | |
19497 | + && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD | |
19498 | + || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)) | |
19499 | + setup_clear_cpu_cap(X86_FEATURE_SYSCALL); | |
19500 | + barrier(); /* until clear_bit()'s constraints are correct ... */ | |
19501 | + if (boot_cpu_has(X86_FEATURE_SYSCALL)) { | |
19502 | + extern const char vdso32_syscall_start, vdso32_syscall_end; | |
19503 | + | |
19504 | + vsyscall = &vdso32_syscall_start; | |
19505 | + vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start; | |
19506 | + } else | |
19507 | +#endif | |
19508 | + if (!vdso32_sysenter()) { | |
19509 | + vsyscall = &vdso32_default_start; | |
19510 | + vsyscall_len = &vdso32_default_end - &vdso32_default_start; | |
19511 | + } else { | |
19512 | + vsyscall = &vdso32_sysenter_start; | |
19513 | + vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; | |
19514 | + } | |
19515 | + | |
19516 | + memcpy(syscall_page, vsyscall, vsyscall_len); | |
19517 | + relocate_vdso(syscall_page); | |
19518 | + | |
19519 | + return 0; | |
19520 | +} | |
19521 | + | |
19522 | +/* Setup a VMA at program startup for the vsyscall page */ | |
19523 | +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) | |
19524 | +{ | |
19525 | + struct mm_struct *mm = current->mm; | |
19526 | + unsigned long addr; | |
19527 | + int ret = 0; | |
19528 | + bool compat; | |
19529 | + | |
19530 | + down_write(&mm->mmap_sem); | |
19531 | + | |
19532 | + /* Test compat mode once here, in case someone | |
19533 | + changes it via sysctl */ | |
19534 | + compat = (vdso_enabled == VDSO_COMPAT); | |
19535 | + | |
19536 | + map_compat_vdso(compat); | |
19537 | + | |
19538 | + if (compat) | |
19539 | + addr = VDSO_HIGH_BASE; | |
19540 | + else { | |
19541 | + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); | |
19542 | + if (IS_ERR_VALUE(addr)) { | |
19543 | + ret = addr; | |
19544 | + goto up_fail; | |
19545 | + } | |
19546 | + } | |
19547 | + | |
19548 | + if (compat_uses_vma || !compat) { | |
19549 | + /* | |
19550 | + * MAYWRITE to allow gdb to COW and set breakpoints | |
19551 | + * | |
19552 | + * Make sure the vDSO gets into every core dump. | |
19553 | + * Dumping its contents makes post-mortem fully | |
19554 | + * interpretable later without matching up the same | |
19555 | + * kernel and hardware config to see what PC values | |
19556 | + * meant. | |
19557 | + */ | |
19558 | + ret = install_special_mapping(mm, addr, PAGE_SIZE, | |
19559 | + VM_READ|VM_EXEC| | |
19560 | + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| | |
19561 | + VM_ALWAYSDUMP, | |
19562 | + vdso32_pages); | |
19563 | + | |
19564 | + if (ret) | |
19565 | + goto up_fail; | |
19566 | + } | |
19567 | + | |
19568 | + current->mm->context.vdso = (void *)addr; | |
19569 | + current_thread_info()->sysenter_return = | |
19570 | + VDSO32_SYMBOL(addr, SYSENTER_RETURN); | |
19571 | + | |
19572 | + up_fail: | |
19573 | + up_write(&mm->mmap_sem); | |
19574 | + | |
19575 | + return ret; | |
19576 | +} | |
19577 | + | |
19578 | +#ifdef CONFIG_X86_64 | |
19579 | + | |
19580 | +/* | |
19581 | + * This must be done early in case we have an initrd containing 32-bit | |
19582 | + * binaries (e.g., hotplug). This could be pushed upstream. | |
19583 | + */ | |
19584 | +core_initcall(sysenter_setup); | |
19585 | + | |
19586 | +#ifdef CONFIG_SYSCTL | |
19587 | +/* Register vsyscall32 into the ABI table */ | |
19588 | +#include <linux/sysctl.h> | |
19589 | + | |
19590 | +static ctl_table abi_table2[] = { | |
19591 | + { | |
19592 | + .procname = "vsyscall32", | |
19593 | + .data = &sysctl_vsyscall32, | |
19594 | + .maxlen = sizeof(int), | |
19595 | + .mode = 0644, | |
19596 | + .proc_handler = proc_dointvec | |
19597 | + }, | |
19598 | + {} | |
19599 | +}; | |
19600 | + | |
19601 | +static ctl_table abi_root_table2[] = { | |
19602 | + { | |
19603 | + .ctl_name = CTL_ABI, | |
19604 | + .procname = "abi", | |
19605 | + .mode = 0555, | |
19606 | + .child = abi_table2 | |
19607 | + }, | |
19608 | + {} | |
19609 | +}; | |
19610 | + | |
19611 | +static __init int ia32_binfmt_init(void) | |
19612 | +{ | |
19613 | + register_sysctl_table(abi_root_table2); | |
19614 | + return 0; | |
19615 | +} | |
19616 | +__initcall(ia32_binfmt_init); | |
19617 | +#endif | |
19618 | + | |
19619 | +#else /* CONFIG_X86_32 */ | |
19620 | + | |
19621 | +const char *arch_vma_name(struct vm_area_struct *vma) | |
19622 | +{ | |
19623 | + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) | |
19624 | + return "[vdso]"; | |
19625 | + return NULL; | |
19626 | +} | |
19627 | + | |
19628 | +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | |
19629 | +{ | |
19630 | + struct mm_struct *mm = tsk->mm; | |
19631 | + | |
19632 | + /* Check to see if this task was created in compat vdso mode */ | |
19633 | + if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) | |
19634 | + return &gate_vma; | |
19635 | + return NULL; | |
19636 | +} | |
19637 | + | |
19638 | +int in_gate_area(struct task_struct *task, unsigned long addr) | |
19639 | +{ | |
19640 | + const struct vm_area_struct *vma = get_gate_vma(task); | |
19641 | + | |
19642 | + return vma && addr >= vma->vm_start && addr < vma->vm_end; | |
19643 | +} | |
19644 | + | |
19645 | +int in_gate_area_no_task(unsigned long addr) | |
19646 | +{ | |
19647 | + return 0; | |
19648 | +} | |
19649 | + | |
19650 | +#endif /* CONFIG_X86_64 */ | |
19651 | --- a/arch/x86/vdso/vdso32/syscall.S | |
19652 | +++ b/arch/x86/vdso/vdso32/syscall.S | |
19653 | @@ -19,8 +19,10 @@ __kernel_vsyscall: | |
19654 | .Lpush_ebp: | |
19655 | movl %ecx, %ebp | |
19656 | syscall | |
19657 | +#ifndef CONFIG_XEN | |
19658 | movl $__USER32_DS, %ecx | |
19659 | movl %ecx, %ss | |
19660 | +#endif | |
19661 | movl %ebp, %ecx | |
19662 | popl %ebp | |
19663 | .Lpop_ebp: | |
19664 | --- a/drivers/pci/msi-xen.c | |
19665 | +++ b/drivers/pci/msi-xen.c | |
19666 | @@ -43,6 +43,53 @@ struct msi_pirq_entry { | |
19667 | int entry_nr; | |
19668 | }; | |
19669 | ||
19670 | +/* Arch hooks */ | |
19671 | + | |
19672 | +int __attribute__ ((weak)) | |
19673 | +arch_msi_check_device(struct pci_dev *dev, int nvec, int type) | |
19674 | +{ | |
19675 | + return 0; | |
19676 | +} | |
19677 | + | |
19678 | +#ifndef CONFIG_XEN | |
19679 | +int __attribute__ ((weak)) | |
19680 | +arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry) | |
19681 | +{ | |
19682 | + return 0; | |
19683 | +} | |
19684 | + | |
19685 | +int __attribute__ ((weak)) | |
19686 | +arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) | |
19687 | +{ | |
19688 | + struct msi_desc *entry; | |
19689 | + int ret; | |
19690 | + | |
19691 | + list_for_each_entry(entry, &dev->msi_list, list) { | |
19692 | + ret = arch_setup_msi_irq(dev, entry); | |
19693 | + if (ret) | |
19694 | + return ret; | |
19695 | + } | |
19696 | + | |
19697 | + return 0; | |
19698 | +} | |
19699 | + | |
19700 | +void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq) | |
19701 | +{ | |
19702 | + return; | |
19703 | +} | |
19704 | + | |
19705 | +void __attribute__ ((weak)) | |
19706 | +arch_teardown_msi_irqs(struct pci_dev *dev) | |
19707 | +{ | |
19708 | + struct msi_desc *entry; | |
19709 | + | |
19710 | + list_for_each_entry(entry, &dev->msi_list, list) { | |
19711 | + if (entry->irq != 0) | |
19712 | + arch_teardown_msi_irq(entry->irq); | |
19713 | + } | |
19714 | +} | |
19715 | +#endif | |
19716 | + | |
19717 | static void msi_set_enable(struct pci_dev *dev, int enable) | |
19718 | { | |
19719 | int pos; | |
19720 | @@ -270,7 +317,6 @@ static void pci_intx_for_msi(struct pci_ | |
19721 | pci_intx(dev, enable); | |
19722 | } | |
19723 | ||
19724 | -#ifdef CONFIG_PM | |
19725 | static void __pci_restore_msi_state(struct pci_dev *dev) | |
19726 | { | |
19727 | int pirq; | |
19728 | @@ -328,7 +374,7 @@ void pci_restore_msi_state(struct pci_de | |
19729 | __pci_restore_msi_state(dev); | |
19730 | __pci_restore_msix_state(dev); | |
19731 | } | |
19732 | -#endif /* CONFIG_PM */ | |
19733 | +EXPORT_SYMBOL_GPL(pci_restore_msi_state); | |
19734 | ||
19735 | /** | |
19736 | * msi_capability_init - configure device's MSI capability structure | |
19737 | @@ -760,51 +806,3 @@ void pci_msi_init_pci_dev(struct pci_dev | |
19738 | INIT_LIST_HEAD(&dev->msi_list); | |
19739 | #endif | |
19740 | } | |
19741 | - | |
19742 | - | |
19743 | -/* Arch hooks */ | |
19744 | - | |
19745 | -int __attribute__ ((weak)) | |
19746 | -arch_msi_check_device(struct pci_dev* dev, int nvec, int type) | |
19747 | -{ | |
19748 | - return 0; | |
19749 | -} | |
19750 | - | |
19751 | -#ifndef CONFIG_XEN | |
19752 | -int __attribute__ ((weak)) | |
19753 | -arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry) | |
19754 | -{ | |
19755 | - return 0; | |
19756 | -} | |
19757 | - | |
19758 | -int __attribute__ ((weak)) | |
19759 | -arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) | |
19760 | -{ | |
19761 | - struct msi_desc *entry; | |
19762 | - int ret; | |
19763 | - | |
19764 | - list_for_each_entry(entry, &dev->msi_list, list) { | |
19765 | - ret = arch_setup_msi_irq(dev, entry); | |
19766 | - if (ret) | |
19767 | - return ret; | |
19768 | - } | |
19769 | - | |
19770 | - return 0; | |
19771 | -} | |
19772 | - | |
19773 | -void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq) | |
19774 | -{ | |
19775 | - return; | |
19776 | -} | |
19777 | - | |
19778 | -void __attribute__ ((weak)) | |
19779 | -arch_teardown_msi_irqs(struct pci_dev *dev) | |
19780 | -{ | |
19781 | - struct msi_desc *entry; | |
19782 | - | |
19783 | - list_for_each_entry(entry, &dev->msi_list, list) { | |
19784 | - if (entry->irq != 0) | |
19785 | - arch_teardown_msi_irq(entry->irq); | |
19786 | - } | |
19787 | -} | |
19788 | -#endif | |
19789 | --- a/drivers/pci/pci.c | |
19790 | +++ b/drivers/pci/pci.c | |
19791 | @@ -353,7 +353,12 @@ pci_find_parent_resource(const struct pc | |
19792 | * Restore the BAR values for a given device, so as to make it | |
19793 | * accessible by its driver. | |
19794 | */ | |
19795 | +#ifndef CONFIG_XEN | |
19796 | static void | |
19797 | +#else | |
19798 | +EXPORT_SYMBOL_GPL(pci_restore_bars); | |
19799 | +void | |
19800 | +#endif | |
19801 | pci_restore_bars(struct pci_dev *dev) | |
19802 | { | |
19803 | int i, numres; | |
19804 | --- a/drivers/xen/balloon/sysfs.c | |
19805 | +++ b/drivers/xen/balloon/sysfs.c | |
19806 | @@ -108,7 +108,7 @@ static struct attribute_group balloon_in | |
19807 | }; | |
19808 | ||
19809 | static struct sysdev_class balloon_sysdev_class = { | |
19810 | - set_kset_name(BALLOON_CLASS_NAME), | |
19811 | + .name = BALLOON_CLASS_NAME, | |
19812 | }; | |
19813 | ||
19814 | static struct sys_device balloon_sysdev; | |
19815 | --- a/drivers/xen/blkback/blkback.c | |
19816 | +++ b/drivers/xen/blkback/blkback.c | |
19817 | @@ -148,7 +148,7 @@ static void unplug_queue(blkif_t *blkif) | |
19818 | return; | |
19819 | if (blkif->plug->unplug_fn) | |
19820 | blkif->plug->unplug_fn(blkif->plug); | |
19821 | - blk_put_queue(blkif->plug); | |
19822 | + kobject_put(&blkif->plug->kobj); | |
19823 | blkif->plug = NULL; | |
19824 | } | |
19825 | ||
19826 | @@ -159,7 +159,8 @@ static void plug_queue(blkif_t *blkif, s | |
19827 | if (q == blkif->plug) | |
19828 | return; | |
19829 | unplug_queue(blkif); | |
19830 | - blk_get_queue(q); | |
19831 | + WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)); | |
19832 | + kobject_get(&q->kobj); | |
19833 | blkif->plug = q; | |
19834 | } | |
19835 | ||
19836 | --- a/drivers/xen/blkfront/blkfront.c | |
19837 | +++ b/drivers/xen/blkfront/blkfront.c | |
19838 | @@ -716,7 +716,6 @@ static irqreturn_t blkif_int(int irq, vo | |
19839 | RING_IDX i, rp; | |
19840 | unsigned long flags; | |
19841 | struct blkfront_info *info = (struct blkfront_info *)dev_id; | |
19842 | - int uptodate; | |
19843 | ||
19844 | spin_lock_irqsave(&blkif_io_lock, flags); | |
19845 | ||
19846 | @@ -741,13 +740,13 @@ static irqreturn_t blkif_int(int irq, vo | |
19847 | ||
19848 | ADD_ID_TO_FREELIST(info, id); | |
19849 | ||
19850 | - uptodate = (bret->status == BLKIF_RSP_OKAY); | |
19851 | + ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO; | |
19852 | switch (bret->operation) { | |
19853 | case BLKIF_OP_WRITE_BARRIER: | |
19854 | if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { | |
19855 | printk("blkfront: %s: write barrier op failed\n", | |
19856 | info->gd->disk_name); | |
19857 | - uptodate = -EOPNOTSUPP; | |
19858 | + ret = -EOPNOTSUPP; | |
19859 | info->feature_barrier = 0; | |
19860 | xlvbd_barrier(info); | |
19861 | } | |
19862 | @@ -758,10 +757,8 @@ static irqreturn_t blkif_int(int irq, vo | |
19863 | DPRINTK("Bad return from blkdev data " | |
19864 | "request: %x\n", bret->status); | |
19865 | ||
19866 | - ret = end_that_request_first(req, uptodate, | |
19867 | - req->hard_nr_sectors); | |
19868 | + ret = __blk_end_request(req, ret, blk_rq_bytes(req)); | |
19869 | BUG_ON(ret); | |
19870 | - end_that_request_last(req, uptodate); | |
19871 | break; | |
19872 | default: | |
19873 | BUG(); | |
19874 | --- a/drivers/xen/blktap/blktap.c | |
19875 | +++ b/drivers/xen/blktap/blktap.c | |
19876 | @@ -327,8 +327,8 @@ static pte_t blktap_clear_pte(struct vm_ | |
19877 | * if vm_file is NULL (meaning mmap failed and we have nothing to do) | |
19878 | */ | |
19879 | if (uvaddr < uvstart || vma->vm_file == NULL) | |
19880 | - return ptep_get_and_clear_full(vma->vm_mm, uvaddr, | |
19881 | - ptep, is_fullmm); | |
19882 | + return xen_ptep_get_and_clear_full(vma, uvaddr, ptep, | |
19883 | + is_fullmm); | |
19884 | ||
19885 | info = vma->vm_file->private_data; | |
19886 | map = vma->vm_private_data; | |
19887 | @@ -375,8 +375,8 @@ static pte_t blktap_clear_pte(struct vm_ | |
19888 | BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap)); | |
19889 | ||
19890 | /* USING SHADOW PAGE TABLES. */ | |
19891 | - copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep, | |
19892 | - is_fullmm); | |
19893 | + copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep, | |
19894 | + is_fullmm); | |
19895 | } | |
19896 | ||
19897 | if (count) { | |
19898 | --- a/drivers/xen/core/evtchn.c | |
19899 | +++ b/drivers/xen/core/evtchn.c | |
19900 | @@ -193,7 +193,7 @@ static inline unsigned int cpu_from_evtc | |
19901 | ||
19902 | /* Upcall to generic IRQ layer. */ | |
19903 | #ifdef CONFIG_X86 | |
19904 | -extern fastcall unsigned int do_IRQ(struct pt_regs *regs); | |
19905 | +extern unsigned int do_IRQ(struct pt_regs *regs); | |
19906 | void __init xen_init_IRQ(void); | |
19907 | void __init init_IRQ(void) | |
19908 | { | |
19909 | @@ -202,13 +202,11 @@ void __init init_IRQ(void) | |
19910 | } | |
19911 | #if defined (__i386__) | |
19912 | static inline void exit_idle(void) {} | |
19913 | -#define IRQ_REG orig_eax | |
19914 | #elif defined (__x86_64__) | |
19915 | #include <asm/idle.h> | |
19916 | -#define IRQ_REG orig_rax | |
19917 | #endif | |
19918 | #define do_IRQ(irq, regs) do { \ | |
19919 | - (regs)->IRQ_REG = ~(irq); \ | |
19920 | + (regs)->orig_ax = ~(irq); \ | |
19921 | do_IRQ((regs)); \ | |
19922 | } while (0) | |
19923 | #endif | |
19924 | @@ -669,13 +667,12 @@ static void set_affinity_irq(unsigned in | |
19925 | int resend_irq_on_evtchn(unsigned int irq) | |
19926 | { | |
19927 | int masked, evtchn = evtchn_from_irq(irq); | |
19928 | - shared_info_t *s = HYPERVISOR_shared_info; | |
19929 | ||
19930 | if (!VALID_EVTCHN(evtchn)) | |
19931 | return 1; | |
19932 | ||
19933 | masked = test_and_set_evtchn_mask(evtchn); | |
19934 | - synch_set_bit(evtchn, s->evtchn_pending); | |
19935 | + set_evtchn(evtchn); | |
19936 | if (!masked) | |
19937 | unmask_evtchn(evtchn); | |
19938 | ||
19939 | @@ -968,6 +965,43 @@ void disable_all_local_evtchn(void) | |
19940 | synch_set_bit(i, &s->evtchn_mask[0]); | |
19941 | } | |
19942 | ||
19943 | +/* Clear an irq's pending state, in preparation for polling on it. */ | |
19944 | +void xen_clear_irq_pending(int irq) | |
19945 | +{ | |
19946 | + int evtchn = evtchn_from_irq(irq); | |
19947 | + | |
19948 | + if (VALID_EVTCHN(evtchn)) | |
19949 | + clear_evtchn(evtchn); | |
19950 | +} | |
19951 | + | |
19952 | +/* Set an irq's pending state, to avoid blocking on it. */ | |
19953 | +void xen_set_irq_pending(int irq) | |
19954 | +{ | |
19955 | + int evtchn = evtchn_from_irq(irq); | |
19956 | + | |
19957 | + if (VALID_EVTCHN(evtchn)) | |
19958 | + set_evtchn(evtchn); | |
19959 | +} | |
19960 | + | |
19961 | +/* Test an irq's pending state. */ | |
19962 | +int xen_test_irq_pending(int irq) | |
19963 | +{ | |
19964 | + int evtchn = evtchn_from_irq(irq); | |
19965 | + | |
19966 | + return VALID_EVTCHN(evtchn) && test_evtchn(evtchn); | |
19967 | +} | |
19968 | + | |
19969 | +/* Poll waiting for an irq to become pending. In the usual case, the | |
19970 | + irq will be disabled so it won't deliver an interrupt. */ | |
19971 | +void xen_poll_irq(int irq) | |
19972 | +{ | |
19973 | + evtchn_port_t evtchn = evtchn_from_irq(irq); | |
19974 | + | |
19975 | + if (VALID_EVTCHN(evtchn) | |
19976 | + && HYPERVISOR_poll_no_timeout(&evtchn, 1)) | |
19977 | + BUG(); | |
19978 | +} | |
19979 | + | |
19980 | static void restore_cpu_virqs(unsigned int cpu) | |
19981 | { | |
19982 | struct evtchn_bind_virq bind_virq; | |
19983 | --- a/drivers/xen/core/hypervisor_sysfs.c | |
19984 | +++ b/drivers/xen/core/hypervisor_sysfs.c | |
19985 | @@ -50,7 +50,7 @@ static int __init hypervisor_subsys_init | |
19986 | if (!is_running_on_xen()) | |
19987 | return -ENODEV; | |
19988 | ||
19989 | - hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type; | |
19990 | + hypervisor_kobj->ktype = &hyp_sysfs_kobj_type; | |
19991 | return 0; | |
19992 | } | |
19993 | ||
19994 | --- a/drivers/xen/core/Makefile | |
19995 | +++ b/drivers/xen/core/Makefile | |
19996 | @@ -10,5 +10,6 @@ obj-$(CONFIG_SYS_HYPERVISOR) += hypervis | |
19997 | obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o | |
19998 | obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o | |
19999 | obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o | |
20000 | +obj-$(CONFIG_X86_SMP) += spinlock.o | |
20001 | obj-$(CONFIG_KEXEC) += machine_kexec.o | |
20002 | obj-$(CONFIG_XEN_XENCOMM) += xencomm.o | |
20003 | --- a/drivers/xen/core/smpboot.c | |
20004 | +++ b/drivers/xen/core/smpboot.c | |
20005 | @@ -139,6 +139,10 @@ static int __cpuinit xen_smp_intr_init(u | |
20006 | goto fail; | |
20007 | per_cpu(callfunc_irq, cpu) = rc; | |
20008 | ||
20009 | + rc = xen_spinlock_init(cpu); | |
20010 | + if (rc < 0) | |
20011 | + goto fail; | |
20012 | + | |
20013 | if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0)) | |
20014 | goto fail; | |
20015 | ||
20016 | @@ -149,6 +153,7 @@ static int __cpuinit xen_smp_intr_init(u | |
20017 | unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); | |
20018 | if (per_cpu(callfunc_irq, cpu) >= 0) | |
20019 | unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); | |
20020 | + xen_spinlock_cleanup(cpu); | |
20021 | return rc; | |
20022 | } | |
20023 | ||
20024 | @@ -160,6 +165,7 @@ static void xen_smp_intr_exit(unsigned i | |
20025 | ||
20026 | unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); | |
20027 | unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); | |
20028 | + xen_spinlock_cleanup(cpu); | |
20029 | } | |
20030 | #endif | |
20031 | ||
20032 | @@ -212,36 +218,25 @@ static void __cpuinit cpu_initialize_con | |
20033 | smp_trap_init(ctxt.trap_ctxt); | |
20034 | ||
20035 | ctxt.ldt_ents = 0; | |
20036 | - ctxt.gdt_ents = GDT_SIZE / 8; | |
20037 | - | |
20038 | -#ifdef __i386__ | |
20039 | ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu)); | |
20040 | + ctxt.gdt_ents = GDT_SIZE / 8; | |
20041 | ||
20042 | ctxt.user_regs.cs = __KERNEL_CS; | |
20043 | - ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); | |
20044 | + ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); | |
20045 | ||
20046 | ctxt.kernel_ss = __KERNEL_DS; | |
20047 | - ctxt.kernel_sp = idle->thread.esp0; | |
20048 | + ctxt.kernel_sp = idle->thread.sp0; | |
20049 | ||
20050 | - ctxt.event_callback_cs = __KERNEL_CS; | |
20051 | ctxt.event_callback_eip = (unsigned long)hypervisor_callback; | |
20052 | - ctxt.failsafe_callback_cs = __KERNEL_CS; | |
20053 | ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; | |
20054 | +#ifdef __i386__ | |
20055 | + ctxt.event_callback_cs = __KERNEL_CS; | |
20056 | + ctxt.failsafe_callback_cs = __KERNEL_CS; | |
20057 | ||
20058 | ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); | |
20059 | ||
20060 | ctxt.user_regs.fs = __KERNEL_PERCPU; | |
20061 | #else /* __x86_64__ */ | |
20062 | - ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address); | |
20063 | - | |
20064 | - ctxt.user_regs.cs = __KERNEL_CS; | |
20065 | - ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); | |
20066 | - | |
20067 | - ctxt.kernel_ss = __KERNEL_DS; | |
20068 | - ctxt.kernel_sp = idle->thread.rsp0; | |
20069 | - | |
20070 | - ctxt.event_callback_eip = (unsigned long)hypervisor_callback; | |
20071 | - ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; | |
20072 | ctxt.syscall_callback_eip = (unsigned long)system_call; | |
20073 | ||
20074 | ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); | |
20075 | --- /dev/null | |
20076 | +++ b/drivers/xen/core/spinlock.c | |
20077 | @@ -0,0 +1,161 @@ | |
20078 | +/* | |
20079 | + * Xen spinlock functions | |
20080 | + * | |
20081 | + * See arch/x86/xen/smp.c for copyright and credits for derived | |
20082 | + * portions of this file. | |
20083 | + */ | |
20084 | + | |
20085 | +#include <linux/init.h> | |
20086 | +#include <linux/irq.h> | |
20087 | +#include <linux/kernel.h> | |
20088 | +#include <linux/kernel_stat.h> | |
20089 | +#include <linux/module.h> | |
20090 | +#include <xen/evtchn.h> | |
20091 | + | |
20092 | +extern irqreturn_t smp_reschedule_interrupt(int, void *); | |
20093 | + | |
20094 | +static DEFINE_PER_CPU(int, spinlock_irq) = -1; | |
20095 | +static char spinlock_name[NR_CPUS][15]; | |
20096 | + | |
20097 | +struct spinning { | |
20098 | + raw_spinlock_t *lock; | |
20099 | + unsigned int ticket; | |
20100 | + struct spinning *prev; | |
20101 | +}; | |
20102 | +static DEFINE_PER_CPU(struct spinning *, spinning); | |
20103 | +/* | |
20104 | + * Protect removal of objects: Addition can be done lockless, and even | |
20105 | + * removal itself doesn't need protection - what needs to be prevented is | |
20106 | + * removed objects going out of scope (as they're allocated on the stack. | |
20107 | + */ | |
20108 | +static DEFINE_PER_CPU(raw_rwlock_t, spinning_rm_lock) = __RAW_RW_LOCK_UNLOCKED; | |
20109 | + | |
20110 | +int __cpuinit xen_spinlock_init(unsigned int cpu) | |
20111 | +{ | |
20112 | + int rc; | |
20113 | + | |
20114 | + sprintf(spinlock_name[cpu], "spinlock%u", cpu); | |
20115 | + rc = bind_ipi_to_irqhandler(SPIN_UNLOCK_VECTOR, | |
20116 | + cpu, | |
20117 | + smp_reschedule_interrupt, | |
20118 | + IRQF_DISABLED|IRQF_NOBALANCING, | |
20119 | + spinlock_name[cpu], | |
20120 | + NULL); | |
20121 | + if (rc < 0) | |
20122 | + return rc; | |
20123 | + | |
20124 | + disable_irq(rc); /* make sure it's never delivered */ | |
20125 | + per_cpu(spinlock_irq, cpu) = rc; | |
20126 | + | |
20127 | + return 0; | |
20128 | +} | |
20129 | + | |
20130 | +void __cpuinit xen_spinlock_cleanup(unsigned int cpu) | |
20131 | +{ | |
20132 | + if (per_cpu(spinlock_irq, cpu) >= 0) | |
20133 | + unbind_from_irqhandler(per_cpu(spinlock_irq, cpu), NULL); | |
20134 | + per_cpu(spinlock_irq, cpu) = -1; | |
20135 | +} | |
20136 | + | |
20137 | +int xen_spin_wait(raw_spinlock_t *lock, unsigned int token) | |
20138 | +{ | |
20139 | + int rc = 0, irq = __get_cpu_var(spinlock_irq); | |
20140 | + raw_rwlock_t *rm_lock; | |
20141 | + unsigned long flags; | |
20142 | + struct spinning spinning; | |
20143 | + | |
20144 | + /* If kicker interrupt not initialized yet, just spin. */ | |
20145 | + if (unlikely(irq < 0)) | |
20146 | + return 0; | |
20147 | + | |
20148 | + token >>= TICKET_SHIFT; | |
20149 | + | |
20150 | + /* announce we're spinning */ | |
20151 | + spinning.ticket = token; | |
20152 | + spinning.lock = lock; | |
20153 | + spinning.prev = __get_cpu_var(spinning); | |
20154 | + smp_wmb(); | |
20155 | + __get_cpu_var(spinning) = &spinning; | |
20156 | + | |
20157 | + /* clear pending */ | |
20158 | + xen_clear_irq_pending(irq); | |
20159 | + | |
20160 | + do { | |
20161 | + /* Check again to make sure it didn't become free while | |
20162 | + * we weren't looking. */ | |
20163 | + if ((lock->slock & ((1U << TICKET_SHIFT) - 1)) == token) { | |
20164 | + /* If we interrupted another spinlock while it was | |
20165 | + * blocking, make sure it doesn't block (again) | |
20166 | + * without rechecking the lock. */ | |
20167 | + if (spinning.prev) | |
20168 | + xen_set_irq_pending(irq); | |
20169 | + rc = 1; | |
20170 | + break; | |
20171 | + } | |
20172 | + | |
20173 | + /* block until irq becomes pending */ | |
20174 | + xen_poll_irq(irq); | |
20175 | + } while (!xen_test_irq_pending(irq)); | |
20176 | + | |
20177 | + /* Leave the irq pending so that any interrupted blocker will | |
20178 | + * re-check. */ | |
20179 | + kstat_this_cpu.irqs[irq] += !rc; | |
20180 | + | |
20181 | + /* announce we're done */ | |
20182 | + __get_cpu_var(spinning) = spinning.prev; | |
20183 | + rm_lock = &__get_cpu_var(spinning_rm_lock); | |
20184 | + raw_local_irq_save(flags); | |
20185 | + __raw_write_lock(rm_lock); | |
20186 | + __raw_write_unlock(rm_lock); | |
20187 | + raw_local_irq_restore(flags); | |
20188 | + | |
20189 | + return rc; | |
20190 | +} | |
20191 | +EXPORT_SYMBOL(xen_spin_wait); | |
20192 | + | |
20193 | +unsigned int xen_spin_adjust(raw_spinlock_t *lock, unsigned int token) | |
20194 | +{ | |
20195 | + return token;//todo | |
20196 | +} | |
20197 | +EXPORT_SYMBOL(xen_spin_adjust); | |
20198 | + | |
20199 | +int xen_spin_wait_flags(raw_spinlock_t *lock, unsigned int *token, | |
20200 | + unsigned int flags) | |
20201 | +{ | |
20202 | + return xen_spin_wait(lock, *token);//todo | |
20203 | +} | |
20204 | +EXPORT_SYMBOL(xen_spin_wait_flags); | |
20205 | + | |
20206 | +void xen_spin_kick(raw_spinlock_t *lock, unsigned int token) | |
20207 | +{ | |
20208 | + unsigned int cpu; | |
20209 | + | |
20210 | + token &= (1U << TICKET_SHIFT) - 1; | |
20211 | + for_each_online_cpu(cpu) { | |
20212 | + raw_rwlock_t *rm_lock; | |
20213 | + unsigned long flags; | |
20214 | + struct spinning *spinning; | |
20215 | + | |
20216 | + if (cpu == raw_smp_processor_id()) | |
20217 | + continue; | |
20218 | + | |
20219 | + rm_lock = &per_cpu(spinning_rm_lock, cpu); | |
20220 | + raw_local_irq_save(flags); | |
20221 | + __raw_read_lock(rm_lock); | |
20222 | + | |
20223 | + spinning = per_cpu(spinning, cpu); | |
20224 | + smp_rmb(); | |
20225 | + if (spinning | |
20226 | + && (spinning->lock != lock || spinning->ticket != token)) | |
20227 | + spinning = NULL; | |
20228 | + | |
20229 | + __raw_read_unlock(rm_lock); | |
20230 | + raw_local_irq_restore(flags); | |
20231 | + | |
20232 | + if (unlikely(spinning)) { | |
20233 | + notify_remote_via_irq(per_cpu(spinlock_irq, cpu)); | |
20234 | + return; | |
20235 | + } | |
20236 | + } | |
20237 | +} | |
20238 | +EXPORT_SYMBOL(xen_spin_kick); | |
20239 | --- a/drivers/xen/core/xen_sysfs.c | |
20240 | +++ b/drivers/xen/core/xen_sysfs.c | |
20241 | @@ -29,12 +29,12 @@ HYPERVISOR_ATTR_RO(type); | |
20242 | ||
20243 | static int __init xen_sysfs_type_init(void) | |
20244 | { | |
20245 | - return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr); | |
20246 | + return sysfs_create_file(hypervisor_kobj, &type_attr.attr); | |
20247 | } | |
20248 | ||
20249 | static void xen_sysfs_type_destroy(void) | |
20250 | { | |
20251 | - sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr); | |
20252 | + sysfs_remove_file(hypervisor_kobj, &type_attr.attr); | |
20253 | } | |
20254 | ||
20255 | /* xen version attributes */ | |
20256 | @@ -90,13 +90,12 @@ static struct attribute_group version_gr | |
20257 | ||
20258 | static int __init xen_sysfs_version_init(void) | |
20259 | { | |
20260 | - return sysfs_create_group(&hypervisor_subsys.kobj, | |
20261 | - &version_group); | |
20262 | + return sysfs_create_group(hypervisor_kobj, &version_group); | |
20263 | } | |
20264 | ||
20265 | static void xen_sysfs_version_destroy(void) | |
20266 | { | |
20267 | - sysfs_remove_group(&hypervisor_subsys.kobj, &version_group); | |
20268 | + sysfs_remove_group(hypervisor_kobj, &version_group); | |
20269 | } | |
20270 | ||
20271 | /* UUID */ | |
20272 | @@ -126,12 +125,12 @@ HYPERVISOR_ATTR_RO(uuid); | |
20273 | ||
20274 | static int __init xen_sysfs_uuid_init(void) | |
20275 | { | |
20276 | - return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr); | |
20277 | + return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr); | |
20278 | } | |
20279 | ||
20280 | static void xen_sysfs_uuid_destroy(void) | |
20281 | { | |
20282 | - sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr); | |
20283 | + sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr); | |
20284 | } | |
20285 | ||
20286 | /* xen compilation attributes */ | |
20287 | @@ -204,14 +203,12 @@ static struct attribute_group xen_compil | |
20288 | ||
20289 | int __init static xen_compilation_init(void) | |
20290 | { | |
20291 | - return sysfs_create_group(&hypervisor_subsys.kobj, | |
20292 | - &xen_compilation_group); | |
20293 | + return sysfs_create_group(hypervisor_kobj, &xen_compilation_group); | |
20294 | } | |
20295 | ||
20296 | static void xen_compilation_destroy(void) | |
20297 | { | |
20298 | - sysfs_remove_group(&hypervisor_subsys.kobj, | |
20299 | - &xen_compilation_group); | |
20300 | + sysfs_remove_group(hypervisor_kobj, &xen_compilation_group); | |
20301 | } | |
20302 | ||
20303 | /* xen properties info */ | |
20304 | @@ -325,14 +322,12 @@ static struct attribute_group xen_proper | |
20305 | ||
20306 | static int __init xen_properties_init(void) | |
20307 | { | |
20308 | - return sysfs_create_group(&hypervisor_subsys.kobj, | |
20309 | - &xen_properties_group); | |
20310 | + return sysfs_create_group(hypervisor_kobj, &xen_properties_group); | |
20311 | } | |
20312 | ||
20313 | static void xen_properties_destroy(void) | |
20314 | { | |
20315 | - sysfs_remove_group(&hypervisor_subsys.kobj, | |
20316 | - &xen_properties_group); | |
20317 | + sysfs_remove_group(hypervisor_kobj, &xen_properties_group); | |
20318 | } | |
20319 | ||
20320 | #ifdef CONFIG_KEXEC | |
20321 | @@ -350,13 +345,12 @@ HYPERVISOR_ATTR_RO(vmcoreinfo); | |
20322 | ||
20323 | static int __init xen_sysfs_vmcoreinfo_init(void) | |
20324 | { | |
20325 | - return sysfs_create_file(&hypervisor_subsys.kobj, | |
20326 | - &vmcoreinfo_attr.attr); | |
20327 | + return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr); | |
20328 | } | |
20329 | ||
20330 | static void xen_sysfs_vmcoreinfo_destroy(void) | |
20331 | { | |
20332 | - sysfs_remove_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr); | |
20333 | + sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr); | |
20334 | } | |
20335 | ||
20336 | #endif | |
20337 | --- a/drivers/xen/gntdev/gntdev.c | |
20338 | +++ b/drivers/xen/gntdev/gntdev.c | |
20339 | @@ -782,7 +782,7 @@ static pte_t gntdev_clear_pte(struct vm_ | |
20340 | op.status); | |
20341 | } else { | |
20342 | /* USING SHADOW PAGE TABLES. */ | |
20343 | - copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm); | |
20344 | + copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm); | |
20345 | } | |
20346 | ||
20347 | /* Finally, we unmap the grant from kernel space. */ | |
20348 | @@ -810,7 +810,7 @@ static pte_t gntdev_clear_pte(struct vm_ | |
20349 | >> PAGE_SHIFT, INVALID_P2M_ENTRY); | |
20350 | ||
20351 | } else { | |
20352 | - copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm); | |
20353 | + copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm); | |
20354 | } | |
20355 | ||
20356 | return copy; | |
20357 | --- a/drivers/xen/scsifront/scsifront.c | |
20358 | +++ b/drivers/xen/scsifront/scsifront.c | |
20359 | @@ -260,19 +260,19 @@ static int map_data_for_request(struct v | |
20360 | return -ENOMEM; | |
20361 | } | |
20362 | ||
20363 | - if (sc->use_sg) { | |
20364 | + if (scsi_bufflen(sc)) { | |
20365 | /* quoted scsi_lib.c/scsi_req_map_sg . */ | |
20366 | - struct scatterlist *sg, *sgl = (struct scatterlist *)sc->request_buffer; | |
20367 | - unsigned int data_len = sc->request_bufflen; | |
20368 | + struct scatterlist *sg, *sgl = scsi_sglist(sc); | |
20369 | + unsigned int data_len = scsi_bufflen(sc); | |
20370 | ||
20371 | - nr_pages = (sc->request_bufflen + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT; | |
20372 | + nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT; | |
20373 | if (nr_pages > VSCSIIF_SG_TABLESIZE) { | |
20374 | printk(KERN_ERR "scsifront: Unable to map request_buffer for command!\n"); | |
20375 | ref_cnt = (-E2BIG); | |
20376 | goto big_to_sg; | |
20377 | } | |
20378 | ||
20379 | - for_each_sg (sgl, sg, sc->use_sg, i) { | |
20380 | + for_each_sg (sgl, sg, scsi_sg_count(sc), i) { | |
20381 | page = sg_page(sg); | |
20382 | off = sg->offset; | |
20383 | len = sg->length; | |
20384 | @@ -306,45 +306,6 @@ static int map_data_for_request(struct v | |
20385 | ref_cnt++; | |
20386 | } | |
20387 | } | |
20388 | - } else if (sc->request_bufflen) { | |
20389 | - unsigned long end = ((unsigned long)sc->request_buffer | |
20390 | - + sc->request_bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT; | |
20391 | - unsigned long start = (unsigned long)sc->request_buffer >> PAGE_SHIFT; | |
20392 | - | |
20393 | - page = virt_to_page(sc->request_buffer); | |
20394 | - nr_pages = end - start; | |
20395 | - len = sc->request_bufflen; | |
20396 | - | |
20397 | - if (nr_pages > VSCSIIF_SG_TABLESIZE) { | |
20398 | - ref_cnt = (-E2BIG); | |
20399 | - goto big_to_sg; | |
20400 | - } | |
20401 | - | |
20402 | - buffer_pfn = page_to_phys(page) >> PAGE_SHIFT; | |
20403 | - | |
20404 | - off = offset_in_page((unsigned long)sc->request_buffer); | |
20405 | - for (i = 0; i < nr_pages; i++) { | |
20406 | - bytes = PAGE_SIZE - off; | |
20407 | - | |
20408 | - if (bytes > len) | |
20409 | - bytes = len; | |
20410 | - | |
20411 | - ref = gnttab_claim_grant_reference(&gref_head); | |
20412 | - BUG_ON(ref == -ENOSPC); | |
20413 | - | |
20414 | - gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id, | |
20415 | - buffer_pfn, write); | |
20416 | - | |
20417 | - info->shadow[id].gref[i] = ref; | |
20418 | - ring_req->seg[i].gref = ref; | |
20419 | - ring_req->seg[i].offset = (uint16_t)off; | |
20420 | - ring_req->seg[i].length = (uint16_t)bytes; | |
20421 | - | |
20422 | - buffer_pfn++; | |
20423 | - len -= bytes; | |
20424 | - off = 0; | |
20425 | - ref_cnt++; | |
20426 | - } | |
20427 | } | |
20428 | ||
20429 | big_to_sg: | |
20430 | --- a/drivers/xen/xenoprof/xenoprofile.c | |
20431 | +++ b/drivers/xen/xenoprof/xenoprofile.c | |
20432 | @@ -79,7 +79,7 @@ static int xenoprof_resume(struct sys_de | |
20433 | ||
20434 | ||
20435 | static struct sysdev_class oprofile_sysclass = { | |
20436 | - set_kset_name("oprofile"), | |
20437 | + .name = "oprofile", | |
20438 | .resume = xenoprof_resume, | |
20439 | .suspend = xenoprof_suspend | |
20440 | }; | |
20441 | --- a/include/asm-x86/mach-xen/asm/agp.h | |
20442 | +++ b/include/asm-x86/mach-xen/asm/agp.h | |
20443 | @@ -13,18 +13,13 @@ | |
20444 | * page. This avoids data corruption on some CPUs. | |
20445 | */ | |
20446 | ||
20447 | -/* | |
20448 | - * Caller's responsibility to call global_flush_tlb() for performance | |
20449 | - * reasons | |
20450 | - */ | |
20451 | #define map_page_into_agp(page) ( \ | |
20452 | xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \ | |
20453 | - ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE)) | |
20454 | + ?: set_pages_uc(page, 1)) | |
20455 | #define unmap_page_from_agp(page) ( \ | |
20456 | xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \ | |
20457 | /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \ | |
20458 | - change_page_attr(page, 1, PAGE_KERNEL)) | |
20459 | -#define flush_agp_mappings() global_flush_tlb() | |
20460 | + set_pages_wb(page, 1)) | |
20461 | ||
20462 | /* | |
20463 | * Could use CLFLUSH here if the cpu supports it. But then it would | |
20464 | --- a/include/asm-x86/mach-xen/asm/desc_32.h | |
20465 | +++ /dev/null | |
20466 | @@ -1,262 +0,0 @@ | |
20467 | -#ifndef __ARCH_DESC_H | |
20468 | -#define __ARCH_DESC_H | |
20469 | - | |
20470 | -#include <asm/ldt.h> | |
20471 | -#include <asm/segment.h> | |
20472 | - | |
20473 | -#ifndef __ASSEMBLY__ | |
20474 | - | |
20475 | -#include <linux/preempt.h> | |
20476 | -#include <linux/smp.h> | |
20477 | - | |
20478 | -#include <asm/mmu.h> | |
20479 | - | |
20480 | -struct Xgt_desc_struct { | |
20481 | - unsigned short size; | |
20482 | - unsigned long address __attribute__((packed)); | |
20483 | - unsigned short pad; | |
20484 | -} __attribute__ ((packed)); | |
20485 | - | |
20486 | -struct gdt_page | |
20487 | -{ | |
20488 | - struct desc_struct gdt[GDT_ENTRIES]; | |
20489 | -} __attribute__((aligned(PAGE_SIZE))); | |
20490 | -DECLARE_PER_CPU(struct gdt_page, gdt_page); | |
20491 | - | |
20492 | -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) | |
20493 | -{ | |
20494 | - return per_cpu(gdt_page, cpu).gdt; | |
20495 | -} | |
20496 | - | |
20497 | -extern struct Xgt_desc_struct idt_descr; | |
20498 | -extern struct desc_struct idt_table[]; | |
20499 | -extern void set_intr_gate(unsigned int irq, void * addr); | |
20500 | - | |
20501 | -static inline void pack_descriptor(__u32 *a, __u32 *b, | |
20502 | - unsigned long base, unsigned long limit, unsigned char type, unsigned char flags) | |
20503 | -{ | |
20504 | - *a = ((base & 0xffff) << 16) | (limit & 0xffff); | |
20505 | - *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | | |
20506 | - (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20); | |
20507 | -} | |
20508 | - | |
20509 | -static inline void pack_gate(__u32 *a, __u32 *b, | |
20510 | - unsigned long base, unsigned short seg, unsigned char type, unsigned char flags) | |
20511 | -{ | |
20512 | - *a = (seg << 16) | (base & 0xffff); | |
20513 | - *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff); | |
20514 | -} | |
20515 | - | |
20516 | -#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */ | |
20517 | -#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */ | |
20518 | -#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */ | |
20519 | -#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */ | |
20520 | -#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */ | |
20521 | -#define DESCTYPE_DPL3 0x60 /* DPL-3 */ | |
20522 | -#define DESCTYPE_S 0x10 /* !system */ | |
20523 | - | |
20524 | -#ifndef CONFIG_XEN | |
20525 | -#define load_TR_desc() native_load_tr_desc() | |
20526 | -#define load_gdt(dtr) native_load_gdt(dtr) | |
20527 | -#define load_idt(dtr) native_load_idt(dtr) | |
20528 | -#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr)) | |
20529 | -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt)) | |
20530 | - | |
20531 | -#define store_gdt(dtr) native_store_gdt(dtr) | |
20532 | -#define store_idt(dtr) native_store_idt(dtr) | |
20533 | -#define store_tr(tr) (tr = native_store_tr()) | |
20534 | -#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt)) | |
20535 | - | |
20536 | -#define load_TLS(t, cpu) native_load_tls(t, cpu) | |
20537 | -#define set_ldt native_set_ldt | |
20538 | - | |
20539 | -#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) | |
20540 | -#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) | |
20541 | -#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) | |
20542 | - | |
20543 | -static inline void write_dt_entry(struct desc_struct *dt, | |
20544 | - int entry, u32 entry_low, u32 entry_high) | |
20545 | -{ | |
20546 | - dt[entry].a = entry_low; | |
20547 | - dt[entry].b = entry_high; | |
20548 | -} | |
20549 | - | |
20550 | -static inline void native_set_ldt(const void *addr, unsigned int entries) | |
20551 | -{ | |
20552 | - if (likely(entries == 0)) | |
20553 | - __asm__ __volatile__("lldt %w0"::"q" (0)); | |
20554 | - else { | |
20555 | - unsigned cpu = smp_processor_id(); | |
20556 | - __u32 a, b; | |
20557 | - | |
20558 | - pack_descriptor(&a, &b, (unsigned long)addr, | |
20559 | - entries * sizeof(struct desc_struct) - 1, | |
20560 | - DESCTYPE_LDT, 0); | |
20561 | - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b); | |
20562 | - __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); | |
20563 | - } | |
20564 | -} | |
20565 | - | |
20566 | - | |
20567 | -static inline void native_load_tr_desc(void) | |
20568 | -{ | |
20569 | - asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); | |
20570 | -} | |
20571 | - | |
20572 | -static inline void native_load_gdt(const struct Xgt_desc_struct *dtr) | |
20573 | -{ | |
20574 | - asm volatile("lgdt %0"::"m" (*dtr)); | |
20575 | -} | |
20576 | - | |
20577 | -static inline void native_load_idt(const struct Xgt_desc_struct *dtr) | |
20578 | -{ | |
20579 | - asm volatile("lidt %0"::"m" (*dtr)); | |
20580 | -} | |
20581 | - | |
20582 | -static inline void native_store_gdt(struct Xgt_desc_struct *dtr) | |
20583 | -{ | |
20584 | - asm ("sgdt %0":"=m" (*dtr)); | |
20585 | -} | |
20586 | - | |
20587 | -static inline void native_store_idt(struct Xgt_desc_struct *dtr) | |
20588 | -{ | |
20589 | - asm ("sidt %0":"=m" (*dtr)); | |
20590 | -} | |
20591 | - | |
20592 | -static inline unsigned long native_store_tr(void) | |
20593 | -{ | |
20594 | - unsigned long tr; | |
20595 | - asm ("str %0":"=r" (tr)); | |
20596 | - return tr; | |
20597 | -} | |
20598 | - | |
20599 | -static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) | |
20600 | -{ | |
20601 | - unsigned int i; | |
20602 | - struct desc_struct *gdt = get_cpu_gdt_table(cpu); | |
20603 | - | |
20604 | - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) | |
20605 | - gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; | |
20606 | -} | |
20607 | -#else | |
20608 | -#define load_TLS(t, cpu) xen_load_tls(t, cpu) | |
20609 | -#define set_ldt xen_set_ldt | |
20610 | - | |
20611 | -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b); | |
20612 | -extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b); | |
20613 | - | |
20614 | -static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu) | |
20615 | -{ | |
20616 | - unsigned int i; | |
20617 | - struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN; | |
20618 | - | |
20619 | - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) | |
20620 | - if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]), | |
20621 | - *(u64 *)&t->tls_array[i])) | |
20622 | - BUG(); | |
20623 | -} | |
20624 | -#endif | |
20625 | - | |
20626 | -#ifndef CONFIG_X86_NO_IDT | |
20627 | -static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg) | |
20628 | -{ | |
20629 | - __u32 a, b; | |
20630 | - pack_gate(&a, &b, (unsigned long)addr, seg, type, 0); | |
20631 | - write_idt_entry(idt_table, gate, a, b); | |
20632 | -} | |
20633 | -#endif | |
20634 | - | |
20635 | -#ifndef CONFIG_X86_NO_TSS | |
20636 | -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr) | |
20637 | -{ | |
20638 | - __u32 a, b; | |
20639 | - pack_descriptor(&a, &b, (unsigned long)addr, | |
20640 | - offsetof(struct tss_struct, __cacheline_filler) - 1, | |
20641 | - DESCTYPE_TSS, 0); | |
20642 | - write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); | |
20643 | -} | |
20644 | -#endif | |
20645 | - | |
20646 | - | |
20647 | -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) | |
20648 | - | |
20649 | -#define LDT_entry_a(info) \ | |
20650 | - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) | |
20651 | - | |
20652 | -#define LDT_entry_b(info) \ | |
20653 | - (((info)->base_addr & 0xff000000) | \ | |
20654 | - (((info)->base_addr & 0x00ff0000) >> 16) | \ | |
20655 | - ((info)->limit & 0xf0000) | \ | |
20656 | - (((info)->read_exec_only ^ 1) << 9) | \ | |
20657 | - ((info)->contents << 10) | \ | |
20658 | - (((info)->seg_not_present ^ 1) << 15) | \ | |
20659 | - ((info)->seg_32bit << 22) | \ | |
20660 | - ((info)->limit_in_pages << 23) | \ | |
20661 | - ((info)->useable << 20) | \ | |
20662 | - 0x7000) | |
20663 | - | |
20664 | -#define LDT_empty(info) (\ | |
20665 | - (info)->base_addr == 0 && \ | |
20666 | - (info)->limit == 0 && \ | |
20667 | - (info)->contents == 0 && \ | |
20668 | - (info)->read_exec_only == 1 && \ | |
20669 | - (info)->seg_32bit == 0 && \ | |
20670 | - (info)->limit_in_pages == 0 && \ | |
20671 | - (info)->seg_not_present == 1 && \ | |
20672 | - (info)->useable == 0 ) | |
20673 | - | |
20674 | -static inline void clear_LDT(void) | |
20675 | -{ | |
20676 | - set_ldt(NULL, 0); | |
20677 | -} | |
20678 | - | |
20679 | -/* | |
20680 | - * load one particular LDT into the current CPU | |
20681 | - */ | |
20682 | -static inline void load_LDT_nolock(mm_context_t *pc) | |
20683 | -{ | |
20684 | - set_ldt(pc->ldt, pc->size); | |
20685 | -} | |
20686 | - | |
20687 | -static inline void load_LDT(mm_context_t *pc) | |
20688 | -{ | |
20689 | - preempt_disable(); | |
20690 | - load_LDT_nolock(pc); | |
20691 | - preempt_enable(); | |
20692 | -} | |
20693 | - | |
20694 | -static inline unsigned long get_desc_base(unsigned long *desc) | |
20695 | -{ | |
20696 | - unsigned long base; | |
20697 | - base = ((desc[0] >> 16) & 0x0000ffff) | | |
20698 | - ((desc[1] << 16) & 0x00ff0000) | | |
20699 | - (desc[1] & 0xff000000); | |
20700 | - return base; | |
20701 | -} | |
20702 | - | |
20703 | -#else /* __ASSEMBLY__ */ | |
20704 | - | |
20705 | -/* | |
20706 | - * GET_DESC_BASE reads the descriptor base of the specified segment. | |
20707 | - * | |
20708 | - * Args: | |
20709 | - * idx - descriptor index | |
20710 | - * gdt - GDT pointer | |
20711 | - * base - 32bit register to which the base will be written | |
20712 | - * lo_w - lo word of the "base" register | |
20713 | - * lo_b - lo byte of the "base" register | |
20714 | - * hi_b - hi byte of the low word of the "base" register | |
20715 | - * | |
20716 | - * Example: | |
20717 | - * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) | |
20718 | - * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. | |
20719 | - */ | |
20720 | -#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ | |
20721 | - movb idx*8+4(gdt), lo_b; \ | |
20722 | - movb idx*8+7(gdt), hi_b; \ | |
20723 | - shll $16, base; \ | |
20724 | - movw idx*8+2(gdt), lo_w; | |
20725 | - | |
20726 | -#endif /* !__ASSEMBLY__ */ | |
20727 | - | |
20728 | -#endif | |
20729 | --- a/include/asm-x86/mach-xen/asm/desc_64.h | |
20730 | +++ /dev/null | |
20731 | @@ -1,228 +0,0 @@ | |
20732 | -/* Written 2000 by Andi Kleen */ | |
20733 | -#ifndef __ARCH_DESC_H | |
20734 | -#define __ARCH_DESC_H | |
20735 | - | |
20736 | -#include <linux/threads.h> | |
20737 | -#include <asm/ldt.h> | |
20738 | - | |
20739 | -#ifndef __ASSEMBLY__ | |
20740 | - | |
20741 | -#include <linux/string.h> | |
20742 | -#include <linux/smp.h> | |
20743 | -#include <asm/desc_defs.h> | |
20744 | - | |
20745 | -#include <asm/segment.h> | |
20746 | -#include <asm/mmu.h> | |
20747 | - | |
20748 | -extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS]; | |
20749 | - | |
20750 | -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; | |
20751 | - | |
20752 | -#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8)) | |
20753 | -#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8)) | |
20754 | - | |
20755 | -static inline void clear_LDT(void) | |
20756 | -{ | |
20757 | - int cpu = get_cpu(); | |
20758 | - | |
20759 | - /* | |
20760 | - * NB. We load the default_ldt for lcall7/27 handling on demand, as | |
20761 | - * it slows down context switching. Noone uses it anyway. | |
20762 | - */ | |
20763 | - cpu = cpu; /* XXX avoid compiler warning */ | |
20764 | - xen_set_ldt(NULL, 0); | |
20765 | - put_cpu(); | |
20766 | -} | |
20767 | - | |
20768 | -#ifndef CONFIG_X86_NO_TSS | |
20769 | -static inline unsigned long __store_tr(void) | |
20770 | -{ | |
20771 | - unsigned long tr; | |
20772 | - | |
20773 | - asm volatile ("str %w0":"=r" (tr)); | |
20774 | - return tr; | |
20775 | -} | |
20776 | - | |
20777 | -#define store_tr(tr) (tr) = __store_tr() | |
20778 | -#endif | |
20779 | - | |
20780 | -/* | |
20781 | - * This is the ldt that every process will get unless we need | |
20782 | - * something other than this. | |
20783 | - */ | |
20784 | -extern struct desc_struct default_ldt[]; | |
20785 | -#ifndef CONFIG_X86_NO_IDT | |
20786 | -extern struct gate_struct idt_table[]; | |
20787 | -#endif | |
20788 | -extern struct desc_ptr cpu_gdt_descr[]; | |
20789 | - | |
20790 | -/* the cpu gdt accessor */ | |
20791 | -#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address) | |
20792 | - | |
20793 | -#ifndef CONFIG_XEN | |
20794 | -static inline void load_gdt(const struct desc_ptr *ptr) | |
20795 | -{ | |
20796 | - asm volatile("lgdt %w0"::"m" (*ptr)); | |
20797 | -} | |
20798 | - | |
20799 | -static inline void store_gdt(struct desc_ptr *ptr) | |
20800 | -{ | |
20801 | - asm("sgdt %w0":"=m" (*ptr)); | |
20802 | -} | |
20803 | -#endif | |
20804 | - | |
20805 | -static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist) | |
20806 | -{ | |
20807 | - struct gate_struct s; | |
20808 | - s.offset_low = PTR_LOW(func); | |
20809 | - s.segment = __KERNEL_CS; | |
20810 | - s.ist = ist; | |
20811 | - s.p = 1; | |
20812 | - s.dpl = dpl; | |
20813 | - s.zero0 = 0; | |
20814 | - s.zero1 = 0; | |
20815 | - s.type = type; | |
20816 | - s.offset_middle = PTR_MIDDLE(func); | |
20817 | - s.offset_high = PTR_HIGH(func); | |
20818 | - /* does not need to be atomic because it is only done once at setup time */ | |
20819 | - memcpy(adr, &s, 16); | |
20820 | -} | |
20821 | - | |
20822 | -#ifndef CONFIG_X86_NO_IDT | |
20823 | -static inline void set_intr_gate(int nr, void *func) | |
20824 | -{ | |
20825 | - BUG_ON((unsigned)nr > 0xFF); | |
20826 | - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); | |
20827 | -} | |
20828 | - | |
20829 | -static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) | |
20830 | -{ | |
20831 | - BUG_ON((unsigned)nr > 0xFF); | |
20832 | - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); | |
20833 | -} | |
20834 | - | |
20835 | -static inline void set_system_gate(int nr, void *func) | |
20836 | -{ | |
20837 | - BUG_ON((unsigned)nr > 0xFF); | |
20838 | - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); | |
20839 | -} | |
20840 | - | |
20841 | -static inline void set_system_gate_ist(int nr, void *func, unsigned ist) | |
20842 | -{ | |
20843 | - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist); | |
20844 | -} | |
20845 | - | |
20846 | -static inline void load_idt(const struct desc_ptr *ptr) | |
20847 | -{ | |
20848 | - asm volatile("lidt %w0"::"m" (*ptr)); | |
20849 | -} | |
20850 | - | |
20851 | -static inline void store_idt(struct desc_ptr *dtr) | |
20852 | -{ | |
20853 | - asm("sidt %w0":"=m" (*dtr)); | |
20854 | -} | |
20855 | -#endif | |
20856 | - | |
20857 | -static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type, | |
20858 | - unsigned size) | |
20859 | -{ | |
20860 | - struct ldttss_desc d; | |
20861 | - memset(&d,0,sizeof(d)); | |
20862 | - d.limit0 = size & 0xFFFF; | |
20863 | - d.base0 = PTR_LOW(tss); | |
20864 | - d.base1 = PTR_MIDDLE(tss) & 0xFF; | |
20865 | - d.type = type; | |
20866 | - d.p = 1; | |
20867 | - d.limit1 = (size >> 16) & 0xF; | |
20868 | - d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF; | |
20869 | - d.base3 = PTR_HIGH(tss); | |
20870 | - memcpy(ptr, &d, 16); | |
20871 | -} | |
20872 | - | |
20873 | -#ifndef CONFIG_X86_NO_TSS | |
20874 | -static inline void set_tss_desc(unsigned cpu, void *addr) | |
20875 | -{ | |
20876 | - /* | |
20877 | - * sizeof(unsigned long) coming from an extra "long" at the end | |
20878 | - * of the iobitmap. See tss_struct definition in processor.h | |
20879 | - * | |
20880 | - * -1? seg base+limit should be pointing to the address of the | |
20881 | - * last valid byte | |
20882 | - */ | |
20883 | - set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS], | |
20884 | - (unsigned long)addr, DESC_TSS, | |
20885 | - IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1); | |
20886 | -} | |
20887 | -#endif | |
20888 | - | |
20889 | -static inline void set_ldt_desc(unsigned cpu, void *addr, int size) | |
20890 | -{ | |
20891 | - set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr, | |
20892 | - DESC_LDT, size * 8 - 1); | |
20893 | -} | |
20894 | - | |
20895 | -#define LDT_entry_a(info) \ | |
20896 | - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) | |
20897 | -/* Don't allow setting of the lm bit. It is useless anyways because | |
20898 | - 64bit system calls require __USER_CS. */ | |
20899 | -#define LDT_entry_b(info) \ | |
20900 | - (((info)->base_addr & 0xff000000) | \ | |
20901 | - (((info)->base_addr & 0x00ff0000) >> 16) | \ | |
20902 | - ((info)->limit & 0xf0000) | \ | |
20903 | - (((info)->read_exec_only ^ 1) << 9) | \ | |
20904 | - ((info)->contents << 10) | \ | |
20905 | - (((info)->seg_not_present ^ 1) << 15) | \ | |
20906 | - ((info)->seg_32bit << 22) | \ | |
20907 | - ((info)->limit_in_pages << 23) | \ | |
20908 | - ((info)->useable << 20) | \ | |
20909 | - /* ((info)->lm << 21) | */ \ | |
20910 | - 0x7000) | |
20911 | - | |
20912 | -#define LDT_empty(info) (\ | |
20913 | - (info)->base_addr == 0 && \ | |
20914 | - (info)->limit == 0 && \ | |
20915 | - (info)->contents == 0 && \ | |
20916 | - (info)->read_exec_only == 1 && \ | |
20917 | - (info)->seg_32bit == 0 && \ | |
20918 | - (info)->limit_in_pages == 0 && \ | |
20919 | - (info)->seg_not_present == 1 && \ | |
20920 | - (info)->useable == 0 && \ | |
20921 | - (info)->lm == 0) | |
20922 | - | |
20923 | -static inline void load_TLS(struct thread_struct *t, unsigned int cpu) | |
20924 | -{ | |
20925 | - unsigned int i; | |
20926 | - u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN); | |
20927 | - | |
20928 | - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) | |
20929 | - if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]), | |
20930 | - t->tls_array[i])) | |
20931 | - BUG(); | |
20932 | -} | |
20933 | - | |
20934 | -/* | |
20935 | - * load one particular LDT into the current CPU | |
20936 | - */ | |
20937 | -static inline void load_LDT_nolock (mm_context_t *pc, int cpu) | |
20938 | -{ | |
20939 | - void *segments = pc->ldt; | |
20940 | - int count = pc->size; | |
20941 | - | |
20942 | - if (likely(!count)) | |
20943 | - segments = NULL; | |
20944 | - | |
20945 | - xen_set_ldt(segments, count); | |
20946 | -} | |
20947 | - | |
20948 | -static inline void load_LDT(mm_context_t *pc) | |
20949 | -{ | |
20950 | - int cpu = get_cpu(); | |
20951 | - load_LDT_nolock(pc, cpu); | |
20952 | - put_cpu(); | |
20953 | -} | |
20954 | - | |
20955 | -extern struct desc_ptr idt_descr; | |
20956 | - | |
20957 | -#endif /* !__ASSEMBLY__ */ | |
20958 | - | |
20959 | -#endif | |
20960 | --- a/include/asm-x86/mach-xen/asm/desc.h | |
20961 | +++ b/include/asm-x86/mach-xen/asm/desc.h | |
20962 | @@ -1,5 +1,404 @@ | |
20963 | +#ifndef _ASM_DESC_H_ | |
20964 | +#define _ASM_DESC_H_ | |
20965 | + | |
20966 | +#ifndef __ASSEMBLY__ | |
20967 | +#include <asm/desc_defs.h> | |
20968 | +#include <asm/ldt.h> | |
20969 | +#include <asm/mmu.h> | |
20970 | +#include <linux/smp.h> | |
20971 | + | |
20972 | +static inline void fill_ldt(struct desc_struct *desc, | |
20973 | + const struct user_desc *info) | |
20974 | +{ | |
20975 | + desc->limit0 = info->limit & 0x0ffff; | |
20976 | + desc->base0 = info->base_addr & 0x0000ffff; | |
20977 | + | |
20978 | + desc->base1 = (info->base_addr & 0x00ff0000) >> 16; | |
20979 | + desc->type = (info->read_exec_only ^ 1) << 1; | |
20980 | + desc->type |= info->contents << 2; | |
20981 | + desc->s = 1; | |
20982 | + desc->dpl = 0x3; | |
20983 | + desc->p = info->seg_not_present ^ 1; | |
20984 | + desc->limit = (info->limit & 0xf0000) >> 16; | |
20985 | + desc->avl = info->useable; | |
20986 | + desc->d = info->seg_32bit; | |
20987 | + desc->g = info->limit_in_pages; | |
20988 | + desc->base2 = (info->base_addr & 0xff000000) >> 24; | |
20989 | +} | |
20990 | + | |
20991 | +#ifndef CONFIG_X86_NO_IDT | |
20992 | +extern struct desc_ptr idt_descr; | |
20993 | +extern gate_desc idt_table[]; | |
20994 | +#endif | |
20995 | + | |
20996 | +#ifdef CONFIG_X86_64 | |
20997 | +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; | |
20998 | +extern struct desc_ptr cpu_gdt_descr[]; | |
20999 | +/* the cpu gdt accessor */ | |
21000 | +#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address) | |
21001 | + | |
21002 | +static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, | |
21003 | + unsigned dpl, unsigned ist, unsigned seg) | |
21004 | +{ | |
21005 | + gate->offset_low = PTR_LOW(func); | |
21006 | + gate->segment = __KERNEL_CS; | |
21007 | + gate->ist = ist; | |
21008 | + gate->p = 1; | |
21009 | + gate->dpl = dpl; | |
21010 | + gate->zero0 = 0; | |
21011 | + gate->zero1 = 0; | |
21012 | + gate->type = type; | |
21013 | + gate->offset_middle = PTR_MIDDLE(func); | |
21014 | + gate->offset_high = PTR_HIGH(func); | |
21015 | +} | |
21016 | + | |
21017 | +#else | |
21018 | +struct gdt_page { | |
21019 | + struct desc_struct gdt[GDT_ENTRIES]; | |
21020 | +} __attribute__((aligned(PAGE_SIZE))); | |
21021 | +DECLARE_PER_CPU(struct gdt_page, gdt_page); | |
21022 | + | |
21023 | +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) | |
21024 | +{ | |
21025 | + return per_cpu(gdt_page, cpu).gdt; | |
21026 | +} | |
21027 | + | |
21028 | +static inline void pack_gate(gate_desc *gate, unsigned char type, | |
21029 | + unsigned long base, unsigned dpl, unsigned flags, unsigned short seg) | |
21030 | + | |
21031 | +{ | |
21032 | + gate->a = (seg << 16) | (base & 0xffff); | |
21033 | + gate->b = (base & 0xffff0000) | | |
21034 | + (((0x80 | type | (dpl << 5)) & 0xff) << 8); | |
21035 | +} | |
21036 | + | |
21037 | +#endif | |
21038 | + | |
21039 | +static inline int desc_empty(const void *ptr) | |
21040 | +{ | |
21041 | + const u32 *desc = ptr; | |
21042 | + return !(desc[0] | desc[1]); | |
21043 | +} | |
21044 | + | |
21045 | +#ifndef CONFIG_XEN | |
21046 | +#define load_TR_desc() native_load_tr_desc() | |
21047 | +#define load_gdt(dtr) native_load_gdt(dtr) | |
21048 | +#define load_idt(dtr) native_load_idt(dtr) | |
21049 | +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr)) | |
21050 | +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt)) | |
21051 | + | |
21052 | +#define store_gdt(dtr) native_store_gdt(dtr) | |
21053 | +#define store_idt(dtr) native_store_idt(dtr) | |
21054 | +#define store_tr(tr) (tr = native_store_tr()) | |
21055 | +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt)) | |
21056 | + | |
21057 | +#define load_TLS(t, cpu) native_load_tls(t, cpu) | |
21058 | +#define set_ldt native_set_ldt | |
21059 | + | |
21060 | +#define write_ldt_entry(dt, entry, desc) \ | |
21061 | + native_write_ldt_entry(dt, entry, desc) | |
21062 | +#define write_gdt_entry(dt, entry, desc, type) \ | |
21063 | + native_write_gdt_entry(dt, entry, desc, type) | |
21064 | +#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g) | |
21065 | + | |
21066 | +static inline void native_write_idt_entry(gate_desc *idt, int entry, | |
21067 | + const gate_desc *gate) | |
21068 | +{ | |
21069 | + memcpy(&idt[entry], gate, sizeof(*gate)); | |
21070 | +} | |
21071 | + | |
21072 | +static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, | |
21073 | + const void *desc) | |
21074 | +{ | |
21075 | + memcpy(&ldt[entry], desc, 8); | |
21076 | +} | |
21077 | + | |
21078 | +static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry, | |
21079 | + const void *desc, int type) | |
21080 | +{ | |
21081 | + unsigned int size; | |
21082 | + switch (type) { | |
21083 | + case DESC_TSS: | |
21084 | + size = sizeof(tss_desc); | |
21085 | + break; | |
21086 | + case DESC_LDT: | |
21087 | + size = sizeof(ldt_desc); | |
21088 | + break; | |
21089 | + default: | |
21090 | + size = sizeof(struct desc_struct); | |
21091 | + break; | |
21092 | + } | |
21093 | + memcpy(&gdt[entry], desc, size); | |
21094 | +} | |
21095 | +#endif | |
21096 | + | |
21097 | +static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, | |
21098 | + unsigned long limit, unsigned char type, | |
21099 | + unsigned char flags) | |
21100 | +{ | |
21101 | + desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); | |
21102 | + desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | | |
21103 | + (limit & 0x000f0000) | ((type & 0xff) << 8) | | |
21104 | + ((flags & 0xf) << 20); | |
21105 | + desc->p = 1; | |
21106 | +} | |
21107 | + | |
21108 | + | |
21109 | +#ifndef CONFIG_XEN | |
21110 | +static inline void set_tssldt_descriptor(void *d, unsigned long addr, | |
21111 | + unsigned type, unsigned size) | |
21112 | +{ | |
21113 | +#ifdef CONFIG_X86_64 | |
21114 | + struct ldttss_desc64 *desc = d; | |
21115 | + memset(desc, 0, sizeof(*desc)); | |
21116 | + desc->limit0 = size & 0xFFFF; | |
21117 | + desc->base0 = PTR_LOW(addr); | |
21118 | + desc->base1 = PTR_MIDDLE(addr) & 0xFF; | |
21119 | + desc->type = type; | |
21120 | + desc->p = 1; | |
21121 | + desc->limit1 = (size >> 16) & 0xF; | |
21122 | + desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; | |
21123 | + desc->base3 = PTR_HIGH(addr); | |
21124 | +#else | |
21125 | + | |
21126 | + pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); | |
21127 | +#endif | |
21128 | +} | |
21129 | + | |
21130 | +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) | |
21131 | +{ | |
21132 | + struct desc_struct *d = get_cpu_gdt_table(cpu); | |
21133 | + tss_desc tss; | |
21134 | + | |
21135 | + /* | |
21136 | + * sizeof(unsigned long) coming from an extra "long" at the end | |
21137 | + * of the iobitmap. See tss_struct definition in processor.h | |
21138 | + * | |
21139 | + * -1? seg base+limit should be pointing to the address of the | |
21140 | + * last valid byte | |
21141 | + */ | |
21142 | + set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, | |
21143 | + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1); | |
21144 | + write_gdt_entry(d, entry, &tss, DESC_TSS); | |
21145 | +} | |
21146 | + | |
21147 | +#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) | |
21148 | + | |
21149 | +static inline void native_set_ldt(const void *addr, unsigned int entries) | |
21150 | +{ | |
21151 | + if (likely(entries == 0)) | |
21152 | + __asm__ __volatile__("lldt %w0"::"q" (0)); | |
21153 | + else { | |
21154 | + unsigned cpu = smp_processor_id(); | |
21155 | + ldt_desc ldt; | |
21156 | + | |
21157 | + set_tssldt_descriptor(&ldt, (unsigned long)addr, | |
21158 | + DESC_LDT, entries * sizeof(ldt) - 1); | |
21159 | + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, | |
21160 | + &ldt, DESC_LDT); | |
21161 | + __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); | |
21162 | + } | |
21163 | +} | |
21164 | + | |
21165 | +static inline void native_load_tr_desc(void) | |
21166 | +{ | |
21167 | + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); | |
21168 | +} | |
21169 | + | |
21170 | +static inline void native_load_gdt(const struct desc_ptr *dtr) | |
21171 | +{ | |
21172 | + asm volatile("lgdt %0"::"m" (*dtr)); | |
21173 | +} | |
21174 | + | |
21175 | +static inline void native_load_idt(const struct desc_ptr *dtr) | |
21176 | +{ | |
21177 | + asm volatile("lidt %0"::"m" (*dtr)); | |
21178 | +} | |
21179 | + | |
21180 | +static inline void native_store_gdt(struct desc_ptr *dtr) | |
21181 | +{ | |
21182 | + asm volatile("sgdt %0":"=m" (*dtr)); | |
21183 | +} | |
21184 | + | |
21185 | +static inline void native_store_idt(struct desc_ptr *dtr) | |
21186 | +{ | |
21187 | + asm volatile("sidt %0":"=m" (*dtr)); | |
21188 | +} | |
21189 | + | |
21190 | +static inline unsigned long native_store_tr(void) | |
21191 | +{ | |
21192 | + unsigned long tr; | |
21193 | + asm volatile("str %0":"=r" (tr)); | |
21194 | + return tr; | |
21195 | +} | |
21196 | + | |
21197 | +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) | |
21198 | +{ | |
21199 | + unsigned int i; | |
21200 | + struct desc_struct *gdt = get_cpu_gdt_table(cpu); | |
21201 | + | |
21202 | + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) | |
21203 | + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; | |
21204 | +} | |
21205 | +#else | |
21206 | +#define load_TLS(t, cpu) xen_load_tls(t, cpu) | |
21207 | +#define set_ldt xen_set_ldt | |
21208 | + | |
21209 | +extern int write_ldt_entry(struct desc_struct *ldt, int entry, | |
21210 | + const void *desc); | |
21211 | +extern int write_gdt_entry(struct desc_struct *gdt, int entry, | |
21212 | + const void *desc, int type); | |
21213 | + | |
21214 | +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu) | |
21215 | +{ | |
21216 | + unsigned int i; | |
21217 | + struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN; | |
21218 | + | |
21219 | + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) | |
21220 | + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]), | |
21221 | + *(u64 *)&t->tls_array[i])) | |
21222 | + BUG(); | |
21223 | +} | |
21224 | +#endif | |
21225 | + | |
21226 | +#define _LDT_empty(info) (\ | |
21227 | + (info)->base_addr == 0 && \ | |
21228 | + (info)->limit == 0 && \ | |
21229 | + (info)->contents == 0 && \ | |
21230 | + (info)->read_exec_only == 1 && \ | |
21231 | + (info)->seg_32bit == 0 && \ | |
21232 | + (info)->limit_in_pages == 0 && \ | |
21233 | + (info)->seg_not_present == 1 && \ | |
21234 | + (info)->useable == 0) | |
21235 | + | |
21236 | +#ifdef CONFIG_X86_64 | |
21237 | +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) | |
21238 | +#else | |
21239 | +#define LDT_empty(info) (_LDT_empty(info)) | |
21240 | +#endif | |
21241 | + | |
21242 | +static inline void clear_LDT(void) | |
21243 | +{ | |
21244 | + set_ldt(NULL, 0); | |
21245 | +} | |
21246 | + | |
21247 | +/* | |
21248 | + * load one particular LDT into the current CPU | |
21249 | + */ | |
21250 | +static inline void load_LDT_nolock(mm_context_t *pc) | |
21251 | +{ | |
21252 | + set_ldt(pc->ldt, pc->size); | |
21253 | +} | |
21254 | + | |
21255 | +static inline void load_LDT(mm_context_t *pc) | |
21256 | +{ | |
21257 | + preempt_disable(); | |
21258 | + load_LDT_nolock(pc); | |
21259 | + preempt_enable(); | |
21260 | +} | |
21261 | + | |
21262 | +static inline unsigned long get_desc_base(const struct desc_struct *desc) | |
21263 | +{ | |
21264 | + return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); | |
21265 | +} | |
21266 | + | |
21267 | +static inline unsigned long get_desc_limit(const struct desc_struct *desc) | |
21268 | +{ | |
21269 | + return desc->limit0 | (desc->limit << 16); | |
21270 | +} | |
21271 | + | |
21272 | +#ifndef CONFIG_X86_NO_IDT | |
21273 | +static inline void _set_gate(int gate, unsigned type, void *addr, | |
21274 | + unsigned dpl, unsigned ist, unsigned seg) | |
21275 | +{ | |
21276 | + gate_desc s; | |
21277 | + pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); | |
21278 | + /* | |
21279 | + * does not need to be atomic because it is only done once at | |
21280 | + * setup time | |
21281 | + */ | |
21282 | + write_idt_entry(idt_table, gate, &s); | |
21283 | +} | |
21284 | + | |
21285 | +/* | |
21286 | + * This needs to use 'idt_table' rather than 'idt', and | |
21287 | + * thus use the _nonmapped_ version of the IDT, as the | |
21288 | + * Pentium F0 0F bugfix can have resulted in the mapped | |
21289 | + * IDT being write-protected. | |
21290 | + */ | |
21291 | +static inline void set_intr_gate(unsigned int n, void *addr) | |
21292 | +{ | |
21293 | + BUG_ON((unsigned)n > 0xFF); | |
21294 | + _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); | |
21295 | +} | |
21296 | + | |
21297 | +/* | |
21298 | + * This routine sets up an interrupt gate at directory privilege level 3. | |
21299 | + */ | |
21300 | +static inline void set_system_intr_gate(unsigned int n, void *addr) | |
21301 | +{ | |
21302 | + BUG_ON((unsigned)n > 0xFF); | |
21303 | + _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); | |
21304 | +} | |
21305 | + | |
21306 | +static inline void set_trap_gate(unsigned int n, void *addr) | |
21307 | +{ | |
21308 | + BUG_ON((unsigned)n > 0xFF); | |
21309 | + _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); | |
21310 | +} | |
21311 | + | |
21312 | +static inline void set_system_gate(unsigned int n, void *addr) | |
21313 | +{ | |
21314 | + BUG_ON((unsigned)n > 0xFF); | |
21315 | #ifdef CONFIG_X86_32 | |
21316 | -# include "desc_32.h" | |
21317 | + _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); | |
21318 | +#else | |
21319 | + _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); | |
21320 | +#endif | |
21321 | +} | |
21322 | + | |
21323 | +static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) | |
21324 | +{ | |
21325 | + BUG_ON((unsigned)n > 0xFF); | |
21326 | + _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); | |
21327 | +} | |
21328 | + | |
21329 | +static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) | |
21330 | +{ | |
21331 | + BUG_ON((unsigned)n > 0xFF); | |
21332 | + _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); | |
21333 | +} | |
21334 | + | |
21335 | +static inline void set_system_gate_ist(int n, void *addr, unsigned ist) | |
21336 | +{ | |
21337 | + BUG_ON((unsigned)n > 0xFF); | |
21338 | + _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); | |
21339 | +} | |
21340 | +#endif | |
21341 | + | |
21342 | #else | |
21343 | -# include "desc_64.h" | |
21344 | +/* | |
21345 | + * GET_DESC_BASE reads the descriptor base of the specified segment. | |
21346 | + * | |
21347 | + * Args: | |
21348 | + * idx - descriptor index | |
21349 | + * gdt - GDT pointer | |
21350 | + * base - 32bit register to which the base will be written | |
21351 | + * lo_w - lo word of the "base" register | |
21352 | + * lo_b - lo byte of the "base" register | |
21353 | + * hi_b - hi byte of the low word of the "base" register | |
21354 | + * | |
21355 | + * Example: | |
21356 | + * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) | |
21357 | + * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. | |
21358 | + */ | |
21359 | +#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ | |
21360 | + movb idx*8+4(gdt), lo_b; \ | |
21361 | + movb idx*8+7(gdt), hi_b; \ | |
21362 | + shll $16, base; \ | |
21363 | + movw idx*8+2(gdt), lo_w; | |
21364 | + | |
21365 | + | |
21366 | +#endif /* __ASSEMBLY__ */ | |
21367 | + | |
21368 | #endif | |
21369 | --- a/include/asm-x86/mach-xen/asm/dma-mapping_32.h | |
21370 | +++ b/include/asm-x86/mach-xen/asm/dma-mapping_32.h | |
21371 | @@ -84,23 +84,13 @@ dma_sync_single_range_for_device(struct | |
21372 | dma_sync_single_for_device(dev, dma_handle+offset, size, direction); | |
21373 | } | |
21374 | ||
21375 | -static inline void | |
21376 | +extern void | |
21377 | dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, | |
21378 | - enum dma_data_direction direction) | |
21379 | -{ | |
21380 | - if (swiotlb) | |
21381 | - swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction); | |
21382 | - flush_write_buffers(); | |
21383 | -} | |
21384 | + enum dma_data_direction direction); | |
21385 | ||
21386 | -static inline void | |
21387 | +extern void | |
21388 | dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, | |
21389 | - enum dma_data_direction direction) | |
21390 | -{ | |
21391 | - if (swiotlb) | |
21392 | - swiotlb_sync_sg_for_device(dev,sg,nelems,direction); | |
21393 | - flush_write_buffers(); | |
21394 | -} | |
21395 | + enum dma_data_direction direction); | |
21396 | ||
21397 | extern int | |
21398 | dma_mapping_error(dma_addr_t dma_addr); | |
21399 | --- a/include/asm-x86/mach-xen/asm/fixmap_32.h | |
21400 | +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h | |
21401 | @@ -64,7 +64,7 @@ enum fixed_addresses { | |
21402 | #endif | |
21403 | #ifdef CONFIG_X86_VISWS_APIC | |
21404 | FIX_CO_CPU, /* Cobalt timer */ | |
21405 | - FIX_CO_APIC, /* Cobalt APIC Redirection Table */ | |
21406 | + FIX_CO_APIC, /* Cobalt APIC Redirection Table */ | |
21407 | FIX_LI_PCIA, /* Lithium PCI Bridge A */ | |
21408 | FIX_LI_PCIB, /* Lithium PCI Bridge B */ | |
21409 | #endif | |
21410 | @@ -73,7 +73,7 @@ enum fixed_addresses { | |
21411 | #endif | |
21412 | #ifdef CONFIG_X86_CYCLONE_TIMER | |
21413 | FIX_CYCLONE_TIMER, /*cyclone timer register*/ | |
21414 | -#endif | |
21415 | +#endif | |
21416 | #ifdef CONFIG_HIGHMEM | |
21417 | FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ | |
21418 | FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, | |
21419 | @@ -93,11 +93,23 @@ enum fixed_addresses { | |
21420 | FIX_ISAMAP_END, | |
21421 | FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, | |
21422 | __end_of_permanent_fixed_addresses, | |
21423 | - /* temporary boot-time mappings, used before ioremap() is functional */ | |
21424 | -#define NR_FIX_BTMAPS 16 | |
21425 | - FIX_BTMAP_END = __end_of_permanent_fixed_addresses, | |
21426 | - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1, | |
21427 | + /* | |
21428 | + * 256 temporary boot-time mappings, used by early_ioremap(), | |
21429 | + * before ioremap() is functional. | |
21430 | + * | |
21431 | + * We round it up to the next 512 pages boundary so that we | |
21432 | + * can have a single pgd entry and a single pte table: | |
21433 | + */ | |
21434 | +#define NR_FIX_BTMAPS 64 | |
21435 | +#define FIX_BTMAPS_NESTING 4 | |
21436 | + FIX_BTMAP_END = | |
21437 | + __end_of_permanent_fixed_addresses + 512 - | |
21438 | + (__end_of_permanent_fixed_addresses & 511), | |
21439 | + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1, | |
21440 | FIX_WP_TEST, | |
21441 | +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT | |
21442 | + FIX_OHCI1394_BASE, | |
21443 | +#endif | |
21444 | __end_of_fixed_addresses | |
21445 | }; | |
21446 | ||
21447 | --- a/include/asm-x86/mach-xen/asm/fixmap_64.h | |
21448 | +++ b/include/asm-x86/mach-xen/asm/fixmap_64.h | |
21449 | @@ -15,6 +15,7 @@ | |
21450 | #include <asm/apicdef.h> | |
21451 | #include <asm/page.h> | |
21452 | #include <asm/vsyscall.h> | |
21453 | +#include <asm/efi.h> | |
21454 | #include <asm/acpi.h> | |
21455 | ||
21456 | /* | |
21457 | @@ -46,6 +47,10 @@ enum fixed_addresses { | |
21458 | FIX_IO_APIC_BASE_0, | |
21459 | FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, | |
21460 | #endif | |
21461 | +#ifdef CONFIG_EFI | |
21462 | + FIX_EFI_IO_MAP_LAST_PAGE, | |
21463 | + FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1, | |
21464 | +#endif | |
21465 | #ifdef CONFIG_ACPI | |
21466 | FIX_ACPI_BEGIN, | |
21467 | FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, | |
21468 | @@ -55,10 +60,22 @@ enum fixed_addresses { | |
21469 | FIX_ISAMAP_END, | |
21470 | FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, | |
21471 | __end_of_permanent_fixed_addresses, | |
21472 | - /* temporary boot-time mappings, used before ioremap() is functional */ | |
21473 | -#define NR_FIX_BTMAPS 16 | |
21474 | - FIX_BTMAP_END = __end_of_permanent_fixed_addresses, | |
21475 | - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1, | |
21476 | + /* | |
21477 | + * 256 temporary boot-time mappings, used by early_ioremap(), | |
21478 | + * before ioremap() is functional. | |
21479 | + * | |
21480 | + * We round it up to the next 512 pages boundary so that we | |
21481 | + * can have a single pgd entry and a single pte table: | |
21482 | + */ | |
21483 | +#define NR_FIX_BTMAPS 64 | |
21484 | +#define FIX_BTMAPS_NESTING 4 | |
21485 | + FIX_BTMAP_END = | |
21486 | + __end_of_permanent_fixed_addresses + 512 - | |
21487 | + (__end_of_permanent_fixed_addresses & 511), | |
21488 | + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1, | |
21489 | +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT | |
21490 | + FIX_OHCI1394_BASE, | |
21491 | +#endif | |
21492 | __end_of_fixed_addresses | |
21493 | }; | |
21494 | ||
21495 | --- a/include/asm-x86/mach-xen/asm/highmem.h | |
21496 | +++ b/include/asm-x86/mach-xen/asm/highmem.h | |
21497 | @@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table; | |
21498 | * easily, subsequent pte tables have to be allocated in one physical | |
21499 | * chunk of RAM. | |
21500 | */ | |
21501 | -#ifdef CONFIG_X86_PAE | |
21502 | -#define LAST_PKMAP 512 | |
21503 | -#else | |
21504 | -#define LAST_PKMAP 1024 | |
21505 | -#endif | |
21506 | /* | |
21507 | * Ordering is: | |
21508 | * | |
21509 | @@ -57,13 +52,12 @@ extern pte_t *pkmap_page_table; | |
21510 | * VMALLOC_START | |
21511 | * high_memory | |
21512 | */ | |
21513 | -#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK ) | |
21514 | #define LAST_PKMAP_MASK (LAST_PKMAP-1) | |
21515 | #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) | |
21516 | #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) | |
21517 | ||
21518 | -extern void * FASTCALL(kmap_high(struct page *page)); | |
21519 | -extern void FASTCALL(kunmap_high(struct page *page)); | |
21520 | +extern void *kmap_high(struct page *page); | |
21521 | +extern void kunmap_high(struct page *page); | |
21522 | ||
21523 | void *kmap(struct page *page); | |
21524 | void kunmap(struct page *page); | |
21525 | --- a/include/asm-x86/mach-xen/asm/hypervisor.h | |
21526 | +++ b/include/asm-x86/mach-xen/asm/hypervisor.h | |
21527 | @@ -264,6 +264,25 @@ HYPERVISOR_poll( | |
21528 | return rc; | |
21529 | } | |
21530 | ||
21531 | +static inline int __must_check | |
21532 | +HYPERVISOR_poll_no_timeout( | |
21533 | + evtchn_port_t *ports, unsigned int nr_ports) | |
21534 | +{ | |
21535 | + int rc; | |
21536 | + struct sched_poll sched_poll = { | |
21537 | + .nr_ports = nr_ports | |
21538 | + }; | |
21539 | + set_xen_guest_handle(sched_poll.ports, ports); | |
21540 | + | |
21541 | + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); | |
21542 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
21543 | + if (rc == -ENOSYS) | |
21544 | + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); | |
21545 | +#endif | |
21546 | + | |
21547 | + return rc; | |
21548 | +} | |
21549 | + | |
21550 | #ifdef CONFIG_XEN | |
21551 | ||
21552 | static inline void | |
21553 | --- a/include/asm-x86/mach-xen/asm/io_32.h | |
21554 | +++ b/include/asm-x86/mach-xen/asm/io_32.h | |
21555 | @@ -113,8 +113,6 @@ static inline void * phys_to_virt(unsign | |
21556 | ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \ | |
21557 | bvec_to_pseudophys((vec2)))) | |
21558 | ||
21559 | -extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags); | |
21560 | - | |
21561 | /** | |
21562 | * ioremap - map bus memory into CPU space | |
21563 | * @offset: bus address of the memory | |
21564 | @@ -124,32 +122,39 @@ extern void __iomem * __ioremap(unsigned | |
21565 | * make bus memory CPU accessible via the readb/readw/readl/writeb/ | |
21566 | * writew/writel functions and the other mmio helpers. The returned | |
21567 | * address is not guaranteed to be usable directly as a virtual | |
21568 | - * address. | |
21569 | + * address. | |
21570 | * | |
21571 | * If the area you are trying to map is a PCI BAR you should have a | |
21572 | * look at pci_iomap(). | |
21573 | */ | |
21574 | +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); | |
21575 | +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); | |
21576 | ||
21577 | -static inline void __iomem * ioremap(unsigned long offset, unsigned long size) | |
21578 | +/* | |
21579 | + * The default ioremap() behavior is non-cached: | |
21580 | + */ | |
21581 | +static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) | |
21582 | { | |
21583 | - return __ioremap(offset, size, 0); | |
21584 | + return ioremap_nocache(offset, size); | |
21585 | } | |
21586 | ||
21587 | -extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size); | |
21588 | extern void iounmap(volatile void __iomem *addr); | |
21589 | ||
21590 | /* | |
21591 | - * bt_ioremap() and bt_iounmap() are for temporary early boot-time | |
21592 | + * early_ioremap() and early_iounmap() are for temporary early boot-time | |
21593 | * mappings, before the real ioremap() is functional. | |
21594 | * A boot-time mapping is currently limited to at most 16 pages. | |
21595 | */ | |
21596 | -extern void *bt_ioremap(unsigned long offset, unsigned long size); | |
21597 | -extern void bt_iounmap(void *addr, unsigned long size); | |
21598 | +extern void early_ioremap_init(void); | |
21599 | +extern void early_ioremap_clear(void); | |
21600 | +extern void early_ioremap_reset(void); | |
21601 | +extern void *early_ioremap(unsigned long offset, unsigned long size); | |
21602 | +extern void early_iounmap(void *addr, unsigned long size); | |
21603 | extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); | |
21604 | ||
21605 | /* Use early IO mappings for DMI because it's initialized early */ | |
21606 | -#define dmi_ioremap bt_ioremap | |
21607 | -#define dmi_iounmap bt_iounmap | |
21608 | +#define dmi_ioremap early_ioremap | |
21609 | +#define dmi_iounmap early_iounmap | |
21610 | #define dmi_alloc alloc_bootmem | |
21611 | ||
21612 | /* | |
21613 | @@ -263,43 +268,21 @@ static inline void flush_write_buffers(v | |
21614 | ||
21615 | #endif /* __KERNEL__ */ | |
21616 | ||
21617 | -static inline void xen_io_delay(void) | |
21618 | -{ | |
21619 | - asm volatile("outb %%al,$0x80" : : : "memory"); | |
21620 | -} | |
21621 | +extern void xen_io_delay(void); | |
21622 | +#define native_io_delay xen_io_delay | |
21623 | + | |
21624 | +extern int io_delay_type; | |
21625 | +extern void io_delay_init(void); | |
21626 | ||
21627 | static inline void slow_down_io(void) { | |
21628 | - xen_io_delay(); | |
21629 | + native_io_delay(); | |
21630 | #ifdef REALLY_SLOW_IO | |
21631 | - xen_io_delay(); | |
21632 | - xen_io_delay(); | |
21633 | - xen_io_delay(); | |
21634 | + native_io_delay(); | |
21635 | + native_io_delay(); | |
21636 | + native_io_delay(); | |
21637 | #endif | |
21638 | } | |
21639 | ||
21640 | -#ifdef CONFIG_X86_NUMAQ | |
21641 | -extern void *xquad_portio; /* Where the IO area was mapped */ | |
21642 | -#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) | |
21643 | -#define __BUILDIO(bwl,bw,type) \ | |
21644 | -static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \ | |
21645 | - if (xquad_portio) \ | |
21646 | - write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \ | |
21647 | - else \ | |
21648 | - out##bwl##_local(value, port); \ | |
21649 | -} \ | |
21650 | -static inline void out##bwl(unsigned type value, int port) { \ | |
21651 | - out##bwl##_quad(value, port, 0); \ | |
21652 | -} \ | |
21653 | -static inline unsigned type in##bwl##_quad(int port, int quad) { \ | |
21654 | - if (xquad_portio) \ | |
21655 | - return read##bwl(XQUAD_PORT_ADDR(port, quad)); \ | |
21656 | - else \ | |
21657 | - return in##bwl##_local(port); \ | |
21658 | -} \ | |
21659 | -static inline unsigned type in##bwl(int port) { \ | |
21660 | - return in##bwl##_quad(port, 0); \ | |
21661 | -} | |
21662 | -#else | |
21663 | #define __BUILDIO(bwl,bw,type) \ | |
21664 | static inline void out##bwl(unsigned type value, int port) { \ | |
21665 | out##bwl##_local(value, port); \ | |
21666 | @@ -307,8 +290,6 @@ static inline void out##bwl(unsigned typ | |
21667 | static inline unsigned type in##bwl(int port) { \ | |
21668 | return in##bwl##_local(port); \ | |
21669 | } | |
21670 | -#endif | |
21671 | - | |
21672 | ||
21673 | #define BUILDIO(bwl,bw,type) \ | |
21674 | static inline void out##bwl##_local(unsigned type value, int port) { \ | |
21675 | --- a/include/asm-x86/mach-xen/asm/io_64.h | |
21676 | +++ b/include/asm-x86/mach-xen/asm/io_64.h | |
21677 | @@ -36,13 +36,21 @@ | |
21678 | * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> | |
21679 | */ | |
21680 | ||
21681 | -#define __SLOW_DOWN_IO "\noutb %%al,$0x80" | |
21682 | +extern void xen_io_delay(void); | |
21683 | +#define native_io_delay xen_io_delay | |
21684 | ||
21685 | +extern int io_delay_type; | |
21686 | +extern void io_delay_init(void); | |
21687 | + | |
21688 | +static inline void slow_down_io(void) | |
21689 | +{ | |
21690 | + native_io_delay(); | |
21691 | #ifdef REALLY_SLOW_IO | |
21692 | -#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO | |
21693 | -#else | |
21694 | -#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO | |
21695 | + native_io_delay(); | |
21696 | + native_io_delay(); | |
21697 | + native_io_delay(); | |
21698 | #endif | |
21699 | +} | |
21700 | ||
21701 | /* | |
21702 | * Talk about misusing macros.. | |
21703 | @@ -53,9 +61,15 @@ static inline void out##s(unsigned x val | |
21704 | #define __OUT2(s,s1,s2) \ | |
21705 | __asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1" | |
21706 | ||
21707 | +#ifndef REALLY_SLOW_IO | |
21708 | +#define REALLY_SLOW_IO | |
21709 | +#define UNSET_REALLY_SLOW_IO | |
21710 | +#endif | |
21711 | + | |
21712 | #define __OUT(s,s1,x) \ | |
21713 | __OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \ | |
21714 | -__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \ | |
21715 | +__OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \ | |
21716 | + slow_down_io(); } | |
21717 | ||
21718 | #define __IN1(s) \ | |
21719 | static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v; | |
21720 | @@ -64,8 +78,13 @@ static inline RETURN_TYPE in##s(unsigned | |
21721 | __asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0" | |
21722 | ||
21723 | #define __IN(s,s1,i...) \ | |
21724 | -__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \ | |
21725 | -__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \ | |
21726 | +__IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); return _v; } \ | |
21727 | +__IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \ | |
21728 | + slow_down_io(); return _v; } | |
21729 | + | |
21730 | +#ifdef UNSET_REALLY_SLOW_IO | |
21731 | +#undef REALLY_SLOW_IO | |
21732 | +#endif | |
21733 | ||
21734 | #define __INS(s) \ | |
21735 | static inline void ins##s(unsigned short port, void * addr, unsigned long count) \ | |
21736 | @@ -143,25 +162,30 @@ static inline void * phys_to_virt(unsign | |
21737 | ||
21738 | #include <asm-generic/iomap.h> | |
21739 | ||
21740 | -extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags); | |
21741 | - | |
21742 | -static inline void __iomem * ioremap (unsigned long offset, unsigned long size) | |
21743 | -{ | |
21744 | - return __ioremap(offset, size, 0); | |
21745 | -} | |
21746 | - | |
21747 | -extern void *bt_ioremap(unsigned long addr, unsigned long size); | |
21748 | -extern void bt_iounmap(void *addr, unsigned long size); | |
21749 | -#define early_ioremap bt_ioremap | |
21750 | -#define early_iounmap bt_iounmap | |
21751 | +extern void early_ioremap_init(void); | |
21752 | +extern void early_ioremap_clear(void); | |
21753 | +extern void early_ioremap_reset(void); | |
21754 | +extern void *early_ioremap(unsigned long addr, unsigned long size); | |
21755 | +extern void early_iounmap(void *addr, unsigned long size); | |
21756 | ||
21757 | /* | |
21758 | * This one maps high address device memory and turns off caching for that area. | |
21759 | * it's useful if some control registers are in such an area and write combining | |
21760 | * or read caching is not desirable: | |
21761 | */ | |
21762 | -extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size); | |
21763 | +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); | |
21764 | +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); | |
21765 | + | |
21766 | +/* | |
21767 | + * The default ioremap() behavior is non-cached: | |
21768 | + */ | |
21769 | +static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) | |
21770 | +{ | |
21771 | + return ioremap_nocache(offset, size); | |
21772 | +} | |
21773 | + | |
21774 | extern void iounmap(volatile void __iomem *addr); | |
21775 | + | |
21776 | extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); | |
21777 | ||
21778 | /* | |
21779 | --- a/include/asm-x86/mach-xen/asm/irqflags_32.h | |
21780 | +++ /dev/null | |
21781 | @@ -1,212 +0,0 @@ | |
21782 | -/* | |
21783 | - * include/asm-i386/irqflags.h | |
21784 | - * | |
21785 | - * IRQ flags handling | |
21786 | - * | |
21787 | - * This file gets included from lowlevel asm headers too, to provide | |
21788 | - * wrapped versions of the local_irq_*() APIs, based on the | |
21789 | - * raw_local_irq_*() functions from the lowlevel headers. | |
21790 | - */ | |
21791 | -#ifndef _ASM_IRQFLAGS_H | |
21792 | -#define _ASM_IRQFLAGS_H | |
21793 | - | |
21794 | -#ifndef __ASSEMBLY__ | |
21795 | -#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask) | |
21796 | - | |
21797 | -#define xen_restore_fl(f) \ | |
21798 | -do { \ | |
21799 | - vcpu_info_t *_vcpu; \ | |
21800 | - barrier(); \ | |
21801 | - _vcpu = current_vcpu_info(); \ | |
21802 | - if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \ | |
21803 | - barrier(); /* unmask then check (avoid races) */\ | |
21804 | - if (unlikely(_vcpu->evtchn_upcall_pending)) \ | |
21805 | - force_evtchn_callback(); \ | |
21806 | - } \ | |
21807 | -} while (0) | |
21808 | - | |
21809 | -#define xen_irq_disable() \ | |
21810 | -do { \ | |
21811 | - current_vcpu_info()->evtchn_upcall_mask = 1; \ | |
21812 | - barrier(); \ | |
21813 | -} while (0) | |
21814 | - | |
21815 | -#define xen_irq_enable() \ | |
21816 | -do { \ | |
21817 | - vcpu_info_t *_vcpu; \ | |
21818 | - barrier(); \ | |
21819 | - _vcpu = current_vcpu_info(); \ | |
21820 | - _vcpu->evtchn_upcall_mask = 0; \ | |
21821 | - barrier(); /* unmask then check (avoid races) */ \ | |
21822 | - if (unlikely(_vcpu->evtchn_upcall_pending)) \ | |
21823 | - force_evtchn_callback(); \ | |
21824 | -} while (0) | |
21825 | - | |
21826 | -void xen_safe_halt(void); | |
21827 | - | |
21828 | -void xen_halt(void); | |
21829 | - | |
21830 | -/* | |
21831 | - * The use of 'barrier' in the following reflects their use as local-lock | |
21832 | - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following | |
21833 | - * critical operations are executed. All critical operations must complete | |
21834 | - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also | |
21835 | - * includes these barriers, for example. | |
21836 | - */ | |
21837 | - | |
21838 | -#define __raw_local_save_flags() xen_save_fl() | |
21839 | - | |
21840 | -#define raw_local_irq_restore(flags) xen_restore_fl(flags) | |
21841 | - | |
21842 | -#define raw_local_irq_disable() xen_irq_disable() | |
21843 | - | |
21844 | -#define raw_local_irq_enable() xen_irq_enable() | |
21845 | - | |
21846 | -/* | |
21847 | - * Used in the idle loop; sti takes one instruction cycle | |
21848 | - * to complete: | |
21849 | - */ | |
21850 | -static inline void raw_safe_halt(void) | |
21851 | -{ | |
21852 | - xen_safe_halt(); | |
21853 | -} | |
21854 | - | |
21855 | -/* | |
21856 | - * Used when interrupts are already enabled or to | |
21857 | - * shutdown the processor: | |
21858 | - */ | |
21859 | -static inline void halt(void) | |
21860 | -{ | |
21861 | - xen_halt(); | |
21862 | -} | |
21863 | - | |
21864 | -/* | |
21865 | - * For spinlocks, etc: | |
21866 | - */ | |
21867 | -#define __raw_local_irq_save() \ | |
21868 | -({ \ | |
21869 | - unsigned long flags = __raw_local_save_flags(); \ | |
21870 | - \ | |
21871 | - raw_local_irq_disable(); \ | |
21872 | - \ | |
21873 | - flags; \ | |
21874 | -}) | |
21875 | - | |
21876 | -#else | |
21877 | -/* Offsets into shared_info_t. */ | |
21878 | -#define evtchn_upcall_pending /* 0 */ | |
21879 | -#define evtchn_upcall_mask 1 | |
21880 | - | |
21881 | -#define sizeof_vcpu_shift 6 | |
21882 | - | |
21883 | -#ifdef CONFIG_SMP | |
21884 | -#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \ | |
21885 | - shl $sizeof_vcpu_shift,%esi ; \ | |
21886 | - addl HYPERVISOR_shared_info,%esi | |
21887 | -#else | |
21888 | -#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi | |
21889 | -#endif | |
21890 | - | |
21891 | -#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi) | |
21892 | -#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi) | |
21893 | -#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi) | |
21894 | -#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ | |
21895 | - __DISABLE_INTERRUPTS | |
21896 | -#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ | |
21897 | - __ENABLE_INTERRUPTS | |
21898 | -#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \ | |
21899 | -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \ | |
21900 | - __TEST_PENDING ; \ | |
21901 | - jnz 14f /* process more events if necessary... */ ; \ | |
21902 | - movl PT_ESI(%esp), %esi ; \ | |
21903 | - sysexit ; \ | |
21904 | -14: __DISABLE_INTERRUPTS ; \ | |
21905 | - TRACE_IRQS_OFF ; \ | |
21906 | -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \ | |
21907 | - push %esp ; \ | |
21908 | - call evtchn_do_upcall ; \ | |
21909 | - add $4,%esp ; \ | |
21910 | - jmp ret_from_intr | |
21911 | -#define INTERRUPT_RETURN iret | |
21912 | -#endif /* __ASSEMBLY__ */ | |
21913 | - | |
21914 | -#ifndef __ASSEMBLY__ | |
21915 | -#define raw_local_save_flags(flags) \ | |
21916 | - do { (flags) = __raw_local_save_flags(); } while (0) | |
21917 | - | |
21918 | -#define raw_local_irq_save(flags) \ | |
21919 | - do { (flags) = __raw_local_irq_save(); } while (0) | |
21920 | - | |
21921 | -static inline int raw_irqs_disabled_flags(unsigned long flags) | |
21922 | -{ | |
21923 | - return (flags != 0); | |
21924 | -} | |
21925 | - | |
21926 | -#define raw_irqs_disabled() \ | |
21927 | -({ \ | |
21928 | - unsigned long flags = __raw_local_save_flags(); \ | |
21929 | - \ | |
21930 | - raw_irqs_disabled_flags(flags); \ | |
21931 | -}) | |
21932 | - | |
21933 | -/* | |
21934 | - * makes the traced hardirq state match with the machine state | |
21935 | - * | |
21936 | - * should be a rarely used function, only in places where its | |
21937 | - * otherwise impossible to know the irq state, like in traps. | |
21938 | - */ | |
21939 | -static inline void trace_hardirqs_fixup_flags(unsigned long flags) | |
21940 | -{ | |
21941 | - if (raw_irqs_disabled_flags(flags)) | |
21942 | - trace_hardirqs_off(); | |
21943 | - else | |
21944 | - trace_hardirqs_on(); | |
21945 | -} | |
21946 | - | |
21947 | -#define trace_hardirqs_fixup() \ | |
21948 | - trace_hardirqs_fixup_flags(__raw_local_save_flags()) | |
21949 | -#endif /* __ASSEMBLY__ */ | |
21950 | - | |
21951 | -/* | |
21952 | - * Do the CPU's IRQ-state tracing from assembly code. We call a | |
21953 | - * C function, so save all the C-clobbered registers: | |
21954 | - */ | |
21955 | -#ifdef CONFIG_TRACE_IRQFLAGS | |
21956 | - | |
21957 | -# define TRACE_IRQS_ON \ | |
21958 | - pushl %eax; \ | |
21959 | - pushl %ecx; \ | |
21960 | - pushl %edx; \ | |
21961 | - call trace_hardirqs_on; \ | |
21962 | - popl %edx; \ | |
21963 | - popl %ecx; \ | |
21964 | - popl %eax; | |
21965 | - | |
21966 | -# define TRACE_IRQS_OFF \ | |
21967 | - pushl %eax; \ | |
21968 | - pushl %ecx; \ | |
21969 | - pushl %edx; \ | |
21970 | - call trace_hardirqs_off; \ | |
21971 | - popl %edx; \ | |
21972 | - popl %ecx; \ | |
21973 | - popl %eax; | |
21974 | - | |
21975 | -#else | |
21976 | -# define TRACE_IRQS_ON | |
21977 | -# define TRACE_IRQS_OFF | |
21978 | -#endif | |
21979 | - | |
21980 | -#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
21981 | -# define LOCKDEP_SYS_EXIT \ | |
21982 | - pushl %eax; \ | |
21983 | - pushl %ecx; \ | |
21984 | - pushl %edx; \ | |
21985 | - call lockdep_sys_exit; \ | |
21986 | - popl %edx; \ | |
21987 | - popl %ecx; \ | |
21988 | - popl %eax; | |
21989 | -#else | |
21990 | -# define LOCKDEP_SYS_EXIT | |
21991 | -#endif | |
21992 | - | |
21993 | -#endif | |
21994 | --- a/include/asm-x86/mach-xen/asm/irqflags_64.h | |
21995 | +++ /dev/null | |
21996 | @@ -1,178 +0,0 @@ | |
21997 | -/* | |
21998 | - * include/asm-x86_64/irqflags.h | |
21999 | - * | |
22000 | - * IRQ flags handling | |
22001 | - * | |
22002 | - * This file gets included from lowlevel asm headers too, to provide | |
22003 | - * wrapped versions of the local_irq_*() APIs, based on the | |
22004 | - * raw_local_irq_*() functions from the lowlevel headers. | |
22005 | - */ | |
22006 | -#ifndef _ASM_IRQFLAGS_H | |
22007 | -#define _ASM_IRQFLAGS_H | |
22008 | -#include <asm/processor-flags.h> | |
22009 | - | |
22010 | -#ifndef __ASSEMBLY__ | |
22011 | -/* | |
22012 | - * Interrupt control: | |
22013 | - */ | |
22014 | - | |
22015 | -/* | |
22016 | - * The use of 'barrier' in the following reflects their use as local-lock | |
22017 | - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following | |
22018 | - * critical operations are executed. All critical operations must complete | |
22019 | - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also | |
22020 | - * includes these barriers, for example. | |
22021 | - */ | |
22022 | - | |
22023 | -#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask) | |
22024 | - | |
22025 | -#define raw_local_save_flags(flags) \ | |
22026 | - do { (flags) = __raw_local_save_flags(); } while (0) | |
22027 | - | |
22028 | -#define raw_local_irq_restore(x) \ | |
22029 | -do { \ | |
22030 | - vcpu_info_t *_vcpu; \ | |
22031 | - barrier(); \ | |
22032 | - _vcpu = current_vcpu_info(); \ | |
22033 | - if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ | |
22034 | - barrier(); /* unmask then check (avoid races) */ \ | |
22035 | - if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ | |
22036 | - force_evtchn_callback(); \ | |
22037 | - } \ | |
22038 | -} while (0) | |
22039 | - | |
22040 | -#ifdef CONFIG_X86_VSMP | |
22041 | - | |
22042 | -/* | |
22043 | - * Interrupt control for the VSMP architecture: | |
22044 | - */ | |
22045 | - | |
22046 | -static inline void raw_local_irq_disable(void) | |
22047 | -{ | |
22048 | - unsigned long flags = __raw_local_save_flags(); | |
22049 | - | |
22050 | - raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); | |
22051 | -} | |
22052 | - | |
22053 | -static inline void raw_local_irq_enable(void) | |
22054 | -{ | |
22055 | - unsigned long flags = __raw_local_save_flags(); | |
22056 | - | |
22057 | - raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); | |
22058 | -} | |
22059 | - | |
22060 | -static inline int raw_irqs_disabled_flags(unsigned long flags) | |
22061 | -{ | |
22062 | - return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC); | |
22063 | -} | |
22064 | - | |
22065 | -#else /* CONFIG_X86_VSMP */ | |
22066 | - | |
22067 | -#define raw_local_irq_disable() \ | |
22068 | -do { \ | |
22069 | - current_vcpu_info()->evtchn_upcall_mask = 1; \ | |
22070 | - barrier(); \ | |
22071 | -} while (0) | |
22072 | - | |
22073 | -#define raw_local_irq_enable() \ | |
22074 | -do { \ | |
22075 | - vcpu_info_t *_vcpu; \ | |
22076 | - barrier(); \ | |
22077 | - _vcpu = current_vcpu_info(); \ | |
22078 | - _vcpu->evtchn_upcall_mask = 0; \ | |
22079 | - barrier(); /* unmask then check (avoid races) */ \ | |
22080 | - if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ | |
22081 | - force_evtchn_callback(); \ | |
22082 | -} while (0) | |
22083 | - | |
22084 | -static inline int raw_irqs_disabled_flags(unsigned long flags) | |
22085 | -{ | |
22086 | - return (flags != 0); | |
22087 | -} | |
22088 | - | |
22089 | -#endif | |
22090 | - | |
22091 | -/* | |
22092 | - * For spinlocks, etc.: | |
22093 | - */ | |
22094 | - | |
22095 | -#define __raw_local_irq_save() \ | |
22096 | -({ \ | |
22097 | - unsigned long flags = __raw_local_save_flags(); \ | |
22098 | - \ | |
22099 | - raw_local_irq_disable(); \ | |
22100 | - \ | |
22101 | - flags; \ | |
22102 | -}) | |
22103 | - | |
22104 | -#define raw_local_irq_save(flags) \ | |
22105 | - do { (flags) = __raw_local_irq_save(); } while (0) | |
22106 | - | |
22107 | -#define raw_irqs_disabled() \ | |
22108 | -({ \ | |
22109 | - unsigned long flags = __raw_local_save_flags(); \ | |
22110 | - \ | |
22111 | - raw_irqs_disabled_flags(flags); \ | |
22112 | -}) | |
22113 | - | |
22114 | -/* | |
22115 | - * makes the traced hardirq state match with the machine state | |
22116 | - * | |
22117 | - * should be a rarely used function, only in places where its | |
22118 | - * otherwise impossible to know the irq state, like in traps. | |
22119 | - */ | |
22120 | -static inline void trace_hardirqs_fixup_flags(unsigned long flags) | |
22121 | -{ | |
22122 | - if (raw_irqs_disabled_flags(flags)) | |
22123 | - trace_hardirqs_off(); | |
22124 | - else | |
22125 | - trace_hardirqs_on(); | |
22126 | -} | |
22127 | - | |
22128 | -#define trace_hardirqs_fixup() \ | |
22129 | - trace_hardirqs_fixup_flags(__raw_local_save_flags()) | |
22130 | -/* | |
22131 | - * Used in the idle loop; sti takes one instruction cycle | |
22132 | - * to complete: | |
22133 | - */ | |
22134 | -void xen_safe_halt(void); | |
22135 | -static inline void raw_safe_halt(void) | |
22136 | -{ | |
22137 | - xen_safe_halt(); | |
22138 | -} | |
22139 | - | |
22140 | -/* | |
22141 | - * Used when interrupts are already enabled or to | |
22142 | - * shutdown the processor: | |
22143 | - */ | |
22144 | -void xen_halt(void); | |
22145 | -static inline void halt(void) | |
22146 | -{ | |
22147 | - xen_halt(); | |
22148 | -} | |
22149 | - | |
22150 | -#else /* __ASSEMBLY__: */ | |
22151 | -# ifdef CONFIG_TRACE_IRQFLAGS | |
22152 | -# define TRACE_IRQS_ON call trace_hardirqs_on_thunk | |
22153 | -# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk | |
22154 | -# else | |
22155 | -# define TRACE_IRQS_ON | |
22156 | -# define TRACE_IRQS_OFF | |
22157 | -# endif | |
22158 | -# ifdef CONFIG_DEBUG_LOCK_ALLOC | |
22159 | -# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk | |
22160 | -# define LOCKDEP_SYS_EXIT_IRQ \ | |
22161 | - TRACE_IRQS_ON; \ | |
22162 | - sti; \ | |
22163 | - SAVE_REST; \ | |
22164 | - LOCKDEP_SYS_EXIT; \ | |
22165 | - RESTORE_REST; \ | |
22166 | - cli; \ | |
22167 | - TRACE_IRQS_OFF; | |
22168 | -# else | |
22169 | -# define LOCKDEP_SYS_EXIT | |
22170 | -# define LOCKDEP_SYS_EXIT_IRQ | |
22171 | -# endif | |
22172 | -#endif | |
22173 | - | |
22174 | -#endif | |
22175 | --- a/include/asm-x86/mach-xen/asm/irqflags.h | |
22176 | +++ b/include/asm-x86/mach-xen/asm/irqflags.h | |
22177 | @@ -1,5 +1,247 @@ | |
22178 | -#ifdef CONFIG_X86_32 | |
22179 | -# include "irqflags_32.h" | |
22180 | +#ifndef _X86_IRQFLAGS_H_ | |
22181 | +#define _X86_IRQFLAGS_H_ | |
22182 | + | |
22183 | +#include <asm/processor-flags.h> | |
22184 | + | |
22185 | +#ifndef __ASSEMBLY__ | |
22186 | +/* | |
22187 | + * The use of 'barrier' in the following reflects their use as local-lock | |
22188 | + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following | |
22189 | + * critical operations are executed. All critical operations must complete | |
22190 | + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also | |
22191 | + * includes these barriers, for example. | |
22192 | + */ | |
22193 | + | |
22194 | +#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask) | |
22195 | + | |
22196 | +#define xen_restore_fl(f) \ | |
22197 | +do { \ | |
22198 | + vcpu_info_t *_vcpu; \ | |
22199 | + barrier(); \ | |
22200 | + _vcpu = current_vcpu_info(); \ | |
22201 | + if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \ | |
22202 | + barrier(); /* unmask then check (avoid races) */\ | |
22203 | + if (unlikely(_vcpu->evtchn_upcall_pending)) \ | |
22204 | + force_evtchn_callback(); \ | |
22205 | + } \ | |
22206 | +} while (0) | |
22207 | + | |
22208 | +#define xen_irq_disable() \ | |
22209 | +do { \ | |
22210 | + current_vcpu_info()->evtchn_upcall_mask = 1; \ | |
22211 | + barrier(); \ | |
22212 | +} while (0) | |
22213 | + | |
22214 | +#define xen_irq_enable() \ | |
22215 | +do { \ | |
22216 | + vcpu_info_t *_vcpu; \ | |
22217 | + barrier(); \ | |
22218 | + _vcpu = current_vcpu_info(); \ | |
22219 | + _vcpu->evtchn_upcall_mask = 0; \ | |
22220 | + barrier(); /* unmask then check (avoid races) */ \ | |
22221 | + if (unlikely(_vcpu->evtchn_upcall_pending)) \ | |
22222 | + force_evtchn_callback(); \ | |
22223 | +} while (0) | |
22224 | + | |
22225 | +void xen_safe_halt(void); | |
22226 | + | |
22227 | +void xen_halt(void); | |
22228 | + | |
22229 | +#define __raw_local_save_flags() xen_save_fl() | |
22230 | + | |
22231 | +#define raw_local_irq_restore(flags) xen_restore_fl(flags) | |
22232 | + | |
22233 | +#define raw_local_irq_disable() xen_irq_disable() | |
22234 | + | |
22235 | +#define raw_local_irq_enable() xen_irq_enable() | |
22236 | + | |
22237 | +/* | |
22238 | + * Used in the idle loop; sti takes one instruction cycle | |
22239 | + * to complete: | |
22240 | + */ | |
22241 | +static inline void raw_safe_halt(void) | |
22242 | +{ | |
22243 | + xen_safe_halt(); | |
22244 | +} | |
22245 | + | |
22246 | +/* | |
22247 | + * Used when interrupts are already enabled or to | |
22248 | + * shutdown the processor: | |
22249 | + */ | |
22250 | +static inline void halt(void) | |
22251 | +{ | |
22252 | + xen_halt(); | |
22253 | +} | |
22254 | + | |
22255 | +/* | |
22256 | + * For spinlocks, etc: | |
22257 | + */ | |
22258 | +#define __raw_local_irq_save() \ | |
22259 | +({ \ | |
22260 | + unsigned long flags = __raw_local_save_flags(); \ | |
22261 | + \ | |
22262 | + raw_local_irq_disable(); \ | |
22263 | + \ | |
22264 | + flags; \ | |
22265 | +}) | |
22266 | #else | |
22267 | -# include "irqflags_64.h" | |
22268 | + | |
22269 | +/* Offsets into shared_info_t. */ | |
22270 | +#define evtchn_upcall_pending /* 0 */ | |
22271 | +#define evtchn_upcall_mask 1 | |
22272 | + | |
22273 | +#define sizeof_vcpu_shift 6 | |
22274 | + | |
22275 | +#ifdef CONFIG_X86_64 | |
22276 | +# define __REG_si %rsi | |
22277 | +# define __CPU_num %gs:pda_cpunumber | |
22278 | +#else | |
22279 | +# define __REG_si %esi | |
22280 | +# define __CPU_num TI_cpu(%ebp) | |
22281 | +#endif | |
22282 | + | |
22283 | +#ifdef CONFIG_SMP | |
22284 | +#define GET_VCPU_INFO movl __CPU_num,%esi ; \ | |
22285 | + shl $sizeof_vcpu_shift,%esi ; \ | |
22286 | + add HYPERVISOR_shared_info,__REG_si | |
22287 | +#else | |
22288 | +#define GET_VCPU_INFO mov HYPERVISOR_shared_info,__REG_si | |
22289 | +#endif | |
22290 | + | |
22291 | +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(__REG_si) | |
22292 | +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(__REG_si) | |
22293 | +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(__REG_si) | |
22294 | +#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ | |
22295 | + __DISABLE_INTERRUPTS | |
22296 | +#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ | |
22297 | + __ENABLE_INTERRUPTS | |
22298 | + | |
22299 | +#ifndef CONFIG_X86_64 | |
22300 | +#define INTERRUPT_RETURN iret | |
22301 | +#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \ | |
22302 | +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \ | |
22303 | + __TEST_PENDING ; \ | |
22304 | + jnz 14f /* process more events if necessary... */ ; \ | |
22305 | + movl PT_ESI(%esp), %esi ; \ | |
22306 | + sysexit ; \ | |
22307 | +14: __DISABLE_INTERRUPTS ; \ | |
22308 | + TRACE_IRQS_OFF ; \ | |
22309 | +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \ | |
22310 | + push %esp ; \ | |
22311 | + call evtchn_do_upcall ; \ | |
22312 | + add $4,%esp ; \ | |
22313 | + jmp ret_from_intr | |
22314 | +#endif | |
22315 | + | |
22316 | + | |
22317 | +#endif /* __ASSEMBLY__ */ | |
22318 | + | |
22319 | +#ifndef __ASSEMBLY__ | |
22320 | +#define raw_local_save_flags(flags) \ | |
22321 | + do { (flags) = __raw_local_save_flags(); } while (0) | |
22322 | + | |
22323 | +#define raw_local_irq_save(flags) \ | |
22324 | + do { (flags) = __raw_local_irq_save(); } while (0) | |
22325 | + | |
22326 | +static inline int raw_irqs_disabled_flags(unsigned long flags) | |
22327 | +{ | |
22328 | + return (flags != 0); | |
22329 | +} | |
22330 | + | |
22331 | +#define raw_irqs_disabled() \ | |
22332 | +({ \ | |
22333 | + unsigned long flags = __raw_local_save_flags(); \ | |
22334 | + \ | |
22335 | + raw_irqs_disabled_flags(flags); \ | |
22336 | +}) | |
22337 | + | |
22338 | +/* | |
22339 | + * makes the traced hardirq state match with the machine state | |
22340 | + * | |
22341 | + * should be a rarely used function, only in places where its | |
22342 | + * otherwise impossible to know the irq state, like in traps. | |
22343 | + */ | |
22344 | +static inline void trace_hardirqs_fixup_flags(unsigned long flags) | |
22345 | +{ | |
22346 | + if (raw_irqs_disabled_flags(flags)) | |
22347 | + trace_hardirqs_off(); | |
22348 | + else | |
22349 | + trace_hardirqs_on(); | |
22350 | +} | |
22351 | + | |
22352 | +#define trace_hardirqs_fixup() \ | |
22353 | + trace_hardirqs_fixup_flags(__raw_local_save_flags()) | |
22354 | + | |
22355 | +#else | |
22356 | + | |
22357 | +#ifdef CONFIG_X86_64 | |
22358 | +/* | |
22359 | + * Currently paravirt can't handle swapgs nicely when we | |
22360 | + * don't have a stack we can rely on (such as a user space | |
22361 | + * stack). So we either find a way around these or just fault | |
22362 | + * and emulate if a guest tries to call swapgs directly. | |
22363 | + * | |
22364 | + * Either way, this is a good way to document that we don't | |
22365 | + * have a reliable stack. x86_64 only. | |
22366 | + */ | |
22367 | +#define SWAPGS_UNSAFE_STACK swapgs | |
22368 | +#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk | |
22369 | +#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk | |
22370 | +#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk | |
22371 | +#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ | |
22372 | + TRACE_IRQS_ON; \ | |
22373 | + ENABLE_INTERRUPTS(CLBR_NONE); \ | |
22374 | + SAVE_REST; \ | |
22375 | + LOCKDEP_SYS_EXIT; \ | |
22376 | + RESTORE_REST; \ | |
22377 | + __DISABLE_INTERRUPTS; \ | |
22378 | + TRACE_IRQS_OFF; | |
22379 | + | |
22380 | +#else | |
22381 | +#define ARCH_TRACE_IRQS_ON \ | |
22382 | + pushl %eax; \ | |
22383 | + pushl %ecx; \ | |
22384 | + pushl %edx; \ | |
22385 | + call trace_hardirqs_on; \ | |
22386 | + popl %edx; \ | |
22387 | + popl %ecx; \ | |
22388 | + popl %eax; | |
22389 | + | |
22390 | +#define ARCH_TRACE_IRQS_OFF \ | |
22391 | + pushl %eax; \ | |
22392 | + pushl %ecx; \ | |
22393 | + pushl %edx; \ | |
22394 | + call trace_hardirqs_off; \ | |
22395 | + popl %edx; \ | |
22396 | + popl %ecx; \ | |
22397 | + popl %eax; | |
22398 | + | |
22399 | +#define ARCH_LOCKDEP_SYS_EXIT \ | |
22400 | + pushl %eax; \ | |
22401 | + pushl %ecx; \ | |
22402 | + pushl %edx; \ | |
22403 | + call lockdep_sys_exit; \ | |
22404 | + popl %edx; \ | |
22405 | + popl %ecx; \ | |
22406 | + popl %eax; | |
22407 | + | |
22408 | +#define ARCH_LOCKDEP_SYS_EXIT_IRQ | |
22409 | +#endif | |
22410 | + | |
22411 | +#ifdef CONFIG_TRACE_IRQFLAGS | |
22412 | +# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON | |
22413 | +# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF | |
22414 | +#else | |
22415 | +# define TRACE_IRQS_ON | |
22416 | +# define TRACE_IRQS_OFF | |
22417 | +#endif | |
22418 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
22419 | +# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT | |
22420 | +# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ | |
22421 | +# else | |
22422 | +# define LOCKDEP_SYS_EXIT | |
22423 | +# define LOCKDEP_SYS_EXIT_IRQ | |
22424 | +# endif | |
22425 | + | |
22426 | +#endif /* __ASSEMBLY__ */ | |
22427 | #endif | |
22428 | --- a/include/asm-x86/mach-xen/asm/maddr_32.h | |
22429 | +++ b/include/asm-x86/mach-xen/asm/maddr_32.h | |
22430 | @@ -1,6 +1,7 @@ | |
22431 | #ifndef _I386_MADDR_H | |
22432 | #define _I386_MADDR_H | |
22433 | ||
22434 | +#include <asm/bug.h> | |
22435 | #include <xen/features.h> | |
22436 | #include <xen/interface/xen.h> | |
22437 | ||
22438 | @@ -151,25 +152,9 @@ static inline paddr_t pte_machine_to_phy | |
22439 | phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); | |
22440 | return phys; | |
22441 | } | |
22442 | -#endif | |
22443 | - | |
22444 | -#ifdef CONFIG_X86_PAE | |
22445 | -#define __pte_ma(x) ((pte_t) { (x), (maddr_t)(x) >> 32 } ) | |
22446 | -extern unsigned long long __supported_pte_mask; | |
22447 | -static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot) | |
22448 | -{ | |
22449 | - pte_t pte; | |
22450 | - | |
22451 | - pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \ | |
22452 | - (pgprot_val(pgprot) >> 32); | |
22453 | - pte.pte_high &= (__supported_pte_mask >> 32); | |
22454 | - pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \ | |
22455 | - __supported_pte_mask; | |
22456 | - return pte; | |
22457 | -} | |
22458 | #else | |
22459 | -#define __pte_ma(x) ((pte_t) { (x) } ) | |
22460 | -#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) | |
22461 | +#define pte_phys_to_machine phys_to_machine | |
22462 | +#define pte_machine_to_phys machine_to_phys | |
22463 | #endif | |
22464 | ||
22465 | #else /* !CONFIG_XEN */ | |
22466 | --- a/include/asm-x86/mach-xen/asm/maddr_64.h | |
22467 | +++ b/include/asm-x86/mach-xen/asm/maddr_64.h | |
22468 | @@ -1,6 +1,7 @@ | |
22469 | #ifndef _X86_64_MADDR_H | |
22470 | #define _X86_64_MADDR_H | |
22471 | ||
22472 | +#include <asm/bug.h> | |
22473 | #include <xen/features.h> | |
22474 | #include <xen/interface/xen.h> | |
22475 | ||
22476 | @@ -16,6 +17,7 @@ typedef unsigned long maddr_t; | |
22477 | #ifdef CONFIG_XEN | |
22478 | ||
22479 | extern unsigned long *phys_to_machine_mapping; | |
22480 | +extern unsigned long max_mapnr; | |
22481 | ||
22482 | #undef machine_to_phys_mapping | |
22483 | extern unsigned long *machine_to_phys_mapping; | |
22484 | @@ -25,7 +27,7 @@ static inline unsigned long pfn_to_mfn(u | |
22485 | { | |
22486 | if (xen_feature(XENFEAT_auto_translated_physmap)) | |
22487 | return pfn; | |
22488 | - BUG_ON(end_pfn && pfn >= end_pfn); | |
22489 | + BUG_ON(max_mapnr && pfn >= max_mapnr); | |
22490 | return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT; | |
22491 | } | |
22492 | ||
22493 | @@ -33,7 +35,7 @@ static inline int phys_to_machine_mappin | |
22494 | { | |
22495 | if (xen_feature(XENFEAT_auto_translated_physmap)) | |
22496 | return 1; | |
22497 | - BUG_ON(end_pfn && pfn >= end_pfn); | |
22498 | + BUG_ON(max_mapnr && pfn >= max_mapnr); | |
22499 | return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); | |
22500 | } | |
22501 | ||
22502 | @@ -45,7 +47,7 @@ static inline unsigned long mfn_to_pfn(u | |
22503 | return mfn; | |
22504 | ||
22505 | if (unlikely((mfn >> machine_to_phys_order) != 0)) | |
22506 | - return end_pfn; | |
22507 | + return max_mapnr; | |
22508 | ||
22509 | /* The array access can fail (e.g., device space beyond end of RAM). */ | |
22510 | asm ( | |
22511 | @@ -60,7 +62,7 @@ static inline unsigned long mfn_to_pfn(u | |
22512 | " .quad 1b,3b\n" | |
22513 | ".previous" | |
22514 | : "=r" (pfn) | |
22515 | - : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) ); | |
22516 | + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) ); | |
22517 | ||
22518 | return pfn; | |
22519 | } | |
22520 | @@ -88,16 +90,16 @@ static inline unsigned long mfn_to_pfn(u | |
22521 | static inline unsigned long mfn_to_local_pfn(unsigned long mfn) | |
22522 | { | |
22523 | unsigned long pfn = mfn_to_pfn(mfn); | |
22524 | - if ((pfn < end_pfn) | |
22525 | + if ((pfn < max_mapnr) | |
22526 | && !xen_feature(XENFEAT_auto_translated_physmap) | |
22527 | && (phys_to_machine_mapping[pfn] != mfn)) | |
22528 | - return end_pfn; /* force !pfn_valid() */ | |
22529 | + return max_mapnr; /* force !pfn_valid() */ | |
22530 | return pfn; | |
22531 | } | |
22532 | ||
22533 | static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) | |
22534 | { | |
22535 | - BUG_ON(end_pfn && pfn >= end_pfn); | |
22536 | + BUG_ON(max_mapnr && pfn >= max_mapnr); | |
22537 | if (xen_feature(XENFEAT_auto_translated_physmap)) { | |
22538 | BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); | |
22539 | return; | |
22540 | @@ -135,9 +137,6 @@ static inline paddr_t pte_machine_to_phy | |
22541 | return phys; | |
22542 | } | |
22543 | ||
22544 | -#define __pte_ma(x) ((pte_t) { (x) } ) | |
22545 | -#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask) | |
22546 | - | |
22547 | #else /* !CONFIG_XEN */ | |
22548 | ||
22549 | #define pfn_to_mfn(pfn) (pfn) | |
22550 | --- a/include/asm-x86/mach-xen/asm/mmu_context_32.h | |
22551 | +++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h | |
22552 | @@ -51,8 +51,6 @@ static inline void __prepare_arch_switch | |
22553 | : : "r" (0) ); | |
22554 | } | |
22555 | ||
22556 | -void leave_mm(unsigned long cpu); | |
22557 | - | |
22558 | static inline void switch_mm(struct mm_struct *prev, | |
22559 | struct mm_struct *next, | |
22560 | struct task_struct *tsk) | |
22561 | --- a/include/asm-x86/mach-xen/asm/mmu_context_64.h | |
22562 | +++ b/include/asm-x86/mach-xen/asm/mmu_context_64.h | |
22563 | @@ -62,12 +62,6 @@ extern void mm_pin(struct mm_struct *mm) | |
22564 | extern void mm_unpin(struct mm_struct *mm); | |
22565 | void mm_pin_all(void); | |
22566 | ||
22567 | -static inline void load_cr3(pgd_t *pgd) | |
22568 | -{ | |
22569 | - asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) : | |
22570 | - "memory"); | |
22571 | -} | |
22572 | - | |
22573 | static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, | |
22574 | struct task_struct *tsk) | |
22575 | { | |
22576 | @@ -97,7 +91,7 @@ static inline void switch_mm(struct mm_s | |
22577 | op++; | |
22578 | ||
22579 | if (unlikely(next->context.ldt != prev->context.ldt)) { | |
22580 | - /* load_LDT_nolock(&next->context, cpu) */ | |
22581 | + /* load_LDT_nolock(&next->context) */ | |
22582 | op->cmd = MMUEXT_SET_LDT; | |
22583 | op->arg1.linear_addr = (unsigned long)next->context.ldt; | |
22584 | op->arg2.nr_ents = next->context.size; | |
22585 | @@ -110,7 +104,7 @@ static inline void switch_mm(struct mm_s | |
22586 | else { | |
22587 | write_pda(mmu_state, TLBSTATE_OK); | |
22588 | if (read_pda(active_mm) != next) | |
22589 | - out_of_line_bug(); | |
22590 | + BUG(); | |
22591 | if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { | |
22592 | /* We were in lazy tlb mode and leave_mm disabled | |
22593 | * tlb flush IPI delivery. We must reload CR3 | |
22594 | @@ -118,7 +112,7 @@ static inline void switch_mm(struct mm_s | |
22595 | */ | |
22596 | load_cr3(next->pgd); | |
22597 | xen_new_user_pt(__pa(__user_pgd(next->pgd))); | |
22598 | - load_LDT_nolock(&next->context, cpu); | |
22599 | + load_LDT_nolock(&next->context); | |
22600 | } | |
22601 | } | |
22602 | #endif | |
22603 | --- a/include/asm-x86/mach-xen/asm/page_64.h | |
22604 | +++ b/include/asm-x86/mach-xen/asm/page_64.h | |
22605 | @@ -1,37 +1,9 @@ | |
22606 | #ifndef _X86_64_PAGE_H | |
22607 | #define _X86_64_PAGE_H | |
22608 | ||
22609 | -/* #include <linux/string.h> */ | |
22610 | -#ifndef __ASSEMBLY__ | |
22611 | -#include <linux/kernel.h> | |
22612 | -#include <linux/types.h> | |
22613 | -#include <asm/bug.h> | |
22614 | -#endif | |
22615 | -#include <linux/const.h> | |
22616 | -#include <xen/interface/xen.h> | |
22617 | - | |
22618 | -/* | |
22619 | - * Need to repeat this here in order to not include pgtable.h (which in turn | |
22620 | - * depends on definitions made here), but to be able to use the symbolic | |
22621 | - * below. The preprocessor will warn if the two definitions aren't identical. | |
22622 | - */ | |
22623 | -#define _PAGE_PRESENT 0x001 | |
22624 | -#define _PAGE_IO 0x200 | |
22625 | - | |
22626 | -/* PAGE_SHIFT determines the page size */ | |
22627 | -#define PAGE_SHIFT 12 | |
22628 | -#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) | |
22629 | -#define PAGE_MASK (~(PAGE_SIZE-1)) | |
22630 | - | |
22631 | -/* See Documentation/x86_64/mm.txt for a description of the memory map. */ | |
22632 | -#define __PHYSICAL_MASK_SHIFT 46 | |
22633 | -#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1) | |
22634 | -#define __VIRTUAL_MASK_SHIFT 48 | |
22635 | -#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1) | |
22636 | - | |
22637 | -#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK) | |
22638 | +#define PAGETABLE_LEVELS 4 | |
22639 | ||
22640 | -#define THREAD_ORDER 1 | |
22641 | +#define THREAD_ORDER 1 | |
22642 | #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) | |
22643 | #define CURRENT_MASK (~(THREAD_SIZE-1)) | |
22644 | ||
22645 | @@ -51,106 +23,10 @@ | |
22646 | #define MCE_STACK 5 | |
22647 | #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ | |
22648 | ||
22649 | -#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) | |
22650 | -#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT) | |
22651 | - | |
22652 | -#define HPAGE_SHIFT PMD_SHIFT | |
22653 | -#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) | |
22654 | -#define HPAGE_MASK (~(HPAGE_SIZE - 1)) | |
22655 | -#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) | |
22656 | - | |
22657 | -#ifdef __KERNEL__ | |
22658 | -#ifndef __ASSEMBLY__ | |
22659 | - | |
22660 | -extern unsigned long end_pfn; | |
22661 | - | |
22662 | -#include <asm/maddr.h> | |
22663 | - | |
22664 | -void clear_page(void *); | |
22665 | -void copy_page(void *, void *); | |
22666 | - | |
22667 | -#define clear_user_page(page, vaddr, pg) clear_page(page) | |
22668 | -#define copy_user_page(to, from, vaddr, pg) copy_page(to, from) | |
22669 | - | |
22670 | -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ | |
22671 | - alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) | |
22672 | -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE | |
22673 | - | |
22674 | -/* | |
22675 | - * These are used to make use of C type-checking.. | |
22676 | - */ | |
22677 | -typedef struct { unsigned long pte; } pte_t; | |
22678 | -typedef struct { unsigned long pmd; } pmd_t; | |
22679 | -typedef struct { unsigned long pud; } pud_t; | |
22680 | -typedef struct { unsigned long pgd; } pgd_t; | |
22681 | -#define PTE_MASK PHYSICAL_PAGE_MASK | |
22682 | - | |
22683 | -typedef struct { unsigned long pgprot; } pgprot_t; | |
22684 | - | |
22685 | -#define __pte_val(x) ((x).pte) | |
22686 | -#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO)) \ | |
22687 | - == _PAGE_PRESENT ? \ | |
22688 | - pte_machine_to_phys(__pte_val(x)) : \ | |
22689 | - __pte_val(x)) | |
22690 | - | |
22691 | -#define __pmd_val(x) ((x).pmd) | |
22692 | -static inline unsigned long pmd_val(pmd_t x) | |
22693 | -{ | |
22694 | - unsigned long ret = __pmd_val(x); | |
22695 | -#if CONFIG_XEN_COMPAT <= 0x030002 | |
22696 | - if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT; | |
22697 | -#else | |
22698 | - if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret); | |
22699 | -#endif | |
22700 | - return ret; | |
22701 | -} | |
22702 | - | |
22703 | -#define __pud_val(x) ((x).pud) | |
22704 | -static inline unsigned long pud_val(pud_t x) | |
22705 | -{ | |
22706 | - unsigned long ret = __pud_val(x); | |
22707 | - if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret); | |
22708 | - return ret; | |
22709 | -} | |
22710 | - | |
22711 | -#define __pgd_val(x) ((x).pgd) | |
22712 | -static inline unsigned long pgd_val(pgd_t x) | |
22713 | -{ | |
22714 | - unsigned long ret = __pgd_val(x); | |
22715 | - if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret); | |
22716 | - return ret; | |
22717 | -} | |
22718 | - | |
22719 | -#define pgprot_val(x) ((x).pgprot) | |
22720 | - | |
22721 | -static inline pte_t __pte(unsigned long x) | |
22722 | -{ | |
22723 | - if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT) | |
22724 | - x = pte_phys_to_machine(x); | |
22725 | - return ((pte_t) { (x) }); | |
22726 | -} | |
22727 | - | |
22728 | -static inline pmd_t __pmd(unsigned long x) | |
22729 | -{ | |
22730 | - if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x); | |
22731 | - return ((pmd_t) { (x) }); | |
22732 | -} | |
22733 | - | |
22734 | -static inline pud_t __pud(unsigned long x) | |
22735 | -{ | |
22736 | - if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x); | |
22737 | - return ((pud_t) { (x) }); | |
22738 | -} | |
22739 | - | |
22740 | -static inline pgd_t __pgd(unsigned long x) | |
22741 | -{ | |
22742 | - if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x); | |
22743 | - return ((pgd_t) { (x) }); | |
22744 | -} | |
22745 | - | |
22746 | -#define __pgprot(x) ((pgprot_t) { (x) } ) | |
22747 | +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) | |
22748 | +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) | |
22749 | ||
22750 | -#endif /* !__ASSEMBLY__ */ | |
22751 | +#define __PAGE_OFFSET _AC(0xffff880000000000, UL) | |
22752 | ||
22753 | #define __PHYSICAL_START CONFIG_PHYSICAL_START | |
22754 | #define __KERNEL_ALIGN 0x200000 | |
22755 | @@ -166,52 +42,58 @@ static inline pgd_t __pgd(unsigned long | |
22756 | ||
22757 | #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) | |
22758 | #define __START_KERNEL_map _AC(0xffffffff80000000, UL) | |
22759 | -#define __PAGE_OFFSET _AC(0xffff880000000000, UL) | |
22760 | ||
22761 | #if CONFIG_XEN_COMPAT <= 0x030002 | |
22762 | #undef LOAD_OFFSET | |
22763 | #define LOAD_OFFSET 0 | |
22764 | #endif | |
22765 | ||
22766 | -/* to align the pointer to the (next) page boundary */ | |
22767 | -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) | |
22768 | - | |
22769 | -#define KERNEL_TEXT_SIZE (40*1024*1024) | |
22770 | -#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL) | |
22771 | +/* See Documentation/x86_64/mm.txt for a description of the memory map. */ | |
22772 | +#define __PHYSICAL_MASK_SHIFT 46 | |
22773 | +#define __VIRTUAL_MASK_SHIFT 48 | |
22774 | ||
22775 | -#define PAGE_OFFSET __PAGE_OFFSET | |
22776 | +/* | |
22777 | + * Kernel image size is limited to 128 MB (see level2_kernel_pgt in | |
22778 | + * arch/x86/kernel/head_64.S), and it is mapped here: | |
22779 | + */ | |
22780 | +#define KERNEL_IMAGE_SIZE (128*1024*1024) | |
22781 | +#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL) | |
22782 | ||
22783 | #ifndef __ASSEMBLY__ | |
22784 | +void clear_page(void *page); | |
22785 | +void copy_page(void *to, void *from); | |
22786 | + | |
22787 | +extern unsigned long end_pfn; | |
22788 | +extern unsigned long end_pfn_map; | |
22789 | + | |
22790 | static inline unsigned long __phys_addr(unsigned long x) | |
22791 | { | |
22792 | - return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET); | |
22793 | + return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : __PAGE_OFFSET); | |
22794 | } | |
22795 | -#endif | |
22796 | ||
22797 | -#define __pa(x) __phys_addr((unsigned long)(x)) | |
22798 | -#define __pa_symbol(x) __phys_addr((unsigned long)(x)) | |
22799 | +#define __phys_reloc_hide(x) (x) | |
22800 | ||
22801 | -#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) | |
22802 | -#define __boot_va(x) __va(x) | |
22803 | -#define __boot_pa(x) __pa(x) | |
22804 | -#ifdef CONFIG_FLATMEM | |
22805 | -#define pfn_valid(pfn) ((pfn) < end_pfn) | |
22806 | -#endif | |
22807 | +/* | |
22808 | + * These are used to make use of C type-checking.. | |
22809 | + */ | |
22810 | +typedef unsigned long pteval_t; | |
22811 | +typedef unsigned long pmdval_t; | |
22812 | +typedef unsigned long pudval_t; | |
22813 | +typedef unsigned long pgdval_t; | |
22814 | +typedef unsigned long pgprotval_t; | |
22815 | +typedef unsigned long phys_addr_t; | |
22816 | ||
22817 | -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) | |
22818 | -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) | |
22819 | -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) | |
22820 | - | |
22821 | -#define VM_DATA_DEFAULT_FLAGS \ | |
22822 | - (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ | |
22823 | - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) | |
22824 | +typedef struct page *pgtable_t; | |
22825 | + | |
22826 | +typedef union { pteval_t pte; unsigned int pte_low; } pte_t; | |
22827 | ||
22828 | -#define __HAVE_ARCH_GATE_AREA 1 | |
22829 | #define vmemmap ((struct page *)VMEMMAP_START) | |
22830 | ||
22831 | -#include <asm-generic/memory_model.h> | |
22832 | -#include <asm-generic/page.h> | |
22833 | +#endif /* !__ASSEMBLY__ */ | |
22834 | + | |
22835 | +#ifdef CONFIG_FLATMEM | |
22836 | +#define pfn_valid(pfn) ((pfn) < max_mapnr) | |
22837 | +#endif | |
22838 | ||
22839 | -#endif /* __KERNEL__ */ | |
22840 | ||
22841 | #endif /* _X86_64_PAGE_H */ | |
22842 | --- a/include/asm-x86/mach-xen/asm/page.h | |
22843 | +++ b/include/asm-x86/mach-xen/asm/page.h | |
22844 | @@ -1,13 +1,231 @@ | |
22845 | +#ifndef _ASM_X86_PAGE_H | |
22846 | +#define _ASM_X86_PAGE_H | |
22847 | + | |
22848 | +#include <linux/const.h> | |
22849 | + | |
22850 | +/* PAGE_SHIFT determines the page size */ | |
22851 | +#define PAGE_SHIFT 12 | |
22852 | +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) | |
22853 | +#define PAGE_MASK (~(PAGE_SIZE-1)) | |
22854 | + | |
22855 | #ifdef __KERNEL__ | |
22856 | -# ifdef CONFIG_X86_32 | |
22857 | -# include "page_32.h" | |
22858 | -# else | |
22859 | -# include "page_64.h" | |
22860 | -# endif | |
22861 | + | |
22862 | +/* | |
22863 | + * Need to repeat this here in order to not include pgtable.h (which in turn | |
22864 | + * depends on definitions made here), but to be able to use the symbolics | |
22865 | + * below. The preprocessor will warn if the two definitions aren't identical. | |
22866 | + */ | |
22867 | +#define _PAGE_BIT_PRESENT 0 | |
22868 | +#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT) | |
22869 | +#define _PAGE_BIT_IO 9 | |
22870 | +#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO) | |
22871 | + | |
22872 | +#define PHYSICAL_PAGE_MASK (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK) | |
22873 | +#define PTE_MASK _AT(pteval_t, PHYSICAL_PAGE_MASK) | |
22874 | + | |
22875 | +#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) | |
22876 | +#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) | |
22877 | + | |
22878 | +#define HPAGE_SHIFT PMD_SHIFT | |
22879 | +#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) | |
22880 | +#define HPAGE_MASK (~(HPAGE_SIZE - 1)) | |
22881 | +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) | |
22882 | + | |
22883 | +/* to align the pointer to the (next) page boundary */ | |
22884 | +#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) | |
22885 | + | |
22886 | +#define __PHYSICAL_MASK _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1) | |
22887 | +#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1) | |
22888 | + | |
22889 | +#ifndef __ASSEMBLY__ | |
22890 | +#include <linux/types.h> | |
22891 | +#endif | |
22892 | + | |
22893 | +#ifdef CONFIG_X86_64 | |
22894 | +#include <asm/page_64.h> | |
22895 | +#define max_pfn_mapped end_pfn_map | |
22896 | +#else | |
22897 | +#include <asm/page_32.h> | |
22898 | +#define max_pfn_mapped max_low_pfn | |
22899 | +#endif /* CONFIG_X86_64 */ | |
22900 | + | |
22901 | +#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) | |
22902 | + | |
22903 | +#define VM_DATA_DEFAULT_FLAGS \ | |
22904 | + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ | |
22905 | + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) | |
22906 | + | |
22907 | + | |
22908 | +#ifndef __ASSEMBLY__ | |
22909 | + | |
22910 | +extern int page_is_ram(unsigned long pagenr); | |
22911 | + | |
22912 | +struct page; | |
22913 | + | |
22914 | +static inline void clear_user_page(void *page, unsigned long vaddr, | |
22915 | + struct page *pg) | |
22916 | +{ | |
22917 | + clear_page(page); | |
22918 | +} | |
22919 | + | |
22920 | +static inline void copy_user_page(void *to, void *from, unsigned long vaddr, | |
22921 | + struct page *topage) | |
22922 | +{ | |
22923 | + copy_page(to, from); | |
22924 | +} | |
22925 | + | |
22926 | +#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ | |
22927 | + alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) | |
22928 | +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE | |
22929 | + | |
22930 | +typedef struct { pgprotval_t pgprot; } pgprot_t; | |
22931 | + | |
22932 | +#define pgprot_val(x) ((x).pgprot) | |
22933 | +#define __pgprot(x) ((pgprot_t) { (x) } ) | |
22934 | + | |
22935 | +#include <asm/maddr.h> | |
22936 | + | |
22937 | +typedef struct { pgdval_t pgd; } pgd_t; | |
22938 | + | |
22939 | +#define __pgd_ma(x) ((pgd_t) { (x) } ) | |
22940 | +static inline pgd_t xen_make_pgd(pgdval_t val) | |
22941 | +{ | |
22942 | + if (val & _PAGE_PRESENT) | |
22943 | + val = pte_phys_to_machine(val); | |
22944 | + return (pgd_t) { val }; | |
22945 | +} | |
22946 | + | |
22947 | +#define __pgd_val(x) ((x).pgd) | |
22948 | +static inline pgdval_t xen_pgd_val(pgd_t pgd) | |
22949 | +{ | |
22950 | + pgdval_t ret = __pgd_val(pgd); | |
22951 | +#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002 | |
22952 | + if (ret) | |
22953 | + ret = machine_to_phys(ret) | _PAGE_PRESENT; | |
22954 | +#else | |
22955 | + if (ret & _PAGE_PRESENT) | |
22956 | + ret = pte_machine_to_phys(ret); | |
22957 | +#endif | |
22958 | + return ret; | |
22959 | +} | |
22960 | + | |
22961 | +#if PAGETABLE_LEVELS >= 3 | |
22962 | +#if PAGETABLE_LEVELS == 4 | |
22963 | +typedef struct { pudval_t pud; } pud_t; | |
22964 | + | |
22965 | +#define __pud_ma(x) ((pud_t) { (x) } ) | |
22966 | +static inline pud_t xen_make_pud(pudval_t val) | |
22967 | +{ | |
22968 | + if (val & _PAGE_PRESENT) | |
22969 | + val = pte_phys_to_machine(val); | |
22970 | + return (pud_t) { val }; | |
22971 | +} | |
22972 | + | |
22973 | +#define __pud_val(x) ((x).pud) | |
22974 | +static inline pudval_t xen_pud_val(pud_t pud) | |
22975 | +{ | |
22976 | + pudval_t ret = __pud_val(pud); | |
22977 | + if (ret & _PAGE_PRESENT) | |
22978 | + ret = pte_machine_to_phys(ret); | |
22979 | + return ret; | |
22980 | +} | |
22981 | +#else /* PAGETABLE_LEVELS == 3 */ | |
22982 | +#include <asm-generic/pgtable-nopud.h> | |
22983 | + | |
22984 | +#define __pud_val(x) __pgd_val((x).pgd) | |
22985 | +static inline pudval_t xen_pud_val(pud_t pud) | |
22986 | +{ | |
22987 | + return xen_pgd_val(pud.pgd); | |
22988 | +} | |
22989 | +#endif /* PAGETABLE_LEVELS == 4 */ | |
22990 | + | |
22991 | +typedef struct { pmdval_t pmd; } pmd_t; | |
22992 | + | |
22993 | +#define __pmd_ma(x) ((pmd_t) { (x) } ) | |
22994 | +static inline pmd_t xen_make_pmd(pmdval_t val) | |
22995 | +{ | |
22996 | + if (val & _PAGE_PRESENT) | |
22997 | + val = pte_phys_to_machine(val); | |
22998 | + return (pmd_t) { val }; | |
22999 | +} | |
23000 | + | |
23001 | +#define __pmd_val(x) ((x).pmd) | |
23002 | +static inline pmdval_t xen_pmd_val(pmd_t pmd) | |
23003 | +{ | |
23004 | + pmdval_t ret = __pmd_val(pmd); | |
23005 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
23006 | + if (ret) | |
23007 | + ret = pte_machine_to_phys(ret) | _PAGE_PRESENT; | |
23008 | #else | |
23009 | -# ifdef __i386__ | |
23010 | -# include "page_32.h" | |
23011 | -# else | |
23012 | -# include "page_64.h" | |
23013 | -# endif | |
23014 | + if (ret & _PAGE_PRESENT) | |
23015 | + ret = pte_machine_to_phys(ret); | |
23016 | +#endif | |
23017 | + return ret; | |
23018 | +} | |
23019 | +#else /* PAGETABLE_LEVELS == 2 */ | |
23020 | +#include <asm-generic/pgtable-nopmd.h> | |
23021 | + | |
23022 | +#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } ) | |
23023 | +#define __pmd_val(x) __pgd_val((x).pud.pgd) | |
23024 | +static inline pmdval_t xen_pmd_val(pmd_t pmd) | |
23025 | +{ | |
23026 | + return xen_pgd_val(pmd.pud.pgd); | |
23027 | +} | |
23028 | +#endif /* PAGETABLE_LEVELS >= 3 */ | |
23029 | + | |
23030 | +#define __pte_ma(x) ((pte_t) { .pte = (x) } ) | |
23031 | +static inline pte_t xen_make_pte(pteval_t val) | |
23032 | +{ | |
23033 | + if ((val & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT) | |
23034 | + val = pte_phys_to_machine(val); | |
23035 | + return (pte_t) { .pte = val }; | |
23036 | +} | |
23037 | + | |
23038 | +#define __pte_val(x) ((x).pte) | |
23039 | +static inline pteval_t xen_pte_val(pte_t pte) | |
23040 | +{ | |
23041 | + pteval_t ret = __pte_val(pte); | |
23042 | + if ((pte.pte_low & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT) | |
23043 | + ret = pte_machine_to_phys(ret); | |
23044 | + return ret; | |
23045 | +} | |
23046 | + | |
23047 | +#define pgd_val(x) xen_pgd_val(x) | |
23048 | +#define __pgd(x) xen_make_pgd(x) | |
23049 | + | |
23050 | +#ifndef __PAGETABLE_PUD_FOLDED | |
23051 | +#define pud_val(x) xen_pud_val(x) | |
23052 | +#define __pud(x) xen_make_pud(x) | |
23053 | +#endif | |
23054 | + | |
23055 | +#ifndef __PAGETABLE_PMD_FOLDED | |
23056 | +#define pmd_val(x) xen_pmd_val(x) | |
23057 | +#define __pmd(x) xen_make_pmd(x) | |
23058 | #endif | |
23059 | + | |
23060 | +#define pte_val(x) xen_pte_val(x) | |
23061 | +#define __pte(x) xen_make_pte(x) | |
23062 | + | |
23063 | +#define __pa(x) __phys_addr((unsigned long)(x)) | |
23064 | +/* __pa_symbol should be used for C visible symbols. | |
23065 | + This seems to be the official gcc blessed way to do such arithmetic. */ | |
23066 | +#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x))) | |
23067 | + | |
23068 | +#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) | |
23069 | + | |
23070 | +#define __boot_va(x) __va(x) | |
23071 | +#define __boot_pa(x) __pa(x) | |
23072 | + | |
23073 | +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) | |
23074 | +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) | |
23075 | +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) | |
23076 | + | |
23077 | +#endif /* __ASSEMBLY__ */ | |
23078 | + | |
23079 | +#include <asm-generic/memory_model.h> | |
23080 | +#include <asm-generic/page.h> | |
23081 | + | |
23082 | +#define __HAVE_ARCH_GATE_AREA 1 | |
23083 | + | |
23084 | +#endif /* __KERNEL__ */ | |
23085 | +#endif /* _ASM_X86_PAGE_H */ | |
23086 | --- a/include/asm-x86/mach-xen/asm/pci_64.h | |
23087 | +++ b/include/asm-x86/mach-xen/asm/pci_64.h | |
23088 | @@ -26,7 +26,6 @@ extern int (*pci_config_write)(int seg, | |
23089 | ||
23090 | ||
23091 | extern void pci_iommu_alloc(void); | |
23092 | -extern int iommu_setup(char *opt); | |
23093 | ||
23094 | /* The PCI address space does equal the physical memory | |
23095 | * address space. The networking and block device layers use | |
23096 | --- a/include/asm-x86/mach-xen/asm/pci.h | |
23097 | +++ b/include/asm-x86/mach-xen/asm/pci.h | |
23098 | @@ -71,6 +71,7 @@ extern int pci_mmap_page_range(struct pc | |
23099 | ||
23100 | ||
23101 | #ifdef CONFIG_PCI | |
23102 | +extern void early_quirks(void); | |
23103 | static inline void pci_dma_burst_advice(struct pci_dev *pdev, | |
23104 | enum pci_dma_burst_strategy *strat, | |
23105 | unsigned long *strategy_parameter) | |
23106 | @@ -78,9 +79,10 @@ static inline void pci_dma_burst_advice( | |
23107 | *strat = PCI_DMA_BURST_INFINITY; | |
23108 | *strategy_parameter = ~0UL; | |
23109 | } | |
23110 | +#else | |
23111 | +static inline void early_quirks(void) { } | |
23112 | #endif | |
23113 | ||
23114 | - | |
23115 | #endif /* __KERNEL__ */ | |
23116 | ||
23117 | #ifdef CONFIG_X86_32 | |
23118 | @@ -95,6 +97,19 @@ static inline void pci_dma_burst_advice( | |
23119 | /* generic pci stuff */ | |
23120 | #include <asm-generic/pci.h> | |
23121 | ||
23122 | +#ifdef CONFIG_NUMA | |
23123 | +/* Returns the node based on pci bus */ | |
23124 | +static inline int __pcibus_to_node(struct pci_bus *bus) | |
23125 | +{ | |
23126 | + struct pci_sysdata *sd = bus->sysdata; | |
23127 | + | |
23128 | + return sd->node; | |
23129 | +} | |
23130 | ||
23131 | +static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus) | |
23132 | +{ | |
23133 | + return node_to_cpumask(__pcibus_to_node(bus)); | |
23134 | +} | |
23135 | +#endif | |
23136 | ||
23137 | #endif | |
23138 | --- a/include/asm-x86/mach-xen/asm/pgalloc_32.h | |
23139 | +++ b/include/asm-x86/mach-xen/asm/pgalloc_32.h | |
23140 | @@ -3,69 +3,109 @@ | |
23141 | ||
23142 | #include <linux/threads.h> | |
23143 | #include <linux/mm.h> /* for struct page */ | |
23144 | +#include <linux/pagemap.h> | |
23145 | +#include <asm/tlb.h> | |
23146 | +#include <asm-generic/tlb.h> | |
23147 | #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ | |
23148 | ||
23149 | #define paravirt_alloc_pt(mm, pfn) do { } while (0) | |
23150 | -#define paravirt_alloc_pd(pfn) do { } while (0) | |
23151 | -#define paravirt_alloc_pd(pfn) do { } while (0) | |
23152 | +#define paravirt_alloc_pd(mm, pfn) do { } while (0) | |
23153 | #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0) | |
23154 | #define paravirt_release_pt(pfn) do { } while (0) | |
23155 | #define paravirt_release_pd(pfn) do { } while (0) | |
23156 | ||
23157 | -#define pmd_populate_kernel(mm, pmd, pte) \ | |
23158 | -do { \ | |
23159 | - paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); \ | |
23160 | - set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \ | |
23161 | -} while (0) | |
23162 | - | |
23163 | -#define pmd_populate(mm, pmd, pte) \ | |
23164 | -do { \ | |
23165 | - unsigned long pfn = page_to_pfn(pte); \ | |
23166 | - paravirt_alloc_pt(mm, pfn); \ | |
23167 | - if (PagePinned(virt_to_page((mm)->pgd))) { \ | |
23168 | - if (!PageHighMem(pte)) \ | |
23169 | - BUG_ON(HYPERVISOR_update_va_mapping( \ | |
23170 | - (unsigned long)__va(pfn << PAGE_SHIFT), \ | |
23171 | - pfn_pte(pfn, PAGE_KERNEL_RO), 0)); \ | |
23172 | - else if (!test_and_set_bit(PG_pinned, &pte->flags)) \ | |
23173 | - kmap_flush_unused(); \ | |
23174 | - set_pmd(pmd, \ | |
23175 | - __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \ | |
23176 | - } else \ | |
23177 | - *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \ | |
23178 | -} while (0) | |
23179 | +static inline void pmd_populate_kernel(struct mm_struct *mm, | |
23180 | + pmd_t *pmd, pte_t *pte) | |
23181 | +{ | |
23182 | + paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); | |
23183 | + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); | |
23184 | +} | |
23185 | + | |
23186 | +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) | |
23187 | +{ | |
23188 | + unsigned long pfn = page_to_pfn(pte); | |
23189 | + | |
23190 | + paravirt_alloc_pt(mm, pfn); | |
23191 | + if (PagePinned(virt_to_page(mm->pgd))) { | |
23192 | + if (!PageHighMem(pte)) | |
23193 | + BUG_ON(HYPERVISOR_update_va_mapping( | |
23194 | + (unsigned long)__va(pfn << PAGE_SHIFT), | |
23195 | + pfn_pte(pfn, PAGE_KERNEL_RO), 0)); | |
23196 | + else if (!test_and_set_bit(PG_pinned, &pte->flags)) | |
23197 | + kmap_flush_unused(); | |
23198 | + set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE)); | |
23199 | + } else | |
23200 | + *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE); | |
23201 | +} | |
23202 | +#define pmd_pgtable(pmd) pmd_page(pmd) | |
23203 | ||
23204 | /* | |
23205 | * Allocate and free page tables. | |
23206 | */ | |
23207 | +extern void pgd_test_and_unpin(pgd_t *); | |
23208 | extern pgd_t *pgd_alloc(struct mm_struct *); | |
23209 | -extern void pgd_free(pgd_t *pgd); | |
23210 | +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); | |
23211 | ||
23212 | extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); | |
23213 | -extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); | |
23214 | +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); | |
23215 | ||
23216 | -static inline void pte_free_kernel(pte_t *pte) | |
23217 | +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) | |
23218 | { | |
23219 | make_lowmem_page_writable(pte, XENFEAT_writable_page_tables); | |
23220 | free_page((unsigned long)pte); | |
23221 | } | |
23222 | ||
23223 | -extern void pte_free(struct page *pte); | |
23224 | +extern void __pte_free(pgtable_t); | |
23225 | +static inline void pte_free(struct mm_struct *mm, pgtable_t pte) | |
23226 | +{ | |
23227 | + __pte_free(pte); | |
23228 | +} | |
23229 | + | |
23230 | ||
23231 | -#define __pte_free_tlb(tlb,pte) \ | |
23232 | -do { \ | |
23233 | - paravirt_release_pt(page_to_pfn(pte)); \ | |
23234 | - tlb_remove_page((tlb),(pte)); \ | |
23235 | -} while (0) | |
23236 | +extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); | |
23237 | ||
23238 | #ifdef CONFIG_X86_PAE | |
23239 | /* | |
23240 | * In the PAE case we free the pmds as part of the pgd. | |
23241 | */ | |
23242 | -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) | |
23243 | -#define pmd_free(x) do { } while (0) | |
23244 | -#define __pmd_free_tlb(tlb,x) do { } while (0) | |
23245 | -#define pud_populate(mm, pmd, pte) BUG() | |
23246 | -#endif | |
23247 | +extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long); | |
23248 | + | |
23249 | +extern void __pmd_free(pgtable_t); | |
23250 | +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) | |
23251 | +{ | |
23252 | + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); | |
23253 | + __pmd_free(virt_to_page(pmd)); | |
23254 | +} | |
23255 | + | |
23256 | +extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); | |
23257 | + | |
23258 | +static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) | |
23259 | +{ | |
23260 | + struct page *page = virt_to_page(pmd); | |
23261 | + unsigned long pfn = page_to_pfn(page); | |
23262 | + | |
23263 | + paravirt_alloc_pd(mm, pfn); | |
23264 | + | |
23265 | + /* Note: almost everything apart from _PAGE_PRESENT is | |
23266 | + reserved at the pmd (PDPT) level. */ | |
23267 | + if (PagePinned(virt_to_page(mm->pgd))) { | |
23268 | + BUG_ON(PageHighMem(page)); | |
23269 | + BUG_ON(HYPERVISOR_update_va_mapping( | |
23270 | + (unsigned long)__va(pfn << PAGE_SHIFT), | |
23271 | + pfn_pte(pfn, PAGE_KERNEL_RO), 0)); | |
23272 | + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); | |
23273 | + } else | |
23274 | + *pudp = __pud(__pa(pmd) | _PAGE_PRESENT); | |
23275 | + | |
23276 | + /* | |
23277 | + * According to Intel App note "TLBs, Paging-Structure Caches, | |
23278 | + * and Their Invalidation", April 2007, document 317080-001, | |
23279 | + * section 8.1: in PAE mode we explicitly have to flush the | |
23280 | + * TLB via cr3 if the top-level pgd is changed... | |
23281 | + */ | |
23282 | + if (mm == current->active_mm) | |
23283 | + xen_tlb_flush(); | |
23284 | +} | |
23285 | +#endif /* CONFIG_X86_PAE */ | |
23286 | ||
23287 | #endif /* _I386_PGALLOC_H */ | |
23288 | --- a/include/asm-x86/mach-xen/asm/pgalloc_64.h | |
23289 | +++ b/include/asm-x86/mach-xen/asm/pgalloc_64.h | |
23290 | @@ -6,30 +6,13 @@ | |
23291 | #include <linux/mm.h> | |
23292 | #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ | |
23293 | ||
23294 | -#include <xen/features.h> | |
23295 | -void make_page_readonly(void *va, unsigned int feature); | |
23296 | -void make_page_writable(void *va, unsigned int feature); | |
23297 | -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); | |
23298 | -void make_pages_writable(void *va, unsigned int nr, unsigned int feature); | |
23299 | +pmd_t *early_get_pmd(unsigned long va); | |
23300 | +void early_make_page_readonly(void *va, unsigned int feature); | |
23301 | ||
23302 | #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) | |
23303 | ||
23304 | -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) | |
23305 | -{ | |
23306 | - set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))); | |
23307 | -} | |
23308 | - | |
23309 | -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) | |
23310 | -{ | |
23311 | - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) { | |
23312 | - BUG_ON(HYPERVISOR_update_va_mapping( | |
23313 | - (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT), | |
23314 | - pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0)); | |
23315 | - set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); | |
23316 | - } else { | |
23317 | - *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)); | |
23318 | - } | |
23319 | -} | |
23320 | +#define pmd_populate_kernel(mm, pmd, pte) \ | |
23321 | + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))) | |
23322 | ||
23323 | static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) | |
23324 | { | |
23325 | @@ -63,53 +46,58 @@ static inline void pgd_populate(struct m | |
23326 | } | |
23327 | } | |
23328 | ||
23329 | -extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr); | |
23330 | -extern void pte_free(struct page *pte); | |
23331 | +#define pmd_pgtable(pmd) pmd_page(pmd) | |
23332 | ||
23333 | -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) | |
23334 | +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) | |
23335 | { | |
23336 | - struct page *pg; | |
23337 | - | |
23338 | - pg = pte_alloc_one(mm, addr); | |
23339 | - return pg ? page_address(pg) : NULL; | |
23340 | + if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) { | |
23341 | + BUG_ON(HYPERVISOR_update_va_mapping( | |
23342 | + (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT), | |
23343 | + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0)); | |
23344 | + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); | |
23345 | + } else { | |
23346 | + *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)); | |
23347 | + } | |
23348 | } | |
23349 | ||
23350 | -static inline void pmd_free(pmd_t *pmd) | |
23351 | +extern void __pmd_free(pgtable_t); | |
23352 | +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) | |
23353 | { | |
23354 | BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); | |
23355 | - pte_free(virt_to_page(pmd)); | |
23356 | + __pmd_free(virt_to_page(pmd)); | |
23357 | } | |
23358 | ||
23359 | +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr); | |
23360 | + | |
23361 | static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) | |
23362 | { | |
23363 | - struct page *pg; | |
23364 | - | |
23365 | - pg = pte_alloc_one(mm, addr); | |
23366 | - return pg ? page_address(pg) : NULL; | |
23367 | + return (pud_t *)pmd_alloc_one(mm, addr); | |
23368 | } | |
23369 | ||
23370 | -static inline void pud_free(pud_t *pud) | |
23371 | +static inline void pud_free(struct mm_struct *mm, pud_t *pud) | |
23372 | { | |
23373 | BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); | |
23374 | - pte_free(virt_to_page(pud)); | |
23375 | + __pmd_free(virt_to_page(pud)); | |
23376 | } | |
23377 | ||
23378 | static inline void pgd_list_add(pgd_t *pgd) | |
23379 | { | |
23380 | struct page *page = virt_to_page(pgd); | |
23381 | + unsigned long flags; | |
23382 | ||
23383 | - spin_lock(&pgd_lock); | |
23384 | + spin_lock_irqsave(&pgd_lock, flags); | |
23385 | list_add(&page->lru, &pgd_list); | |
23386 | - spin_unlock(&pgd_lock); | |
23387 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
23388 | } | |
23389 | ||
23390 | static inline void pgd_list_del(pgd_t *pgd) | |
23391 | { | |
23392 | struct page *page = virt_to_page(pgd); | |
23393 | + unsigned long flags; | |
23394 | ||
23395 | - spin_lock(&pgd_lock); | |
23396 | + spin_lock_irqsave(&pgd_lock, flags); | |
23397 | list_del(&page->lru); | |
23398 | - spin_unlock(&pgd_lock); | |
23399 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
23400 | } | |
23401 | ||
23402 | extern void pgd_test_and_unpin(pgd_t *); | |
23403 | @@ -145,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm | |
23404 | return pgd; | |
23405 | } | |
23406 | ||
23407 | -static inline void pgd_free(pgd_t *pgd) | |
23408 | +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) | |
23409 | { | |
23410 | pgd_test_and_unpin(pgd); | |
23411 | pgd_list_del(pgd); | |
23412 | @@ -161,17 +149,30 @@ static inline pte_t *pte_alloc_one_kerne | |
23413 | return pte; | |
23414 | } | |
23415 | ||
23416 | +extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); | |
23417 | + | |
23418 | /* Should really implement gc for free page table pages. This could be | |
23419 | done with a reference count in struct page. */ | |
23420 | ||
23421 | -static inline void pte_free_kernel(pte_t *pte) | |
23422 | +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) | |
23423 | { | |
23424 | BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); | |
23425 | make_page_writable(pte, XENFEAT_writable_page_tables); | |
23426 | free_page((unsigned long)pte); | |
23427 | } | |
23428 | ||
23429 | -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) | |
23430 | +extern void __pte_free(pgtable_t); | |
23431 | +static inline void pte_free(struct mm_struct *mm, pgtable_t pte) | |
23432 | +{ | |
23433 | + __pte_free(pte); | |
23434 | +} | |
23435 | + | |
23436 | +#define __pte_free_tlb(tlb,pte) \ | |
23437 | +do { \ | |
23438 | + pgtable_page_dtor((pte)); \ | |
23439 | + tlb_remove_page((tlb), (pte)); \ | |
23440 | +} while (0) | |
23441 | + | |
23442 | #define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) | |
23443 | #define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) | |
23444 | ||
23445 | --- a/include/asm-x86/mach-xen/asm/pgtable_32.h | |
23446 | +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h | |
23447 | @@ -1,8 +1,6 @@ | |
23448 | #ifndef _I386_PGTABLE_H | |
23449 | #define _I386_PGTABLE_H | |
23450 | ||
23451 | -#include <asm/hypervisor.h> | |
23452 | - | |
23453 | /* | |
23454 | * The Linux memory management assumes a three-level page table setup. On | |
23455 | * the i386, we use that, but "fold" the mid level into the top-level page | |
23456 | @@ -25,20 +23,10 @@ | |
23457 | ||
23458 | struct vm_area_struct; | |
23459 | ||
23460 | -/* | |
23461 | - * ZERO_PAGE is a global shared page that is always zero: used | |
23462 | - * for zero-mapped memory areas etc.. | |
23463 | - */ | |
23464 | -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) | |
23465 | -extern unsigned long empty_zero_page[1024]; | |
23466 | extern pgd_t *swapper_pg_dir; | |
23467 | -extern struct kmem_cache *pmd_cache; | |
23468 | -extern spinlock_t pgd_lock; | |
23469 | -extern struct page *pgd_list; | |
23470 | -void check_pgt_cache(void); | |
23471 | ||
23472 | -void pmd_ctor(struct kmem_cache *, void *); | |
23473 | -void pgtable_cache_init(void); | |
23474 | +static inline void pgtable_cache_init(void) { } | |
23475 | +static inline void check_pgt_cache(void) { } | |
23476 | void paging_init(void); | |
23477 | ||
23478 | ||
23479 | @@ -58,16 +46,9 @@ void paging_init(void); | |
23480 | #define PGDIR_SIZE (1UL << PGDIR_SHIFT) | |
23481 | #define PGDIR_MASK (~(PGDIR_SIZE-1)) | |
23482 | ||
23483 | -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) | |
23484 | -#define FIRST_USER_ADDRESS 0 | |
23485 | - | |
23486 | #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) | |
23487 | #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) | |
23488 | ||
23489 | -#define TWOLEVEL_PGDIR_SHIFT 22 | |
23490 | -#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT) | |
23491 | -#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS) | |
23492 | - | |
23493 | /* Just any arbitrary offset to the start of the vmalloc VM area: the | |
23494 | * current 8MB value just means that there will be a 8MB "hole" after the | |
23495 | * physical memory until the kernel virtual memory starts. That means that | |
23496 | @@ -78,121 +59,19 @@ void paging_init(void); | |
23497 | #define VMALLOC_OFFSET (8*1024*1024) | |
23498 | #define VMALLOC_START (((unsigned long) high_memory + \ | |
23499 | 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) | |
23500 | -#ifdef CONFIG_HIGHMEM | |
23501 | -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) | |
23502 | -#else | |
23503 | -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) | |
23504 | -#endif | |
23505 | - | |
23506 | -/* | |
23507 | - * _PAGE_PSE set in the page directory entry just means that | |
23508 | - * the page directory entry points directly to a 4MB-aligned block of | |
23509 | - * memory. | |
23510 | - */ | |
23511 | -#define _PAGE_BIT_PRESENT 0 | |
23512 | -#define _PAGE_BIT_RW 1 | |
23513 | -#define _PAGE_BIT_USER 2 | |
23514 | -#define _PAGE_BIT_PWT 3 | |
23515 | -#define _PAGE_BIT_PCD 4 | |
23516 | -#define _PAGE_BIT_ACCESSED 5 | |
23517 | -#define _PAGE_BIT_DIRTY 6 | |
23518 | -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */ | |
23519 | -#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ | |
23520 | -/*#define _PAGE_BIT_UNUSED1 9*/ /* available for programmer */ | |
23521 | -#define _PAGE_BIT_UNUSED2 10 | |
23522 | -#define _PAGE_BIT_UNUSED3 11 | |
23523 | -#define _PAGE_BIT_NX 63 | |
23524 | - | |
23525 | -#define _PAGE_PRESENT 0x001 | |
23526 | -#define _PAGE_RW 0x002 | |
23527 | -#define _PAGE_USER 0x004 | |
23528 | -#define _PAGE_PWT 0x008 | |
23529 | -#define _PAGE_PCD 0x010 | |
23530 | -#define _PAGE_ACCESSED 0x020 | |
23531 | -#define _PAGE_DIRTY 0x040 | |
23532 | -#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */ | |
23533 | -#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */ | |
23534 | -/*#define _PAGE_UNUSED1 0x200*/ /* available for programmer */ | |
23535 | -#define _PAGE_UNUSED2 0x400 | |
23536 | -#define _PAGE_UNUSED3 0x800 | |
23537 | - | |
23538 | -/* If _PAGE_PRESENT is clear, we use these: */ | |
23539 | -#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ | |
23540 | -#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE; | |
23541 | - pte_present gives true */ | |
23542 | #ifdef CONFIG_X86_PAE | |
23543 | -#define _PAGE_NX (1ULL<<_PAGE_BIT_NX) | |
23544 | +#define LAST_PKMAP 512 | |
23545 | #else | |
23546 | -#define _PAGE_NX 0 | |
23547 | +#define LAST_PKMAP 1024 | |
23548 | #endif | |
23549 | ||
23550 | -/* Mapped page is I/O or foreign and has no associated page struct. */ | |
23551 | -#define _PAGE_IO 0x200 | |
23552 | +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK) | |
23553 | ||
23554 | -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) | |
23555 | -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | |
23556 | -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO) | |
23557 | - | |
23558 | -#define PAGE_NONE \ | |
23559 | - __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) | |
23560 | -#define PAGE_SHARED \ | |
23561 | - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) | |
23562 | - | |
23563 | -#define PAGE_SHARED_EXEC \ | |
23564 | - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) | |
23565 | -#define PAGE_COPY_NOEXEC \ | |
23566 | - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
23567 | -#define PAGE_COPY_EXEC \ | |
23568 | - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) | |
23569 | -#define PAGE_COPY \ | |
23570 | - PAGE_COPY_NOEXEC | |
23571 | -#define PAGE_READONLY \ | |
23572 | - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
23573 | -#define PAGE_READONLY_EXEC \ | |
23574 | - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) | |
23575 | - | |
23576 | -#define _PAGE_KERNEL \ | |
23577 | - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX) | |
23578 | -#define _PAGE_KERNEL_EXEC \ | |
23579 | - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) | |
23580 | - | |
23581 | -extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; | |
23582 | -#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) | |
23583 | -#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) | |
23584 | -#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD) | |
23585 | -#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) | |
23586 | -#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) | |
23587 | - | |
23588 | -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) | |
23589 | -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) | |
23590 | -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) | |
23591 | -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) | |
23592 | -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) | |
23593 | -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) | |
23594 | -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) | |
23595 | - | |
23596 | -/* | |
23597 | - * The i386 can't do page protection for execute, and considers that | |
23598 | - * the same are read. Also, write permissions imply read permissions. | |
23599 | - * This is the closest we can get.. | |
23600 | - */ | |
23601 | -#define __P000 PAGE_NONE | |
23602 | -#define __P001 PAGE_READONLY | |
23603 | -#define __P010 PAGE_COPY | |
23604 | -#define __P011 PAGE_COPY | |
23605 | -#define __P100 PAGE_READONLY_EXEC | |
23606 | -#define __P101 PAGE_READONLY_EXEC | |
23607 | -#define __P110 PAGE_COPY_EXEC | |
23608 | -#define __P111 PAGE_COPY_EXEC | |
23609 | - | |
23610 | -#define __S000 PAGE_NONE | |
23611 | -#define __S001 PAGE_READONLY | |
23612 | -#define __S010 PAGE_SHARED | |
23613 | -#define __S011 PAGE_SHARED | |
23614 | -#define __S100 PAGE_READONLY_EXEC | |
23615 | -#define __S101 PAGE_READONLY_EXEC | |
23616 | -#define __S110 PAGE_SHARED_EXEC | |
23617 | -#define __S111 PAGE_SHARED_EXEC | |
23618 | +#ifdef CONFIG_HIGHMEM | |
23619 | +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) | |
23620 | +#else | |
23621 | +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) | |
23622 | +#endif | |
23623 | ||
23624 | /* | |
23625 | * Define this if things work differently on an i386 and an i486: | |
23626 | @@ -221,28 +100,6 @@ extern unsigned long pg0[]; | |
23627 | ||
23628 | #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) | |
23629 | ||
23630 | -/* | |
23631 | - * The following only work if pte_present() is true. | |
23632 | - * Undefined behaviour if not.. | |
23633 | - */ | |
23634 | -static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } | |
23635 | -static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } | |
23636 | -static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } | |
23637 | -static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; } | |
23638 | - | |
23639 | -/* | |
23640 | - * The following only works if pte_present() is not true. | |
23641 | - */ | |
23642 | -static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; } | |
23643 | - | |
23644 | -static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; } | |
23645 | -static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; } | |
23646 | -static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; } | |
23647 | -static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } | |
23648 | -static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } | |
23649 | -static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } | |
23650 | -static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } | |
23651 | - | |
23652 | #ifdef CONFIG_X86_PAE | |
23653 | # include <asm/pgtable-3level.h> | |
23654 | #else | |
23655 | @@ -250,111 +107,6 @@ static inline pte_t pte_mkhuge(pte_t pte | |
23656 | #endif | |
23657 | ||
23658 | /* | |
23659 | - * Rules for using pte_update - it must be called after any PTE update which | |
23660 | - * has not been done using the set_pte / clear_pte interfaces. It is used by | |
23661 | - * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE | |
23662 | - * updates should either be sets, clears, or set_pte_atomic for P->P | |
23663 | - * transitions, which means this hook should only be called for user PTEs. | |
23664 | - * This hook implies a P->P protection or access change has taken place, which | |
23665 | - * requires a subsequent TLB flush. The notification can optionally be delayed | |
23666 | - * until the TLB flush event by using the pte_update_defer form of the | |
23667 | - * interface, but care must be taken to assure that the flush happens while | |
23668 | - * still holding the same page table lock so that the shadow and primary pages | |
23669 | - * do not become out of sync on SMP. | |
23670 | - */ | |
23671 | -#define pte_update(mm, addr, ptep) do { } while (0) | |
23672 | -#define pte_update_defer(mm, addr, ptep) do { } while (0) | |
23673 | - | |
23674 | -/* local pte updates need not use xchg for locking */ | |
23675 | -static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res) | |
23676 | -{ | |
23677 | - xen_set_pte(ptep, __pte(0)); | |
23678 | - return res; | |
23679 | -} | |
23680 | - | |
23681 | -/* | |
23682 | - * We only update the dirty/accessed state if we set | |
23683 | - * the dirty bit by hand in the kernel, since the hardware | |
23684 | - * will do the accessed bit for us, and we don't want to | |
23685 | - * race with other CPU's that might be updating the dirty | |
23686 | - * bit at the same time. | |
23687 | - */ | |
23688 | -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | |
23689 | -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ | |
23690 | -({ \ | |
23691 | - int __changed = !pte_same(*(ptep), entry); \ | |
23692 | - if (__changed && (dirty)) { \ | |
23693 | - if ( likely((vma)->vm_mm == current->mm) ) { \ | |
23694 | - BUG_ON(HYPERVISOR_update_va_mapping(address, \ | |
23695 | - entry, \ | |
23696 | - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ | |
23697 | - UVMF_INVLPG|UVMF_MULTI)); \ | |
23698 | - } else { \ | |
23699 | - xen_l1_entry_update(ptep, entry); \ | |
23700 | - flush_tlb_page(vma, address); \ | |
23701 | - } \ | |
23702 | - } \ | |
23703 | - __changed; \ | |
23704 | -}) | |
23705 | - | |
23706 | -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG | |
23707 | -#define ptep_test_and_clear_young(vma, addr, ptep) ({ \ | |
23708 | - int __ret = 0; \ | |
23709 | - if (pte_young(*(ptep))) \ | |
23710 | - __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \ | |
23711 | - &(ptep)->pte_low); \ | |
23712 | - if (__ret) \ | |
23713 | - pte_update((vma)->vm_mm, addr, ptep); \ | |
23714 | - __ret; \ | |
23715 | -}) | |
23716 | - | |
23717 | -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH | |
23718 | -#define ptep_clear_flush_young(vma, address, ptep) \ | |
23719 | -({ \ | |
23720 | - pte_t __pte = *(ptep); \ | |
23721 | - int __young = pte_young(__pte); \ | |
23722 | - __pte = pte_mkold(__pte); \ | |
23723 | - if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \ | |
23724 | - (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \ | |
23725 | - else if (__young) \ | |
23726 | - (ptep)->pte_low = __pte.pte_low; \ | |
23727 | - __young; \ | |
23728 | -}) | |
23729 | - | |
23730 | -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR | |
23731 | -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | |
23732 | -{ | |
23733 | - pte_t pte = *ptep; | |
23734 | - if (!pte_none(pte) | |
23735 | - && (mm != &init_mm | |
23736 | - || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) { | |
23737 | - pte = xen_ptep_get_and_clear(ptep, pte); | |
23738 | - pte_update(mm, addr, ptep); | |
23739 | - } | |
23740 | - return pte; | |
23741 | -} | |
23742 | - | |
23743 | -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL | |
23744 | -#define ptep_get_and_clear_full(mm, addr, ptep, full) \ | |
23745 | - ((full) ? ({ \ | |
23746 | - pte_t __res = *(ptep); \ | |
23747 | - if (PagePinned(virt_to_page((mm)->pgd))) \ | |
23748 | - xen_l1_entry_update(ptep, __pte(0)); \ | |
23749 | - else \ | |
23750 | - *(ptep) = __pte(0); \ | |
23751 | - __res; \ | |
23752 | - }) : \ | |
23753 | - ptep_get_and_clear(mm, addr, ptep)) | |
23754 | - | |
23755 | -#define __HAVE_ARCH_PTEP_SET_WRPROTECT | |
23756 | -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | |
23757 | -{ | |
23758 | - pte_t pte = *ptep; | |
23759 | - if (pte_write(pte)) | |
23760 | - set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); | |
23761 | -} | |
23762 | - | |
23763 | -/* | |
23764 | * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); | |
23765 | * | |
23766 | * dst - pointer to pgd range anwhere on a pgd page | |
23767 | @@ -383,26 +135,6 @@ static inline void clone_pgd_range(pgd_t | |
23768 | ||
23769 | #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) | |
23770 | ||
23771 | -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) | |
23772 | -{ | |
23773 | - /* | |
23774 | - * Since this might change the present bit (which controls whether | |
23775 | - * a pte_t object has undergone p2m translation), we must use | |
23776 | - * pte_val() on the input pte and __pte() for the return value. | |
23777 | - */ | |
23778 | - paddr_t pteval = pte_val(pte); | |
23779 | - | |
23780 | - pteval &= _PAGE_CHG_MASK; | |
23781 | - pteval |= pgprot_val(newprot); | |
23782 | -#ifdef CONFIG_X86_PAE | |
23783 | - pteval &= __supported_pte_mask; | |
23784 | -#endif | |
23785 | - return __pte(pteval); | |
23786 | -} | |
23787 | - | |
23788 | -#define pmd_large(pmd) \ | |
23789 | -((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT)) | |
23790 | - | |
23791 | /* | |
23792 | * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] | |
23793 | * | |
23794 | @@ -424,6 +156,8 @@ static inline pte_t pte_modify(pte_t pte | |
23795 | */ | |
23796 | #define pgd_offset_k(address) pgd_offset(&init_mm, address) | |
23797 | ||
23798 | +static inline int pud_large(pud_t pud) { return 0; } | |
23799 | + | |
23800 | /* | |
23801 | * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] | |
23802 | * | |
23803 | @@ -449,26 +183,6 @@ static inline pte_t pte_modify(pte_t pte | |
23804 | #define pmd_page_vaddr(pmd) \ | |
23805 | ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) | |
23806 | ||
23807 | -/* | |
23808 | - * Helper function that returns the kernel pagetable entry controlling | |
23809 | - * the virtual address 'address'. NULL means no pagetable entry present. | |
23810 | - * NOTE: the return type is pte_t but if the pmd is PSE then we return it | |
23811 | - * as a pte too. | |
23812 | - */ | |
23813 | -extern pte_t *lookup_address(unsigned long address); | |
23814 | - | |
23815 | -/* | |
23816 | - * Make a given kernel text page executable/non-executable. | |
23817 | - * Returns the previous executability setting of that page (which | |
23818 | - * is used to restore the previous state). Used by the SMP bootup code. | |
23819 | - * NOTE: this is an __init function for security reasons. | |
23820 | - */ | |
23821 | -#ifdef CONFIG_X86_PAE | |
23822 | - extern int set_kernel_exec(unsigned long vaddr, int enable); | |
23823 | -#else | |
23824 | - static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;} | |
23825 | -#endif | |
23826 | - | |
23827 | #if defined(CONFIG_HIGHPTE) | |
23828 | #define pte_offset_map(dir, address) \ | |
23829 | ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) | |
23830 | @@ -496,59 +210,22 @@ extern pte_t *lookup_address(unsigned lo | |
23831 | */ | |
23832 | #define update_mmu_cache(vma,address,pte) do { } while (0) | |
23833 | ||
23834 | -#include <xen/features.h> | |
23835 | void make_lowmem_page_readonly(void *va, unsigned int feature); | |
23836 | void make_lowmem_page_writable(void *va, unsigned int feature); | |
23837 | -void make_page_readonly(void *va, unsigned int feature); | |
23838 | -void make_page_writable(void *va, unsigned int feature); | |
23839 | -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); | |
23840 | -void make_pages_writable(void *va, unsigned int nr, unsigned int feature); | |
23841 | - | |
23842 | -#define virt_to_ptep(va) \ | |
23843 | -({ \ | |
23844 | - pte_t *__ptep = lookup_address((unsigned long)(va)); \ | |
23845 | - BUG_ON(!__ptep || !pte_present(*__ptep)); \ | |
23846 | - __ptep; \ | |
23847 | -}) | |
23848 | - | |
23849 | -#define arbitrary_virt_to_machine(va) \ | |
23850 | - (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \ | |
23851 | - | ((unsigned long)(va) & (PAGE_SIZE - 1))) | |
23852 | ||
23853 | #endif /* !__ASSEMBLY__ */ | |
23854 | ||
23855 | +/* | |
23856 | + * kern_addr_valid() is (1) for FLATMEM and (0) for | |
23857 | + * SPARSEMEM and DISCONTIGMEM | |
23858 | + */ | |
23859 | #ifdef CONFIG_FLATMEM | |
23860 | #define kern_addr_valid(addr) (1) | |
23861 | -#endif /* CONFIG_FLATMEM */ | |
23862 | - | |
23863 | -int direct_remap_pfn_range(struct vm_area_struct *vma, | |
23864 | - unsigned long address, | |
23865 | - unsigned long mfn, | |
23866 | - unsigned long size, | |
23867 | - pgprot_t prot, | |
23868 | - domid_t domid); | |
23869 | -int direct_kernel_remap_pfn_range(unsigned long address, | |
23870 | - unsigned long mfn, | |
23871 | - unsigned long size, | |
23872 | - pgprot_t prot, | |
23873 | - domid_t domid); | |
23874 | -int create_lookup_pte_addr(struct mm_struct *mm, | |
23875 | - unsigned long address, | |
23876 | - uint64_t *ptep); | |
23877 | -int touch_pte_range(struct mm_struct *mm, | |
23878 | - unsigned long address, | |
23879 | - unsigned long size); | |
23880 | - | |
23881 | -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |
23882 | - unsigned long addr, unsigned long end, pgprot_t newprot, | |
23883 | - int dirty_accountable); | |
23884 | - | |
23885 | -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \ | |
23886 | - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) | |
23887 | +#else | |
23888 | +#define kern_addr_valid(kaddr) (0) | |
23889 | +#endif | |
23890 | ||
23891 | #define io_remap_pfn_range(vma,from,pfn,size,prot) \ | |
23892 | direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO) | |
23893 | ||
23894 | -#include <asm-generic/pgtable.h> | |
23895 | - | |
23896 | #endif /* _I386_PGTABLE_H */ | |
23897 | --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h | |
23898 | +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h | |
23899 | @@ -18,16 +18,18 @@ | |
23900 | printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \ | |
23901 | &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT) | |
23902 | ||
23903 | -#define pud_none(pud) 0 | |
23904 | -#define pud_bad(pud) 0 | |
23905 | -#define pud_present(pud) 1 | |
23906 | ||
23907 | -/* | |
23908 | - * All present pages with !NX bit are kernel-executable: | |
23909 | - */ | |
23910 | -static inline int pte_exec_kernel(pte_t pte) | |
23911 | +static inline int pud_none(pud_t pud) | |
23912 | +{ | |
23913 | + return __pud_val(pud) == 0; | |
23914 | +} | |
23915 | +static inline int pud_bad(pud_t pud) | |
23916 | { | |
23917 | - return !(__pte_val(pte) & _PAGE_NX); | |
23918 | + return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0; | |
23919 | +} | |
23920 | +static inline int pud_present(pud_t pud) | |
23921 | +{ | |
23922 | + return __pud_val(pud) & _PAGE_PRESENT; | |
23923 | } | |
23924 | ||
23925 | /* Rules for using set_pte: the pte being assigned *must* be | |
23926 | @@ -44,14 +46,6 @@ static inline void xen_set_pte(pte_t *pt | |
23927 | ptep->pte_low = pte.pte_low; | |
23928 | } | |
23929 | ||
23930 | -static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | |
23931 | - pte_t *ptep , pte_t pte) | |
23932 | -{ | |
23933 | - if ((mm != current->mm && mm != &init_mm) || | |
23934 | - HYPERVISOR_update_va_mapping(addr, pte, 0)) | |
23935 | - xen_set_pte(ptep, pte); | |
23936 | -} | |
23937 | - | |
23938 | static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte) | |
23939 | { | |
23940 | set_64bit((unsigned long long *)(ptep),__pte_val(pte)); | |
23941 | @@ -70,14 +64,11 @@ static inline void xen_set_pud(pud_t *pu | |
23942 | * entry, so clear the bottom half first and enforce ordering with a compiler | |
23943 | * barrier. | |
23944 | */ | |
23945 | -static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | |
23946 | +static inline void __xen_pte_clear(pte_t *ptep) | |
23947 | { | |
23948 | - if ((mm != current->mm && mm != &init_mm) | |
23949 | - || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { | |
23950 | - ptep->pte_low = 0; | |
23951 | - smp_wmb(); | |
23952 | - ptep->pte_high = 0; | |
23953 | - } | |
23954 | + ptep->pte_low = 0; | |
23955 | + smp_wmb(); | |
23956 | + ptep->pte_high = 0; | |
23957 | } | |
23958 | ||
23959 | static inline void xen_pmd_clear(pmd_t *pmd) | |
23960 | @@ -85,21 +76,25 @@ static inline void xen_pmd_clear(pmd_t * | |
23961 | xen_l2_entry_update(pmd, __pmd(0)); | |
23962 | } | |
23963 | ||
23964 | -#define set_pte(ptep, pte) xen_set_pte(ptep, pte) | |
23965 | -#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) | |
23966 | -#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte) | |
23967 | -#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd) | |
23968 | -#define set_pud(pudp, pud) xen_set_pud(pudp, pud) | |
23969 | -#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep) | |
23970 | -#define pmd_clear(pmd) xen_pmd_clear(pmd) | |
23971 | +static inline void pud_clear(pud_t *pudp) | |
23972 | +{ | |
23973 | + pgdval_t pgd; | |
23974 | + | |
23975 | + set_pud(pudp, __pud(0)); | |
23976 | ||
23977 | -/* | |
23978 | - * Pentium-II erratum A13: in PAE mode we explicitly have to flush | |
23979 | - * the TLB via cr3 if the top-level pgd is changed... | |
23980 | - * We do not let the generic code free and clear pgd entries due to | |
23981 | - * this erratum. | |
23982 | - */ | |
23983 | -static inline void pud_clear (pud_t * pud) { } | |
23984 | + /* | |
23985 | + * According to Intel App note "TLBs, Paging-Structure Caches, | |
23986 | + * and Their Invalidation", April 2007, document 317080-001, | |
23987 | + * section 8.1: in PAE mode we explicitly have to flush the | |
23988 | + * TLB via cr3 if the top-level pgd is changed... | |
23989 | + * | |
23990 | + * Make sure the pud entry we're updating is within the | |
23991 | + * current pgd to avoid unnecessary TLB flushes. | |
23992 | + */ | |
23993 | + pgd = read_cr3(); | |
23994 | + if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) | |
23995 | + xen_tlb_flush(); | |
23996 | +} | |
23997 | ||
23998 | #define pud_page(pud) \ | |
23999 | ((struct page *) __va(pud_val(pud) & PAGE_MASK)) | |
24000 | @@ -128,24 +123,6 @@ static inline pte_t xen_ptep_get_and_cle | |
24001 | #define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) | |
24002 | #endif | |
24003 | ||
24004 | -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH | |
24005 | -#define ptep_clear_flush(vma, addr, ptep) \ | |
24006 | -({ \ | |
24007 | - pte_t *__ptep = (ptep); \ | |
24008 | - pte_t __res = *__ptep; \ | |
24009 | - if (!pte_none(__res) && \ | |
24010 | - ((vma)->vm_mm != current->mm || \ | |
24011 | - HYPERVISOR_update_va_mapping(addr, __pte(0), \ | |
24012 | - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ | |
24013 | - UVMF_INVLPG|UVMF_MULTI))) { \ | |
24014 | - __ptep->pte_low = 0; \ | |
24015 | - smp_wmb(); \ | |
24016 | - __ptep->pte_high = 0; \ | |
24017 | - flush_tlb_page(vma, addr); \ | |
24018 | - } \ | |
24019 | - __res; \ | |
24020 | -}) | |
24021 | - | |
24022 | #define __HAVE_ARCH_PTE_SAME | |
24023 | static inline int pte_same(pte_t a, pte_t b) | |
24024 | { | |
24025 | @@ -168,26 +145,12 @@ static inline int pte_none(pte_t pte) | |
24026 | mfn_to_local_pfn(__pte_mfn(_pte)) : \ | |
24027 | __pte_mfn(_pte)) | |
24028 | ||
24029 | -extern unsigned long long __supported_pte_mask; | |
24030 | - | |
24031 | -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) | |
24032 | -{ | |
24033 | - return __pte((((unsigned long long)page_nr << PAGE_SHIFT) | | |
24034 | - pgprot_val(pgprot)) & __supported_pte_mask); | |
24035 | -} | |
24036 | - | |
24037 | -static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) | |
24038 | -{ | |
24039 | - return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | | |
24040 | - pgprot_val(pgprot)) & __supported_pte_mask); | |
24041 | -} | |
24042 | - | |
24043 | /* | |
24044 | * Bits 0, 6 and 7 are taken in the low part of the pte, | |
24045 | * put the 32 bits of offset into the high part. | |
24046 | */ | |
24047 | #define pte_to_pgoff(pte) ((pte).pte_high) | |
24048 | -#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) | |
24049 | +#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } }) | |
24050 | #define PTE_FILE_MAX_BITS 32 | |
24051 | ||
24052 | /* Encode and de-code a swap entry */ | |
24053 | @@ -195,8 +158,6 @@ static inline pmd_t pfn_pmd(unsigned lon | |
24054 | #define __swp_offset(x) ((x).val >> 5) | |
24055 | #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) | |
24056 | #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) | |
24057 | -#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val }) | |
24058 | - | |
24059 | -#define __pmd_free_tlb(tlb, x) do { } while (0) | |
24060 | +#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) | |
24061 | ||
24062 | #endif /* _I386_PGTABLE_3LEVEL_H */ | |
24063 | --- a/include/asm-x86/mach-xen/asm/pgtable_64.h | |
24064 | +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h | |
24065 | @@ -13,47 +13,26 @@ | |
24066 | #include <linux/threads.h> | |
24067 | #include <linux/sched.h> | |
24068 | #include <asm/pda.h> | |
24069 | -#ifdef CONFIG_XEN | |
24070 | -#include <asm/hypervisor.h> | |
24071 | ||
24072 | +#ifdef CONFIG_XEN | |
24073 | extern pud_t level3_user_pgt[512]; | |
24074 | ||
24075 | extern void xen_init_pt(void); | |
24076 | - | |
24077 | -extern pte_t *lookup_address(unsigned long address); | |
24078 | - | |
24079 | -#define virt_to_ptep(va) \ | |
24080 | -({ \ | |
24081 | - pte_t *__ptep = lookup_address((unsigned long)(va)); \ | |
24082 | - BUG_ON(!__ptep || !pte_present(*__ptep)); \ | |
24083 | - __ptep; \ | |
24084 | -}) | |
24085 | - | |
24086 | -#define arbitrary_virt_to_machine(va) \ | |
24087 | - (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \ | |
24088 | - | ((unsigned long)(va) & (PAGE_SIZE - 1))) | |
24089 | #endif | |
24090 | ||
24091 | extern pud_t level3_kernel_pgt[512]; | |
24092 | extern pud_t level3_ident_pgt[512]; | |
24093 | extern pmd_t level2_kernel_pgt[512]; | |
24094 | extern pgd_t init_level4_pgt[]; | |
24095 | -extern unsigned long __supported_pte_mask; | |
24096 | ||
24097 | #define swapper_pg_dir init_level4_pgt | |
24098 | ||
24099 | extern void paging_init(void); | |
24100 | -extern void clear_kernel_mapping(unsigned long addr, unsigned long size); | |
24101 | - | |
24102 | -/* | |
24103 | - * ZERO_PAGE is a global shared page that is always zero: used | |
24104 | - * for zero-mapped memory areas etc.. | |
24105 | - */ | |
24106 | -extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; | |
24107 | -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) | |
24108 | ||
24109 | #endif /* !__ASSEMBLY__ */ | |
24110 | ||
24111 | +#define SHARED_KERNEL_PMD 1 | |
24112 | + | |
24113 | /* | |
24114 | * PGDIR_SHIFT determines what a top-level page table entry can map | |
24115 | */ | |
24116 | @@ -96,31 +75,63 @@ extern unsigned long empty_zero_page[PAG | |
24117 | #define pgd_none(x) (!__pgd_val(x)) | |
24118 | #define pud_none(x) (!__pud_val(x)) | |
24119 | ||
24120 | -static inline void set_pte(pte_t *dst, pte_t val) | |
24121 | +struct mm_struct; | |
24122 | + | |
24123 | +#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0)) | |
24124 | + | |
24125 | +static inline void xen_set_pte(pte_t *ptep, pte_t pte) | |
24126 | +{ | |
24127 | + *ptep = pte; | |
24128 | +} | |
24129 | + | |
24130 | +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte) | |
24131 | +{ | |
24132 | + xen_set_pte(ptep, pte); | |
24133 | +} | |
24134 | + | |
24135 | +#ifdef CONFIG_SMP | |
24136 | +static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret) | |
24137 | +{ | |
24138 | + return __pte_ma(xchg(&xp->pte, 0)); | |
24139 | +} | |
24140 | +#else | |
24141 | +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) | |
24142 | +#endif | |
24143 | + | |
24144 | +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd) | |
24145 | { | |
24146 | - *dst = val; | |
24147 | + xen_l2_entry_update(pmdp, pmd); | |
24148 | } | |
24149 | ||
24150 | -#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval)) | |
24151 | -#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval)) | |
24152 | -#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval)) | |
24153 | +static inline void xen_pmd_clear(pmd_t *pmd) | |
24154 | +{ | |
24155 | + xen_set_pmd(pmd, xen_make_pmd(0)); | |
24156 | +} | |
24157 | + | |
24158 | +static inline void xen_set_pud(pud_t *pudp, pud_t pud) | |
24159 | +{ | |
24160 | + xen_l3_entry_update(pudp, pud); | |
24161 | +} | |
24162 | ||
24163 | -static inline void pud_clear (pud_t * pud) | |
24164 | +static inline void xen_pud_clear(pud_t *pud) | |
24165 | { | |
24166 | - set_pud(pud, __pud(0)); | |
24167 | + xen_set_pud(pud, xen_make_pud(0)); | |
24168 | } | |
24169 | ||
24170 | #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) | |
24171 | ||
24172 | -static inline void pgd_clear (pgd_t * pgd) | |
24173 | +static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd) | |
24174 | { | |
24175 | - set_pgd(pgd, __pgd(0)); | |
24176 | - set_pgd(__user_pgd(pgd), __pgd(0)); | |
24177 | + xen_l4_entry_update(pgdp, pgd); | |
24178 | } | |
24179 | ||
24180 | -#define pte_same(a, b) ((a).pte == (b).pte) | |
24181 | +static inline void xen_pgd_clear(pgd_t * pgd) | |
24182 | +{ | |
24183 | + xen_set_pgd(pgd, xen_make_pgd(0)); | |
24184 | + xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0)); | |
24185 | +} | |
24186 | ||
24187 | -#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK)) | |
24188 | +#define pte_same(a, b) ((a).pte == (b).pte) | |
24189 | ||
24190 | #endif /* !__ASSEMBLY__ */ | |
24191 | ||
24192 | @@ -131,8 +142,6 @@ static inline void pgd_clear (pgd_t * pg | |
24193 | #define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT) | |
24194 | #define PGDIR_MASK (~(PGDIR_SIZE-1)) | |
24195 | ||
24196 | -#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1) | |
24197 | -#define FIRST_USER_ADDRESS 0 | |
24198 | ||
24199 | #define MAXMEM _AC(0x3fffffffffff, UL) | |
24200 | #define VMALLOC_START _AC(0xffffc20000000000, UL) | |
24201 | @@ -142,105 +151,6 @@ static inline void pgd_clear (pgd_t * pg | |
24202 | #define MODULES_END _AC(0xfffffffffff00000, UL) | |
24203 | #define MODULES_LEN (MODULES_END - MODULES_VADDR) | |
24204 | ||
24205 | -#define _PAGE_BIT_PRESENT 0 | |
24206 | -#define _PAGE_BIT_RW 1 | |
24207 | -#define _PAGE_BIT_USER 2 | |
24208 | -#define _PAGE_BIT_PWT 3 | |
24209 | -#define _PAGE_BIT_PCD 4 | |
24210 | -#define _PAGE_BIT_ACCESSED 5 | |
24211 | -#define _PAGE_BIT_DIRTY 6 | |
24212 | -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ | |
24213 | -#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ | |
24214 | -#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ | |
24215 | - | |
24216 | -#define _PAGE_PRESENT 0x001 | |
24217 | -#define _PAGE_RW 0x002 | |
24218 | -#define _PAGE_USER 0x004 | |
24219 | -#define _PAGE_PWT 0x008 | |
24220 | -#define _PAGE_PCD 0x010 | |
24221 | -#define _PAGE_ACCESSED 0x020 | |
24222 | -#define _PAGE_DIRTY 0x040 | |
24223 | -#define _PAGE_PSE 0x080 /* 2MB page */ | |
24224 | -#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ | |
24225 | -#define _PAGE_GLOBAL 0x100 /* Global TLB entry */ | |
24226 | - | |
24227 | -#define _PAGE_PROTNONE 0x080 /* If not present */ | |
24228 | -#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX) | |
24229 | - | |
24230 | -/* Mapped page is I/O or foreign and has no associated page struct. */ | |
24231 | -#define _PAGE_IO 0x200 | |
24232 | - | |
24233 | -#ifndef __ASSEMBLY__ | |
24234 | -#if CONFIG_XEN_COMPAT <= 0x030002 | |
24235 | -extern unsigned int __kernel_page_user; | |
24236 | -#else | |
24237 | -#define __kernel_page_user 0 | |
24238 | -#endif | |
24239 | -#endif | |
24240 | - | |
24241 | -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) | |
24242 | -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user) | |
24243 | - | |
24244 | -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO) | |
24245 | - | |
24246 | -#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) | |
24247 | -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
24248 | -#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) | |
24249 | -#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
24250 | -#define PAGE_COPY PAGE_COPY_NOEXEC | |
24251 | -#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) | |
24252 | -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
24253 | -#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) | |
24254 | -#define __PAGE_KERNEL \ | |
24255 | - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) | |
24256 | -#define __PAGE_KERNEL_EXEC \ | |
24257 | - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user) | |
24258 | -#define __PAGE_KERNEL_NOCACHE \ | |
24259 | - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) | |
24260 | -#define __PAGE_KERNEL_RO \ | |
24261 | - (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) | |
24262 | -#define __PAGE_KERNEL_VSYSCALL \ | |
24263 | - (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) | |
24264 | -#define __PAGE_KERNEL_VSYSCALL_NOCACHE \ | |
24265 | - (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD) | |
24266 | -#define __PAGE_KERNEL_LARGE \ | |
24267 | - (__PAGE_KERNEL | _PAGE_PSE) | |
24268 | -#define __PAGE_KERNEL_LARGE_EXEC \ | |
24269 | - (__PAGE_KERNEL_EXEC | _PAGE_PSE) | |
24270 | - | |
24271 | -/* | |
24272 | - * We don't support GLOBAL page in xenolinux64 | |
24273 | - */ | |
24274 | -#define MAKE_GLOBAL(x) __pgprot((x)) | |
24275 | - | |
24276 | -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL) | |
24277 | -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC) | |
24278 | -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) | |
24279 | -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) | |
24280 | -#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL) | |
24281 | -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL) | |
24282 | -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE) | |
24283 | -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE) | |
24284 | - | |
24285 | -/* xwr */ | |
24286 | -#define __P000 PAGE_NONE | |
24287 | -#define __P001 PAGE_READONLY | |
24288 | -#define __P010 PAGE_COPY | |
24289 | -#define __P011 PAGE_COPY | |
24290 | -#define __P100 PAGE_READONLY_EXEC | |
24291 | -#define __P101 PAGE_READONLY_EXEC | |
24292 | -#define __P110 PAGE_COPY_EXEC | |
24293 | -#define __P111 PAGE_COPY_EXEC | |
24294 | - | |
24295 | -#define __S000 PAGE_NONE | |
24296 | -#define __S001 PAGE_READONLY | |
24297 | -#define __S010 PAGE_SHARED | |
24298 | -#define __S011 PAGE_SHARED | |
24299 | -#define __S100 PAGE_READONLY_EXEC | |
24300 | -#define __S101 PAGE_READONLY_EXEC | |
24301 | -#define __S110 PAGE_SHARED_EXEC | |
24302 | -#define __S111 PAGE_SHARED_EXEC | |
24303 | - | |
24304 | #ifndef __ASSEMBLY__ | |
24305 | ||
24306 | static inline unsigned long pgd_bad(pgd_t pgd) | |
24307 | @@ -258,119 +168,26 @@ static inline unsigned long pmd_bad(pmd_ | |
24308 | return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); | |
24309 | } | |
24310 | ||
24311 | -#define set_pte_at(_mm,addr,ptep,pteval) do { \ | |
24312 | - if (((_mm) != current->mm && (_mm) != &init_mm) || \ | |
24313 | - HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \ | |
24314 | - set_pte((ptep), (pteval)); \ | |
24315 | -} while (0) | |
24316 | - | |
24317 | #define pte_none(x) (!(x).pte) | |
24318 | #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE)) | |
24319 | -#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) | |
24320 | ||
24321 | -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) | |
24322 | +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */ | |
24323 | ||
24324 | #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT) | |
24325 | #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \ | |
24326 | __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte))) | |
24327 | -#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn : \ | |
24328 | +#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \ | |
24329 | (_pte).pte & _PAGE_PRESENT ? \ | |
24330 | mfn_to_local_pfn(__pte_mfn(_pte)) : \ | |
24331 | __pte_mfn(_pte)) | |
24332 | ||
24333 | #define pte_page(x) pfn_to_page(pte_pfn(x)) | |
24334 | ||
24335 | -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) | |
24336 | -{ | |
24337 | - unsigned long pte = page_nr << PAGE_SHIFT; | |
24338 | - pte |= pgprot_val(pgprot); | |
24339 | - pte &= __supported_pte_mask; | |
24340 | - return __pte(pte); | |
24341 | -} | |
24342 | - | |
24343 | -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | |
24344 | -{ | |
24345 | - pte_t pte = *ptep; | |
24346 | - if (!pte_none(pte)) { | |
24347 | - if ((mm != &init_mm) || | |
24348 | - HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) | |
24349 | - pte = __pte_ma(xchg(&ptep->pte, 0)); | |
24350 | - } | |
24351 | - return pte; | |
24352 | -} | |
24353 | - | |
24354 | -static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) | |
24355 | -{ | |
24356 | - if (full) { | |
24357 | - pte_t pte = *ptep; | |
24358 | - if (PagePinned(virt_to_page(mm->pgd))) | |
24359 | - xen_l1_entry_update(ptep, __pte(0)); | |
24360 | - else | |
24361 | - *ptep = __pte(0); | |
24362 | - return pte; | |
24363 | - } | |
24364 | - return ptep_get_and_clear(mm, addr, ptep); | |
24365 | -} | |
24366 | - | |
24367 | -#define ptep_clear_flush(vma, addr, ptep) \ | |
24368 | -({ \ | |
24369 | - pte_t *__ptep = (ptep); \ | |
24370 | - pte_t __res = *__ptep; \ | |
24371 | - if (!pte_none(__res) && \ | |
24372 | - ((vma)->vm_mm != current->mm || \ | |
24373 | - HYPERVISOR_update_va_mapping(addr, __pte(0), \ | |
24374 | - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ | |
24375 | - UVMF_INVLPG|UVMF_MULTI))) { \ | |
24376 | - __ptep->pte = 0; \ | |
24377 | - flush_tlb_page(vma, addr); \ | |
24378 | - } \ | |
24379 | - __res; \ | |
24380 | -}) | |
24381 | - | |
24382 | -/* | |
24383 | - * The following only work if pte_present() is true. | |
24384 | - * Undefined behaviour if not.. | |
24385 | - */ | |
24386 | -#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT) | |
24387 | -static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; } | |
24388 | -static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; } | |
24389 | -static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; } | |
24390 | -static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; } | |
24391 | -static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; } | |
24392 | - | |
24393 | -static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; } | |
24394 | -static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; } | |
24395 | -static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; } | |
24396 | -static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) &= ~_PAGE_NX; return pte; } | |
24397 | -static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; } | |
24398 | -static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; } | |
24399 | -static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; } | |
24400 | -static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; } | |
24401 | -static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; } | |
24402 | - | |
24403 | -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) | |
24404 | -{ | |
24405 | - if (!pte_young(*ptep)) | |
24406 | - return 0; | |
24407 | - return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte); | |
24408 | -} | |
24409 | - | |
24410 | -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | |
24411 | -{ | |
24412 | - pte_t pte = *ptep; | |
24413 | - if (pte_write(pte)) | |
24414 | - set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); | |
24415 | -} | |
24416 | - | |
24417 | /* | |
24418 | * Macro to mark a page protection value as "uncacheable". | |
24419 | */ | |
24420 | #define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) | |
24421 | ||
24422 | -static inline int pmd_large(pmd_t pte) { | |
24423 | - return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; | |
24424 | -} | |
24425 | - | |
24426 | ||
24427 | /* | |
24428 | * Conversion functions: convert a page and protection to a page entry, | |
24429 | @@ -386,6 +203,7 @@ static inline int pmd_large(pmd_t pte) { | |
24430 | #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr)) | |
24431 | #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address)) | |
24432 | #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT) | |
24433 | +static inline int pgd_large(pgd_t pgd) { return 0; } | |
24434 | #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) | |
24435 | ||
24436 | /* PUD - Level3 access */ | |
24437 | @@ -396,6 +214,12 @@ static inline int pmd_large(pmd_t pte) { | |
24438 | #define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address)) | |
24439 | #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT) | |
24440 | ||
24441 | +static inline int pud_large(pud_t pte) | |
24442 | +{ | |
24443 | + return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) == | |
24444 | + (_PAGE_PSE|_PAGE_PRESENT); | |
24445 | +} | |
24446 | + | |
24447 | /* PMD - Level 2 access */ | |
24448 | #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK)) | |
24449 | #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) | |
24450 | @@ -411,36 +235,18 @@ static inline int pmd_large(pmd_t pte) { | |
24451 | #else | |
24452 | #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT) | |
24453 | #endif | |
24454 | -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) | |
24455 | #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) | |
24456 | #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT) | |
24457 | ||
24458 | #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) | |
24459 | -#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE }) | |
24460 | +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE }) | |
24461 | #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT | |
24462 | ||
24463 | /* PTE - Level 1 access. */ | |
24464 | ||
24465 | /* page, protection -> pte */ | |
24466 | #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) | |
24467 | -#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE) | |
24468 | ||
24469 | -/* Change flags of a PTE */ | |
24470 | -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) | |
24471 | -{ | |
24472 | - /* | |
24473 | - * Since this might change the present bit (which controls whether | |
24474 | - * a pte_t object has undergone p2m translation), we must use | |
24475 | - * pte_val() on the input pte and __pte() for the return value. | |
24476 | - */ | |
24477 | - unsigned long pteval = pte_val(pte); | |
24478 | - | |
24479 | - pteval &= _PAGE_CHG_MASK; | |
24480 | - pteval |= pgprot_val(newprot); | |
24481 | - pteval &= __supported_pte_mask; | |
24482 | - return __pte(pteval); | |
24483 | -} | |
24484 | - | |
24485 | #define pte_index(address) \ | |
24486 | (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) | |
24487 | #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \ | |
24488 | @@ -454,101 +260,21 @@ static inline pte_t pte_modify(pte_t pte | |
24489 | ||
24490 | #define update_mmu_cache(vma,address,pte) do { } while (0) | |
24491 | ||
24492 | -/* | |
24493 | - * Rules for using ptep_establish: the pte MUST be a user pte, and | |
24494 | - * must be a present->present transition. | |
24495 | - */ | |
24496 | -#define __HAVE_ARCH_PTEP_ESTABLISH | |
24497 | -#define ptep_establish(vma, address, ptep, pteval) \ | |
24498 | - do { \ | |
24499 | - if ( likely((vma)->vm_mm == current->mm) ) { \ | |
24500 | - BUG_ON(HYPERVISOR_update_va_mapping(address, \ | |
24501 | - pteval, \ | |
24502 | - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ | |
24503 | - UVMF_INVLPG|UVMF_MULTI)); \ | |
24504 | - } else { \ | |
24505 | - xen_l1_entry_update(ptep, pteval); \ | |
24506 | - flush_tlb_page(vma, address); \ | |
24507 | - } \ | |
24508 | - } while (0) | |
24509 | - | |
24510 | -/* We only update the dirty/accessed state if we set | |
24511 | - * the dirty bit by hand in the kernel, since the hardware | |
24512 | - * will do the accessed bit for us, and we don't want to | |
24513 | - * race with other CPU's that might be updating the dirty | |
24514 | - * bit at the same time. */ | |
24515 | -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | |
24516 | -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ | |
24517 | -({ \ | |
24518 | - int __changed = !pte_same(*(ptep), entry); \ | |
24519 | - if (__changed && (dirty)) \ | |
24520 | - ptep_establish(vma, address, ptep, entry); \ | |
24521 | - __changed; \ | |
24522 | -}) | |
24523 | - | |
24524 | -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH | |
24525 | -#define ptep_clear_flush_young(vma, address, ptep) \ | |
24526 | -({ \ | |
24527 | - pte_t __pte = *(ptep); \ | |
24528 | - int __young = pte_young(__pte); \ | |
24529 | - __pte = pte_mkold(__pte); \ | |
24530 | - if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \ | |
24531 | - (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \ | |
24532 | - else if (__young) \ | |
24533 | - set_pte(ptep, __pte); \ | |
24534 | - __young; \ | |
24535 | -}) | |
24536 | - | |
24537 | /* Encode and de-code a swap entry */ | |
24538 | #define __swp_type(x) (((x).val >> 1) & 0x3f) | |
24539 | #define __swp_offset(x) ((x).val >> 8) | |
24540 | #define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) | |
24541 | #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) }) | |
24542 | -#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) | |
24543 | - | |
24544 | -extern spinlock_t pgd_lock; | |
24545 | -extern struct list_head pgd_list; | |
24546 | +#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) | |
24547 | ||
24548 | extern int kern_addr_valid(unsigned long addr); | |
24549 | - | |
24550 | -#define DOMID_LOCAL (0xFFFFU) | |
24551 | - | |
24552 | -struct vm_area_struct; | |
24553 | - | |
24554 | -int direct_remap_pfn_range(struct vm_area_struct *vma, | |
24555 | - unsigned long address, | |
24556 | - unsigned long mfn, | |
24557 | - unsigned long size, | |
24558 | - pgprot_t prot, | |
24559 | - domid_t domid); | |
24560 | - | |
24561 | -int direct_kernel_remap_pfn_range(unsigned long address, | |
24562 | - unsigned long mfn, | |
24563 | - unsigned long size, | |
24564 | - pgprot_t prot, | |
24565 | - domid_t domid); | |
24566 | - | |
24567 | -int create_lookup_pte_addr(struct mm_struct *mm, | |
24568 | - unsigned long address, | |
24569 | - uint64_t *ptep); | |
24570 | - | |
24571 | -int touch_pte_range(struct mm_struct *mm, | |
24572 | - unsigned long address, | |
24573 | - unsigned long size); | |
24574 | - | |
24575 | -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |
24576 | - unsigned long addr, unsigned long end, pgprot_t newprot, | |
24577 | - int dirty_accountable); | |
24578 | - | |
24579 | -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \ | |
24580 | - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) | |
24581 | - | |
24582 | -pte_t *lookup_address(unsigned long addr); | |
24583 | +extern void cleanup_highmap(void); | |
24584 | ||
24585 | #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ | |
24586 | direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO) | |
24587 | ||
24588 | #define HAVE_ARCH_UNMAPPED_AREA | |
24589 | +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN | |
24590 | ||
24591 | #define pgtable_cache_init() do { } while (0) | |
24592 | #define check_pgt_cache() do { } while (0) | |
24593 | @@ -561,13 +287,7 @@ pte_t *lookup_address(unsigned long addr | |
24594 | #define kc_offset_to_vaddr(o) \ | |
24595 | (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o)) | |
24596 | ||
24597 | -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG | |
24598 | -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR | |
24599 | -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL | |
24600 | -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH | |
24601 | -#define __HAVE_ARCH_PTEP_SET_WRPROTECT | |
24602 | #define __HAVE_ARCH_PTE_SAME | |
24603 | -#include <asm-generic/pgtable.h> | |
24604 | #endif /* !__ASSEMBLY__ */ | |
24605 | ||
24606 | #endif /* _X86_64_PGTABLE_H */ | |
24607 | --- a/include/asm-x86/mach-xen/asm/pgtable.h | |
24608 | +++ b/include/asm-x86/mach-xen/asm/pgtable.h | |
24609 | @@ -1,5 +1,454 @@ | |
24610 | +#ifndef _ASM_X86_PGTABLE_H | |
24611 | +#define _ASM_X86_PGTABLE_H | |
24612 | + | |
24613 | +#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1) | |
24614 | +#define FIRST_USER_ADDRESS 0 | |
24615 | + | |
24616 | +#define _PAGE_BIT_PRESENT 0 | |
24617 | +#define _PAGE_BIT_RW 1 | |
24618 | +#define _PAGE_BIT_USER 2 | |
24619 | +#define _PAGE_BIT_PWT 3 | |
24620 | +#define _PAGE_BIT_PCD 4 | |
24621 | +#define _PAGE_BIT_ACCESSED 5 | |
24622 | +#define _PAGE_BIT_DIRTY 6 | |
24623 | +#define _PAGE_BIT_FILE 6 | |
24624 | +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ | |
24625 | +#define _PAGE_BIT_PAT 7 /* on 4KB pages */ | |
24626 | +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ | |
24627 | +#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and | |
24628 | + * has no associated page struct. */ | |
24629 | +#define _PAGE_BIT_UNUSED2 10 /* available for programmer */ | |
24630 | +#define _PAGE_BIT_UNUSED3 11 | |
24631 | +#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ | |
24632 | +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ | |
24633 | + | |
24634 | +/* | |
24635 | + * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a | |
24636 | + * sign-extended value on 32-bit with all 1's in the upper word, | |
24637 | + * which preserves the upper pte values on 64-bit ptes: | |
24638 | + */ | |
24639 | +#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT) | |
24640 | +#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW) | |
24641 | +#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER) | |
24642 | +#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT) | |
24643 | +#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD) | |
24644 | +#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED) | |
24645 | +#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY) | |
24646 | +#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */ | |
24647 | +#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */ | |
24648 | +#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO) | |
24649 | +#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2) | |
24650 | +#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3) | |
24651 | +#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT) | |
24652 | +#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE) | |
24653 | + | |
24654 | +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | |
24655 | +#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX) | |
24656 | +#else | |
24657 | +#define _PAGE_NX 0 | |
24658 | +#endif | |
24659 | + | |
24660 | +/* If _PAGE_PRESENT is clear, we use these: */ | |
24661 | +#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */ | |
24662 | +#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE; | |
24663 | + pte_present gives true */ | |
24664 | + | |
24665 | +#ifndef __ASSEMBLY__ | |
24666 | +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002 | |
24667 | +extern unsigned int __kernel_page_user; | |
24668 | +#else | |
24669 | +#define __kernel_page_user 0 | |
24670 | +#endif | |
24671 | +#endif | |
24672 | + | |
24673 | +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) | |
24674 | +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user) | |
24675 | + | |
24676 | +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO) | |
24677 | + | |
24678 | +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) | |
24679 | +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
24680 | + | |
24681 | +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) | |
24682 | +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
24683 | +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) | |
24684 | +#define PAGE_COPY PAGE_COPY_NOEXEC | |
24685 | +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
24686 | +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) | |
24687 | + | |
24688 | +#ifdef CONFIG_X86_32 | |
24689 | +#define _PAGE_KERNEL_EXEC \ | |
24690 | + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) | |
24691 | +#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX) | |
24692 | + | |
24693 | +#ifndef __ASSEMBLY__ | |
24694 | +extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC; | |
24695 | +#endif /* __ASSEMBLY__ */ | |
24696 | +#else | |
24697 | +#define __PAGE_KERNEL_EXEC \ | |
24698 | + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user) | |
24699 | +#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) | |
24700 | +#endif | |
24701 | + | |
24702 | +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) | |
24703 | +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) | |
24704 | +#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) | |
24705 | +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) | |
24706 | +#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) | |
24707 | +#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) | |
24708 | +#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) | |
24709 | +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) | |
24710 | +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) | |
24711 | + | |
24712 | +/* | |
24713 | + * We don't support GLOBAL page in xenolinux64 | |
24714 | + */ | |
24715 | +#define MAKE_GLOBAL(x) __pgprot((x)) | |
24716 | + | |
24717 | +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL) | |
24718 | +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) | |
24719 | +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC) | |
24720 | +#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX) | |
24721 | +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) | |
24722 | +#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS) | |
24723 | +#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE) | |
24724 | +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE) | |
24725 | +#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC) | |
24726 | +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL) | |
24727 | +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE) | |
24728 | + | |
24729 | +/* xwr */ | |
24730 | +#define __P000 PAGE_NONE | |
24731 | +#define __P001 PAGE_READONLY | |
24732 | +#define __P010 PAGE_COPY | |
24733 | +#define __P011 PAGE_COPY | |
24734 | +#define __P100 PAGE_READONLY_EXEC | |
24735 | +#define __P101 PAGE_READONLY_EXEC | |
24736 | +#define __P110 PAGE_COPY_EXEC | |
24737 | +#define __P111 PAGE_COPY_EXEC | |
24738 | + | |
24739 | +#define __S000 PAGE_NONE | |
24740 | +#define __S001 PAGE_READONLY | |
24741 | +#define __S010 PAGE_SHARED | |
24742 | +#define __S011 PAGE_SHARED | |
24743 | +#define __S100 PAGE_READONLY_EXEC | |
24744 | +#define __S101 PAGE_READONLY_EXEC | |
24745 | +#define __S110 PAGE_SHARED_EXEC | |
24746 | +#define __S111 PAGE_SHARED_EXEC | |
24747 | + | |
24748 | +#ifndef __ASSEMBLY__ | |
24749 | + | |
24750 | +/* | |
24751 | + * ZERO_PAGE is a global shared page that is always zero: used | |
24752 | + * for zero-mapped memory areas etc.. | |
24753 | + */ | |
24754 | +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; | |
24755 | +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) | |
24756 | + | |
24757 | +extern spinlock_t pgd_lock; | |
24758 | +extern struct list_head pgd_list; | |
24759 | + | |
24760 | +/* | |
24761 | + * The following only work if pte_present() is true. | |
24762 | + * Undefined behaviour if not.. | |
24763 | + */ | |
24764 | +static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; } | |
24765 | +static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; } | |
24766 | +static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; } | |
24767 | +static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; } | |
24768 | +static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; } | |
24769 | +static inline int pte_global(pte_t pte) { return 0; } | |
24770 | +static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); } | |
24771 | + | |
24772 | +static inline int pmd_large(pmd_t pte) { | |
24773 | + return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) == | |
24774 | + (_PAGE_PSE|_PAGE_PRESENT); | |
24775 | +} | |
24776 | + | |
24777 | +static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); } | |
24778 | +static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); } | |
24779 | +static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); } | |
24780 | +static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); } | |
24781 | +static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); } | |
24782 | +static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); } | |
24783 | +static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); } | |
24784 | +static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); } | |
24785 | +static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); } | |
24786 | +static inline pte_t pte_mkglobal(pte_t pte) { return pte; } | |
24787 | +static inline pte_t pte_clrglobal(pte_t pte) { return pte; } | |
24788 | + | |
24789 | +extern pteval_t __supported_pte_mask; | |
24790 | + | |
24791 | +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) | |
24792 | +{ | |
24793 | + return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) | | |
24794 | + pgprot_val(pgprot)) & __supported_pte_mask); | |
24795 | +} | |
24796 | + | |
24797 | +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot) | |
24798 | +{ | |
24799 | + return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) | | |
24800 | + pgprot_val(pgprot)) & __supported_pte_mask); | |
24801 | +} | |
24802 | + | |
24803 | +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) | |
24804 | +{ | |
24805 | + return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) | | |
24806 | + pgprot_val(pgprot)) & __supported_pte_mask); | |
24807 | +} | |
24808 | + | |
24809 | +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) | |
24810 | +{ | |
24811 | + pteval_t val = pte_val(pte); | |
24812 | + | |
24813 | + val &= _PAGE_CHG_MASK; | |
24814 | + val |= pgprot_val(newprot) & __supported_pte_mask; | |
24815 | + | |
24816 | + return __pte(val); | |
24817 | +} | |
24818 | + | |
24819 | +#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX)) | |
24820 | + | |
24821 | +#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) | |
24822 | + | |
24823 | +#define set_pte(ptep, pte) xen_set_pte(ptep, pte) | |
24824 | +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) | |
24825 | + | |
24826 | +#define set_pte_atomic(ptep, pte) \ | |
24827 | + xen_set_pte_atomic(ptep, pte) | |
24828 | + | |
24829 | +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd) | |
24830 | + | |
24831 | +#ifndef __PAGETABLE_PUD_FOLDED | |
24832 | +#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd) | |
24833 | +#define pgd_clear(pgd) xen_pgd_clear(pgd) | |
24834 | +#endif | |
24835 | + | |
24836 | +#ifndef set_pud | |
24837 | +# define set_pud(pudp, pud) xen_set_pud(pudp, pud) | |
24838 | +#endif | |
24839 | + | |
24840 | +#ifndef __PAGETABLE_PMD_FOLDED | |
24841 | +#define pud_clear(pud) xen_pud_clear(pud) | |
24842 | +#endif | |
24843 | + | |
24844 | +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep) | |
24845 | +#define pmd_clear(pmd) xen_pmd_clear(pmd) | |
24846 | + | |
24847 | +#define pte_update(mm, addr, ptep) do { } while (0) | |
24848 | +#define pte_update_defer(mm, addr, ptep) do { } while (0) | |
24849 | + | |
24850 | +#endif /* __ASSEMBLY__ */ | |
24851 | + | |
24852 | #ifdef CONFIG_X86_32 | |
24853 | # include "pgtable_32.h" | |
24854 | #else | |
24855 | # include "pgtable_64.h" | |
24856 | #endif | |
24857 | + | |
24858 | +#ifndef __ASSEMBLY__ | |
24859 | + | |
24860 | +enum { | |
24861 | + PG_LEVEL_NONE, | |
24862 | + PG_LEVEL_4K, | |
24863 | + PG_LEVEL_2M, | |
24864 | + PG_LEVEL_1G, | |
24865 | +}; | |
24866 | + | |
24867 | +/* | |
24868 | + * Helper function that returns the kernel pagetable entry controlling | |
24869 | + * the virtual address 'address'. NULL means no pagetable entry present. | |
24870 | + * NOTE: the return type is pte_t but if the pmd is PSE then we return it | |
24871 | + * as a pte too. | |
24872 | + */ | |
24873 | +extern pte_t *lookup_address(unsigned long address, unsigned int *level); | |
24874 | + | |
24875 | +/* local pte updates need not use xchg for locking */ | |
24876 | +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res) | |
24877 | +{ | |
24878 | + xen_set_pte(ptep, __pte(0)); | |
24879 | + return res; | |
24880 | +} | |
24881 | + | |
24882 | +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | |
24883 | + pte_t *ptep , pte_t pte) | |
24884 | +{ | |
24885 | + if ((mm != current->mm && mm != &init_mm) || | |
24886 | + HYPERVISOR_update_va_mapping(addr, pte, 0)) | |
24887 | + xen_set_pte(ptep, pte); | |
24888 | +} | |
24889 | + | |
24890 | +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, | |
24891 | + pte_t *ptep) | |
24892 | +{ | |
24893 | + if ((mm != current->mm && mm != &init_mm) | |
24894 | + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) | |
24895 | + __xen_pte_clear(ptep); | |
24896 | +} | |
24897 | + | |
24898 | +#ifndef CONFIG_PARAVIRT | |
24899 | +/* | |
24900 | + * Rules for using pte_update - it must be called after any PTE update which | |
24901 | + * has not been done using the set_pte / clear_pte interfaces. It is used by | |
24902 | + * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE | |
24903 | + * updates should either be sets, clears, or set_pte_atomic for P->P | |
24904 | + * transitions, which means this hook should only be called for user PTEs. | |
24905 | + * This hook implies a P->P protection or access change has taken place, which | |
24906 | + * requires a subsequent TLB flush. The notification can optionally be delayed | |
24907 | + * until the TLB flush event by using the pte_update_defer form of the | |
24908 | + * interface, but care must be taken to assure that the flush happens while | |
24909 | + * still holding the same page table lock so that the shadow and primary pages | |
24910 | + * do not become out of sync on SMP. | |
24911 | + */ | |
24912 | +#define pte_update(mm, addr, ptep) do { } while (0) | |
24913 | +#define pte_update_defer(mm, addr, ptep) do { } while (0) | |
24914 | +#endif | |
24915 | + | |
24916 | +/* | |
24917 | + * We only update the dirty/accessed state if we set | |
24918 | + * the dirty bit by hand in the kernel, since the hardware | |
24919 | + * will do the accessed bit for us, and we don't want to | |
24920 | + * race with other CPU's that might be updating the dirty | |
24921 | + * bit at the same time. | |
24922 | + */ | |
24923 | +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | |
24924 | +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ | |
24925 | +({ \ | |
24926 | + int __changed = !pte_same(*(ptep), entry); \ | |
24927 | + if (__changed && (dirty)) { \ | |
24928 | + if ( likely((vma)->vm_mm == current->mm) ) { \ | |
24929 | + BUG_ON(HYPERVISOR_update_va_mapping(address, \ | |
24930 | + entry, \ | |
24931 | + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ | |
24932 | + UVMF_INVLPG|UVMF_MULTI)); \ | |
24933 | + } else { \ | |
24934 | + xen_l1_entry_update(ptep, entry); \ | |
24935 | + flush_tlb_page(vma, address); \ | |
24936 | + } \ | |
24937 | + } \ | |
24938 | + __changed; \ | |
24939 | +}) | |
24940 | + | |
24941 | +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG | |
24942 | +#define ptep_test_and_clear_young(vma, addr, ptep) ({ \ | |
24943 | + int __ret = 0; \ | |
24944 | + if (pte_young(*(ptep))) \ | |
24945 | + __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \ | |
24946 | + &(ptep)->pte); \ | |
24947 | + if (__ret) \ | |
24948 | + pte_update((vma)->vm_mm, addr, ptep); \ | |
24949 | + __ret; \ | |
24950 | +}) | |
24951 | + | |
24952 | +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH | |
24953 | +#define ptep_clear_flush_young(vma, address, ptep) \ | |
24954 | +({ \ | |
24955 | + pte_t __pte = *(ptep); \ | |
24956 | + int __young = pte_young(__pte); \ | |
24957 | + __pte = pte_mkold(__pte); \ | |
24958 | + if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \ | |
24959 | + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \ | |
24960 | + else if (__young) \ | |
24961 | + (ptep)->pte_low = __pte.pte_low; \ | |
24962 | + __young; \ | |
24963 | +}) | |
24964 | + | |
24965 | +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH | |
24966 | +#define ptep_clear_flush(vma, addr, ptep) \ | |
24967 | +({ \ | |
24968 | + pte_t *__ptep = (ptep); \ | |
24969 | + pte_t __res = *__ptep; \ | |
24970 | + if (!pte_none(__res) && \ | |
24971 | + ((vma)->vm_mm != current->mm || \ | |
24972 | + HYPERVISOR_update_va_mapping(addr, __pte(0), \ | |
24973 | + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ | |
24974 | + UVMF_INVLPG|UVMF_MULTI))) { \ | |
24975 | + __xen_pte_clear(__ptep); \ | |
24976 | + flush_tlb_page(vma, addr); \ | |
24977 | + } \ | |
24978 | + __res; \ | |
24979 | +}) | |
24980 | + | |
24981 | +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR | |
24982 | +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | |
24983 | +{ | |
24984 | + pte_t pte = *ptep; | |
24985 | + if (!pte_none(pte) | |
24986 | + && (mm != &init_mm | |
24987 | + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) { | |
24988 | + pte = xen_ptep_get_and_clear(ptep, pte); | |
24989 | + pte_update(mm, addr, ptep); | |
24990 | + } | |
24991 | + return pte; | |
24992 | +} | |
24993 | + | |
24994 | +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL | |
24995 | +#define ptep_get_and_clear_full(mm, addr, ptep, full) \ | |
24996 | + ((full) ? ({ \ | |
24997 | + pte_t *__ptep = (ptep); \ | |
24998 | + pte_t __res = *__ptep; \ | |
24999 | + if (!PagePinned(virt_to_page((mm)->pgd))) \ | |
25000 | + __xen_pte_clear(__ptep); \ | |
25001 | + else if (!pte_none(__res)) \ | |
25002 | + xen_l1_entry_update(__ptep, __pte(0)); \ | |
25003 | + __res; \ | |
25004 | + }) : \ | |
25005 | + ptep_get_and_clear(mm, addr, ptep)) | |
25006 | + | |
25007 | +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int); | |
25008 | + | |
25009 | +#define __HAVE_ARCH_PTEP_SET_WRPROTECT | |
25010 | +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | |
25011 | +{ | |
25012 | + pte_t pte = *ptep; | |
25013 | + if (pte_write(pte)) | |
25014 | + set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); | |
25015 | +} | |
25016 | + | |
25017 | +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \ | |
25018 | + xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) | |
25019 | + | |
25020 | +#define arbitrary_virt_to_machine(va) \ | |
25021 | +({ \ | |
25022 | + unsigned int __lvl; \ | |
25023 | + pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \ | |
25024 | + BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\ | |
25025 | + (((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT) \ | |
25026 | + | ((unsigned long)(va) & (PAGE_SIZE - 1))); \ | |
25027 | +}) | |
25028 | + | |
25029 | +#include <asm-generic/pgtable.h> | |
25030 | + | |
25031 | +#include <xen/features.h> | |
25032 | +void make_page_readonly(void *va, unsigned int feature); | |
25033 | +void make_page_writable(void *va, unsigned int feature); | |
25034 | +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); | |
25035 | +void make_pages_writable(void *va, unsigned int nr, unsigned int feature); | |
25036 | + | |
25037 | +struct vm_area_struct; | |
25038 | + | |
25039 | +int direct_remap_pfn_range(struct vm_area_struct *vma, | |
25040 | + unsigned long address, | |
25041 | + unsigned long mfn, | |
25042 | + unsigned long size, | |
25043 | + pgprot_t prot, | |
25044 | + domid_t domid); | |
25045 | +int direct_kernel_remap_pfn_range(unsigned long address, | |
25046 | + unsigned long mfn, | |
25047 | + unsigned long size, | |
25048 | + pgprot_t prot, | |
25049 | + domid_t domid); | |
25050 | +int create_lookup_pte_addr(struct mm_struct *mm, | |
25051 | + unsigned long address, | |
25052 | + uint64_t *ptep); | |
25053 | +int touch_pte_range(struct mm_struct *mm, | |
25054 | + unsigned long address, | |
25055 | + unsigned long size); | |
25056 | + | |
25057 | +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |
25058 | + unsigned long addr, unsigned long end, pgprot_t newprot, | |
25059 | + int dirty_accountable); | |
25060 | + | |
25061 | +#endif /* __ASSEMBLY__ */ | |
25062 | + | |
25063 | +#endif /* _ASM_X86_PGTABLE_H */ | |
25064 | --- a/include/asm-x86/mach-xen/asm/processor_32.h | |
25065 | +++ /dev/null | |
25066 | @@ -1,751 +0,0 @@ | |
25067 | -/* | |
25068 | - * include/asm-i386/processor.h | |
25069 | - * | |
25070 | - * Copyright (C) 1994 Linus Torvalds | |
25071 | - */ | |
25072 | - | |
25073 | -#ifndef __ASM_I386_PROCESSOR_H | |
25074 | -#define __ASM_I386_PROCESSOR_H | |
25075 | - | |
25076 | -#include <asm/vm86.h> | |
25077 | -#include <asm/math_emu.h> | |
25078 | -#include <asm/segment.h> | |
25079 | -#include <asm/page.h> | |
25080 | -#include <asm/types.h> | |
25081 | -#include <asm/sigcontext.h> | |
25082 | -#include <asm/cpufeature.h> | |
25083 | -#include <asm/msr.h> | |
25084 | -#include <asm/system.h> | |
25085 | -#include <linux/cache.h> | |
25086 | -#include <linux/threads.h> | |
25087 | -#include <asm/percpu.h> | |
25088 | -#include <linux/cpumask.h> | |
25089 | -#include <linux/init.h> | |
25090 | -#include <asm/processor-flags.h> | |
25091 | -#include <xen/interface/physdev.h> | |
25092 | - | |
25093 | -/* flag for disabling the tsc */ | |
25094 | -#define tsc_disable 0 | |
25095 | - | |
25096 | -struct desc_struct { | |
25097 | - unsigned long a,b; | |
25098 | -}; | |
25099 | - | |
25100 | -#define desc_empty(desc) \ | |
25101 | - (!((desc)->a | (desc)->b)) | |
25102 | - | |
25103 | -#define desc_equal(desc1, desc2) \ | |
25104 | - (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b)) | |
25105 | -/* | |
25106 | - * Default implementation of macro that returns current | |
25107 | - * instruction pointer ("program counter"). | |
25108 | - */ | |
25109 | -#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; }) | |
25110 | - | |
25111 | -/* | |
25112 | - * CPU type and hardware bug flags. Kept separately for each CPU. | |
25113 | - * Members of this structure are referenced in head.S, so think twice | |
25114 | - * before touching them. [mj] | |
25115 | - */ | |
25116 | - | |
25117 | -struct cpuinfo_x86 { | |
25118 | - __u8 x86; /* CPU family */ | |
25119 | - __u8 x86_vendor; /* CPU vendor */ | |
25120 | - __u8 x86_model; | |
25121 | - __u8 x86_mask; | |
25122 | - char wp_works_ok; /* It doesn't on 386's */ | |
25123 | - char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */ | |
25124 | - char hard_math; | |
25125 | - char rfu; | |
25126 | - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ | |
25127 | - unsigned long x86_capability[NCAPINTS]; | |
25128 | - char x86_vendor_id[16]; | |
25129 | - char x86_model_id[64]; | |
25130 | - int x86_cache_size; /* in KB - valid for CPUS which support this | |
25131 | - call */ | |
25132 | - int x86_cache_alignment; /* In bytes */ | |
25133 | - char fdiv_bug; | |
25134 | - char f00f_bug; | |
25135 | - char coma_bug; | |
25136 | - char pad0; | |
25137 | - int x86_power; | |
25138 | - unsigned long loops_per_jiffy; | |
25139 | -#ifdef CONFIG_SMP | |
25140 | - cpumask_t llc_shared_map; /* cpus sharing the last level cache */ | |
25141 | -#endif | |
25142 | - unsigned char x86_max_cores; /* cpuid returned max cores value */ | |
25143 | - unsigned char apicid; | |
25144 | - unsigned short x86_clflush_size; | |
25145 | -#ifdef CONFIG_SMP | |
25146 | - unsigned char booted_cores; /* number of cores as seen by OS */ | |
25147 | - __u8 phys_proc_id; /* Physical processor id. */ | |
25148 | - __u8 cpu_core_id; /* Core id */ | |
25149 | - __u8 cpu_index; /* index into per_cpu list */ | |
25150 | -#endif | |
25151 | -} __attribute__((__aligned__(SMP_CACHE_BYTES))); | |
25152 | - | |
25153 | -#define X86_VENDOR_INTEL 0 | |
25154 | -#define X86_VENDOR_CYRIX 1 | |
25155 | -#define X86_VENDOR_AMD 2 | |
25156 | -#define X86_VENDOR_UMC 3 | |
25157 | -#define X86_VENDOR_NEXGEN 4 | |
25158 | -#define X86_VENDOR_CENTAUR 5 | |
25159 | -#define X86_VENDOR_TRANSMETA 7 | |
25160 | -#define X86_VENDOR_NSC 8 | |
25161 | -#define X86_VENDOR_NUM 9 | |
25162 | -#define X86_VENDOR_UNKNOWN 0xff | |
25163 | - | |
25164 | -/* | |
25165 | - * capabilities of CPUs | |
25166 | - */ | |
25167 | - | |
25168 | -extern struct cpuinfo_x86 boot_cpu_data; | |
25169 | -extern struct cpuinfo_x86 new_cpu_data; | |
25170 | -#ifndef CONFIG_X86_NO_TSS | |
25171 | -extern struct tss_struct doublefault_tss; | |
25172 | -DECLARE_PER_CPU(struct tss_struct, init_tss); | |
25173 | -#endif | |
25174 | - | |
25175 | -#ifdef CONFIG_SMP | |
25176 | -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info); | |
25177 | -#define cpu_data(cpu) per_cpu(cpu_info, cpu) | |
25178 | -#define current_cpu_data cpu_data(smp_processor_id()) | |
25179 | -#else | |
25180 | -#define cpu_data(cpu) boot_cpu_data | |
25181 | -#define current_cpu_data boot_cpu_data | |
25182 | -#endif | |
25183 | - | |
25184 | -/* | |
25185 | - * the following now lives in the per cpu area: | |
25186 | - * extern int cpu_llc_id[NR_CPUS]; | |
25187 | - */ | |
25188 | -DECLARE_PER_CPU(u8, cpu_llc_id); | |
25189 | -extern char ignore_fpu_irq; | |
25190 | - | |
25191 | -void __init cpu_detect(struct cpuinfo_x86 *c); | |
25192 | - | |
25193 | -extern void identify_boot_cpu(void); | |
25194 | -extern void identify_secondary_cpu(struct cpuinfo_x86 *); | |
25195 | -extern void print_cpu_info(struct cpuinfo_x86 *); | |
25196 | -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); | |
25197 | -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); | |
25198 | -extern unsigned short num_cache_leaves; | |
25199 | - | |
25200 | -#ifdef CONFIG_X86_HT | |
25201 | -extern void detect_ht(struct cpuinfo_x86 *c); | |
25202 | -#else | |
25203 | -static inline void detect_ht(struct cpuinfo_x86 *c) {} | |
25204 | -#endif | |
25205 | - | |
25206 | -static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx, | |
25207 | - unsigned int *ecx, unsigned int *edx) | |
25208 | -{ | |
25209 | - /* ecx is often an input as well as an output. */ | |
25210 | - __asm__(XEN_CPUID | |
25211 | - : "=a" (*eax), | |
25212 | - "=b" (*ebx), | |
25213 | - "=c" (*ecx), | |
25214 | - "=d" (*edx) | |
25215 | - : "0" (*eax), "2" (*ecx)); | |
25216 | -} | |
25217 | - | |
25218 | -#define load_cr3(pgdir) write_cr3(__pa(pgdir)) | |
25219 | - | |
25220 | -/* | |
25221 | - * Save the cr4 feature set we're using (ie | |
25222 | - * Pentium 4MB enable and PPro Global page | |
25223 | - * enable), so that any CPU's that boot up | |
25224 | - * after us can get the correct flags. | |
25225 | - */ | |
25226 | -extern unsigned long mmu_cr4_features; | |
25227 | - | |
25228 | -static inline void set_in_cr4 (unsigned long mask) | |
25229 | -{ | |
25230 | - unsigned cr4; | |
25231 | - mmu_cr4_features |= mask; | |
25232 | - cr4 = read_cr4(); | |
25233 | - cr4 |= mask; | |
25234 | - write_cr4(cr4); | |
25235 | -} | |
25236 | - | |
25237 | -static inline void clear_in_cr4 (unsigned long mask) | |
25238 | -{ | |
25239 | - unsigned cr4; | |
25240 | - mmu_cr4_features &= ~mask; | |
25241 | - cr4 = read_cr4(); | |
25242 | - cr4 &= ~mask; | |
25243 | - write_cr4(cr4); | |
25244 | -} | |
25245 | - | |
25246 | -/* Stop speculative execution */ | |
25247 | -static inline void sync_core(void) | |
25248 | -{ | |
25249 | - int tmp; | |
25250 | - asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory"); | |
25251 | -} | |
25252 | - | |
25253 | -static inline void __monitor(const void *eax, unsigned long ecx, | |
25254 | - unsigned long edx) | |
25255 | -{ | |
25256 | - /* "monitor %eax,%ecx,%edx;" */ | |
25257 | - asm volatile( | |
25258 | - ".byte 0x0f,0x01,0xc8;" | |
25259 | - : :"a" (eax), "c" (ecx), "d"(edx)); | |
25260 | -} | |
25261 | - | |
25262 | -static inline void __mwait(unsigned long eax, unsigned long ecx) | |
25263 | -{ | |
25264 | - /* "mwait %eax,%ecx;" */ | |
25265 | - asm volatile( | |
25266 | - ".byte 0x0f,0x01,0xc9;" | |
25267 | - : :"a" (eax), "c" (ecx)); | |
25268 | -} | |
25269 | - | |
25270 | -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); | |
25271 | - | |
25272 | -/* from system description table in BIOS. Mostly for MCA use, but | |
25273 | -others may find it useful. */ | |
25274 | -extern unsigned int machine_id; | |
25275 | -extern unsigned int machine_submodel_id; | |
25276 | -extern unsigned int BIOS_revision; | |
25277 | -extern unsigned int mca_pentium_flag; | |
25278 | - | |
25279 | -/* Boot loader type from the setup header */ | |
25280 | -extern int bootloader_type; | |
25281 | - | |
25282 | -/* | |
25283 | - * User space process size: 3GB (default). | |
25284 | - */ | |
25285 | -#define TASK_SIZE (PAGE_OFFSET) | |
25286 | - | |
25287 | -/* This decides where the kernel will search for a free chunk of vm | |
25288 | - * space during mmap's. | |
25289 | - */ | |
25290 | -#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) | |
25291 | - | |
25292 | -#define HAVE_ARCH_PICK_MMAP_LAYOUT | |
25293 | - | |
25294 | -extern void hard_disable_TSC(void); | |
25295 | -extern void disable_TSC(void); | |
25296 | -extern void hard_enable_TSC(void); | |
25297 | - | |
25298 | -/* | |
25299 | - * Size of io_bitmap. | |
25300 | - */ | |
25301 | -#define IO_BITMAP_BITS 65536 | |
25302 | -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) | |
25303 | -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) | |
25304 | -#ifndef CONFIG_X86_NO_TSS | |
25305 | -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap) | |
25306 | -#endif | |
25307 | -#define INVALID_IO_BITMAP_OFFSET 0x8000 | |
25308 | -#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 | |
25309 | - | |
25310 | -struct i387_fsave_struct { | |
25311 | - long cwd; | |
25312 | - long swd; | |
25313 | - long twd; | |
25314 | - long fip; | |
25315 | - long fcs; | |
25316 | - long foo; | |
25317 | - long fos; | |
25318 | - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ | |
25319 | - long status; /* software status information */ | |
25320 | -}; | |
25321 | - | |
25322 | -struct i387_fxsave_struct { | |
25323 | - unsigned short cwd; | |
25324 | - unsigned short swd; | |
25325 | - unsigned short twd; | |
25326 | - unsigned short fop; | |
25327 | - long fip; | |
25328 | - long fcs; | |
25329 | - long foo; | |
25330 | - long fos; | |
25331 | - long mxcsr; | |
25332 | - long mxcsr_mask; | |
25333 | - long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ | |
25334 | - long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ | |
25335 | - long padding[56]; | |
25336 | -} __attribute__ ((aligned (16))); | |
25337 | - | |
25338 | -struct i387_soft_struct { | |
25339 | - long cwd; | |
25340 | - long swd; | |
25341 | - long twd; | |
25342 | - long fip; | |
25343 | - long fcs; | |
25344 | - long foo; | |
25345 | - long fos; | |
25346 | - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ | |
25347 | - unsigned char ftop, changed, lookahead, no_update, rm, alimit; | |
25348 | - struct info *info; | |
25349 | - unsigned long entry_eip; | |
25350 | -}; | |
25351 | - | |
25352 | -union i387_union { | |
25353 | - struct i387_fsave_struct fsave; | |
25354 | - struct i387_fxsave_struct fxsave; | |
25355 | - struct i387_soft_struct soft; | |
25356 | -}; | |
25357 | - | |
25358 | -typedef struct { | |
25359 | - unsigned long seg; | |
25360 | -} mm_segment_t; | |
25361 | - | |
25362 | -struct thread_struct; | |
25363 | - | |
25364 | -#ifndef CONFIG_X86_NO_TSS | |
25365 | -/* This is the TSS defined by the hardware. */ | |
25366 | -struct i386_hw_tss { | |
25367 | - unsigned short back_link,__blh; | |
25368 | - unsigned long esp0; | |
25369 | - unsigned short ss0,__ss0h; | |
25370 | - unsigned long esp1; | |
25371 | - unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */ | |
25372 | - unsigned long esp2; | |
25373 | - unsigned short ss2,__ss2h; | |
25374 | - unsigned long __cr3; | |
25375 | - unsigned long eip; | |
25376 | - unsigned long eflags; | |
25377 | - unsigned long eax,ecx,edx,ebx; | |
25378 | - unsigned long esp; | |
25379 | - unsigned long ebp; | |
25380 | - unsigned long esi; | |
25381 | - unsigned long edi; | |
25382 | - unsigned short es, __esh; | |
25383 | - unsigned short cs, __csh; | |
25384 | - unsigned short ss, __ssh; | |
25385 | - unsigned short ds, __dsh; | |
25386 | - unsigned short fs, __fsh; | |
25387 | - unsigned short gs, __gsh; | |
25388 | - unsigned short ldt, __ldth; | |
25389 | - unsigned short trace, io_bitmap_base; | |
25390 | -} __attribute__((packed)); | |
25391 | - | |
25392 | -struct tss_struct { | |
25393 | - struct i386_hw_tss x86_tss; | |
25394 | - | |
25395 | - /* | |
25396 | - * The extra 1 is there because the CPU will access an | |
25397 | - * additional byte beyond the end of the IO permission | |
25398 | - * bitmap. The extra byte must be all 1 bits, and must | |
25399 | - * be within the limit. | |
25400 | - */ | |
25401 | - unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; | |
25402 | - /* | |
25403 | - * Cache the current maximum and the last task that used the bitmap: | |
25404 | - */ | |
25405 | - unsigned long io_bitmap_max; | |
25406 | - struct thread_struct *io_bitmap_owner; | |
25407 | - /* | |
25408 | - * pads the TSS to be cacheline-aligned (size is 0x100) | |
25409 | - */ | |
25410 | - unsigned long __cacheline_filler[35]; | |
25411 | - /* | |
25412 | - * .. and then another 0x100 bytes for emergency kernel stack | |
25413 | - */ | |
25414 | - unsigned long stack[64]; | |
25415 | -} __attribute__((packed)); | |
25416 | -#endif | |
25417 | - | |
25418 | -#define ARCH_MIN_TASKALIGN 16 | |
25419 | - | |
25420 | -struct thread_struct { | |
25421 | -/* cached TLS descriptors. */ | |
25422 | - struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; | |
25423 | - unsigned long esp0; | |
25424 | - unsigned long sysenter_cs; | |
25425 | - unsigned long eip; | |
25426 | - unsigned long esp; | |
25427 | - unsigned long fs; | |
25428 | - unsigned long gs; | |
25429 | -/* Hardware debugging registers */ | |
25430 | - unsigned long debugreg[8]; /* %%db0-7 debug registers */ | |
25431 | -/* fault info */ | |
25432 | - unsigned long cr2, trap_no, error_code; | |
25433 | -/* floating point info */ | |
25434 | - union i387_union i387; | |
25435 | -/* virtual 86 mode info */ | |
25436 | - struct vm86_struct __user * vm86_info; | |
25437 | - unsigned long screen_bitmap; | |
25438 | - unsigned long v86flags, v86mask, saved_esp0; | |
25439 | - unsigned int saved_fs, saved_gs; | |
25440 | -/* IO permissions */ | |
25441 | - unsigned long *io_bitmap_ptr; | |
25442 | - unsigned long iopl; | |
25443 | -/* max allowed port in the bitmap, in bytes: */ | |
25444 | - unsigned long io_bitmap_max; | |
25445 | -}; | |
25446 | - | |
25447 | -#define INIT_THREAD { \ | |
25448 | - .esp0 = sizeof(init_stack) + (long)&init_stack, \ | |
25449 | - .vm86_info = NULL, \ | |
25450 | - .sysenter_cs = __KERNEL_CS, \ | |
25451 | - .io_bitmap_ptr = NULL, \ | |
25452 | - .fs = __KERNEL_PERCPU, \ | |
25453 | -} | |
25454 | - | |
25455 | -/* | |
25456 | - * Note that the .io_bitmap member must be extra-big. This is because | |
25457 | - * the CPU will access an additional byte beyond the end of the IO | |
25458 | - * permission bitmap. The extra byte must be all 1 bits, and must | |
25459 | - * be within the limit. | |
25460 | - */ | |
25461 | -#define INIT_TSS { \ | |
25462 | - .x86_tss = { \ | |
25463 | - .esp0 = sizeof(init_stack) + (long)&init_stack, \ | |
25464 | - .ss0 = __KERNEL_DS, \ | |
25465 | - .ss1 = __KERNEL_CS, \ | |
25466 | - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ | |
25467 | - }, \ | |
25468 | - .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ | |
25469 | -} | |
25470 | - | |
25471 | -#define start_thread(regs, new_eip, new_esp) do { \ | |
25472 | - __asm__("movl %0,%%gs": :"r" (0)); \ | |
25473 | - regs->xfs = 0; \ | |
25474 | - set_fs(USER_DS); \ | |
25475 | - regs->xds = __USER_DS; \ | |
25476 | - regs->xes = __USER_DS; \ | |
25477 | - regs->xss = __USER_DS; \ | |
25478 | - regs->xcs = __USER_CS; \ | |
25479 | - regs->eip = new_eip; \ | |
25480 | - regs->esp = new_esp; \ | |
25481 | -} while (0) | |
25482 | - | |
25483 | -/* Forward declaration, a strange C thing */ | |
25484 | -struct task_struct; | |
25485 | -struct mm_struct; | |
25486 | - | |
25487 | -/* Free all resources held by a thread. */ | |
25488 | -extern void release_thread(struct task_struct *); | |
25489 | - | |
25490 | -/* Prepare to copy thread state - unlazy all lazy status */ | |
25491 | -extern void prepare_to_copy(struct task_struct *tsk); | |
25492 | - | |
25493 | -/* | |
25494 | - * create a kernel thread without removing it from tasklists | |
25495 | - */ | |
25496 | -extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); | |
25497 | - | |
25498 | -extern unsigned long thread_saved_pc(struct task_struct *tsk); | |
25499 | -void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack); | |
25500 | - | |
25501 | -unsigned long get_wchan(struct task_struct *p); | |
25502 | - | |
25503 | -#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) | |
25504 | -#define KSTK_TOP(info) \ | |
25505 | -({ \ | |
25506 | - unsigned long *__ptr = (unsigned long *)(info); \ | |
25507 | - (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ | |
25508 | -}) | |
25509 | - | |
25510 | -/* | |
25511 | - * The below -8 is to reserve 8 bytes on top of the ring0 stack. | |
25512 | - * This is necessary to guarantee that the entire "struct pt_regs" | |
25513 | - * is accessable even if the CPU haven't stored the SS/ESP registers | |
25514 | - * on the stack (interrupt gate does not save these registers | |
25515 | - * when switching to the same priv ring). | |
25516 | - * Therefore beware: accessing the xss/esp fields of the | |
25517 | - * "struct pt_regs" is possible, but they may contain the | |
25518 | - * completely wrong values. | |
25519 | - */ | |
25520 | -#define task_pt_regs(task) \ | |
25521 | -({ \ | |
25522 | - struct pt_regs *__regs__; \ | |
25523 | - __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ | |
25524 | - __regs__ - 1; \ | |
25525 | -}) | |
25526 | - | |
25527 | -#define KSTK_EIP(task) (task_pt_regs(task)->eip) | |
25528 | -#define KSTK_ESP(task) (task_pt_regs(task)->esp) | |
25529 | - | |
25530 | - | |
25531 | -struct microcode_header { | |
25532 | - unsigned int hdrver; | |
25533 | - unsigned int rev; | |
25534 | - unsigned int date; | |
25535 | - unsigned int sig; | |
25536 | - unsigned int cksum; | |
25537 | - unsigned int ldrver; | |
25538 | - unsigned int pf; | |
25539 | - unsigned int datasize; | |
25540 | - unsigned int totalsize; | |
25541 | - unsigned int reserved[3]; | |
25542 | -}; | |
25543 | - | |
25544 | -struct microcode { | |
25545 | - struct microcode_header hdr; | |
25546 | - unsigned int bits[0]; | |
25547 | -}; | |
25548 | - | |
25549 | -typedef struct microcode microcode_t; | |
25550 | -typedef struct microcode_header microcode_header_t; | |
25551 | - | |
25552 | -/* microcode format is extended from prescott processors */ | |
25553 | -struct extended_signature { | |
25554 | - unsigned int sig; | |
25555 | - unsigned int pf; | |
25556 | - unsigned int cksum; | |
25557 | -}; | |
25558 | - | |
25559 | -struct extended_sigtable { | |
25560 | - unsigned int count; | |
25561 | - unsigned int cksum; | |
25562 | - unsigned int reserved[3]; | |
25563 | - struct extended_signature sigs[0]; | |
25564 | -}; | |
25565 | - | |
25566 | -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ | |
25567 | -static inline void rep_nop(void) | |
25568 | -{ | |
25569 | - __asm__ __volatile__("rep;nop": : :"memory"); | |
25570 | -} | |
25571 | - | |
25572 | -#define cpu_relax() rep_nop() | |
25573 | - | |
25574 | -#ifndef CONFIG_X86_NO_TSS | |
25575 | -static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread) | |
25576 | -{ | |
25577 | - tss->x86_tss.esp0 = thread->esp0; | |
25578 | - /* This can only happen when SEP is enabled, no need to test "SEP"arately */ | |
25579 | - if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { | |
25580 | - tss->x86_tss.ss1 = thread->sysenter_cs; | |
25581 | - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); | |
25582 | - } | |
25583 | -} | |
25584 | -#else | |
25585 | -#define xen_load_esp0(tss, thread) do { \ | |
25586 | - if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \ | |
25587 | - BUG(); \ | |
25588 | -} while (0) | |
25589 | -#endif | |
25590 | - | |
25591 | - | |
25592 | -static inline unsigned long xen_get_debugreg(int regno) | |
25593 | -{ | |
25594 | - return HYPERVISOR_get_debugreg(regno); | |
25595 | -} | |
25596 | - | |
25597 | -static inline void xen_set_debugreg(int regno, unsigned long value) | |
25598 | -{ | |
25599 | - WARN_ON(HYPERVISOR_set_debugreg(regno, value)); | |
25600 | -} | |
25601 | - | |
25602 | -/* | |
25603 | - * Set IOPL bits in EFLAGS from given mask | |
25604 | - */ | |
25605 | -static inline void xen_set_iopl_mask(unsigned mask) | |
25606 | -{ | |
25607 | - struct physdev_set_iopl set_iopl; | |
25608 | - | |
25609 | - /* Force the change at ring 0. */ | |
25610 | - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; | |
25611 | - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); | |
25612 | -} | |
25613 | - | |
25614 | - | |
25615 | -#define paravirt_enabled() 0 | |
25616 | -#define __cpuid xen_cpuid | |
25617 | - | |
25618 | -#define load_esp0 xen_load_esp0 | |
25619 | - | |
25620 | -/* | |
25621 | - * These special macros can be used to get or set a debugging register | |
25622 | - */ | |
25623 | -#define get_debugreg(var, register) \ | |
25624 | - (var) = xen_get_debugreg(register) | |
25625 | -#define set_debugreg(value, register) \ | |
25626 | - xen_set_debugreg(register, value) | |
25627 | - | |
25628 | -#define set_iopl_mask xen_set_iopl_mask | |
25629 | - | |
25630 | -/* | |
25631 | - * Generic CPUID function | |
25632 | - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx | |
25633 | - * resulting in stale register contents being returned. | |
25634 | - */ | |
25635 | -static inline void cpuid(unsigned int op, | |
25636 | - unsigned int *eax, unsigned int *ebx, | |
25637 | - unsigned int *ecx, unsigned int *edx) | |
25638 | -{ | |
25639 | - *eax = op; | |
25640 | - *ecx = 0; | |
25641 | - __cpuid(eax, ebx, ecx, edx); | |
25642 | -} | |
25643 | - | |
25644 | -/* Some CPUID calls want 'count' to be placed in ecx */ | |
25645 | -static inline void cpuid_count(unsigned int op, int count, | |
25646 | - unsigned int *eax, unsigned int *ebx, | |
25647 | - unsigned int *ecx, unsigned int *edx) | |
25648 | -{ | |
25649 | - *eax = op; | |
25650 | - *ecx = count; | |
25651 | - __cpuid(eax, ebx, ecx, edx); | |
25652 | -} | |
25653 | - | |
25654 | -/* | |
25655 | - * CPUID functions returning a single datum | |
25656 | - */ | |
25657 | -static inline unsigned int cpuid_eax(unsigned int op) | |
25658 | -{ | |
25659 | - unsigned int eax, ebx, ecx, edx; | |
25660 | - | |
25661 | - cpuid(op, &eax, &ebx, &ecx, &edx); | |
25662 | - return eax; | |
25663 | -} | |
25664 | -static inline unsigned int cpuid_ebx(unsigned int op) | |
25665 | -{ | |
25666 | - unsigned int eax, ebx, ecx, edx; | |
25667 | - | |
25668 | - cpuid(op, &eax, &ebx, &ecx, &edx); | |
25669 | - return ebx; | |
25670 | -} | |
25671 | -static inline unsigned int cpuid_ecx(unsigned int op) | |
25672 | -{ | |
25673 | - unsigned int eax, ebx, ecx, edx; | |
25674 | - | |
25675 | - cpuid(op, &eax, &ebx, &ecx, &edx); | |
25676 | - return ecx; | |
25677 | -} | |
25678 | -static inline unsigned int cpuid_edx(unsigned int op) | |
25679 | -{ | |
25680 | - unsigned int eax, ebx, ecx, edx; | |
25681 | - | |
25682 | - cpuid(op, &eax, &ebx, &ecx, &edx); | |
25683 | - return edx; | |
25684 | -} | |
25685 | - | |
25686 | -/* generic versions from gas */ | |
25687 | -#define GENERIC_NOP1 ".byte 0x90\n" | |
25688 | -#define GENERIC_NOP2 ".byte 0x89,0xf6\n" | |
25689 | -#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" | |
25690 | -#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" | |
25691 | -#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 | |
25692 | -#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" | |
25693 | -#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" | |
25694 | -#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 | |
25695 | - | |
25696 | -/* Opteron nops */ | |
25697 | -#define K8_NOP1 GENERIC_NOP1 | |
25698 | -#define K8_NOP2 ".byte 0x66,0x90\n" | |
25699 | -#define K8_NOP3 ".byte 0x66,0x66,0x90\n" | |
25700 | -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" | |
25701 | -#define K8_NOP5 K8_NOP3 K8_NOP2 | |
25702 | -#define K8_NOP6 K8_NOP3 K8_NOP3 | |
25703 | -#define K8_NOP7 K8_NOP4 K8_NOP3 | |
25704 | -#define K8_NOP8 K8_NOP4 K8_NOP4 | |
25705 | - | |
25706 | -/* K7 nops */ | |
25707 | -/* uses eax dependencies (arbitary choice) */ | |
25708 | -#define K7_NOP1 GENERIC_NOP1 | |
25709 | -#define K7_NOP2 ".byte 0x8b,0xc0\n" | |
25710 | -#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" | |
25711 | -#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" | |
25712 | -#define K7_NOP5 K7_NOP4 ASM_NOP1 | |
25713 | -#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" | |
25714 | -#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" | |
25715 | -#define K7_NOP8 K7_NOP7 ASM_NOP1 | |
25716 | - | |
25717 | -/* P6 nops */ | |
25718 | -/* uses eax dependencies (Intel-recommended choice) */ | |
25719 | -#define P6_NOP1 GENERIC_NOP1 | |
25720 | -#define P6_NOP2 ".byte 0x66,0x90\n" | |
25721 | -#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" | |
25722 | -#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" | |
25723 | -#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" | |
25724 | -#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" | |
25725 | -#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" | |
25726 | -#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" | |
25727 | - | |
25728 | -#ifdef CONFIG_MK8 | |
25729 | -#define ASM_NOP1 K8_NOP1 | |
25730 | -#define ASM_NOP2 K8_NOP2 | |
25731 | -#define ASM_NOP3 K8_NOP3 | |
25732 | -#define ASM_NOP4 K8_NOP4 | |
25733 | -#define ASM_NOP5 K8_NOP5 | |
25734 | -#define ASM_NOP6 K8_NOP6 | |
25735 | -#define ASM_NOP7 K8_NOP7 | |
25736 | -#define ASM_NOP8 K8_NOP8 | |
25737 | -#elif defined(CONFIG_MK7) | |
25738 | -#define ASM_NOP1 K7_NOP1 | |
25739 | -#define ASM_NOP2 K7_NOP2 | |
25740 | -#define ASM_NOP3 K7_NOP3 | |
25741 | -#define ASM_NOP4 K7_NOP4 | |
25742 | -#define ASM_NOP5 K7_NOP5 | |
25743 | -#define ASM_NOP6 K7_NOP6 | |
25744 | -#define ASM_NOP7 K7_NOP7 | |
25745 | -#define ASM_NOP8 K7_NOP8 | |
25746 | -#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \ | |
25747 | - defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \ | |
25748 | - defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4) | |
25749 | -#define ASM_NOP1 P6_NOP1 | |
25750 | -#define ASM_NOP2 P6_NOP2 | |
25751 | -#define ASM_NOP3 P6_NOP3 | |
25752 | -#define ASM_NOP4 P6_NOP4 | |
25753 | -#define ASM_NOP5 P6_NOP5 | |
25754 | -#define ASM_NOP6 P6_NOP6 | |
25755 | -#define ASM_NOP7 P6_NOP7 | |
25756 | -#define ASM_NOP8 P6_NOP8 | |
25757 | -#else | |
25758 | -#define ASM_NOP1 GENERIC_NOP1 | |
25759 | -#define ASM_NOP2 GENERIC_NOP2 | |
25760 | -#define ASM_NOP3 GENERIC_NOP3 | |
25761 | -#define ASM_NOP4 GENERIC_NOP4 | |
25762 | -#define ASM_NOP5 GENERIC_NOP5 | |
25763 | -#define ASM_NOP6 GENERIC_NOP6 | |
25764 | -#define ASM_NOP7 GENERIC_NOP7 | |
25765 | -#define ASM_NOP8 GENERIC_NOP8 | |
25766 | -#endif | |
25767 | - | |
25768 | -#define ASM_NOP_MAX 8 | |
25769 | - | |
25770 | -/* Prefetch instructions for Pentium III and AMD Athlon */ | |
25771 | -/* It's not worth to care about 3dnow! prefetches for the K6 | |
25772 | - because they are microcoded there and very slow. | |
25773 | - However we don't do prefetches for pre XP Athlons currently | |
25774 | - That should be fixed. */ | |
25775 | -#define ARCH_HAS_PREFETCH | |
25776 | -static inline void prefetch(const void *x) | |
25777 | -{ | |
25778 | - alternative_input(ASM_NOP4, | |
25779 | - "prefetchnta (%1)", | |
25780 | - X86_FEATURE_XMM, | |
25781 | - "r" (x)); | |
25782 | -} | |
25783 | - | |
25784 | -#define ARCH_HAS_PREFETCH | |
25785 | -#define ARCH_HAS_PREFETCHW | |
25786 | -#define ARCH_HAS_SPINLOCK_PREFETCH | |
25787 | - | |
25788 | -/* 3dnow! prefetch to get an exclusive cache line. Useful for | |
25789 | - spinlocks to avoid one state transition in the cache coherency protocol. */ | |
25790 | -static inline void prefetchw(const void *x) | |
25791 | -{ | |
25792 | - alternative_input(ASM_NOP4, | |
25793 | - "prefetchw (%1)", | |
25794 | - X86_FEATURE_3DNOW, | |
25795 | - "r" (x)); | |
25796 | -} | |
25797 | -#define spin_lock_prefetch(x) prefetchw(x) | |
25798 | - | |
25799 | -extern void select_idle_routine(const struct cpuinfo_x86 *c); | |
25800 | - | |
25801 | -#define cache_line_size() (boot_cpu_data.x86_cache_alignment) | |
25802 | - | |
25803 | -extern unsigned long boot_option_idle_override; | |
25804 | -extern void enable_sep_cpu(void); | |
25805 | -extern int sysenter_setup(void); | |
25806 | - | |
25807 | -/* Defined in head.S */ | |
25808 | -extern struct Xgt_desc_struct early_gdt_descr; | |
25809 | - | |
25810 | -extern void cpu_set_gdt(int); | |
25811 | -extern void switch_to_new_gdt(void); | |
25812 | -extern void cpu_init(void); | |
25813 | -extern void init_gdt(int cpu); | |
25814 | - | |
25815 | -extern int force_mwait; | |
25816 | - | |
25817 | -#endif /* __ASM_I386_PROCESSOR_H */ | |
25818 | --- a/include/asm-x86/mach-xen/asm/processor_64.h | |
25819 | +++ /dev/null | |
25820 | @@ -1,461 +0,0 @@ | |
25821 | -/* | |
25822 | - * include/asm-x86_64/processor.h | |
25823 | - * | |
25824 | - * Copyright (C) 1994 Linus Torvalds | |
25825 | - */ | |
25826 | - | |
25827 | -#ifndef __ASM_X86_64_PROCESSOR_H | |
25828 | -#define __ASM_X86_64_PROCESSOR_H | |
25829 | - | |
25830 | -#include <asm/segment.h> | |
25831 | -#include <asm/page.h> | |
25832 | -#include <asm/types.h> | |
25833 | -#include <asm/sigcontext.h> | |
25834 | -#include <asm/cpufeature.h> | |
25835 | -#include <linux/threads.h> | |
25836 | -#include <asm/msr.h> | |
25837 | -#include <asm/current.h> | |
25838 | -#include <asm/system.h> | |
25839 | -#include <asm/mmsegment.h> | |
25840 | -#include <asm/percpu.h> | |
25841 | -#include <linux/personality.h> | |
25842 | -#include <linux/cpumask.h> | |
25843 | -#include <asm/processor-flags.h> | |
25844 | - | |
25845 | -#define TF_MASK 0x00000100 | |
25846 | -#define IF_MASK 0x00000200 | |
25847 | -#define IOPL_MASK 0x00003000 | |
25848 | -#define NT_MASK 0x00004000 | |
25849 | -#define VM_MASK 0x00020000 | |
25850 | -#define AC_MASK 0x00040000 | |
25851 | -#define VIF_MASK 0x00080000 /* virtual interrupt flag */ | |
25852 | -#define VIP_MASK 0x00100000 /* virtual interrupt pending */ | |
25853 | -#define ID_MASK 0x00200000 | |
25854 | - | |
25855 | -#define desc_empty(desc) \ | |
25856 | - (!((desc)->a | (desc)->b)) | |
25857 | - | |
25858 | -#define desc_equal(desc1, desc2) \ | |
25859 | - (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b)) | |
25860 | - | |
25861 | -/* | |
25862 | - * Default implementation of macro that returns current | |
25863 | - * instruction pointer ("program counter"). | |
25864 | - */ | |
25865 | -#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; }) | |
25866 | - | |
25867 | -/* | |
25868 | - * CPU type and hardware bug flags. Kept separately for each CPU. | |
25869 | - */ | |
25870 | - | |
25871 | -struct cpuinfo_x86 { | |
25872 | - __u8 x86; /* CPU family */ | |
25873 | - __u8 x86_vendor; /* CPU vendor */ | |
25874 | - __u8 x86_model; | |
25875 | - __u8 x86_mask; | |
25876 | - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ | |
25877 | - __u32 x86_capability[NCAPINTS]; | |
25878 | - char x86_vendor_id[16]; | |
25879 | - char x86_model_id[64]; | |
25880 | - int x86_cache_size; /* in KB */ | |
25881 | - int x86_clflush_size; | |
25882 | - int x86_cache_alignment; | |
25883 | - int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/ | |
25884 | - __u8 x86_virt_bits, x86_phys_bits; | |
25885 | - __u8 x86_max_cores; /* cpuid returned max cores value */ | |
25886 | - __u32 x86_power; | |
25887 | - __u32 extended_cpuid_level; /* Max extended CPUID function supported */ | |
25888 | - unsigned long loops_per_jiffy; | |
25889 | -#ifdef CONFIG_SMP | |
25890 | - cpumask_t llc_shared_map; /* cpus sharing the last level cache */ | |
25891 | -#endif | |
25892 | - __u8 apicid; | |
25893 | -#ifdef CONFIG_SMP | |
25894 | - __u8 booted_cores; /* number of cores as seen by OS */ | |
25895 | - __u8 phys_proc_id; /* Physical Processor id. */ | |
25896 | - __u8 cpu_core_id; /* Core id. */ | |
25897 | - __u8 cpu_index; /* index into per_cpu list */ | |
25898 | -#endif | |
25899 | -} ____cacheline_aligned; | |
25900 | - | |
25901 | -#define X86_VENDOR_INTEL 0 | |
25902 | -#define X86_VENDOR_CYRIX 1 | |
25903 | -#define X86_VENDOR_AMD 2 | |
25904 | -#define X86_VENDOR_UMC 3 | |
25905 | -#define X86_VENDOR_NEXGEN 4 | |
25906 | -#define X86_VENDOR_CENTAUR 5 | |
25907 | -#define X86_VENDOR_TRANSMETA 7 | |
25908 | -#define X86_VENDOR_NUM 8 | |
25909 | -#define X86_VENDOR_UNKNOWN 0xff | |
25910 | - | |
25911 | -#ifdef CONFIG_SMP | |
25912 | -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info); | |
25913 | -#define cpu_data(cpu) per_cpu(cpu_info, cpu) | |
25914 | -#define current_cpu_data cpu_data(smp_processor_id()) | |
25915 | -#else | |
25916 | -#define cpu_data(cpu) boot_cpu_data | |
25917 | -#define current_cpu_data boot_cpu_data | |
25918 | -#endif | |
25919 | - | |
25920 | -extern char ignore_irq13; | |
25921 | - | |
25922 | -extern void identify_cpu(struct cpuinfo_x86 *); | |
25923 | -extern void print_cpu_info(struct cpuinfo_x86 *); | |
25924 | -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); | |
25925 | -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); | |
25926 | -extern unsigned short num_cache_leaves; | |
25927 | - | |
25928 | -/* | |
25929 | - * Save the cr4 feature set we're using (ie | |
25930 | - * Pentium 4MB enable and PPro Global page | |
25931 | - * enable), so that any CPU's that boot up | |
25932 | - * after us can get the correct flags. | |
25933 | - */ | |
25934 | -extern unsigned long mmu_cr4_features; | |
25935 | - | |
25936 | -static inline void set_in_cr4 (unsigned long mask) | |
25937 | -{ | |
25938 | - mmu_cr4_features |= mask; | |
25939 | - __asm__("movq %%cr4,%%rax\n\t" | |
25940 | - "orq %0,%%rax\n\t" | |
25941 | - "movq %%rax,%%cr4\n" | |
25942 | - : : "irg" (mask) | |
25943 | - :"ax"); | |
25944 | -} | |
25945 | - | |
25946 | -static inline void clear_in_cr4 (unsigned long mask) | |
25947 | -{ | |
25948 | - mmu_cr4_features &= ~mask; | |
25949 | - __asm__("movq %%cr4,%%rax\n\t" | |
25950 | - "andq %0,%%rax\n\t" | |
25951 | - "movq %%rax,%%cr4\n" | |
25952 | - : : "irg" (~mask) | |
25953 | - :"ax"); | |
25954 | -} | |
25955 | - | |
25956 | - | |
25957 | -/* | |
25958 | - * User space process size. 47bits minus one guard page. | |
25959 | - */ | |
25960 | -#define TASK_SIZE64 (0x800000000000UL - 4096) | |
25961 | - | |
25962 | -/* This decides where the kernel will search for a free chunk of vm | |
25963 | - * space during mmap's. | |
25964 | - */ | |
25965 | -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000) | |
25966 | - | |
25967 | -#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64) | |
25968 | -#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64) | |
25969 | - | |
25970 | -#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3) | |
25971 | - | |
25972 | -/* | |
25973 | - * Size of io_bitmap. | |
25974 | - */ | |
25975 | -#define IO_BITMAP_BITS 65536 | |
25976 | -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) | |
25977 | -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) | |
25978 | -#ifndef CONFIG_X86_NO_TSS | |
25979 | -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap) | |
25980 | -#endif | |
25981 | -#define INVALID_IO_BITMAP_OFFSET 0x8000 | |
25982 | - | |
25983 | -struct i387_fxsave_struct { | |
25984 | - u16 cwd; | |
25985 | - u16 swd; | |
25986 | - u16 twd; | |
25987 | - u16 fop; | |
25988 | - u64 rip; | |
25989 | - u64 rdp; | |
25990 | - u32 mxcsr; | |
25991 | - u32 mxcsr_mask; | |
25992 | - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ | |
25993 | - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ | |
25994 | - u32 padding[24]; | |
25995 | -} __attribute__ ((aligned (16))); | |
25996 | - | |
25997 | -union i387_union { | |
25998 | - struct i387_fxsave_struct fxsave; | |
25999 | -}; | |
26000 | - | |
26001 | -#ifndef CONFIG_X86_NO_TSS | |
26002 | -struct tss_struct { | |
26003 | - u32 reserved1; | |
26004 | - u64 rsp0; | |
26005 | - u64 rsp1; | |
26006 | - u64 rsp2; | |
26007 | - u64 reserved2; | |
26008 | - u64 ist[7]; | |
26009 | - u32 reserved3; | |
26010 | - u32 reserved4; | |
26011 | - u16 reserved5; | |
26012 | - u16 io_bitmap_base; | |
26013 | - /* | |
26014 | - * The extra 1 is there because the CPU will access an | |
26015 | - * additional byte beyond the end of the IO permission | |
26016 | - * bitmap. The extra byte must be all 1 bits, and must | |
26017 | - * be within the limit. Thus we have: | |
26018 | - * | |
26019 | - * 128 bytes, the bitmap itself, for ports 0..0x3ff | |
26020 | - * 8 bytes, for an extra "long" of ~0UL | |
26021 | - */ | |
26022 | - unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; | |
26023 | -} __attribute__((packed)) ____cacheline_aligned; | |
26024 | - | |
26025 | -DECLARE_PER_CPU(struct tss_struct,init_tss); | |
26026 | -#endif | |
26027 | - | |
26028 | - | |
26029 | -extern struct cpuinfo_x86 boot_cpu_data; | |
26030 | -#ifndef CONFIG_X86_NO_TSS | |
26031 | -/* Save the original ist values for checking stack pointers during debugging */ | |
26032 | -struct orig_ist { | |
26033 | - unsigned long ist[7]; | |
26034 | -}; | |
26035 | -DECLARE_PER_CPU(struct orig_ist, orig_ist); | |
26036 | -#endif | |
26037 | - | |
26038 | -#ifdef CONFIG_X86_VSMP | |
26039 | -#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) | |
26040 | -#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) | |
26041 | -#else | |
26042 | -#define ARCH_MIN_TASKALIGN 16 | |
26043 | -#define ARCH_MIN_MMSTRUCT_ALIGN 0 | |
26044 | -#endif | |
26045 | - | |
26046 | -struct thread_struct { | |
26047 | - unsigned long rsp0; | |
26048 | - unsigned long rsp; | |
26049 | - unsigned long userrsp; /* Copy from PDA */ | |
26050 | - unsigned long fs; | |
26051 | - unsigned long gs; | |
26052 | - unsigned short es, ds, fsindex, gsindex; | |
26053 | -/* Hardware debugging registers */ | |
26054 | - unsigned long debugreg0; | |
26055 | - unsigned long debugreg1; | |
26056 | - unsigned long debugreg2; | |
26057 | - unsigned long debugreg3; | |
26058 | - unsigned long debugreg6; | |
26059 | - unsigned long debugreg7; | |
26060 | -/* fault info */ | |
26061 | - unsigned long cr2, trap_no, error_code; | |
26062 | -/* floating point info */ | |
26063 | - union i387_union i387 __attribute__((aligned(16))); | |
26064 | -/* IO permissions. the bitmap could be moved into the GDT, that would make | |
26065 | - switch faster for a limited number of ioperm using tasks. -AK */ | |
26066 | - int ioperm; | |
26067 | - unsigned long *io_bitmap_ptr; | |
26068 | - unsigned io_bitmap_max; | |
26069 | -/* cached TLS descriptors. */ | |
26070 | - u64 tls_array[GDT_ENTRY_TLS_ENTRIES]; | |
26071 | - unsigned int iopl; | |
26072 | -} __attribute__((aligned(16))); | |
26073 | - | |
26074 | -#define INIT_THREAD { \ | |
26075 | - .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \ | |
26076 | -} | |
26077 | - | |
26078 | -#ifndef CONFIG_X86_NO_TSS | |
26079 | -#define INIT_TSS { \ | |
26080 | - .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \ | |
26081 | -} | |
26082 | -#endif | |
26083 | - | |
26084 | -#define INIT_MMAP \ | |
26085 | -{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL } | |
26086 | - | |
26087 | -#define start_thread(regs,new_rip,new_rsp) do { \ | |
26088 | - asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \ | |
26089 | - load_gs_index(0); \ | |
26090 | - (regs)->rip = (new_rip); \ | |
26091 | - (regs)->rsp = (new_rsp); \ | |
26092 | - write_pda(oldrsp, (new_rsp)); \ | |
26093 | - (regs)->cs = __USER_CS; \ | |
26094 | - (regs)->ss = __USER_DS; \ | |
26095 | - (regs)->eflags = 0x200; \ | |
26096 | - set_fs(USER_DS); \ | |
26097 | -} while(0) | |
26098 | - | |
26099 | -#define get_debugreg(var, register) \ | |
26100 | - var = HYPERVISOR_get_debugreg(register) | |
26101 | -#define set_debugreg(value, register) do { \ | |
26102 | - if (HYPERVISOR_set_debugreg(register, value)) \ | |
26103 | - BUG(); \ | |
26104 | -} while (0) | |
26105 | - | |
26106 | -struct task_struct; | |
26107 | -struct mm_struct; | |
26108 | - | |
26109 | -/* Free all resources held by a thread. */ | |
26110 | -extern void release_thread(struct task_struct *); | |
26111 | - | |
26112 | -/* Prepare to copy thread state - unlazy all lazy status */ | |
26113 | -extern void prepare_to_copy(struct task_struct *tsk); | |
26114 | - | |
26115 | -/* | |
26116 | - * create a kernel thread without removing it from tasklists | |
26117 | - */ | |
26118 | -extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); | |
26119 | - | |
26120 | -/* | |
26121 | - * Return saved PC of a blocked thread. | |
26122 | - * What is this good for? it will be always the scheduler or ret_from_fork. | |
26123 | - */ | |
26124 | -#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8)) | |
26125 | - | |
26126 | -extern unsigned long get_wchan(struct task_struct *p); | |
26127 | -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1) | |
26128 | -#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip) | |
26129 | -#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ | |
26130 | - | |
26131 | - | |
26132 | -struct microcode_header { | |
26133 | - unsigned int hdrver; | |
26134 | - unsigned int rev; | |
26135 | - unsigned int date; | |
26136 | - unsigned int sig; | |
26137 | - unsigned int cksum; | |
26138 | - unsigned int ldrver; | |
26139 | - unsigned int pf; | |
26140 | - unsigned int datasize; | |
26141 | - unsigned int totalsize; | |
26142 | - unsigned int reserved[3]; | |
26143 | -}; | |
26144 | - | |
26145 | -struct microcode { | |
26146 | - struct microcode_header hdr; | |
26147 | - unsigned int bits[0]; | |
26148 | -}; | |
26149 | - | |
26150 | -typedef struct microcode microcode_t; | |
26151 | -typedef struct microcode_header microcode_header_t; | |
26152 | - | |
26153 | -/* microcode format is extended from prescott processors */ | |
26154 | -struct extended_signature { | |
26155 | - unsigned int sig; | |
26156 | - unsigned int pf; | |
26157 | - unsigned int cksum; | |
26158 | -}; | |
26159 | - | |
26160 | -struct extended_sigtable { | |
26161 | - unsigned int count; | |
26162 | - unsigned int cksum; | |
26163 | - unsigned int reserved[3]; | |
26164 | - struct extended_signature sigs[0]; | |
26165 | -}; | |
26166 | - | |
26167 | - | |
26168 | -#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2) | |
26169 | -#define ASM_NOP1 P6_NOP1 | |
26170 | -#define ASM_NOP2 P6_NOP2 | |
26171 | -#define ASM_NOP3 P6_NOP3 | |
26172 | -#define ASM_NOP4 P6_NOP4 | |
26173 | -#define ASM_NOP5 P6_NOP5 | |
26174 | -#define ASM_NOP6 P6_NOP6 | |
26175 | -#define ASM_NOP7 P6_NOP7 | |
26176 | -#define ASM_NOP8 P6_NOP8 | |
26177 | -#else | |
26178 | -#define ASM_NOP1 K8_NOP1 | |
26179 | -#define ASM_NOP2 K8_NOP2 | |
26180 | -#define ASM_NOP3 K8_NOP3 | |
26181 | -#define ASM_NOP4 K8_NOP4 | |
26182 | -#define ASM_NOP5 K8_NOP5 | |
26183 | -#define ASM_NOP6 K8_NOP6 | |
26184 | -#define ASM_NOP7 K8_NOP7 | |
26185 | -#define ASM_NOP8 K8_NOP8 | |
26186 | -#endif | |
26187 | - | |
26188 | -/* Opteron nops */ | |
26189 | -#define K8_NOP1 ".byte 0x90\n" | |
26190 | -#define K8_NOP2 ".byte 0x66,0x90\n" | |
26191 | -#define K8_NOP3 ".byte 0x66,0x66,0x90\n" | |
26192 | -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" | |
26193 | -#define K8_NOP5 K8_NOP3 K8_NOP2 | |
26194 | -#define K8_NOP6 K8_NOP3 K8_NOP3 | |
26195 | -#define K8_NOP7 K8_NOP4 K8_NOP3 | |
26196 | -#define K8_NOP8 K8_NOP4 K8_NOP4 | |
26197 | - | |
26198 | -/* P6 nops */ | |
26199 | -/* uses eax dependencies (Intel-recommended choice) */ | |
26200 | -#define P6_NOP1 ".byte 0x90\n" | |
26201 | -#define P6_NOP2 ".byte 0x66,0x90\n" | |
26202 | -#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" | |
26203 | -#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" | |
26204 | -#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" | |
26205 | -#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" | |
26206 | -#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" | |
26207 | -#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" | |
26208 | - | |
26209 | -#define ASM_NOP_MAX 8 | |
26210 | - | |
26211 | -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ | |
26212 | -static inline void rep_nop(void) | |
26213 | -{ | |
26214 | - __asm__ __volatile__("rep;nop": : :"memory"); | |
26215 | -} | |
26216 | - | |
26217 | -/* Stop speculative execution */ | |
26218 | -static inline void sync_core(void) | |
26219 | -{ | |
26220 | - int tmp; | |
26221 | - asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory"); | |
26222 | -} | |
26223 | - | |
26224 | -#define ARCH_HAS_PREFETCHW 1 | |
26225 | -static inline void prefetchw(void *x) | |
26226 | -{ | |
26227 | - alternative_input("prefetcht0 (%1)", | |
26228 | - "prefetchw (%1)", | |
26229 | - X86_FEATURE_3DNOW, | |
26230 | - "r" (x)); | |
26231 | -} | |
26232 | - | |
26233 | -#define ARCH_HAS_SPINLOCK_PREFETCH 1 | |
26234 | - | |
26235 | -#define spin_lock_prefetch(x) prefetchw(x) | |
26236 | - | |
26237 | -#define cpu_relax() rep_nop() | |
26238 | - | |
26239 | -static inline void __monitor(const void *eax, unsigned long ecx, | |
26240 | - unsigned long edx) | |
26241 | -{ | |
26242 | - /* "monitor %eax,%ecx,%edx;" */ | |
26243 | - asm volatile( | |
26244 | - ".byte 0x0f,0x01,0xc8;" | |
26245 | - : :"a" (eax), "c" (ecx), "d"(edx)); | |
26246 | -} | |
26247 | - | |
26248 | -static inline void __mwait(unsigned long eax, unsigned long ecx) | |
26249 | -{ | |
26250 | - /* "mwait %eax,%ecx;" */ | |
26251 | - asm volatile( | |
26252 | - ".byte 0x0f,0x01,0xc9;" | |
26253 | - : :"a" (eax), "c" (ecx)); | |
26254 | -} | |
26255 | - | |
26256 | -static inline void __sti_mwait(unsigned long eax, unsigned long ecx) | |
26257 | -{ | |
26258 | - /* "mwait %eax,%ecx;" */ | |
26259 | - asm volatile( | |
26260 | - "sti; .byte 0x0f,0x01,0xc9;" | |
26261 | - : :"a" (eax), "c" (ecx)); | |
26262 | -} | |
26263 | - | |
26264 | -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); | |
26265 | - | |
26266 | -#define stack_current() \ | |
26267 | -({ \ | |
26268 | - struct thread_info *ti; \ | |
26269 | - asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ | |
26270 | - ti->task; \ | |
26271 | -}) | |
26272 | - | |
26273 | -#define cache_line_size() (boot_cpu_data.x86_cache_alignment) | |
26274 | - | |
26275 | -extern unsigned long boot_option_idle_override; | |
26276 | -/* Boot loader type from the setup header */ | |
26277 | -extern int bootloader_type; | |
26278 | - | |
26279 | -#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 | |
26280 | - | |
26281 | -#endif /* __ASM_X86_64_PROCESSOR_H */ | |
26282 | --- a/include/asm-x86/mach-xen/asm/processor.h | |
26283 | +++ b/include/asm-x86/mach-xen/asm/processor.h | |
26284 | @@ -1,5 +1,793 @@ | |
26285 | +#ifndef __ASM_X86_PROCESSOR_H | |
26286 | +#define __ASM_X86_PROCESSOR_H | |
26287 | + | |
26288 | +#include <asm/processor-flags.h> | |
26289 | + | |
26290 | +/* migration helpers, for KVM - will be removed in 2.6.25: */ | |
26291 | +#include <asm/vm86.h> | |
26292 | +#define Xgt_desc_struct desc_ptr | |
26293 | + | |
26294 | +/* Forward declaration, a strange C thing */ | |
26295 | +struct task_struct; | |
26296 | +struct mm_struct; | |
26297 | + | |
26298 | +#include <asm/vm86.h> | |
26299 | +#include <asm/math_emu.h> | |
26300 | +#include <asm/segment.h> | |
26301 | +#include <asm/types.h> | |
26302 | +#include <asm/sigcontext.h> | |
26303 | +#include <asm/current.h> | |
26304 | +#include <asm/cpufeature.h> | |
26305 | +#include <asm/system.h> | |
26306 | +#include <asm/page.h> | |
26307 | +#include <asm/percpu.h> | |
26308 | +#include <asm/msr.h> | |
26309 | +#include <asm/desc_defs.h> | |
26310 | +#include <asm/nops.h> | |
26311 | +#include <linux/personality.h> | |
26312 | +#include <linux/cpumask.h> | |
26313 | +#include <linux/cache.h> | |
26314 | +#include <linux/threads.h> | |
26315 | +#include <linux/init.h> | |
26316 | +#include <xen/interface/physdev.h> | |
26317 | + | |
26318 | +/* | |
26319 | + * Default implementation of macro that returns current | |
26320 | + * instruction pointer ("program counter"). | |
26321 | + */ | |
26322 | +static inline void *current_text_addr(void) | |
26323 | +{ | |
26324 | + void *pc; | |
26325 | + asm volatile("mov $1f,%0\n1:":"=r" (pc)); | |
26326 | + return pc; | |
26327 | +} | |
26328 | + | |
26329 | +#ifdef CONFIG_X86_VSMP | |
26330 | +#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) | |
26331 | +#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) | |
26332 | +#else | |
26333 | +#define ARCH_MIN_TASKALIGN 16 | |
26334 | +#define ARCH_MIN_MMSTRUCT_ALIGN 0 | |
26335 | +#endif | |
26336 | + | |
26337 | +/* | |
26338 | + * CPU type and hardware bug flags. Kept separately for each CPU. | |
26339 | + * Members of this structure are referenced in head.S, so think twice | |
26340 | + * before touching them. [mj] | |
26341 | + */ | |
26342 | + | |
26343 | +struct cpuinfo_x86 { | |
26344 | + __u8 x86; /* CPU family */ | |
26345 | + __u8 x86_vendor; /* CPU vendor */ | |
26346 | + __u8 x86_model; | |
26347 | + __u8 x86_mask; | |
26348 | +#ifdef CONFIG_X86_32 | |
26349 | + char wp_works_ok; /* It doesn't on 386's */ | |
26350 | + char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */ | |
26351 | + char hard_math; | |
26352 | + char rfu; | |
26353 | + char fdiv_bug; | |
26354 | + char f00f_bug; | |
26355 | + char coma_bug; | |
26356 | + char pad0; | |
26357 | +#else | |
26358 | + /* number of 4K pages in DTLB/ITLB combined(in pages)*/ | |
26359 | + int x86_tlbsize; | |
26360 | + __u8 x86_virt_bits, x86_phys_bits; | |
26361 | + /* cpuid returned core id bits */ | |
26362 | + __u8 x86_coreid_bits; | |
26363 | + /* Max extended CPUID function supported */ | |
26364 | + __u32 extended_cpuid_level; | |
26365 | +#endif | |
26366 | + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ | |
26367 | + __u32 x86_capability[NCAPINTS]; | |
26368 | + char x86_vendor_id[16]; | |
26369 | + char x86_model_id[64]; | |
26370 | + int x86_cache_size; /* in KB - valid for CPUS which support this | |
26371 | + call */ | |
26372 | + int x86_cache_alignment; /* In bytes */ | |
26373 | + int x86_power; | |
26374 | + unsigned long loops_per_jiffy; | |
26375 | +#ifdef CONFIG_SMP | |
26376 | + cpumask_t llc_shared_map; /* cpus sharing the last level cache */ | |
26377 | +#endif | |
26378 | + u16 x86_max_cores; /* cpuid returned max cores value */ | |
26379 | + u16 apicid; | |
26380 | + u16 x86_clflush_size; | |
26381 | +#ifdef CONFIG_SMP | |
26382 | + u16 booted_cores; /* number of cores as seen by OS */ | |
26383 | + u16 phys_proc_id; /* Physical processor id. */ | |
26384 | + u16 cpu_core_id; /* Core id */ | |
26385 | + u16 cpu_index; /* index into per_cpu list */ | |
26386 | +#endif | |
26387 | +} __attribute__((__aligned__(SMP_CACHE_BYTES))); | |
26388 | + | |
26389 | +#define X86_VENDOR_INTEL 0 | |
26390 | +#define X86_VENDOR_CYRIX 1 | |
26391 | +#define X86_VENDOR_AMD 2 | |
26392 | +#define X86_VENDOR_UMC 3 | |
26393 | +#define X86_VENDOR_NEXGEN 4 | |
26394 | +#define X86_VENDOR_CENTAUR 5 | |
26395 | +#define X86_VENDOR_TRANSMETA 7 | |
26396 | +#define X86_VENDOR_NSC 8 | |
26397 | +#define X86_VENDOR_NUM 9 | |
26398 | +#define X86_VENDOR_UNKNOWN 0xff | |
26399 | + | |
26400 | +/* | |
26401 | + * capabilities of CPUs | |
26402 | + */ | |
26403 | +extern struct cpuinfo_x86 boot_cpu_data; | |
26404 | +extern struct cpuinfo_x86 new_cpu_data; | |
26405 | +extern __u32 cleared_cpu_caps[NCAPINTS]; | |
26406 | + | |
26407 | +#ifdef CONFIG_SMP | |
26408 | +DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info); | |
26409 | +#define cpu_data(cpu) per_cpu(cpu_info, cpu) | |
26410 | +#define current_cpu_data cpu_data(smp_processor_id()) | |
26411 | +#else | |
26412 | +#define cpu_data(cpu) boot_cpu_data | |
26413 | +#define current_cpu_data boot_cpu_data | |
26414 | +#endif | |
26415 | + | |
26416 | +void cpu_detect(struct cpuinfo_x86 *c); | |
26417 | + | |
26418 | +extern void identify_cpu(struct cpuinfo_x86 *); | |
26419 | +extern void identify_boot_cpu(void); | |
26420 | +extern void identify_secondary_cpu(struct cpuinfo_x86 *); | |
26421 | +extern void print_cpu_info(struct cpuinfo_x86 *); | |
26422 | +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); | |
26423 | +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); | |
26424 | +extern unsigned short num_cache_leaves; | |
26425 | + | |
26426 | +#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64) | |
26427 | +extern void detect_ht(struct cpuinfo_x86 *c); | |
26428 | +#else | |
26429 | +static inline void detect_ht(struct cpuinfo_x86 *c) {} | |
26430 | +#endif | |
26431 | + | |
26432 | +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx, | |
26433 | + unsigned int *ecx, unsigned int *edx) | |
26434 | +{ | |
26435 | + /* ecx is often an input as well as an output. */ | |
26436 | + __asm__(XEN_CPUID | |
26437 | + : "=a" (*eax), | |
26438 | + "=b" (*ebx), | |
26439 | + "=c" (*ecx), | |
26440 | + "=d" (*edx) | |
26441 | + : "0" (*eax), "2" (*ecx)); | |
26442 | +} | |
26443 | + | |
26444 | +static inline void load_cr3(pgd_t *pgdir) | |
26445 | +{ | |
26446 | + write_cr3(__pa(pgdir)); | |
26447 | +} | |
26448 | + | |
26449 | +#ifndef CONFIG_X86_NO_TSS | |
26450 | +#ifdef CONFIG_X86_32 | |
26451 | +/* This is the TSS defined by the hardware. */ | |
26452 | +struct x86_hw_tss { | |
26453 | + unsigned short back_link, __blh; | |
26454 | + unsigned long sp0; | |
26455 | + unsigned short ss0, __ss0h; | |
26456 | + unsigned long sp1; | |
26457 | + unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */ | |
26458 | + unsigned long sp2; | |
26459 | + unsigned short ss2, __ss2h; | |
26460 | + unsigned long __cr3; | |
26461 | + unsigned long ip; | |
26462 | + unsigned long flags; | |
26463 | + unsigned long ax, cx, dx, bx; | |
26464 | + unsigned long sp, bp, si, di; | |
26465 | + unsigned short es, __esh; | |
26466 | + unsigned short cs, __csh; | |
26467 | + unsigned short ss, __ssh; | |
26468 | + unsigned short ds, __dsh; | |
26469 | + unsigned short fs, __fsh; | |
26470 | + unsigned short gs, __gsh; | |
26471 | + unsigned short ldt, __ldth; | |
26472 | + unsigned short trace, io_bitmap_base; | |
26473 | +} __attribute__((packed)); | |
26474 | +extern struct tss_struct doublefault_tss; | |
26475 | +#else | |
26476 | +struct x86_hw_tss { | |
26477 | + u32 reserved1; | |
26478 | + u64 sp0; | |
26479 | + u64 sp1; | |
26480 | + u64 sp2; | |
26481 | + u64 reserved2; | |
26482 | + u64 ist[7]; | |
26483 | + u32 reserved3; | |
26484 | + u32 reserved4; | |
26485 | + u16 reserved5; | |
26486 | + u16 io_bitmap_base; | |
26487 | +} __attribute__((packed)) ____cacheline_aligned; | |
26488 | +#endif | |
26489 | +#endif /* CONFIG_X86_NO_TSS */ | |
26490 | + | |
26491 | +/* | |
26492 | + * Size of io_bitmap. | |
26493 | + */ | |
26494 | +#define IO_BITMAP_BITS 65536 | |
26495 | +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) | |
26496 | +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) | |
26497 | +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) | |
26498 | +#define INVALID_IO_BITMAP_OFFSET 0x8000 | |
26499 | +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 | |
26500 | + | |
26501 | +#ifndef CONFIG_X86_NO_TSS | |
26502 | +struct tss_struct { | |
26503 | + struct x86_hw_tss x86_tss; | |
26504 | + | |
26505 | + /* | |
26506 | + * The extra 1 is there because the CPU will access an | |
26507 | + * additional byte beyond the end of the IO permission | |
26508 | + * bitmap. The extra byte must be all 1 bits, and must | |
26509 | + * be within the limit. | |
26510 | + */ | |
26511 | + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; | |
26512 | + /* | |
26513 | + * Cache the current maximum and the last task that used the bitmap: | |
26514 | + */ | |
26515 | + unsigned long io_bitmap_max; | |
26516 | + struct thread_struct *io_bitmap_owner; | |
26517 | + /* | |
26518 | + * pads the TSS to be cacheline-aligned (size is 0x100) | |
26519 | + */ | |
26520 | + unsigned long __cacheline_filler[35]; | |
26521 | + /* | |
26522 | + * .. and then another 0x100 bytes for emergency kernel stack | |
26523 | + */ | |
26524 | + unsigned long stack[64]; | |
26525 | +} __attribute__((packed)); | |
26526 | + | |
26527 | +DECLARE_PER_CPU(struct tss_struct, init_tss); | |
26528 | + | |
26529 | +/* Save the original ist values for checking stack pointers during debugging */ | |
26530 | +struct orig_ist { | |
26531 | + unsigned long ist[7]; | |
26532 | +}; | |
26533 | +#endif /* CONFIG_X86_NO_TSS */ | |
26534 | + | |
26535 | +#define MXCSR_DEFAULT 0x1f80 | |
26536 | + | |
26537 | +struct i387_fsave_struct { | |
26538 | + u32 cwd; | |
26539 | + u32 swd; | |
26540 | + u32 twd; | |
26541 | + u32 fip; | |
26542 | + u32 fcs; | |
26543 | + u32 foo; | |
26544 | + u32 fos; | |
26545 | + u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ | |
26546 | + u32 status; /* software status information */ | |
26547 | +}; | |
26548 | + | |
26549 | +struct i387_fxsave_struct { | |
26550 | + u16 cwd; | |
26551 | + u16 swd; | |
26552 | + u16 twd; | |
26553 | + u16 fop; | |
26554 | + union { | |
26555 | + struct { | |
26556 | + u64 rip; | |
26557 | + u64 rdp; | |
26558 | + }; | |
26559 | + struct { | |
26560 | + u32 fip; | |
26561 | + u32 fcs; | |
26562 | + u32 foo; | |
26563 | + u32 fos; | |
26564 | + }; | |
26565 | + }; | |
26566 | + u32 mxcsr; | |
26567 | + u32 mxcsr_mask; | |
26568 | + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ | |
26569 | + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ | |
26570 | + u32 padding[24]; | |
26571 | +} __attribute__((aligned(16))); | |
26572 | + | |
26573 | +struct i387_soft_struct { | |
26574 | + u32 cwd; | |
26575 | + u32 swd; | |
26576 | + u32 twd; | |
26577 | + u32 fip; | |
26578 | + u32 fcs; | |
26579 | + u32 foo; | |
26580 | + u32 fos; | |
26581 | + u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ | |
26582 | + u8 ftop, changed, lookahead, no_update, rm, alimit; | |
26583 | + struct info *info; | |
26584 | + u32 entry_eip; | |
26585 | +}; | |
26586 | + | |
26587 | +union i387_union { | |
26588 | + struct i387_fsave_struct fsave; | |
26589 | + struct i387_fxsave_struct fxsave; | |
26590 | + struct i387_soft_struct soft; | |
26591 | +}; | |
26592 | + | |
26593 | +#ifdef CONFIG_X86_32 | |
26594 | +DECLARE_PER_CPU(u8, cpu_llc_id); | |
26595 | +#elif !defined(CONFIG_X86_NO_TSS) | |
26596 | +DECLARE_PER_CPU(struct orig_ist, orig_ist); | |
26597 | +#endif | |
26598 | + | |
26599 | +extern void print_cpu_info(struct cpuinfo_x86 *); | |
26600 | +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); | |
26601 | +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); | |
26602 | +extern unsigned short num_cache_leaves; | |
26603 | + | |
26604 | +struct thread_struct { | |
26605 | +/* cached TLS descriptors. */ | |
26606 | + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; | |
26607 | + unsigned long sp0; | |
26608 | + unsigned long sp; | |
26609 | +#ifdef CONFIG_X86_32 | |
26610 | + unsigned long sysenter_cs; | |
26611 | +#else | |
26612 | + unsigned long usersp; /* Copy from PDA */ | |
26613 | + unsigned short es, ds, fsindex, gsindex; | |
26614 | +#endif | |
26615 | + unsigned long ip; | |
26616 | + unsigned long fs; | |
26617 | + unsigned long gs; | |
26618 | +/* Hardware debugging registers */ | |
26619 | + unsigned long debugreg0; | |
26620 | + unsigned long debugreg1; | |
26621 | + unsigned long debugreg2; | |
26622 | + unsigned long debugreg3; | |
26623 | + unsigned long debugreg6; | |
26624 | + unsigned long debugreg7; | |
26625 | +/* fault info */ | |
26626 | + unsigned long cr2, trap_no, error_code; | |
26627 | +/* floating point info */ | |
26628 | + union i387_union i387 __attribute__((aligned(16)));; | |
26629 | +#ifdef CONFIG_X86_32 | |
26630 | +/* virtual 86 mode info */ | |
26631 | + struct vm86_struct __user *vm86_info; | |
26632 | + unsigned long screen_bitmap; | |
26633 | + unsigned long v86flags, v86mask, saved_sp0; | |
26634 | + unsigned int saved_fs, saved_gs; | |
26635 | +#endif | |
26636 | +/* IO permissions */ | |
26637 | + unsigned long *io_bitmap_ptr; | |
26638 | + unsigned long iopl; | |
26639 | +/* max allowed port in the bitmap, in bytes: */ | |
26640 | + unsigned io_bitmap_max; | |
26641 | +/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ | |
26642 | + unsigned long debugctlmsr; | |
26643 | +/* Debug Store - if not 0 points to a DS Save Area configuration; | |
26644 | + * goes into MSR_IA32_DS_AREA */ | |
26645 | + unsigned long ds_area_msr; | |
26646 | +}; | |
26647 | + | |
26648 | +static inline unsigned long xen_get_debugreg(int regno) | |
26649 | +{ | |
26650 | + return HYPERVISOR_get_debugreg(regno); | |
26651 | +} | |
26652 | + | |
26653 | +static inline void xen_set_debugreg(int regno, unsigned long value) | |
26654 | +{ | |
26655 | + WARN_ON(HYPERVISOR_set_debugreg(regno, value)); | |
26656 | +} | |
26657 | + | |
26658 | +/* | |
26659 | + * Set IOPL bits in EFLAGS from given mask | |
26660 | + */ | |
26661 | +static inline void xen_set_iopl_mask(unsigned mask) | |
26662 | +{ | |
26663 | + struct physdev_set_iopl set_iopl; | |
26664 | + | |
26665 | + /* Force the change at ring 0. */ | |
26666 | + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; | |
26667 | + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); | |
26668 | +} | |
26669 | + | |
26670 | +#ifndef CONFIG_X86_NO_TSS | |
26671 | +static inline void native_load_sp0(struct tss_struct *tss, | |
26672 | + struct thread_struct *thread) | |
26673 | +{ | |
26674 | + tss->x86_tss.sp0 = thread->sp0; | |
26675 | +#ifdef CONFIG_X86_32 | |
26676 | + /* Only happens when SEP is enabled, no need to test "SEP"arately */ | |
26677 | + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { | |
26678 | + tss->x86_tss.ss1 = thread->sysenter_cs; | |
26679 | + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); | |
26680 | + } | |
26681 | +#endif | |
26682 | +} | |
26683 | +#else | |
26684 | +#define xen_load_sp0(tss, thread) do { \ | |
26685 | + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \ | |
26686 | + BUG(); \ | |
26687 | +} while (0) | |
26688 | +#endif | |
26689 | + | |
26690 | +#define __cpuid xen_cpuid | |
26691 | +#define paravirt_enabled() 0 | |
26692 | + | |
26693 | +/* | |
26694 | + * These special macros can be used to get or set a debugging register | |
26695 | + */ | |
26696 | +#define get_debugreg(var, register) \ | |
26697 | + (var) = xen_get_debugreg(register) | |
26698 | +#define set_debugreg(value, register) \ | |
26699 | + xen_set_debugreg(register, value) | |
26700 | + | |
26701 | +#define load_sp0 xen_load_sp0 | |
26702 | + | |
26703 | +#define set_iopl_mask xen_set_iopl_mask | |
26704 | + | |
26705 | +/* | |
26706 | + * Save the cr4 feature set we're using (ie | |
26707 | + * Pentium 4MB enable and PPro Global page | |
26708 | + * enable), so that any CPU's that boot up | |
26709 | + * after us can get the correct flags. | |
26710 | + */ | |
26711 | +extern unsigned long mmu_cr4_features; | |
26712 | + | |
26713 | +static inline void set_in_cr4(unsigned long mask) | |
26714 | +{ | |
26715 | + unsigned cr4; | |
26716 | + mmu_cr4_features |= mask; | |
26717 | + cr4 = read_cr4(); | |
26718 | + cr4 |= mask; | |
26719 | + write_cr4(cr4); | |
26720 | +} | |
26721 | + | |
26722 | +static inline void clear_in_cr4(unsigned long mask) | |
26723 | +{ | |
26724 | + unsigned cr4; | |
26725 | + mmu_cr4_features &= ~mask; | |
26726 | + cr4 = read_cr4(); | |
26727 | + cr4 &= ~mask; | |
26728 | + write_cr4(cr4); | |
26729 | +} | |
26730 | + | |
26731 | +struct microcode_header { | |
26732 | + unsigned int hdrver; | |
26733 | + unsigned int rev; | |
26734 | + unsigned int date; | |
26735 | + unsigned int sig; | |
26736 | + unsigned int cksum; | |
26737 | + unsigned int ldrver; | |
26738 | + unsigned int pf; | |
26739 | + unsigned int datasize; | |
26740 | + unsigned int totalsize; | |
26741 | + unsigned int reserved[3]; | |
26742 | +}; | |
26743 | + | |
26744 | +struct microcode { | |
26745 | + struct microcode_header hdr; | |
26746 | + unsigned int bits[0]; | |
26747 | +}; | |
26748 | + | |
26749 | +typedef struct microcode microcode_t; | |
26750 | +typedef struct microcode_header microcode_header_t; | |
26751 | + | |
26752 | +/* microcode format is extended from prescott processors */ | |
26753 | +struct extended_signature { | |
26754 | + unsigned int sig; | |
26755 | + unsigned int pf; | |
26756 | + unsigned int cksum; | |
26757 | +}; | |
26758 | + | |
26759 | +struct extended_sigtable { | |
26760 | + unsigned int count; | |
26761 | + unsigned int cksum; | |
26762 | + unsigned int reserved[3]; | |
26763 | + struct extended_signature sigs[0]; | |
26764 | +}; | |
26765 | + | |
26766 | +typedef struct { | |
26767 | + unsigned long seg; | |
26768 | +} mm_segment_t; | |
26769 | + | |
26770 | + | |
26771 | +/* | |
26772 | + * create a kernel thread without removing it from tasklists | |
26773 | + */ | |
26774 | +extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); | |
26775 | + | |
26776 | +/* Free all resources held by a thread. */ | |
26777 | +extern void release_thread(struct task_struct *); | |
26778 | + | |
26779 | +/* Prepare to copy thread state - unlazy all lazy status */ | |
26780 | +extern void prepare_to_copy(struct task_struct *tsk); | |
26781 | + | |
26782 | +unsigned long get_wchan(struct task_struct *p); | |
26783 | + | |
26784 | +/* | |
26785 | + * Generic CPUID function | |
26786 | + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx | |
26787 | + * resulting in stale register contents being returned. | |
26788 | + */ | |
26789 | +static inline void cpuid(unsigned int op, | |
26790 | + unsigned int *eax, unsigned int *ebx, | |
26791 | + unsigned int *ecx, unsigned int *edx) | |
26792 | +{ | |
26793 | + *eax = op; | |
26794 | + *ecx = 0; | |
26795 | + __cpuid(eax, ebx, ecx, edx); | |
26796 | +} | |
26797 | + | |
26798 | +/* Some CPUID calls want 'count' to be placed in ecx */ | |
26799 | +static inline void cpuid_count(unsigned int op, int count, | |
26800 | + unsigned int *eax, unsigned int *ebx, | |
26801 | + unsigned int *ecx, unsigned int *edx) | |
26802 | +{ | |
26803 | + *eax = op; | |
26804 | + *ecx = count; | |
26805 | + __cpuid(eax, ebx, ecx, edx); | |
26806 | +} | |
26807 | + | |
26808 | +/* | |
26809 | + * CPUID functions returning a single datum | |
26810 | + */ | |
26811 | +static inline unsigned int cpuid_eax(unsigned int op) | |
26812 | +{ | |
26813 | + unsigned int eax, ebx, ecx, edx; | |
26814 | + | |
26815 | + cpuid(op, &eax, &ebx, &ecx, &edx); | |
26816 | + return eax; | |
26817 | +} | |
26818 | +static inline unsigned int cpuid_ebx(unsigned int op) | |
26819 | +{ | |
26820 | + unsigned int eax, ebx, ecx, edx; | |
26821 | + | |
26822 | + cpuid(op, &eax, &ebx, &ecx, &edx); | |
26823 | + return ebx; | |
26824 | +} | |
26825 | +static inline unsigned int cpuid_ecx(unsigned int op) | |
26826 | +{ | |
26827 | + unsigned int eax, ebx, ecx, edx; | |
26828 | + | |
26829 | + cpuid(op, &eax, &ebx, &ecx, &edx); | |
26830 | + return ecx; | |
26831 | +} | |
26832 | +static inline unsigned int cpuid_edx(unsigned int op) | |
26833 | +{ | |
26834 | + unsigned int eax, ebx, ecx, edx; | |
26835 | + | |
26836 | + cpuid(op, &eax, &ebx, &ecx, &edx); | |
26837 | + return edx; | |
26838 | +} | |
26839 | + | |
26840 | +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ | |
26841 | +static inline void rep_nop(void) | |
26842 | +{ | |
26843 | + __asm__ __volatile__("rep;nop": : :"memory"); | |
26844 | +} | |
26845 | + | |
26846 | +/* Stop speculative execution */ | |
26847 | +static inline void sync_core(void) | |
26848 | +{ | |
26849 | + int tmp; | |
26850 | + asm volatile("cpuid" : "=a" (tmp) : "0" (1) | |
26851 | + : "ebx", "ecx", "edx", "memory"); | |
26852 | +} | |
26853 | + | |
26854 | +#define cpu_relax() rep_nop() | |
26855 | + | |
26856 | +static inline void __monitor(const void *eax, unsigned long ecx, | |
26857 | + unsigned long edx) | |
26858 | +{ | |
26859 | + /* "monitor %eax,%ecx,%edx;" */ | |
26860 | + asm volatile( | |
26861 | + ".byte 0x0f,0x01,0xc8;" | |
26862 | + : :"a" (eax), "c" (ecx), "d"(edx)); | |
26863 | +} | |
26864 | + | |
26865 | +static inline void __mwait(unsigned long eax, unsigned long ecx) | |
26866 | +{ | |
26867 | + /* "mwait %eax,%ecx;" */ | |
26868 | + asm volatile( | |
26869 | + ".byte 0x0f,0x01,0xc9;" | |
26870 | + : :"a" (eax), "c" (ecx)); | |
26871 | +} | |
26872 | + | |
26873 | +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) | |
26874 | +{ | |
26875 | + /* "mwait %eax,%ecx;" */ | |
26876 | + asm volatile( | |
26877 | + "sti; .byte 0x0f,0x01,0xc9;" | |
26878 | + : :"a" (eax), "c" (ecx)); | |
26879 | +} | |
26880 | + | |
26881 | +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); | |
26882 | + | |
26883 | +extern int force_mwait; | |
26884 | + | |
26885 | +extern void select_idle_routine(const struct cpuinfo_x86 *c); | |
26886 | + | |
26887 | +extern unsigned long boot_option_idle_override; | |
26888 | + | |
26889 | +extern void enable_sep_cpu(void); | |
26890 | +extern int sysenter_setup(void); | |
26891 | + | |
26892 | +/* Defined in head.S */ | |
26893 | +extern struct desc_ptr early_gdt_descr; | |
26894 | + | |
26895 | +extern void cpu_set_gdt(int); | |
26896 | +extern void switch_to_new_gdt(void); | |
26897 | +extern void cpu_init(void); | |
26898 | +extern void init_gdt(int cpu); | |
26899 | + | |
26900 | +/* from system description table in BIOS. Mostly for MCA use, but | |
26901 | + * others may find it useful. */ | |
26902 | +extern unsigned int machine_id; | |
26903 | +extern unsigned int machine_submodel_id; | |
26904 | +extern unsigned int BIOS_revision; | |
26905 | + | |
26906 | +/* Boot loader type from the setup header */ | |
26907 | +extern int bootloader_type; | |
26908 | + | |
26909 | +extern char ignore_fpu_irq; | |
26910 | +#define cache_line_size() (boot_cpu_data.x86_cache_alignment) | |
26911 | + | |
26912 | +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 | |
26913 | +#define ARCH_HAS_PREFETCHW | |
26914 | +#define ARCH_HAS_SPINLOCK_PREFETCH | |
26915 | + | |
26916 | +#ifdef CONFIG_X86_32 | |
26917 | +#define BASE_PREFETCH ASM_NOP4 | |
26918 | +#define ARCH_HAS_PREFETCH | |
26919 | +#else | |
26920 | +#define BASE_PREFETCH "prefetcht0 (%1)" | |
26921 | +#endif | |
26922 | + | |
26923 | +/* Prefetch instructions for Pentium III and AMD Athlon */ | |
26924 | +/* It's not worth to care about 3dnow! prefetches for the K6 | |
26925 | + because they are microcoded there and very slow. | |
26926 | + However we don't do prefetches for pre XP Athlons currently | |
26927 | + That should be fixed. */ | |
26928 | +static inline void prefetch(const void *x) | |
26929 | +{ | |
26930 | + alternative_input(BASE_PREFETCH, | |
26931 | + "prefetchnta (%1)", | |
26932 | + X86_FEATURE_XMM, | |
26933 | + "r" (x)); | |
26934 | +} | |
26935 | + | |
26936 | +/* 3dnow! prefetch to get an exclusive cache line. Useful for | |
26937 | + spinlocks to avoid one state transition in the cache coherency protocol. */ | |
26938 | +static inline void prefetchw(const void *x) | |
26939 | +{ | |
26940 | + alternative_input(BASE_PREFETCH, | |
26941 | + "prefetchw (%1)", | |
26942 | + X86_FEATURE_3DNOW, | |
26943 | + "r" (x)); | |
26944 | +} | |
26945 | + | |
26946 | +#define spin_lock_prefetch(x) prefetchw(x) | |
26947 | #ifdef CONFIG_X86_32 | |
26948 | -# include "processor_32.h" | |
26949 | +/* | |
26950 | + * User space process size: 3GB (default). | |
26951 | + */ | |
26952 | +#define TASK_SIZE (PAGE_OFFSET) | |
26953 | +#define STACK_TOP TASK_SIZE | |
26954 | +#define STACK_TOP_MAX STACK_TOP | |
26955 | + | |
26956 | +#define INIT_THREAD { \ | |
26957 | + .sp0 = sizeof(init_stack) + (long)&init_stack, \ | |
26958 | + .vm86_info = NULL, \ | |
26959 | + .sysenter_cs = __KERNEL_CS, \ | |
26960 | + .io_bitmap_ptr = NULL, \ | |
26961 | + .fs = __KERNEL_PERCPU, \ | |
26962 | +} | |
26963 | + | |
26964 | +/* | |
26965 | + * Note that the .io_bitmap member must be extra-big. This is because | |
26966 | + * the CPU will access an additional byte beyond the end of the IO | |
26967 | + * permission bitmap. The extra byte must be all 1 bits, and must | |
26968 | + * be within the limit. | |
26969 | + */ | |
26970 | +#define INIT_TSS { \ | |
26971 | + .x86_tss = { \ | |
26972 | + .sp0 = sizeof(init_stack) + (long)&init_stack, \ | |
26973 | + .ss0 = __KERNEL_DS, \ | |
26974 | + .ss1 = __KERNEL_CS, \ | |
26975 | + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ | |
26976 | + }, \ | |
26977 | + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ | |
26978 | +} | |
26979 | + | |
26980 | +#define start_thread(regs, new_eip, new_esp) do { \ | |
26981 | + __asm__("movl %0,%%gs": :"r" (0)); \ | |
26982 | + regs->fs = 0; \ | |
26983 | + set_fs(USER_DS); \ | |
26984 | + regs->ds = __USER_DS; \ | |
26985 | + regs->es = __USER_DS; \ | |
26986 | + regs->ss = __USER_DS; \ | |
26987 | + regs->cs = __USER_CS; \ | |
26988 | + regs->ip = new_eip; \ | |
26989 | + regs->sp = new_esp; \ | |
26990 | +} while (0) | |
26991 | + | |
26992 | + | |
26993 | +extern unsigned long thread_saved_pc(struct task_struct *tsk); | |
26994 | + | |
26995 | +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) | |
26996 | +#define KSTK_TOP(info) \ | |
26997 | +({ \ | |
26998 | + unsigned long *__ptr = (unsigned long *)(info); \ | |
26999 | + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ | |
27000 | +}) | |
27001 | + | |
27002 | +/* | |
27003 | + * The below -8 is to reserve 8 bytes on top of the ring0 stack. | |
27004 | + * This is necessary to guarantee that the entire "struct pt_regs" | |
27005 | + * is accessable even if the CPU haven't stored the SS/ESP registers | |
27006 | + * on the stack (interrupt gate does not save these registers | |
27007 | + * when switching to the same priv ring). | |
27008 | + * Therefore beware: accessing the ss/esp fields of the | |
27009 | + * "struct pt_regs" is possible, but they may contain the | |
27010 | + * completely wrong values. | |
27011 | + */ | |
27012 | +#define task_pt_regs(task) \ | |
27013 | +({ \ | |
27014 | + struct pt_regs *__regs__; \ | |
27015 | + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ | |
27016 | + __regs__ - 1; \ | |
27017 | +}) | |
27018 | + | |
27019 | +#define KSTK_ESP(task) (task_pt_regs(task)->sp) | |
27020 | + | |
27021 | #else | |
27022 | -# include "processor_64.h" | |
27023 | +/* | |
27024 | + * User space process size. 47bits minus one guard page. | |
27025 | + */ | |
27026 | +#define TASK_SIZE64 (0x800000000000UL - 4096) | |
27027 | + | |
27028 | +/* This decides where the kernel will search for a free chunk of vm | |
27029 | + * space during mmap's. | |
27030 | + */ | |
27031 | +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ | |
27032 | + 0xc0000000 : 0xFFFFe000) | |
27033 | + | |
27034 | +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ | |
27035 | + IA32_PAGE_OFFSET : TASK_SIZE64) | |
27036 | +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ | |
27037 | + IA32_PAGE_OFFSET : TASK_SIZE64) | |
27038 | + | |
27039 | +#define STACK_TOP TASK_SIZE | |
27040 | +#define STACK_TOP_MAX TASK_SIZE64 | |
27041 | + | |
27042 | +#define INIT_THREAD { \ | |
27043 | + .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ | |
27044 | +} | |
27045 | + | |
27046 | +#define INIT_TSS { \ | |
27047 | + .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ | |
27048 | +} | |
27049 | + | |
27050 | +#define start_thread(regs, new_rip, new_rsp) do { \ | |
27051 | + asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \ | |
27052 | + load_gs_index(0); \ | |
27053 | + (regs)->ip = (new_rip); \ | |
27054 | + (regs)->sp = (new_rsp); \ | |
27055 | + write_pda(oldrsp, (new_rsp)); \ | |
27056 | + (regs)->cs = __USER_CS; \ | |
27057 | + (regs)->ss = __USER_DS; \ | |
27058 | + (regs)->flags = 0x200; \ | |
27059 | + set_fs(USER_DS); \ | |
27060 | +} while (0) | |
27061 | + | |
27062 | +/* | |
27063 | + * Return saved PC of a blocked thread. | |
27064 | + * What is this good for? it will be always the scheduler or ret_from_fork. | |
27065 | + */ | |
27066 | +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) | |
27067 | + | |
27068 | +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) | |
27069 | +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ | |
27070 | +#endif /* CONFIG_X86_64 */ | |
27071 | + | |
27072 | +/* This decides where the kernel will search for a free chunk of vm | |
27073 | + * space during mmap's. | |
27074 | + */ | |
27075 | +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) | |
27076 | + | |
27077 | +#define KSTK_EIP(task) (task_pt_regs(task)->ip) | |
27078 | + | |
27079 | #endif | |
27080 | --- a/include/asm-x86/mach-xen/asm/segment_32.h | |
27081 | +++ /dev/null | |
27082 | @@ -1,150 +0,0 @@ | |
27083 | -#ifndef _ASM_SEGMENT_H | |
27084 | -#define _ASM_SEGMENT_H | |
27085 | - | |
27086 | -/* | |
27087 | - * The layout of the per-CPU GDT under Linux: | |
27088 | - * | |
27089 | - * 0 - null | |
27090 | - * 1 - reserved | |
27091 | - * 2 - reserved | |
27092 | - * 3 - reserved | |
27093 | - * | |
27094 | - * 4 - unused <==== new cacheline | |
27095 | - * 5 - unused | |
27096 | - * | |
27097 | - * ------- start of TLS (Thread-Local Storage) segments: | |
27098 | - * | |
27099 | - * 6 - TLS segment #1 [ glibc's TLS segment ] | |
27100 | - * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] | |
27101 | - * 8 - TLS segment #3 | |
27102 | - * 9 - reserved | |
27103 | - * 10 - reserved | |
27104 | - * 11 - reserved | |
27105 | - * | |
27106 | - * ------- start of kernel segments: | |
27107 | - * | |
27108 | - * 12 - kernel code segment <==== new cacheline | |
27109 | - * 13 - kernel data segment | |
27110 | - * 14 - default user CS | |
27111 | - * 15 - default user DS | |
27112 | - * 16 - TSS | |
27113 | - * 17 - LDT | |
27114 | - * 18 - PNPBIOS support (16->32 gate) | |
27115 | - * 19 - PNPBIOS support | |
27116 | - * 20 - PNPBIOS support | |
27117 | - * 21 - PNPBIOS support | |
27118 | - * 22 - PNPBIOS support | |
27119 | - * 23 - APM BIOS support | |
27120 | - * 24 - APM BIOS support | |
27121 | - * 25 - APM BIOS support | |
27122 | - * | |
27123 | - * 26 - ESPFIX small SS | |
27124 | - * 27 - per-cpu [ offset to per-cpu data area ] | |
27125 | - * 28 - unused | |
27126 | - * 29 - unused | |
27127 | - * 30 - unused | |
27128 | - * 31 - TSS for double fault handler | |
27129 | - */ | |
27130 | -#define GDT_ENTRY_TLS_ENTRIES 3 | |
27131 | -#define GDT_ENTRY_TLS_MIN 6 | |
27132 | -#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) | |
27133 | - | |
27134 | -#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) | |
27135 | - | |
27136 | -#define GDT_ENTRY_DEFAULT_USER_CS 14 | |
27137 | -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3) | |
27138 | - | |
27139 | -#define GDT_ENTRY_DEFAULT_USER_DS 15 | |
27140 | -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3) | |
27141 | - | |
27142 | -#define GDT_ENTRY_KERNEL_BASE 12 | |
27143 | - | |
27144 | -#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0) | |
27145 | -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8) | |
27146 | - | |
27147 | -#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1) | |
27148 | -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) | |
27149 | - | |
27150 | -#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4) | |
27151 | -#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5) | |
27152 | - | |
27153 | -#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) | |
27154 | -#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) | |
27155 | - | |
27156 | -#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) | |
27157 | -#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) | |
27158 | - | |
27159 | -#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) | |
27160 | -#ifdef CONFIG_SMP | |
27161 | -#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) | |
27162 | -#else | |
27163 | -#define __KERNEL_PERCPU 0 | |
27164 | -#endif | |
27165 | - | |
27166 | -#define GDT_ENTRY_DOUBLEFAULT_TSS 31 | |
27167 | - | |
27168 | -/* | |
27169 | - * The GDT has 32 entries | |
27170 | - */ | |
27171 | -#define GDT_ENTRIES 32 | |
27172 | -#define GDT_SIZE (GDT_ENTRIES * 8) | |
27173 | - | |
27174 | -/* Simple and small GDT entries for booting only */ | |
27175 | - | |
27176 | -#define GDT_ENTRY_BOOT_CS 2 | |
27177 | -#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) | |
27178 | - | |
27179 | -#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) | |
27180 | -#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) | |
27181 | - | |
27182 | -/* The PnP BIOS entries in the GDT */ | |
27183 | -#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0) | |
27184 | -#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1) | |
27185 | -#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2) | |
27186 | -#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3) | |
27187 | -#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4) | |
27188 | - | |
27189 | -/* The PnP BIOS selectors */ | |
27190 | -#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */ | |
27191 | -#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */ | |
27192 | -#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */ | |
27193 | -#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ | |
27194 | -#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ | |
27195 | - | |
27196 | -/* | |
27197 | - * The interrupt descriptor table has room for 256 idt's, | |
27198 | - * the global descriptor table is dependent on the number | |
27199 | - * of tasks we can have.. | |
27200 | - */ | |
27201 | -#define IDT_ENTRIES 256 | |
27202 | - | |
27203 | -/* Bottom two bits of selector give the ring privilege level */ | |
27204 | -#define SEGMENT_RPL_MASK 0x3 | |
27205 | -/* Bit 2 is table indicator (LDT/GDT) */ | |
27206 | -#define SEGMENT_TI_MASK 0x4 | |
27207 | - | |
27208 | -/* User mode is privilege level 3 */ | |
27209 | -#define USER_RPL 0x3 | |
27210 | -/* LDT segment has TI set, GDT has it cleared */ | |
27211 | -#define SEGMENT_LDT 0x4 | |
27212 | -#define SEGMENT_GDT 0x0 | |
27213 | - | |
27214 | -#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) | |
27215 | - | |
27216 | -/* | |
27217 | - * Matching rules for certain types of segments. | |
27218 | - */ | |
27219 | - | |
27220 | -/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */ | |
27221 | -#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \ | |
27222 | - || ((x) & ~3) == (FLAT_KERNEL_CS & ~3)) | |
27223 | - | |
27224 | -/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */ | |
27225 | -#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \ | |
27226 | - || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \ | |
27227 | - || ((x) & ~3) == (FLAT_USER_CS & ~3)) | |
27228 | - | |
27229 | -/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ | |
27230 | -#define SEGMENT_IS_PNP_CODE(x) (((x) & ~0x0b) == GDT_ENTRY_PNPBIOS_BASE * 8) | |
27231 | - | |
27232 | -#endif | |
27233 | --- a/include/asm-x86/mach-xen/asm/segment.h | |
27234 | +++ b/include/asm-x86/mach-xen/asm/segment.h | |
27235 | @@ -1,5 +1,204 @@ | |
27236 | +#ifndef _ASM_X86_SEGMENT_H_ | |
27237 | +#define _ASM_X86_SEGMENT_H_ | |
27238 | + | |
27239 | +/* Simple and small GDT entries for booting only */ | |
27240 | + | |
27241 | +#define GDT_ENTRY_BOOT_CS 2 | |
27242 | +#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) | |
27243 | + | |
27244 | +#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) | |
27245 | +#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) | |
27246 | + | |
27247 | +#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) | |
27248 | +#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) | |
27249 | + | |
27250 | #ifdef CONFIG_X86_32 | |
27251 | -# include "segment_32.h" | |
27252 | +/* | |
27253 | + * The layout of the per-CPU GDT under Linux: | |
27254 | + * | |
27255 | + * 0 - null | |
27256 | + * 1 - reserved | |
27257 | + * 2 - reserved | |
27258 | + * 3 - reserved | |
27259 | + * | |
27260 | + * 4 - unused <==== new cacheline | |
27261 | + * 5 - unused | |
27262 | + * | |
27263 | + * ------- start of TLS (Thread-Local Storage) segments: | |
27264 | + * | |
27265 | + * 6 - TLS segment #1 [ glibc's TLS segment ] | |
27266 | + * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] | |
27267 | + * 8 - TLS segment #3 | |
27268 | + * 9 - reserved | |
27269 | + * 10 - reserved | |
27270 | + * 11 - reserved | |
27271 | + * | |
27272 | + * ------- start of kernel segments: | |
27273 | + * | |
27274 | + * 12 - kernel code segment <==== new cacheline | |
27275 | + * 13 - kernel data segment | |
27276 | + * 14 - default user CS | |
27277 | + * 15 - default user DS | |
27278 | + * 16 - TSS | |
27279 | + * 17 - LDT | |
27280 | + * 18 - PNPBIOS support (16->32 gate) | |
27281 | + * 19 - PNPBIOS support | |
27282 | + * 20 - PNPBIOS support | |
27283 | + * 21 - PNPBIOS support | |
27284 | + * 22 - PNPBIOS support | |
27285 | + * 23 - APM BIOS support | |
27286 | + * 24 - APM BIOS support | |
27287 | + * 25 - APM BIOS support | |
27288 | + * | |
27289 | + * 26 - ESPFIX small SS | |
27290 | + * 27 - per-cpu [ offset to per-cpu data area ] | |
27291 | + * 28 - unused | |
27292 | + * 29 - unused | |
27293 | + * 30 - unused | |
27294 | + * 31 - TSS for double fault handler | |
27295 | + */ | |
27296 | +#define GDT_ENTRY_TLS_MIN 6 | |
27297 | +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) | |
27298 | + | |
27299 | +#define GDT_ENTRY_DEFAULT_USER_CS 14 | |
27300 | +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3) | |
27301 | + | |
27302 | +#define GDT_ENTRY_DEFAULT_USER_DS 15 | |
27303 | +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3) | |
27304 | + | |
27305 | +#define GDT_ENTRY_KERNEL_BASE 12 | |
27306 | + | |
27307 | +#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0) | |
27308 | +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8) | |
27309 | + | |
27310 | +#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1) | |
27311 | +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) | |
27312 | + | |
27313 | +#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4) | |
27314 | +#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5) | |
27315 | + | |
27316 | +#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) | |
27317 | +#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) | |
27318 | + | |
27319 | +#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) | |
27320 | +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) | |
27321 | + | |
27322 | +#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) | |
27323 | +#ifdef CONFIG_SMP | |
27324 | +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) | |
27325 | #else | |
27326 | -# include "../../segment_64.h" | |
27327 | +#define __KERNEL_PERCPU 0 | |
27328 | +#endif | |
27329 | + | |
27330 | +#define GDT_ENTRY_DOUBLEFAULT_TSS 31 | |
27331 | + | |
27332 | +/* | |
27333 | + * The GDT has 32 entries | |
27334 | + */ | |
27335 | +#define GDT_ENTRIES 32 | |
27336 | + | |
27337 | +/* The PnP BIOS entries in the GDT */ | |
27338 | +#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0) | |
27339 | +#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1) | |
27340 | +#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2) | |
27341 | +#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3) | |
27342 | +#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4) | |
27343 | + | |
27344 | +/* The PnP BIOS selectors */ | |
27345 | +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */ | |
27346 | +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */ | |
27347 | +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */ | |
27348 | +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ | |
27349 | +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ | |
27350 | + | |
27351 | +/* Bottom two bits of selector give the ring privilege level */ | |
27352 | +#define SEGMENT_RPL_MASK 0x3 | |
27353 | +/* Bit 2 is table indicator (LDT/GDT) */ | |
27354 | +#define SEGMENT_TI_MASK 0x4 | |
27355 | + | |
27356 | +/* User mode is privilege level 3 */ | |
27357 | +#define USER_RPL 0x3 | |
27358 | +/* LDT segment has TI set, GDT has it cleared */ | |
27359 | +#define SEGMENT_LDT 0x4 | |
27360 | +#define SEGMENT_GDT 0x0 | |
27361 | + | |
27362 | +/* | |
27363 | + * Matching rules for certain types of segments. | |
27364 | + */ | |
27365 | + | |
27366 | +/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */ | |
27367 | +#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \ | |
27368 | + || ((x) & ~3) == (FLAT_KERNEL_CS & ~3)) | |
27369 | + | |
27370 | +/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */ | |
27371 | +#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \ | |
27372 | + || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \ | |
27373 | + || ((x) & ~3) == (FLAT_USER_CS & ~3)) | |
27374 | + | |
27375 | +/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ | |
27376 | +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) | |
27377 | + | |
27378 | +#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) | |
27379 | + | |
27380 | +#else | |
27381 | +#include <asm/cache.h> | |
27382 | + | |
27383 | +#define __KERNEL_CS 0x10 | |
27384 | +#define __KERNEL_DS 0x18 | |
27385 | + | |
27386 | +#define __KERNEL32_CS 0x08 | |
27387 | + | |
27388 | +/* | |
27389 | + * we cannot use the same code segment descriptor for user and kernel | |
27390 | + * -- not even in the long flat mode, because of different DPL /kkeil | |
27391 | + * The segment offset needs to contain a RPL. Grr. -AK | |
27392 | + * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) | |
27393 | + */ | |
27394 | + | |
27395 | +#define __USER32_CS 0x23 /* 4*8+3 */ | |
27396 | +#define __USER_DS 0x2b /* 5*8+3 */ | |
27397 | +#define __USER_CS 0x33 /* 6*8+3 */ | |
27398 | +#define __USER32_DS __USER_DS | |
27399 | + | |
27400 | +#define GDT_ENTRY_TSS 8 /* needs two entries */ | |
27401 | +#define GDT_ENTRY_LDT 10 /* needs two entries */ | |
27402 | +#define GDT_ENTRY_TLS_MIN 12 | |
27403 | +#define GDT_ENTRY_TLS_MAX 14 | |
27404 | + | |
27405 | +#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */ | |
27406 | +#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3) | |
27407 | + | |
27408 | +/* TLS indexes for 64bit - hardcoded in arch_prctl */ | |
27409 | +#define FS_TLS 0 | |
27410 | +#define GS_TLS 1 | |
27411 | + | |
27412 | +#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) | |
27413 | +#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) | |
27414 | + | |
27415 | +#define GDT_ENTRIES 16 | |
27416 | + | |
27417 | +#endif | |
27418 | + | |
27419 | +/* User mode is privilege level 3 */ | |
27420 | +#define USER_RPL 0x3 | |
27421 | +/* LDT segment has TI set, GDT has it cleared */ | |
27422 | +#define SEGMENT_LDT 0x4 | |
27423 | +#define SEGMENT_GDT 0x0 | |
27424 | + | |
27425 | +/* Bottom two bits of selector give the ring privilege level */ | |
27426 | +#define SEGMENT_RPL_MASK 0x3 | |
27427 | +/* Bit 2 is table indicator (LDT/GDT) */ | |
27428 | +#define SEGMENT_TI_MASK 0x4 | |
27429 | + | |
27430 | +#define IDT_ENTRIES 256 | |
27431 | +#define GDT_SIZE (GDT_ENTRIES * 8) | |
27432 | +#define GDT_ENTRY_TLS_ENTRIES 3 | |
27433 | +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) | |
27434 | + | |
27435 | +#ifdef __KERNEL__ | |
27436 | +#ifndef __ASSEMBLY__ | |
27437 | +extern const char early_idt_handlers[IDT_ENTRIES][10]; | |
27438 | +#endif | |
27439 | +#endif | |
27440 | + | |
27441 | #endif | |
27442 | --- a/include/asm-x86/mach-xen/asm/smp_32.h | |
27443 | +++ b/include/asm-x86/mach-xen/asm/smp_32.h | |
27444 | @@ -1,56 +1,51 @@ | |
27445 | #ifndef __ASM_SMP_H | |
27446 | #define __ASM_SMP_H | |
27447 | ||
27448 | +#ifndef __ASSEMBLY__ | |
27449 | +#include <linux/cpumask.h> | |
27450 | +#include <linux/init.h> | |
27451 | + | |
27452 | /* | |
27453 | * We need the APIC definitions automatically as part of 'smp.h' | |
27454 | */ | |
27455 | -#ifndef __ASSEMBLY__ | |
27456 | -#include <linux/kernel.h> | |
27457 | -#include <linux/threads.h> | |
27458 | -#include <linux/cpumask.h> | |
27459 | +#ifdef CONFIG_X86_LOCAL_APIC | |
27460 | +# include <asm/mpspec.h> | |
27461 | +# include <asm/apic.h> | |
27462 | +# ifdef CONFIG_X86_IO_APIC | |
27463 | +# include <asm/io_apic.h> | |
27464 | +# endif | |
27465 | #endif | |
27466 | ||
27467 | -#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__) | |
27468 | -#include <linux/bitops.h> | |
27469 | -#include <asm/mpspec.h> | |
27470 | -#include <asm/apic.h> | |
27471 | -#ifdef CONFIG_X86_IO_APIC | |
27472 | -#include <asm/io_apic.h> | |
27473 | -#endif | |
27474 | -#endif | |
27475 | +#define cpu_callout_map cpu_possible_map | |
27476 | +#define cpu_callin_map cpu_possible_map | |
27477 | ||
27478 | -#define BAD_APICID 0xFFu | |
27479 | -#ifdef CONFIG_SMP | |
27480 | -#ifndef __ASSEMBLY__ | |
27481 | +extern int smp_num_siblings; | |
27482 | +extern unsigned int num_processors; | |
27483 | ||
27484 | -/* | |
27485 | - * Private routines/data | |
27486 | - */ | |
27487 | - | |
27488 | extern void smp_alloc_memory(void); | |
27489 | -extern int pic_mode; | |
27490 | -extern int smp_num_siblings; | |
27491 | -DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); | |
27492 | -DECLARE_PER_CPU(cpumask_t, cpu_core_map); | |
27493 | +extern void lock_ipi_call_lock(void); | |
27494 | +extern void unlock_ipi_call_lock(void); | |
27495 | ||
27496 | extern void (*mtrr_hook) (void); | |
27497 | extern void zap_low_mappings (void); | |
27498 | -extern void lock_ipi_call_lock(void); | |
27499 | -extern void unlock_ipi_call_lock(void); | |
27500 | ||
27501 | -#define MAX_APICID 256 | |
27502 | -extern u8 __initdata x86_cpu_to_apicid_init[]; | |
27503 | -extern void *x86_cpu_to_apicid_ptr; | |
27504 | +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); | |
27505 | +DECLARE_PER_CPU(cpumask_t, cpu_core_map); | |
27506 | +DECLARE_PER_CPU(u8, cpu_llc_id); | |
27507 | DECLARE_PER_CPU(u8, x86_cpu_to_apicid); | |
27508 | ||
27509 | -#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) | |
27510 | - | |
27511 | #ifdef CONFIG_HOTPLUG_CPU | |
27512 | extern void cpu_exit_clear(void); | |
27513 | extern void cpu_uninit(void); | |
27514 | #endif | |
27515 | ||
27516 | +#ifdef CONFIG_SMP | |
27517 | + | |
27518 | #ifndef CONFIG_XEN | |
27519 | + | |
27520 | +/* Globals due to paravirt */ | |
27521 | +extern void set_cpu_sibling_map(int cpu); | |
27522 | + | |
27523 | struct smp_ops | |
27524 | { | |
27525 | void (*smp_prepare_boot_cpu)(void); | |
27526 | @@ -104,11 +99,11 @@ void native_smp_prepare_cpus(unsigned in | |
27527 | int native_cpu_up(unsigned int cpunum); | |
27528 | void native_smp_cpus_done(unsigned int max_cpus); | |
27529 | ||
27530 | -#define startup_ipi_hook(phys_apicid, start_eip, start_esp) \ | |
27531 | -do { } while (0) | |
27532 | - | |
27533 | -#else | |
27534 | +#ifndef CONFIG_PARAVIRT | |
27535 | +#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0) | |
27536 | +#endif | |
27537 | ||
27538 | +#else /* CONFIG_XEN */ | |
27539 | ||
27540 | void xen_smp_send_stop(void); | |
27541 | void xen_smp_send_reschedule(int cpu); | |
27542 | @@ -120,7 +115,12 @@ int xen_smp_call_function_mask(cpumask_t | |
27543 | #define smp_send_reschedule xen_smp_send_reschedule | |
27544 | #define smp_call_function_mask xen_smp_call_function_mask | |
27545 | ||
27546 | -#endif | |
27547 | +extern void prefill_possible_map(void); | |
27548 | + | |
27549 | +#endif /* CONFIG_XEN */ | |
27550 | + | |
27551 | +extern int __cpu_disable(void); | |
27552 | +extern void __cpu_die(unsigned int cpu); | |
27553 | ||
27554 | /* | |
27555 | * This function is needed by all SMP systems. It must _always_ be valid | |
27556 | @@ -130,64 +130,49 @@ int xen_smp_call_function_mask(cpumask_t | |
27557 | DECLARE_PER_CPU(int, cpu_number); | |
27558 | #define raw_smp_processor_id() (x86_read_percpu(cpu_number)) | |
27559 | ||
27560 | -extern cpumask_t cpu_possible_map; | |
27561 | -#define cpu_callin_map cpu_possible_map | |
27562 | +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) | |
27563 | + | |
27564 | +#define safe_smp_processor_id() smp_processor_id() | |
27565 | ||
27566 | /* We don't mark CPUs online until __cpu_up(), so we need another measure */ | |
27567 | static inline int num_booting_cpus(void) | |
27568 | { | |
27569 | - return cpus_weight(cpu_possible_map); | |
27570 | + return cpus_weight(cpu_callout_map); | |
27571 | } | |
27572 | ||
27573 | -#define safe_smp_processor_id() smp_processor_id() | |
27574 | -extern int __cpu_disable(void); | |
27575 | -extern void __cpu_die(unsigned int cpu); | |
27576 | -extern void prefill_possible_map(void); | |
27577 | -extern unsigned int num_processors; | |
27578 | - | |
27579 | -#endif /* !__ASSEMBLY__ */ | |
27580 | - | |
27581 | #else /* CONFIG_SMP */ | |
27582 | ||
27583 | #define safe_smp_processor_id() 0 | |
27584 | #define cpu_physical_id(cpu) boot_cpu_physical_apicid | |
27585 | ||
27586 | -#define NO_PROC_ID 0xFF /* No processor magic marker */ | |
27587 | - | |
27588 | -#endif /* CONFIG_SMP */ | |
27589 | - | |
27590 | -#ifndef __ASSEMBLY__ | |
27591 | +#endif /* !CONFIG_SMP */ | |
27592 | ||
27593 | #ifdef CONFIG_X86_LOCAL_APIC | |
27594 | ||
27595 | -#ifdef APIC_DEFINITION | |
27596 | +static __inline int logical_smp_processor_id(void) | |
27597 | +{ | |
27598 | + /* we don't want to mark this access volatile - bad code generation */ | |
27599 | + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); | |
27600 | +} | |
27601 | + | |
27602 | +# ifdef APIC_DEFINITION | |
27603 | extern int hard_smp_processor_id(void); | |
27604 | -#else | |
27605 | -#include <mach_apicdef.h> | |
27606 | +# else | |
27607 | +# include <mach_apicdef.h> | |
27608 | static inline int hard_smp_processor_id(void) | |
27609 | { | |
27610 | /* we don't want to mark this access volatile - bad code generation */ | |
27611 | - return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID)); | |
27612 | + return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID)); | |
27613 | } | |
27614 | -#endif /* APIC_DEFINITION */ | |
27615 | +# endif /* APIC_DEFINITION */ | |
27616 | ||
27617 | #else /* CONFIG_X86_LOCAL_APIC */ | |
27618 | ||
27619 | -#ifndef CONFIG_SMP | |
27620 | -#define hard_smp_processor_id() 0 | |
27621 | -#endif | |
27622 | +# ifndef CONFIG_SMP | |
27623 | +# define hard_smp_processor_id() 0 | |
27624 | +# endif | |
27625 | ||
27626 | #endif /* CONFIG_X86_LOCAL_APIC */ | |
27627 | ||
27628 | -extern u8 apicid_2_node[]; | |
27629 | - | |
27630 | -#ifdef CONFIG_X86_LOCAL_APIC | |
27631 | -static __inline int logical_smp_processor_id(void) | |
27632 | -{ | |
27633 | - /* we don't want to mark this access volatile - bad code generation */ | |
27634 | - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); | |
27635 | -} | |
27636 | -#endif | |
27637 | -#endif | |
27638 | - | |
27639 | +#endif /* !ASSEMBLY */ | |
27640 | #endif | |
27641 | --- a/include/asm-x86/mach-xen/asm/smp_64.h | |
27642 | +++ b/include/asm-x86/mach-xen/asm/smp_64.h | |
27643 | @@ -1,139 +1,103 @@ | |
27644 | #ifndef __ASM_SMP_H | |
27645 | #define __ASM_SMP_H | |
27646 | ||
27647 | -/* | |
27648 | - * We need the APIC definitions automatically as part of 'smp.h' | |
27649 | - */ | |
27650 | -#include <linux/threads.h> | |
27651 | #include <linux/cpumask.h> | |
27652 | -#include <linux/bitops.h> | |
27653 | #include <linux/init.h> | |
27654 | -extern int disable_apic; | |
27655 | ||
27656 | #ifdef CONFIG_X86_LOCAL_APIC | |
27657 | -#include <asm/mpspec.h> | |
27658 | +/* | |
27659 | + * We need the APIC definitions automatically as part of 'smp.h' | |
27660 | + */ | |
27661 | #include <asm/apic.h> | |
27662 | #ifdef CONFIG_X86_IO_APIC | |
27663 | #include <asm/io_apic.h> | |
27664 | #endif | |
27665 | -#include <asm/thread_info.h> | |
27666 | +#include <asm/mpspec.h> | |
27667 | #endif | |
27668 | - | |
27669 | -#ifdef CONFIG_SMP | |
27670 | - | |
27671 | #include <asm/pda.h> | |
27672 | +#include <asm/thread_info.h> | |
27673 | ||
27674 | -struct pt_regs; | |
27675 | - | |
27676 | -extern cpumask_t cpu_present_mask; | |
27677 | -extern cpumask_t cpu_possible_map; | |
27678 | -extern cpumask_t cpu_online_map; | |
27679 | extern cpumask_t cpu_initialized; | |
27680 | ||
27681 | -/* | |
27682 | - * Private routines/data | |
27683 | - */ | |
27684 | - | |
27685 | +extern int smp_num_siblings; | |
27686 | +extern unsigned int num_processors; | |
27687 | + | |
27688 | extern void smp_alloc_memory(void); | |
27689 | -extern volatile unsigned long smp_invalidate_needed; | |
27690 | extern void lock_ipi_call_lock(void); | |
27691 | extern void unlock_ipi_call_lock(void); | |
27692 | -extern int smp_num_siblings; | |
27693 | -extern void smp_send_reschedule(int cpu); | |
27694 | + | |
27695 | extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *), | |
27696 | void *info, int wait); | |
27697 | ||
27698 | -/* | |
27699 | - * cpu_sibling_map and cpu_core_map now live | |
27700 | - * in the per cpu area | |
27701 | - * | |
27702 | - * extern cpumask_t cpu_sibling_map[NR_CPUS]; | |
27703 | - * extern cpumask_t cpu_core_map[NR_CPUS]; | |
27704 | - */ | |
27705 | DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); | |
27706 | DECLARE_PER_CPU(cpumask_t, cpu_core_map); | |
27707 | -DECLARE_PER_CPU(u8, cpu_llc_id); | |
27708 | - | |
27709 | -#define SMP_TRAMPOLINE_BASE 0x6000 | |
27710 | +DECLARE_PER_CPU(u16, cpu_llc_id); | |
27711 | +DECLARE_PER_CPU(u16, x86_cpu_to_apicid); | |
27712 | +DECLARE_PER_CPU(u16, x86_bios_cpu_apicid); | |
27713 | ||
27714 | -/* | |
27715 | - * On x86 all CPUs are mapped 1:1 to the APIC space. | |
27716 | - * This simplifies scheduling and IPI sending and | |
27717 | - * compresses data structures. | |
27718 | - */ | |
27719 | - | |
27720 | -static inline int num_booting_cpus(void) | |
27721 | +#ifdef CONFIG_X86_LOCAL_APIC | |
27722 | +static inline int cpu_present_to_apicid(int mps_cpu) | |
27723 | { | |
27724 | - return cpus_weight(cpu_possible_map); | |
27725 | + if (cpu_present(mps_cpu)) | |
27726 | + return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); | |
27727 | + else | |
27728 | + return BAD_APICID; | |
27729 | } | |
27730 | +#endif | |
27731 | ||
27732 | -#define raw_smp_processor_id() read_pda(cpunumber) | |
27733 | +#ifdef CONFIG_SMP | |
27734 | + | |
27735 | +#define SMP_TRAMPOLINE_BASE 0x6000 | |
27736 | ||
27737 | extern int __cpu_disable(void); | |
27738 | extern void __cpu_die(unsigned int cpu); | |
27739 | extern void prefill_possible_map(void); | |
27740 | -extern unsigned num_processors; | |
27741 | extern unsigned __cpuinitdata disabled_cpus; | |
27742 | ||
27743 | -#define NO_PROC_ID 0xFF /* No processor magic marker */ | |
27744 | - | |
27745 | -#endif /* CONFIG_SMP */ | |
27746 | +#define raw_smp_processor_id() read_pda(cpunumber) | |
27747 | +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) | |
27748 | ||
27749 | -#define safe_smp_processor_id() smp_processor_id() | |
27750 | - | |
27751 | -#ifdef CONFIG_X86_LOCAL_APIC | |
27752 | -static inline int hard_smp_processor_id(void) | |
27753 | -{ | |
27754 | - /* we don't want to mark this access volatile - bad code generation */ | |
27755 | - return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); | |
27756 | -} | |
27757 | -#endif | |
27758 | +#define stack_smp_processor_id() \ | |
27759 | + ({ \ | |
27760 | + struct thread_info *ti; \ | |
27761 | + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ | |
27762 | + ti->cpu; \ | |
27763 | +}) | |
27764 | ||
27765 | /* | |
27766 | - * Some lowlevel functions might want to know about | |
27767 | - * the real APIC ID <-> CPU # mapping. | |
27768 | + * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies | |
27769 | + * scheduling and IPI sending and compresses data structures. | |
27770 | */ | |
27771 | -extern u8 __initdata x86_cpu_to_apicid_init[]; | |
27772 | -extern void *x86_cpu_to_apicid_ptr; | |
27773 | -DECLARE_PER_CPU(u8, x86_cpu_to_apicid); /* physical ID */ | |
27774 | -extern u8 bios_cpu_apicid[]; | |
27775 | - | |
27776 | -#ifdef CONFIG_X86_LOCAL_APIC | |
27777 | -static inline int cpu_present_to_apicid(int mps_cpu) | |
27778 | +static inline int num_booting_cpus(void) | |
27779 | { | |
27780 | - if (mps_cpu < NR_CPUS) | |
27781 | - return (int)bios_cpu_apicid[mps_cpu]; | |
27782 | - else | |
27783 | - return BAD_APICID; | |
27784 | + return cpus_weight(cpu_possible_map); | |
27785 | } | |
27786 | -#endif | |
27787 | ||
27788 | -#ifndef CONFIG_SMP | |
27789 | +extern void smp_send_reschedule(int cpu); | |
27790 | + | |
27791 | +#else /* CONFIG_SMP */ | |
27792 | + | |
27793 | +extern unsigned int boot_cpu_id; | |
27794 | +#define cpu_physical_id(cpu) boot_cpu_id | |
27795 | #define stack_smp_processor_id() 0 | |
27796 | -#define cpu_logical_map(x) (x) | |
27797 | -#else | |
27798 | -#include <asm/thread_info.h> | |
27799 | -#define stack_smp_processor_id() \ | |
27800 | -({ \ | |
27801 | - struct thread_info *ti; \ | |
27802 | - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ | |
27803 | - ti->cpu; \ | |
27804 | -}) | |
27805 | -#endif | |
27806 | + | |
27807 | +#endif /* !CONFIG_SMP */ | |
27808 | + | |
27809 | +#define safe_smp_processor_id() smp_processor_id() | |
27810 | ||
27811 | #ifdef CONFIG_X86_LOCAL_APIC | |
27812 | static __inline int logical_smp_processor_id(void) | |
27813 | { | |
27814 | /* we don't want to mark this access volatile - bad code generation */ | |
27815 | - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); | |
27816 | + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); | |
27817 | +} | |
27818 | + | |
27819 | +static inline int hard_smp_processor_id(void) | |
27820 | +{ | |
27821 | + /* we don't want to mark this access volatile - bad code generation */ | |
27822 | + return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID)); | |
27823 | } | |
27824 | #endif | |
27825 | ||
27826 | -#ifdef CONFIG_SMP | |
27827 | -#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) | |
27828 | -#else | |
27829 | -extern unsigned int boot_cpu_id; | |
27830 | -#define cpu_physical_id(cpu) boot_cpu_id | |
27831 | -#endif /* !CONFIG_SMP */ | |
27832 | #endif | |
27833 | ||
27834 | --- /dev/null | |
27835 | +++ b/include/asm-x86/mach-xen/asm/spinlock.h | |
27836 | @@ -0,0 +1,333 @@ | |
27837 | +#ifndef _X86_SPINLOCK_H_ | |
27838 | +#define _X86_SPINLOCK_H_ | |
27839 | + | |
27840 | +#include <asm/atomic.h> | |
27841 | +#include <asm/rwlock.h> | |
27842 | +#include <asm/page.h> | |
27843 | +#include <asm/processor.h> | |
27844 | +#include <linux/compiler.h> | |
27845 | + | |
27846 | +/* | |
27847 | + * Your basic SMP spinlocks, allowing only a single CPU anywhere | |
27848 | + * | |
27849 | + * Simple spin lock operations. There are two variants, one clears IRQ's | |
27850 | + * on the local processor, one does not. | |
27851 | + * | |
27852 | + * These are fair FIFO ticket locks, which are currently limited to 256 | |
27853 | + * CPUs. | |
27854 | + * | |
27855 | + * (the type definitions are in asm/spinlock_types.h) | |
27856 | + */ | |
27857 | + | |
27858 | +#ifdef CONFIG_X86_32 | |
27859 | +# define LOCK_PTR_REG "a" | |
27860 | +# define REG_PTR_MODE "k" | |
27861 | +#else | |
27862 | +# define LOCK_PTR_REG "D" | |
27863 | +# define REG_PTR_MODE "q" | |
27864 | +#endif | |
27865 | + | |
27866 | +#if defined(CONFIG_X86_32) && \ | |
27867 | + (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)) | |
27868 | +/* | |
27869 | + * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock | |
27870 | + * (PPro errata 66, 92) | |
27871 | + */ | |
27872 | +# define UNLOCK_LOCK_PREFIX LOCK_PREFIX | |
27873 | +#else | |
27874 | +# define UNLOCK_LOCK_PREFIX | |
27875 | +#endif | |
27876 | + | |
27877 | +int xen_spinlock_init(unsigned int cpu); | |
27878 | +void xen_spinlock_cleanup(unsigned int cpu); | |
27879 | +extern int xen_spin_wait(raw_spinlock_t *, unsigned int token); | |
27880 | +extern int xen_spin_wait_flags(raw_spinlock_t *, unsigned int *token, | |
27881 | + unsigned int flags); | |
27882 | +extern unsigned int xen_spin_adjust(raw_spinlock_t *, unsigned int token); | |
27883 | +extern void xen_spin_kick(raw_spinlock_t *, unsigned int token); | |
27884 | + | |
27885 | +/* | |
27886 | + * Ticket locks are conceptually two parts, one indicating the current head of | |
27887 | + * the queue, and the other indicating the current tail. The lock is acquired | |
27888 | + * by atomically noting the tail and incrementing it by one (thus adding | |
27889 | + * ourself to the queue and noting our position), then waiting until the head | |
27890 | + * becomes equal to the the initial value of the tail. | |
27891 | + * | |
27892 | + * We use an xadd covering *both* parts of the lock, to increment the tail and | |
27893 | + * also load the position of the head, which takes care of memory ordering | |
27894 | + * issues and should be optimal for the uncontended case. Note the tail must be | |
27895 | + * in the high part, because a wide xadd increment of the low part would carry | |
27896 | + * up and contaminate the high part. | |
27897 | + * | |
27898 | + * With fewer than 2^8 possible CPUs, we can use x86's partial registers to | |
27899 | + * save some instructions and make the code more elegant. There really isn't | |
27900 | + * much between them in performance though, especially as locks are out of line. | |
27901 | + */ | |
27902 | +#if (NR_CPUS < 256) | |
27903 | +#define TICKET_SHIFT 8 | |
27904 | +#define __raw_spin_lock_preamble \ | |
27905 | + asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \ | |
27906 | + "cmpb %h0, %b0\n\t" \ | |
27907 | + "sete %1" \ | |
27908 | + : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \ | |
27909 | + : "0" (0x0100) \ | |
27910 | + : "memory", "cc") | |
27911 | +#define __raw_spin_lock_body \ | |
27912 | + asm("1:\t" \ | |
27913 | + "cmpb %h0, %b0\n\t" \ | |
27914 | + "je 2f\n\t" \ | |
27915 | + "decl %1\n\t" \ | |
27916 | + "jz 2f\n\t" \ | |
27917 | + "rep ; nop\n\t" \ | |
27918 | + "movb %2, %b0\n\t" \ | |
27919 | + /* don't need lfence here, because loads are in-order */ \ | |
27920 | + "jmp 1b\n" \ | |
27921 | + "2:" \ | |
27922 | + : "+Q" (token), "+g" (count) \ | |
27923 | + : "m" (lock->slock) \ | |
27924 | + : "memory", "cc") | |
27925 | + | |
27926 | + | |
27927 | +static inline int __raw_spin_trylock(raw_spinlock_t *lock) | |
27928 | +{ | |
27929 | + int tmp, new; | |
27930 | + | |
27931 | + asm("movzwl %2, %0\n\t" | |
27932 | + "cmpb %h0, %b0\n\t" | |
27933 | + "leal 0x100(%" REG_PTR_MODE "0), %1\n\t" | |
27934 | + "jne 1f\n\t" | |
27935 | + LOCK_PREFIX "cmpxchgw %w1, %2\n\t" | |
27936 | + "1:\t" | |
27937 | + "sete %b1\n\t" | |
27938 | + "movzbl %b1, %0\n\t" | |
27939 | + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) | |
27940 | + : | |
27941 | + : "memory", "cc"); | |
27942 | + | |
27943 | + return tmp; | |
27944 | +} | |
27945 | + | |
27946 | +static inline void __raw_spin_unlock(raw_spinlock_t *lock) | |
27947 | +{ | |
27948 | + unsigned int token; | |
27949 | + unsigned char kick; | |
27950 | + | |
27951 | + asm(UNLOCK_LOCK_PREFIX "incb %2\n\t" | |
27952 | + "movzwl %2, %0\n\t" | |
27953 | + "cmpb %h0, %b0\n\t" | |
27954 | + "setne %1" | |
27955 | + : "=&Q" (token), "=qm" (kick), "+m" (lock->slock) | |
27956 | + : | |
27957 | + : "memory", "cc"); | |
27958 | + if (kick) | |
27959 | + xen_spin_kick(lock, token); | |
27960 | +} | |
27961 | +#else | |
27962 | +#define TICKET_SHIFT 16 | |
27963 | +#define __raw_spin_lock_preamble \ | |
27964 | + do { \ | |
27965 | + unsigned int tmp; \ | |
27966 | + asm(LOCK_PREFIX "xaddl %0, %2\n\t" \ | |
27967 | + "shldl $16, %0, %3\n\t" \ | |
27968 | + "cmpw %w3, %w0\n\t" \ | |
27969 | + "sete %1" | |
27970 | + : "=&r" (token), "=qm" (free), "+m" (lock->slock), \ | |
27971 | + "=&g" (tmp) \ | |
27972 | + : "0" (0x00010000) \ | |
27973 | + : "memory", "cc"); \ | |
27974 | + } while (0) | |
27975 | +#define __raw_spin_lock_body \ | |
27976 | + do { \ | |
27977 | + unsigned int tmp; \ | |
27978 | + asm("shldl $16, %0, %2\n" \ | |
27979 | + "1:\t" \ | |
27980 | + "cmpw %w2, %w0\n\t" \ | |
27981 | + "je 2f\n\t" \ | |
27982 | + "decl %1\n\t" \ | |
27983 | + "jz 2f\n\t" \ | |
27984 | + "rep ; nop\n\t" \ | |
27985 | + "movw %3, %w0\n\t" \ | |
27986 | + /* don't need lfence here, because loads are in-order */ \ | |
27987 | + "jmp 1b\n" \ | |
27988 | + "2:" \ | |
27989 | + : "+r" (token), "+g" (count), "=&g" (tmp) \ | |
27990 | + : "m" (lock->slock) \ | |
27991 | + : "memory", "cc"); \ | |
27992 | + } while (0) | |
27993 | + | |
27994 | +static inline int __raw_spin_trylock(raw_spinlock_t *lock) | |
27995 | +{ | |
27996 | + int tmp; | |
27997 | + int new; | |
27998 | + | |
27999 | + asm("movl %2, %0\n\t" | |
28000 | + "movl %0, %1\n\t" | |
28001 | + "roll $16, %0\n\t" | |
28002 | + "cmpl %0, %1\n\t" | |
28003 | + "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t" | |
28004 | + "jne 1f\n\t" | |
28005 | + LOCK_PREFIX "cmpxchgl %1, %2\n" | |
28006 | + "1:\t" | |
28007 | + "sete %b1\n\t" | |
28008 | + "movzbl %b1, %0\n\t" | |
28009 | + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) | |
28010 | + : | |
28011 | + : "memory", "cc"); | |
28012 | + | |
28013 | + return tmp; | |
28014 | +} | |
28015 | + | |
28016 | +static inline void __raw_spin_unlock(raw_spinlock_t *lock) | |
28017 | +{ | |
28018 | + unsigned int token, tmp; | |
28019 | + bool kick; | |
28020 | + | |
28021 | + asm(UNLOCK_LOCK_PREFIX "incw %2\n\t" | |
28022 | + "movl %2, %0\n\t" | |
28023 | + "shldl $16, %0, %3\n\t" | |
28024 | + "cmpw %w3, %w0\n\t" | |
28025 | + "setne %1" | |
28026 | + : "=&r" (token), "=qm" (kick), "+m" (lock->slock), "=&r" (tmp) | |
28027 | + : | |
28028 | + : "memory", "cc"); | |
28029 | + if (kick) | |
28030 | + xen_spin_kick(lock, token); | |
28031 | +} | |
28032 | +#endif | |
28033 | + | |
28034 | +static inline int __raw_spin_is_locked(raw_spinlock_t *lock) | |
28035 | +{ | |
28036 | + int tmp = *(volatile signed int *)(&(lock)->slock); | |
28037 | + | |
28038 | + return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); | |
28039 | +} | |
28040 | + | |
28041 | +static inline int __raw_spin_is_contended(raw_spinlock_t *lock) | |
28042 | +{ | |
28043 | + int tmp = *(volatile signed int *)(&(lock)->slock); | |
28044 | + | |
28045 | + return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; | |
28046 | +} | |
28047 | + | |
28048 | +static inline void __raw_spin_lock(raw_spinlock_t *lock) | |
28049 | +{ | |
28050 | + unsigned int token, count; | |
28051 | + bool free; | |
28052 | + | |
28053 | + __raw_spin_lock_preamble; | |
28054 | + if (unlikely(!free)) | |
28055 | + token = xen_spin_adjust(lock, token); | |
28056 | + do { | |
28057 | + count = 1 << 10; | |
28058 | + __raw_spin_lock_body; | |
28059 | + } while (unlikely(!count) && !xen_spin_wait(lock, token)); | |
28060 | +} | |
28061 | + | |
28062 | +static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, | |
28063 | + unsigned long flags) | |
28064 | +{ | |
28065 | + unsigned int token, count; | |
28066 | + bool free; | |
28067 | + | |
28068 | + __raw_spin_lock_preamble; | |
28069 | + if (unlikely(!free)) | |
28070 | + token = xen_spin_adjust(lock, token); | |
28071 | + do { | |
28072 | + count = 1 << 10; | |
28073 | + __raw_spin_lock_body; | |
28074 | + } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags)); | |
28075 | +} | |
28076 | + | |
28077 | +static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) | |
28078 | +{ | |
28079 | + while (__raw_spin_is_locked(lock)) | |
28080 | + cpu_relax(); | |
28081 | +} | |
28082 | + | |
28083 | +/* | |
28084 | + * Read-write spinlocks, allowing multiple readers | |
28085 | + * but only one writer. | |
28086 | + * | |
28087 | + * NOTE! it is quite common to have readers in interrupts | |
28088 | + * but no interrupt writers. For those circumstances we | |
28089 | + * can "mix" irq-safe locks - any writer needs to get a | |
28090 | + * irq-safe write-lock, but readers can get non-irqsafe | |
28091 | + * read-locks. | |
28092 | + * | |
28093 | + * On x86, we implement read-write locks as a 32-bit counter | |
28094 | + * with the high bit (sign) being the "contended" bit. | |
28095 | + */ | |
28096 | + | |
28097 | +/** | |
28098 | + * read_can_lock - would read_trylock() succeed? | |
28099 | + * @lock: the rwlock in question. | |
28100 | + */ | |
28101 | +static inline int __raw_read_can_lock(raw_rwlock_t *lock) | |
28102 | +{ | |
28103 | + return (int)(lock)->lock > 0; | |
28104 | +} | |
28105 | + | |
28106 | +/** | |
28107 | + * write_can_lock - would write_trylock() succeed? | |
28108 | + * @lock: the rwlock in question. | |
28109 | + */ | |
28110 | +static inline int __raw_write_can_lock(raw_rwlock_t *lock) | |
28111 | +{ | |
28112 | + return (lock)->lock == RW_LOCK_BIAS; | |
28113 | +} | |
28114 | + | |
28115 | +static inline void __raw_read_lock(raw_rwlock_t *rw) | |
28116 | +{ | |
28117 | + asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" | |
28118 | + "jns 1f\n" | |
28119 | + "call __read_lock_failed\n\t" | |
28120 | + "1:\n" | |
28121 | + ::LOCK_PTR_REG (rw) : "memory"); | |
28122 | +} | |
28123 | + | |
28124 | +static inline void __raw_write_lock(raw_rwlock_t *rw) | |
28125 | +{ | |
28126 | + asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" | |
28127 | + "jz 1f\n" | |
28128 | + "call __write_lock_failed\n\t" | |
28129 | + "1:\n" | |
28130 | + ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); | |
28131 | +} | |
28132 | + | |
28133 | +static inline int __raw_read_trylock(raw_rwlock_t *lock) | |
28134 | +{ | |
28135 | + atomic_t *count = (atomic_t *)lock; | |
28136 | + | |
28137 | + atomic_dec(count); | |
28138 | + if (atomic_read(count) >= 0) | |
28139 | + return 1; | |
28140 | + atomic_inc(count); | |
28141 | + return 0; | |
28142 | +} | |
28143 | + | |
28144 | +static inline int __raw_write_trylock(raw_rwlock_t *lock) | |
28145 | +{ | |
28146 | + atomic_t *count = (atomic_t *)lock; | |
28147 | + | |
28148 | + if (atomic_sub_and_test(RW_LOCK_BIAS, count)) | |
28149 | + return 1; | |
28150 | + atomic_add(RW_LOCK_BIAS, count); | |
28151 | + return 0; | |
28152 | +} | |
28153 | + | |
28154 | +static inline void __raw_read_unlock(raw_rwlock_t *rw) | |
28155 | +{ | |
28156 | + asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); | |
28157 | +} | |
28158 | + | |
28159 | +static inline void __raw_write_unlock(raw_rwlock_t *rw) | |
28160 | +{ | |
28161 | + asm volatile(LOCK_PREFIX "addl %1, %0" | |
28162 | + : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); | |
28163 | +} | |
28164 | + | |
28165 | +#define _raw_spin_relax(lock) cpu_relax() | |
28166 | +#define _raw_read_relax(lock) cpu_relax() | |
28167 | +#define _raw_write_relax(lock) cpu_relax() | |
28168 | + | |
28169 | +#endif | |
28170 | --- a/include/asm-x86/mach-xen/asm/system_32.h | |
28171 | +++ /dev/null | |
28172 | @@ -1,312 +0,0 @@ | |
28173 | -#ifndef __ASM_SYSTEM_H | |
28174 | -#define __ASM_SYSTEM_H | |
28175 | - | |
28176 | -#include <linux/kernel.h> | |
28177 | -#include <asm/segment.h> | |
28178 | -#include <asm/cpufeature.h> | |
28179 | -#include <asm/cmpxchg.h> | |
28180 | -#include <asm/synch_bitops.h> | |
28181 | -#include <asm/hypervisor.h> | |
28182 | - | |
28183 | -#ifdef __KERNEL__ | |
28184 | -#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */ | |
28185 | - | |
28186 | -struct task_struct; /* one of the stranger aspects of C forward declarations.. */ | |
28187 | -extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); | |
28188 | - | |
28189 | -/* | |
28190 | - * Saving eflags is important. It switches not only IOPL between tasks, | |
28191 | - * it also protects other tasks from NT leaking through sysenter etc. | |
28192 | - */ | |
28193 | -#define switch_to(prev,next,last) do { \ | |
28194 | - unsigned long esi,edi; \ | |
28195 | - asm volatile("pushfl\n\t" /* Save flags */ \ | |
28196 | - "pushl %%ebp\n\t" \ | |
28197 | - "movl %%esp,%0\n\t" /* save ESP */ \ | |
28198 | - "movl %5,%%esp\n\t" /* restore ESP */ \ | |
28199 | - "movl $1f,%1\n\t" /* save EIP */ \ | |
28200 | - "pushl %6\n\t" /* restore EIP */ \ | |
28201 | - "jmp __switch_to\n" \ | |
28202 | - "1:\t" \ | |
28203 | - "popl %%ebp\n\t" \ | |
28204 | - "popfl" \ | |
28205 | - :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ | |
28206 | - "=a" (last),"=S" (esi),"=D" (edi) \ | |
28207 | - :"m" (next->thread.esp),"m" (next->thread.eip), \ | |
28208 | - "2" (prev), "d" (next)); \ | |
28209 | -} while (0) | |
28210 | - | |
28211 | -#define _set_base(addr,base) do { unsigned long __pr; \ | |
28212 | -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ | |
28213 | - "rorl $16,%%edx\n\t" \ | |
28214 | - "movb %%dl,%2\n\t" \ | |
28215 | - "movb %%dh,%3" \ | |
28216 | - :"=&d" (__pr) \ | |
28217 | - :"m" (*((addr)+2)), \ | |
28218 | - "m" (*((addr)+4)), \ | |
28219 | - "m" (*((addr)+7)), \ | |
28220 | - "0" (base) \ | |
28221 | - ); } while(0) | |
28222 | - | |
28223 | -#define _set_limit(addr,limit) do { unsigned long __lr; \ | |
28224 | -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ | |
28225 | - "rorl $16,%%edx\n\t" \ | |
28226 | - "movb %2,%%dh\n\t" \ | |
28227 | - "andb $0xf0,%%dh\n\t" \ | |
28228 | - "orb %%dh,%%dl\n\t" \ | |
28229 | - "movb %%dl,%2" \ | |
28230 | - :"=&d" (__lr) \ | |
28231 | - :"m" (*(addr)), \ | |
28232 | - "m" (*((addr)+6)), \ | |
28233 | - "0" (limit) \ | |
28234 | - ); } while(0) | |
28235 | - | |
28236 | -#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) ) | |
28237 | -#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) ) | |
28238 | - | |
28239 | -/* | |
28240 | - * Load a segment. Fall back on loading the zero | |
28241 | - * segment if something goes wrong.. | |
28242 | - */ | |
28243 | -#define loadsegment(seg,value) \ | |
28244 | - asm volatile("\n" \ | |
28245 | - "1:\t" \ | |
28246 | - "mov %0,%%" #seg "\n" \ | |
28247 | - "2:\n" \ | |
28248 | - ".section .fixup,\"ax\"\n" \ | |
28249 | - "3:\t" \ | |
28250 | - "pushl $0\n\t" \ | |
28251 | - "popl %%" #seg "\n\t" \ | |
28252 | - "jmp 2b\n" \ | |
28253 | - ".previous\n" \ | |
28254 | - ".section __ex_table,\"a\"\n\t" \ | |
28255 | - ".align 4\n\t" \ | |
28256 | - ".long 1b,3b\n" \ | |
28257 | - ".previous" \ | |
28258 | - : :"rm" (value)) | |
28259 | - | |
28260 | -/* | |
28261 | - * Save a segment register away | |
28262 | - */ | |
28263 | -#define savesegment(seg, value) \ | |
28264 | - asm volatile("mov %%" #seg ",%0":"=rm" (value)) | |
28265 | - | |
28266 | -static inline void xen_clts(void) | |
28267 | -{ | |
28268 | - HYPERVISOR_fpu_taskswitch(0); | |
28269 | -} | |
28270 | - | |
28271 | -static inline unsigned long xen_read_cr0(void) | |
28272 | -{ | |
28273 | - unsigned long val; | |
28274 | - asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); | |
28275 | - return val; | |
28276 | -} | |
28277 | - | |
28278 | -static inline void xen_write_cr0(unsigned long val) | |
28279 | -{ | |
28280 | - asm volatile("movl %0,%%cr0": :"r" (val)); | |
28281 | -} | |
28282 | - | |
28283 | -#define xen_read_cr2() (current_vcpu_info()->arch.cr2) | |
28284 | - | |
28285 | -static inline void xen_write_cr2(unsigned long val) | |
28286 | -{ | |
28287 | - asm volatile("movl %0,%%cr2": :"r" (val)); | |
28288 | -} | |
28289 | - | |
28290 | -static inline unsigned long xen_read_cr3(void) | |
28291 | -{ | |
28292 | - unsigned long val; | |
28293 | - asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); | |
28294 | - return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT; | |
28295 | -} | |
28296 | - | |
28297 | -static inline void xen_write_cr3(unsigned long val) | |
28298 | -{ | |
28299 | - val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT)); | |
28300 | - asm volatile("movl %0,%%cr3": :"r" (val)); | |
28301 | -} | |
28302 | - | |
28303 | -static inline unsigned long xen_read_cr4(void) | |
28304 | -{ | |
28305 | - unsigned long val; | |
28306 | - asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); | |
28307 | - return val; | |
28308 | -} | |
28309 | - | |
28310 | -static inline unsigned long xen_read_cr4_safe(void) | |
28311 | -{ | |
28312 | - unsigned long val; | |
28313 | - /* This could fault if %cr4 does not exist */ | |
28314 | - asm volatile("1: movl %%cr4, %0 \n" | |
28315 | - "2: \n" | |
28316 | - ".section __ex_table,\"a\" \n" | |
28317 | - ".long 1b,2b \n" | |
28318 | - ".previous \n" | |
28319 | - : "=r" (val): "0" (0)); | |
28320 | - return val; | |
28321 | -} | |
28322 | - | |
28323 | -static inline void xen_write_cr4(unsigned long val) | |
28324 | -{ | |
28325 | - asm volatile("movl %0,%%cr4": :"r" (val)); | |
28326 | -} | |
28327 | - | |
28328 | -static inline void xen_wbinvd(void) | |
28329 | -{ | |
28330 | - asm volatile("wbinvd": : :"memory"); | |
28331 | -} | |
28332 | - | |
28333 | -static inline void clflush(volatile void *__p) | |
28334 | -{ | |
28335 | - asm volatile("clflush %0" : "+m" (*(char __force *)__p)); | |
28336 | -} | |
28337 | - | |
28338 | -#define read_cr0() (xen_read_cr0()) | |
28339 | -#define write_cr0(x) (xen_write_cr0(x)) | |
28340 | -#define read_cr2() (xen_read_cr2()) | |
28341 | -#define write_cr2(x) (xen_write_cr2(x)) | |
28342 | -#define read_cr3() (xen_read_cr3()) | |
28343 | -#define write_cr3(x) (xen_write_cr3(x)) | |
28344 | -#define read_cr4() (xen_read_cr4()) | |
28345 | -#define read_cr4_safe() (xen_read_cr4_safe()) | |
28346 | -#define write_cr4(x) (xen_write_cr4(x)) | |
28347 | -#define wbinvd() (xen_wbinvd()) | |
28348 | - | |
28349 | -/* Clear the 'TS' bit */ | |
28350 | -#define clts() (xen_clts()) | |
28351 | - | |
28352 | -/* Set the 'TS' bit */ | |
28353 | -#define stts() (HYPERVISOR_fpu_taskswitch(1)) | |
28354 | - | |
28355 | -#endif /* __KERNEL__ */ | |
28356 | - | |
28357 | -static inline unsigned long get_limit(unsigned long segment) | |
28358 | -{ | |
28359 | - unsigned long __limit; | |
28360 | - __asm__("lsll %1,%0" | |
28361 | - :"=r" (__limit):"r" (segment)); | |
28362 | - return __limit+1; | |
28363 | -} | |
28364 | - | |
28365 | -#define nop() __asm__ __volatile__ ("nop") | |
28366 | - | |
28367 | -/* | |
28368 | - * Force strict CPU ordering. | |
28369 | - * And yes, this is required on UP too when we're talking | |
28370 | - * to devices. | |
28371 | - * | |
28372 | - * For now, "wmb()" doesn't actually do anything, as all | |
28373 | - * Intel CPU's follow what Intel calls a *Processor Order*, | |
28374 | - * in which all writes are seen in the program order even | |
28375 | - * outside the CPU. | |
28376 | - * | |
28377 | - * I expect future Intel CPU's to have a weaker ordering, | |
28378 | - * but I'd also expect them to finally get their act together | |
28379 | - * and add some real memory barriers if so. | |
28380 | - * | |
28381 | - * Some non intel clones support out of order store. wmb() ceases to be a | |
28382 | - * nop for these. | |
28383 | - */ | |
28384 | - | |
28385 | - | |
28386 | -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) | |
28387 | -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) | |
28388 | -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) | |
28389 | - | |
28390 | -/** | |
28391 | - * read_barrier_depends - Flush all pending reads that subsequents reads | |
28392 | - * depend on. | |
28393 | - * | |
28394 | - * No data-dependent reads from memory-like regions are ever reordered | |
28395 | - * over this barrier. All reads preceding this primitive are guaranteed | |
28396 | - * to access memory (but not necessarily other CPUs' caches) before any | |
28397 | - * reads following this primitive that depend on the data return by | |
28398 | - * any of the preceding reads. This primitive is much lighter weight than | |
28399 | - * rmb() on most CPUs, and is never heavier weight than is | |
28400 | - * rmb(). | |
28401 | - * | |
28402 | - * These ordering constraints are respected by both the local CPU | |
28403 | - * and the compiler. | |
28404 | - * | |
28405 | - * Ordering is not guaranteed by anything other than these primitives, | |
28406 | - * not even by data dependencies. See the documentation for | |
28407 | - * memory_barrier() for examples and URLs to more information. | |
28408 | - * | |
28409 | - * For example, the following code would force ordering (the initial | |
28410 | - * value of "a" is zero, "b" is one, and "p" is "&a"): | |
28411 | - * | |
28412 | - * <programlisting> | |
28413 | - * CPU 0 CPU 1 | |
28414 | - * | |
28415 | - * b = 2; | |
28416 | - * memory_barrier(); | |
28417 | - * p = &b; q = p; | |
28418 | - * read_barrier_depends(); | |
28419 | - * d = *q; | |
28420 | - * </programlisting> | |
28421 | - * | |
28422 | - * because the read of "*q" depends on the read of "p" and these | |
28423 | - * two reads are separated by a read_barrier_depends(). However, | |
28424 | - * the following code, with the same initial values for "a" and "b": | |
28425 | - * | |
28426 | - * <programlisting> | |
28427 | - * CPU 0 CPU 1 | |
28428 | - * | |
28429 | - * a = 2; | |
28430 | - * memory_barrier(); | |
28431 | - * b = 3; y = b; | |
28432 | - * read_barrier_depends(); | |
28433 | - * x = a; | |
28434 | - * </programlisting> | |
28435 | - * | |
28436 | - * does not enforce ordering, since there is no data dependency between | |
28437 | - * the read of "a" and the read of "b". Therefore, on some CPUs, such | |
28438 | - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() | |
28439 | - * in cases like this where there are no data dependencies. | |
28440 | - **/ | |
28441 | - | |
28442 | -#define read_barrier_depends() do { } while(0) | |
28443 | - | |
28444 | -#ifdef CONFIG_SMP | |
28445 | -#define smp_mb() mb() | |
28446 | -#ifdef CONFIG_X86_PPRO_FENCE | |
28447 | -# define smp_rmb() rmb() | |
28448 | -#else | |
28449 | -# define smp_rmb() barrier() | |
28450 | -#endif | |
28451 | -#ifdef CONFIG_X86_OOSTORE | |
28452 | -# define smp_wmb() wmb() | |
28453 | -#else | |
28454 | -# define smp_wmb() barrier() | |
28455 | -#endif | |
28456 | -#define smp_read_barrier_depends() read_barrier_depends() | |
28457 | -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) | |
28458 | -#else | |
28459 | -#define smp_mb() barrier() | |
28460 | -#define smp_rmb() barrier() | |
28461 | -#define smp_wmb() barrier() | |
28462 | -#define smp_read_barrier_depends() do { } while(0) | |
28463 | -#define set_mb(var, value) do { var = value; barrier(); } while (0) | |
28464 | -#endif | |
28465 | - | |
28466 | -#include <linux/irqflags.h> | |
28467 | - | |
28468 | -/* | |
28469 | - * disable hlt during certain critical i/o operations | |
28470 | - */ | |
28471 | -#define HAVE_DISABLE_HLT | |
28472 | -void disable_hlt(void); | |
28473 | -void enable_hlt(void); | |
28474 | - | |
28475 | -extern int es7000_plat; | |
28476 | -void cpu_idle_wait(void); | |
28477 | - | |
28478 | -extern unsigned long arch_align_stack(unsigned long sp); | |
28479 | -extern void free_init_pages(char *what, unsigned long begin, unsigned long end); | |
28480 | - | |
28481 | -void default_idle(void); | |
28482 | -void __show_registers(struct pt_regs *, int all); | |
28483 | - | |
28484 | -#endif | |
28485 | --- a/include/asm-x86/mach-xen/asm/system_64.h | |
28486 | +++ b/include/asm-x86/mach-xen/asm/system_64.h | |
28487 | @@ -1,122 +1,9 @@ | |
28488 | #ifndef __ASM_SYSTEM_H | |
28489 | #define __ASM_SYSTEM_H | |
28490 | ||
28491 | -#include <linux/kernel.h> | |
28492 | #include <asm/segment.h> | |
28493 | #include <asm/cmpxchg.h> | |
28494 | ||
28495 | -#include <asm/synch_bitops.h> | |
28496 | -#include <asm/hypervisor.h> | |
28497 | -#include <xen/interface/arch-x86_64.h> | |
28498 | - | |
28499 | -#ifdef __KERNEL__ | |
28500 | - | |
28501 | -/* entries in ARCH_DLINFO: */ | |
28502 | -#ifdef CONFIG_IA32_EMULATION | |
28503 | -# define AT_VECTOR_SIZE_ARCH 2 | |
28504 | -#else | |
28505 | -# define AT_VECTOR_SIZE_ARCH 1 | |
28506 | -#endif | |
28507 | - | |
28508 | -#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" | |
28509 | -#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" | |
28510 | - | |
28511 | -/* frame pointer must be last for get_wchan */ | |
28512 | -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" | |
28513 | -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t" | |
28514 | - | |
28515 | -#define __EXTRA_CLOBBER \ | |
28516 | - ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15" | |
28517 | - | |
28518 | -/* Save restore flags to clear handle leaking NT */ | |
28519 | -#define switch_to(prev,next,last) \ | |
28520 | - asm volatile(SAVE_CONTEXT \ | |
28521 | - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ | |
28522 | - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ | |
28523 | - "call __switch_to\n\t" \ | |
28524 | - ".globl thread_return\n" \ | |
28525 | - "thread_return:\n\t" \ | |
28526 | - "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ | |
28527 | - "movq %P[thread_info](%%rsi),%%r8\n\t" \ | |
28528 | - LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ | |
28529 | - "movq %%rax,%%rdi\n\t" \ | |
28530 | - "jc ret_from_fork\n\t" \ | |
28531 | - RESTORE_CONTEXT \ | |
28532 | - : "=a" (last) \ | |
28533 | - : [next] "S" (next), [prev] "D" (prev), \ | |
28534 | - [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \ | |
28535 | - [ti_flags] "i" (offsetof(struct thread_info, flags)),\ | |
28536 | - [tif_fork] "i" (TIF_FORK), \ | |
28537 | - [thread_info] "i" (offsetof(struct task_struct, stack)), \ | |
28538 | - [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ | |
28539 | - : "memory", "cc" __EXTRA_CLOBBER) | |
28540 | - | |
28541 | -extern void load_gs_index(unsigned); | |
28542 | - | |
28543 | -/* | |
28544 | - * Load a segment. Fall back on loading the zero | |
28545 | - * segment if something goes wrong.. | |
28546 | - */ | |
28547 | -#define loadsegment(seg,value) \ | |
28548 | - asm volatile("\n" \ | |
28549 | - "1:\t" \ | |
28550 | - "movl %k0,%%" #seg "\n" \ | |
28551 | - "2:\n" \ | |
28552 | - ".section .fixup,\"ax\"\n" \ | |
28553 | - "3:\t" \ | |
28554 | - "movl %1,%%" #seg "\n\t" \ | |
28555 | - "jmp 2b\n" \ | |
28556 | - ".previous\n" \ | |
28557 | - ".section __ex_table,\"a\"\n\t" \ | |
28558 | - ".align 8\n\t" \ | |
28559 | - ".quad 1b,3b\n" \ | |
28560 | - ".previous" \ | |
28561 | - : :"r" (value), "r" (0)) | |
28562 | - | |
28563 | -/* | |
28564 | - * Clear and set 'TS' bit respectively | |
28565 | - */ | |
28566 | -#define clts() (HYPERVISOR_fpu_taskswitch(0)) | |
28567 | - | |
28568 | -static inline unsigned long read_cr0(void) | |
28569 | -{ | |
28570 | - unsigned long cr0; | |
28571 | - asm volatile("movq %%cr0,%0" : "=r" (cr0)); | |
28572 | - return cr0; | |
28573 | -} | |
28574 | - | |
28575 | -static inline void write_cr0(unsigned long val) | |
28576 | -{ | |
28577 | - asm volatile("movq %0,%%cr0" :: "r" (val)); | |
28578 | -} | |
28579 | - | |
28580 | -#define read_cr2() current_vcpu_info()->arch.cr2 | |
28581 | - | |
28582 | -#define write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val))) | |
28583 | - | |
28584 | -#define read_cr3() ({ \ | |
28585 | - unsigned long __dummy; \ | |
28586 | - asm volatile("movq %%cr3,%0" : "=r" (__dummy)); \ | |
28587 | - machine_to_phys(__dummy); \ | |
28588 | -}) | |
28589 | - | |
28590 | -static inline void write_cr3(unsigned long val) | |
28591 | -{ | |
28592 | - val = phys_to_machine(val); | |
28593 | - asm volatile("movq %0,%%cr3" :: "r" (val) : "memory"); | |
28594 | -} | |
28595 | - | |
28596 | -static inline unsigned long read_cr4(void) | |
28597 | -{ | |
28598 | - unsigned long cr4; | |
28599 | - asm volatile("movq %%cr4,%0" : "=r" (cr4)); | |
28600 | - return cr4; | |
28601 | -} | |
28602 | - | |
28603 | -static inline void write_cr4(unsigned long val) | |
28604 | -{ | |
28605 | - asm volatile("movq %0,%%cr4" :: "r" (val) : "memory"); | |
28606 | -} | |
28607 | ||
28608 | static inline unsigned long read_cr8(void) | |
28609 | { | |
28610 | @@ -128,52 +15,6 @@ static inline void write_cr8(unsigned lo | |
28611 | BUG_ON(val); | |
28612 | } | |
28613 | ||
28614 | -#define stts() (HYPERVISOR_fpu_taskswitch(1)) | |
28615 | - | |
28616 | -#define wbinvd() \ | |
28617 | - __asm__ __volatile__ ("wbinvd": : :"memory") | |
28618 | - | |
28619 | -#endif /* __KERNEL__ */ | |
28620 | - | |
28621 | -static inline void clflush(volatile void *__p) | |
28622 | -{ | |
28623 | - asm volatile("clflush %0" : "+m" (*(char __force *)__p)); | |
28624 | -} | |
28625 | - | |
28626 | -#define nop() __asm__ __volatile__ ("nop") | |
28627 | - | |
28628 | -#ifdef CONFIG_SMP | |
28629 | -#define smp_mb() mb() | |
28630 | -#define smp_rmb() barrier() | |
28631 | -#define smp_wmb() barrier() | |
28632 | -#define smp_read_barrier_depends() do {} while(0) | |
28633 | -#else | |
28634 | -#define smp_mb() barrier() | |
28635 | -#define smp_rmb() barrier() | |
28636 | -#define smp_wmb() barrier() | |
28637 | -#define smp_read_barrier_depends() do {} while(0) | |
28638 | -#endif | |
28639 | - | |
28640 | - | |
28641 | -/* | |
28642 | - * Force strict CPU ordering. | |
28643 | - * And yes, this is required on UP too when we're talking | |
28644 | - * to devices. | |
28645 | - */ | |
28646 | -#define mb() asm volatile("mfence":::"memory") | |
28647 | -#define rmb() asm volatile("lfence":::"memory") | |
28648 | -#define wmb() asm volatile("sfence" ::: "memory") | |
28649 | - | |
28650 | -#define read_barrier_depends() do {} while(0) | |
28651 | -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) | |
28652 | - | |
28653 | -#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0) | |
28654 | - | |
28655 | #include <linux/irqflags.h> | |
28656 | ||
28657 | -void cpu_idle_wait(void); | |
28658 | - | |
28659 | -extern unsigned long arch_align_stack(unsigned long sp); | |
28660 | -extern void free_init_pages(char *what, unsigned long begin, unsigned long end); | |
28661 | - | |
28662 | #endif | |
28663 | --- a/include/asm-x86/mach-xen/asm/system.h | |
28664 | +++ b/include/asm-x86/mach-xen/asm/system.h | |
28665 | @@ -1,5 +1,393 @@ | |
28666 | +#ifndef _ASM_X86_SYSTEM_H_ | |
28667 | +#define _ASM_X86_SYSTEM_H_ | |
28668 | + | |
28669 | +#include <asm/asm.h> | |
28670 | +#include <asm/segment.h> | |
28671 | +#include <asm/cpufeature.h> | |
28672 | +#include <asm/cmpxchg.h> | |
28673 | +#include <asm/nops.h> | |
28674 | +#include <asm/hypervisor.h> | |
28675 | + | |
28676 | +#include <linux/kernel.h> | |
28677 | +#include <linux/irqflags.h> | |
28678 | + | |
28679 | +/* entries in ARCH_DLINFO: */ | |
28680 | +#ifdef CONFIG_IA32_EMULATION | |
28681 | +# define AT_VECTOR_SIZE_ARCH 2 | |
28682 | +#else | |
28683 | +# define AT_VECTOR_SIZE_ARCH 1 | |
28684 | +#endif | |
28685 | + | |
28686 | +#ifdef CONFIG_X86_32 | |
28687 | + | |
28688 | +struct task_struct; /* one of the stranger aspects of C forward declarations */ | |
28689 | +struct task_struct *__switch_to(struct task_struct *prev, | |
28690 | + struct task_struct *next); | |
28691 | + | |
28692 | +/* | |
28693 | + * Saving eflags is important. It switches not only IOPL between tasks, | |
28694 | + * it also protects other tasks from NT leaking through sysenter etc. | |
28695 | + */ | |
28696 | +#define switch_to(prev, next, last) do { \ | |
28697 | + unsigned long esi, edi; \ | |
28698 | + asm volatile("pushfl\n\t" /* Save flags */ \ | |
28699 | + "pushl %%ebp\n\t" \ | |
28700 | + "movl %%esp,%0\n\t" /* save ESP */ \ | |
28701 | + "movl %5,%%esp\n\t" /* restore ESP */ \ | |
28702 | + "movl $1f,%1\n\t" /* save EIP */ \ | |
28703 | + "pushl %6\n\t" /* restore EIP */ \ | |
28704 | + "jmp __switch_to\n" \ | |
28705 | + "1:\t" \ | |
28706 | + "popl %%ebp\n\t" \ | |
28707 | + "popfl" \ | |
28708 | + :"=m" (prev->thread.sp), "=m" (prev->thread.ip), \ | |
28709 | + "=a" (last), "=S" (esi), "=D" (edi) \ | |
28710 | + :"m" (next->thread.sp), "m" (next->thread.ip), \ | |
28711 | + "2" (prev), "d" (next)); \ | |
28712 | +} while (0) | |
28713 | + | |
28714 | +/* | |
28715 | + * disable hlt during certain critical i/o operations | |
28716 | + */ | |
28717 | +#define HAVE_DISABLE_HLT | |
28718 | +#else | |
28719 | +#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" | |
28720 | +#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" | |
28721 | + | |
28722 | +/* frame pointer must be last for get_wchan */ | |
28723 | +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" | |
28724 | +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" | |
28725 | + | |
28726 | +#define __EXTRA_CLOBBER \ | |
28727 | + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ | |
28728 | + "r12", "r13", "r14", "r15" | |
28729 | + | |
28730 | +/* Save restore flags to clear handle leaking NT */ | |
28731 | +#define switch_to(prev, next, last) \ | |
28732 | + asm volatile(SAVE_CONTEXT \ | |
28733 | + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ | |
28734 | + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ | |
28735 | + "call __switch_to\n\t" \ | |
28736 | + ".globl thread_return\n" \ | |
28737 | + "thread_return:\n\t" \ | |
28738 | + "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ | |
28739 | + "movq %P[thread_info](%%rsi),%%r8\n\t" \ | |
28740 | + LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ | |
28741 | + "movq %%rax,%%rdi\n\t" \ | |
28742 | + "jc ret_from_fork\n\t" \ | |
28743 | + RESTORE_CONTEXT \ | |
28744 | + : "=a" (last) \ | |
28745 | + : [next] "S" (next), [prev] "D" (prev), \ | |
28746 | + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ | |
28747 | + [ti_flags] "i" (offsetof(struct thread_info, flags)), \ | |
28748 | + [tif_fork] "i" (TIF_FORK), \ | |
28749 | + [thread_info] "i" (offsetof(struct task_struct, stack)), \ | |
28750 | + [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ | |
28751 | + : "memory", "cc" __EXTRA_CLOBBER) | |
28752 | +#endif | |
28753 | + | |
28754 | +#ifdef __KERNEL__ | |
28755 | +#define _set_base(addr, base) do { unsigned long __pr; \ | |
28756 | +__asm__ __volatile__ ("movw %%dx,%1\n\t" \ | |
28757 | + "rorl $16,%%edx\n\t" \ | |
28758 | + "movb %%dl,%2\n\t" \ | |
28759 | + "movb %%dh,%3" \ | |
28760 | + :"=&d" (__pr) \ | |
28761 | + :"m" (*((addr)+2)), \ | |
28762 | + "m" (*((addr)+4)), \ | |
28763 | + "m" (*((addr)+7)), \ | |
28764 | + "0" (base) \ | |
28765 | + ); } while (0) | |
28766 | + | |
28767 | +#define _set_limit(addr, limit) do { unsigned long __lr; \ | |
28768 | +__asm__ __volatile__ ("movw %%dx,%1\n\t" \ | |
28769 | + "rorl $16,%%edx\n\t" \ | |
28770 | + "movb %2,%%dh\n\t" \ | |
28771 | + "andb $0xf0,%%dh\n\t" \ | |
28772 | + "orb %%dh,%%dl\n\t" \ | |
28773 | + "movb %%dl,%2" \ | |
28774 | + :"=&d" (__lr) \ | |
28775 | + :"m" (*(addr)), \ | |
28776 | + "m" (*((addr)+6)), \ | |
28777 | + "0" (limit) \ | |
28778 | + ); } while (0) | |
28779 | + | |
28780 | +#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base)) | |
28781 | +#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1)) | |
28782 | + | |
28783 | +extern void load_gs_index(unsigned); | |
28784 | + | |
28785 | +/* | |
28786 | + * Load a segment. Fall back on loading the zero | |
28787 | + * segment if something goes wrong.. | |
28788 | + */ | |
28789 | +#define loadsegment(seg, value) \ | |
28790 | + asm volatile("\n" \ | |
28791 | + "1:\t" \ | |
28792 | + "movl %k0,%%" #seg "\n" \ | |
28793 | + "2:\n" \ | |
28794 | + ".section .fixup,\"ax\"\n" \ | |
28795 | + "3:\t" \ | |
28796 | + "movl %k1, %%" #seg "\n\t" \ | |
28797 | + "jmp 2b\n" \ | |
28798 | + ".previous\n" \ | |
28799 | + _ASM_EXTABLE(1b,3b) \ | |
28800 | + : :"r" (value), "r" (0)) | |
28801 | + | |
28802 | + | |
28803 | +/* | |
28804 | + * Save a segment register away | |
28805 | + */ | |
28806 | +#define savesegment(seg, value) \ | |
28807 | + asm volatile("mov %%" #seg ",%0":"=rm" (value)) | |
28808 | + | |
28809 | +static inline unsigned long get_limit(unsigned long segment) | |
28810 | +{ | |
28811 | + unsigned long __limit; | |
28812 | + __asm__("lsll %1,%0" | |
28813 | + :"=r" (__limit):"r" (segment)); | |
28814 | + return __limit+1; | |
28815 | +} | |
28816 | + | |
28817 | +static inline void xen_clts(void) | |
28818 | +{ | |
28819 | + HYPERVISOR_fpu_taskswitch(0); | |
28820 | +} | |
28821 | + | |
28822 | +static inline void xen_stts(void) | |
28823 | +{ | |
28824 | + HYPERVISOR_fpu_taskswitch(1); | |
28825 | +} | |
28826 | + | |
28827 | +/* | |
28828 | + * Volatile isn't enough to prevent the compiler from reordering the | |
28829 | + * read/write functions for the control registers and messing everything up. | |
28830 | + * A memory clobber would solve the problem, but would prevent reordering of | |
28831 | + * all loads stores around it, which can hurt performance. Solution is to | |
28832 | + * use a variable and mimic reads and writes to it to enforce serialization | |
28833 | + */ | |
28834 | +static unsigned long __force_order; | |
28835 | + | |
28836 | +static inline unsigned long xen_read_cr0(void) | |
28837 | +{ | |
28838 | + unsigned long val; | |
28839 | + asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order)); | |
28840 | + return val; | |
28841 | +} | |
28842 | + | |
28843 | +static inline void xen_write_cr0(unsigned long val) | |
28844 | +{ | |
28845 | + asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order)); | |
28846 | +} | |
28847 | + | |
28848 | +#define xen_read_cr2() (current_vcpu_info()->arch.cr2) | |
28849 | +#define xen_write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val))) | |
28850 | + | |
28851 | +static inline unsigned long xen_read_cr3(void) | |
28852 | +{ | |
28853 | + unsigned long val; | |
28854 | + asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order)); | |
28855 | +#ifdef CONFIG_X86_32 | |
28856 | + return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT; | |
28857 | +#else | |
28858 | + return machine_to_phys(val); | |
28859 | +#endif | |
28860 | +} | |
28861 | + | |
28862 | +static inline void xen_write_cr3(unsigned long val) | |
28863 | +{ | |
28864 | +#ifdef CONFIG_X86_32 | |
28865 | + val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT)); | |
28866 | +#else | |
28867 | + val = phys_to_machine(val); | |
28868 | +#endif | |
28869 | + asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order)); | |
28870 | +} | |
28871 | + | |
28872 | +static inline unsigned long xen_read_cr4(void) | |
28873 | +{ | |
28874 | + unsigned long val; | |
28875 | + asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order)); | |
28876 | + return val; | |
28877 | +} | |
28878 | + | |
28879 | +#define xen_read_cr4_safe() xen_read_cr4() | |
28880 | + | |
28881 | +static inline void xen_write_cr4(unsigned long val) | |
28882 | +{ | |
28883 | + asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order)); | |
28884 | +} | |
28885 | + | |
28886 | +#ifdef CONFIG_X86_64 | |
28887 | +static inline unsigned long xen_read_cr8(void) | |
28888 | +{ | |
28889 | + return 0; | |
28890 | +} | |
28891 | + | |
28892 | +static inline void xen_write_cr8(unsigned long val) | |
28893 | +{ | |
28894 | + BUG_ON(val); | |
28895 | +} | |
28896 | +#endif | |
28897 | + | |
28898 | +static inline void xen_wbinvd(void) | |
28899 | +{ | |
28900 | + asm volatile("wbinvd": : :"memory"); | |
28901 | +} | |
28902 | +#define read_cr0() (xen_read_cr0()) | |
28903 | +#define write_cr0(x) (xen_write_cr0(x)) | |
28904 | +#define read_cr2() (xen_read_cr2()) | |
28905 | +#define write_cr2(x) (xen_write_cr2(x)) | |
28906 | +#define read_cr3() (xen_read_cr3()) | |
28907 | +#define write_cr3(x) (xen_write_cr3(x)) | |
28908 | +#define read_cr4() (xen_read_cr4()) | |
28909 | +#define read_cr4_safe() (xen_read_cr4_safe()) | |
28910 | +#define write_cr4(x) (xen_write_cr4(x)) | |
28911 | +#define wbinvd() (xen_wbinvd()) | |
28912 | +#ifdef CONFIG_X86_64 | |
28913 | +#define read_cr8() (xen_read_cr8()) | |
28914 | +#define write_cr8(x) (xen_write_cr8(x)) | |
28915 | +#endif | |
28916 | + | |
28917 | +/* Clear the 'TS' bit */ | |
28918 | +#define clts() (xen_clts()) | |
28919 | +#define stts() (xen_stts()) | |
28920 | + | |
28921 | +#endif /* __KERNEL__ */ | |
28922 | + | |
28923 | +static inline void clflush(volatile void *__p) | |
28924 | +{ | |
28925 | + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); | |
28926 | +} | |
28927 | + | |
28928 | +#define nop() __asm__ __volatile__ ("nop") | |
28929 | + | |
28930 | +void disable_hlt(void); | |
28931 | +void enable_hlt(void); | |
28932 | + | |
28933 | +extern int es7000_plat; | |
28934 | +void cpu_idle_wait(void); | |
28935 | + | |
28936 | +extern unsigned long arch_align_stack(unsigned long sp); | |
28937 | +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); | |
28938 | + | |
28939 | +void default_idle(void); | |
28940 | + | |
28941 | +/* | |
28942 | + * Force strict CPU ordering. | |
28943 | + * And yes, this is required on UP too when we're talking | |
28944 | + * to devices. | |
28945 | + */ | |
28946 | #ifdef CONFIG_X86_32 | |
28947 | -# include "system_32.h" | |
28948 | +/* | |
28949 | + * For now, "wmb()" doesn't actually do anything, as all | |
28950 | + * Intel CPU's follow what Intel calls a *Processor Order*, | |
28951 | + * in which all writes are seen in the program order even | |
28952 | + * outside the CPU. | |
28953 | + * | |
28954 | + * I expect future Intel CPU's to have a weaker ordering, | |
28955 | + * but I'd also expect them to finally get their act together | |
28956 | + * and add some real memory barriers if so. | |
28957 | + * | |
28958 | + * Some non intel clones support out of order store. wmb() ceases to be a | |
28959 | + * nop for these. | |
28960 | + */ | |
28961 | +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) | |
28962 | +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) | |
28963 | +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) | |
28964 | +#else | |
28965 | +#define mb() asm volatile("mfence":::"memory") | |
28966 | +#define rmb() asm volatile("lfence":::"memory") | |
28967 | +#define wmb() asm volatile("sfence" ::: "memory") | |
28968 | +#endif | |
28969 | + | |
28970 | +/** | |
28971 | + * read_barrier_depends - Flush all pending reads that subsequents reads | |
28972 | + * depend on. | |
28973 | + * | |
28974 | + * No data-dependent reads from memory-like regions are ever reordered | |
28975 | + * over this barrier. All reads preceding this primitive are guaranteed | |
28976 | + * to access memory (but not necessarily other CPUs' caches) before any | |
28977 | + * reads following this primitive that depend on the data return by | |
28978 | + * any of the preceding reads. This primitive is much lighter weight than | |
28979 | + * rmb() on most CPUs, and is never heavier weight than is | |
28980 | + * rmb(). | |
28981 | + * | |
28982 | + * These ordering constraints are respected by both the local CPU | |
28983 | + * and the compiler. | |
28984 | + * | |
28985 | + * Ordering is not guaranteed by anything other than these primitives, | |
28986 | + * not even by data dependencies. See the documentation for | |
28987 | + * memory_barrier() for examples and URLs to more information. | |
28988 | + * | |
28989 | + * For example, the following code would force ordering (the initial | |
28990 | + * value of "a" is zero, "b" is one, and "p" is "&a"): | |
28991 | + * | |
28992 | + * <programlisting> | |
28993 | + * CPU 0 CPU 1 | |
28994 | + * | |
28995 | + * b = 2; | |
28996 | + * memory_barrier(); | |
28997 | + * p = &b; q = p; | |
28998 | + * read_barrier_depends(); | |
28999 | + * d = *q; | |
29000 | + * </programlisting> | |
29001 | + * | |
29002 | + * because the read of "*q" depends on the read of "p" and these | |
29003 | + * two reads are separated by a read_barrier_depends(). However, | |
29004 | + * the following code, with the same initial values for "a" and "b": | |
29005 | + * | |
29006 | + * <programlisting> | |
29007 | + * CPU 0 CPU 1 | |
29008 | + * | |
29009 | + * a = 2; | |
29010 | + * memory_barrier(); | |
29011 | + * b = 3; y = b; | |
29012 | + * read_barrier_depends(); | |
29013 | + * x = a; | |
29014 | + * </programlisting> | |
29015 | + * | |
29016 | + * does not enforce ordering, since there is no data dependency between | |
29017 | + * the read of "a" and the read of "b". Therefore, on some CPUs, such | |
29018 | + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() | |
29019 | + * in cases like this where there are no data dependencies. | |
29020 | + **/ | |
29021 | + | |
29022 | +#define read_barrier_depends() do { } while (0) | |
29023 | + | |
29024 | +#ifdef CONFIG_SMP | |
29025 | +#define smp_mb() mb() | |
29026 | +#ifdef CONFIG_X86_PPRO_FENCE | |
29027 | +# define smp_rmb() rmb() | |
29028 | #else | |
29029 | -# include "system_64.h" | |
29030 | +# define smp_rmb() barrier() | |
29031 | +#endif | |
29032 | +#ifdef CONFIG_X86_OOSTORE | |
29033 | +# define smp_wmb() wmb() | |
29034 | +#else | |
29035 | +# define smp_wmb() barrier() | |
29036 | +#endif | |
29037 | +#define smp_read_barrier_depends() read_barrier_depends() | |
29038 | +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) | |
29039 | +#else | |
29040 | +#define smp_mb() barrier() | |
29041 | +#define smp_rmb() barrier() | |
29042 | +#define smp_wmb() barrier() | |
29043 | +#define smp_read_barrier_depends() do { } while (0) | |
29044 | +#define set_mb(var, value) do { var = value; barrier(); } while (0) | |
29045 | +#endif | |
29046 | + | |
29047 | +/* | |
29048 | + * Stop RDTSC speculation. This is needed when you need to use RDTSC | |
29049 | + * (or get_cycles or vread that possibly accesses the TSC) in a defined | |
29050 | + * code region. | |
29051 | + * | |
29052 | + * (Could use an alternative three way for this if there was one.) | |
29053 | + */ | |
29054 | +static inline void rdtsc_barrier(void) | |
29055 | +{ | |
29056 | + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); | |
29057 | + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); | |
29058 | +} | |
29059 | + | |
29060 | #endif | |
29061 | --- a/include/asm-x86/mach-xen/asm/tlbflush_32.h | |
29062 | +++ /dev/null | |
29063 | @@ -1,99 +0,0 @@ | |
29064 | -#ifndef _I386_TLBFLUSH_H | |
29065 | -#define _I386_TLBFLUSH_H | |
29066 | - | |
29067 | -#include <linux/mm.h> | |
29068 | -#include <asm/processor.h> | |
29069 | - | |
29070 | -#define __flush_tlb() xen_tlb_flush() | |
29071 | -#define __flush_tlb_global() xen_tlb_flush() | |
29072 | -#define __flush_tlb_all() xen_tlb_flush() | |
29073 | - | |
29074 | -#define cpu_has_invlpg (boot_cpu_data.x86 > 3) | |
29075 | - | |
29076 | -#define __flush_tlb_single(addr) xen_invlpg(addr) | |
29077 | - | |
29078 | -#define __flush_tlb_one(addr) __flush_tlb_single(addr) | |
29079 | - | |
29080 | -/* | |
29081 | - * TLB flushing: | |
29082 | - * | |
29083 | - * - flush_tlb() flushes the current mm struct TLBs | |
29084 | - * - flush_tlb_all() flushes all processes TLBs | |
29085 | - * - flush_tlb_mm(mm) flushes the specified mm context TLB's | |
29086 | - * - flush_tlb_page(vma, vmaddr) flushes one page | |
29087 | - * - flush_tlb_range(vma, start, end) flushes a range of pages | |
29088 | - * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages | |
29089 | - * | |
29090 | - * ..but the i386 has somewhat limited tlb flushing capabilities, | |
29091 | - * and page-granular flushes are available only on i486 and up. | |
29092 | - */ | |
29093 | - | |
29094 | -#define TLB_FLUSH_ALL 0xffffffff | |
29095 | - | |
29096 | - | |
29097 | -#ifndef CONFIG_SMP | |
29098 | - | |
29099 | -#include <linux/sched.h> | |
29100 | - | |
29101 | -#define flush_tlb() __flush_tlb() | |
29102 | -#define flush_tlb_all() __flush_tlb_all() | |
29103 | -#define local_flush_tlb() __flush_tlb() | |
29104 | - | |
29105 | -static inline void flush_tlb_mm(struct mm_struct *mm) | |
29106 | -{ | |
29107 | - if (mm == current->active_mm) | |
29108 | - __flush_tlb(); | |
29109 | -} | |
29110 | - | |
29111 | -static inline void flush_tlb_page(struct vm_area_struct *vma, | |
29112 | - unsigned long addr) | |
29113 | -{ | |
29114 | - if (vma->vm_mm == current->active_mm) | |
29115 | - __flush_tlb_one(addr); | |
29116 | -} | |
29117 | - | |
29118 | -static inline void flush_tlb_range(struct vm_area_struct *vma, | |
29119 | - unsigned long start, unsigned long end) | |
29120 | -{ | |
29121 | - if (vma->vm_mm == current->active_mm) | |
29122 | - __flush_tlb(); | |
29123 | -} | |
29124 | - | |
29125 | -#else /* SMP */ | |
29126 | - | |
29127 | -#include <asm/smp.h> | |
29128 | - | |
29129 | -#define local_flush_tlb() \ | |
29130 | - __flush_tlb() | |
29131 | - | |
29132 | -#define flush_tlb_all xen_tlb_flush_all | |
29133 | -#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask) | |
29134 | -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask) | |
29135 | -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va) | |
29136 | - | |
29137 | -#define flush_tlb() flush_tlb_current_task() | |
29138 | - | |
29139 | -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) | |
29140 | -{ | |
29141 | - flush_tlb_mm(vma->vm_mm); | |
29142 | -} | |
29143 | - | |
29144 | -#define TLBSTATE_OK 1 | |
29145 | -#define TLBSTATE_LAZY 2 | |
29146 | - | |
29147 | -struct tlb_state | |
29148 | -{ | |
29149 | - struct mm_struct *active_mm; | |
29150 | - int state; | |
29151 | - char __cacheline_padding[L1_CACHE_BYTES-8]; | |
29152 | -}; | |
29153 | -DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); | |
29154 | -#endif /* SMP */ | |
29155 | - | |
29156 | -static inline void flush_tlb_kernel_range(unsigned long start, | |
29157 | - unsigned long end) | |
29158 | -{ | |
29159 | - flush_tlb_all(); | |
29160 | -} | |
29161 | - | |
29162 | -#endif /* _I386_TLBFLUSH_H */ | |
29163 | --- a/include/asm-x86/mach-xen/asm/tlbflush_64.h | |
29164 | +++ /dev/null | |
29165 | @@ -1,97 +0,0 @@ | |
29166 | -#ifndef _X8664_TLBFLUSH_H | |
29167 | -#define _X8664_TLBFLUSH_H | |
29168 | - | |
29169 | -#include <linux/mm.h> | |
29170 | -#include <linux/sched.h> | |
29171 | -#include <asm/processor.h> | |
29172 | -#include <asm/system.h> | |
29173 | - | |
29174 | -#define __flush_tlb() xen_tlb_flush() | |
29175 | - | |
29176 | -/* | |
29177 | - * Global pages have to be flushed a bit differently. Not a real | |
29178 | - * performance problem because this does not happen often. | |
29179 | - */ | |
29180 | -#define __flush_tlb_global() xen_tlb_flush() | |
29181 | - | |
29182 | -#define __flush_tlb_all() __flush_tlb_global() | |
29183 | - | |
29184 | -#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr) | |
29185 | - | |
29186 | - | |
29187 | -/* | |
29188 | - * TLB flushing: | |
29189 | - * | |
29190 | - * - flush_tlb() flushes the current mm struct TLBs | |
29191 | - * - flush_tlb_all() flushes all processes TLBs | |
29192 | - * - flush_tlb_mm(mm) flushes the specified mm context TLB's | |
29193 | - * - flush_tlb_page(vma, vmaddr) flushes one page | |
29194 | - * - flush_tlb_range(vma, start, end) flushes a range of pages | |
29195 | - * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages | |
29196 | - * | |
29197 | - * x86-64 can only flush individual pages or full VMs. For a range flush | |
29198 | - * we always do the full VM. Might be worth trying if for a small | |
29199 | - * range a few INVLPGs in a row are a win. | |
29200 | - */ | |
29201 | - | |
29202 | -#ifndef CONFIG_SMP | |
29203 | - | |
29204 | -#define flush_tlb() __flush_tlb() | |
29205 | -#define flush_tlb_all() __flush_tlb_all() | |
29206 | -#define local_flush_tlb() __flush_tlb() | |
29207 | - | |
29208 | -static inline void flush_tlb_mm(struct mm_struct *mm) | |
29209 | -{ | |
29210 | - if (mm == current->active_mm) | |
29211 | - __flush_tlb(); | |
29212 | -} | |
29213 | - | |
29214 | -static inline void flush_tlb_page(struct vm_area_struct *vma, | |
29215 | - unsigned long addr) | |
29216 | -{ | |
29217 | - if (vma->vm_mm == current->active_mm) | |
29218 | - __flush_tlb_one(addr); | |
29219 | -} | |
29220 | - | |
29221 | -static inline void flush_tlb_range(struct vm_area_struct *vma, | |
29222 | - unsigned long start, unsigned long end) | |
29223 | -{ | |
29224 | - if (vma->vm_mm == current->active_mm) | |
29225 | - __flush_tlb(); | |
29226 | -} | |
29227 | - | |
29228 | -#else | |
29229 | - | |
29230 | -#include <asm/smp.h> | |
29231 | - | |
29232 | -#define local_flush_tlb() \ | |
29233 | - __flush_tlb() | |
29234 | - | |
29235 | -#define flush_tlb_all xen_tlb_flush_all | |
29236 | -#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask) | |
29237 | -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask) | |
29238 | -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va) | |
29239 | - | |
29240 | -#define flush_tlb() flush_tlb_current_task() | |
29241 | - | |
29242 | -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) | |
29243 | -{ | |
29244 | - flush_tlb_mm(vma->vm_mm); | |
29245 | -} | |
29246 | - | |
29247 | -#define TLBSTATE_OK 1 | |
29248 | -#define TLBSTATE_LAZY 2 | |
29249 | - | |
29250 | -/* Roughly an IPI every 20MB with 4k pages for freeing page table | |
29251 | - ranges. Cost is about 42k of memory for each CPU. */ | |
29252 | -#define ARCH_FREE_PTE_NR 5350 | |
29253 | - | |
29254 | -#endif | |
29255 | - | |
29256 | -static inline void flush_tlb_kernel_range(unsigned long start, | |
29257 | - unsigned long end) | |
29258 | -{ | |
29259 | - flush_tlb_all(); | |
29260 | -} | |
29261 | - | |
29262 | -#endif /* _X8664_TLBFLUSH_H */ | |
29263 | --- a/include/asm-x86/mach-xen/asm/tlbflush.h | |
29264 | +++ b/include/asm-x86/mach-xen/asm/tlbflush.h | |
29265 | @@ -1,5 +1,106 @@ | |
29266 | +#ifndef _ASM_X86_TLBFLUSH_H | |
29267 | +#define _ASM_X86_TLBFLUSH_H | |
29268 | + | |
29269 | +#include <linux/mm.h> | |
29270 | +#include <linux/sched.h> | |
29271 | + | |
29272 | +#include <asm/processor.h> | |
29273 | +#include <asm/system.h> | |
29274 | + | |
29275 | +#define __flush_tlb() xen_tlb_flush() | |
29276 | +#define __flush_tlb_global() xen_tlb_flush() | |
29277 | +#define __flush_tlb_single(addr) xen_invlpg(addr) | |
29278 | +#define __flush_tlb_all() xen_tlb_flush() | |
29279 | +#define __flush_tlb_one(addr) xen_invlpg(addr) | |
29280 | + | |
29281 | #ifdef CONFIG_X86_32 | |
29282 | -# include "tlbflush_32.h" | |
29283 | +# define TLB_FLUSH_ALL 0xffffffff | |
29284 | #else | |
29285 | -# include "tlbflush_64.h" | |
29286 | +# define TLB_FLUSH_ALL -1ULL | |
29287 | #endif | |
29288 | + | |
29289 | +/* | |
29290 | + * TLB flushing: | |
29291 | + * | |
29292 | + * - flush_tlb() flushes the current mm struct TLBs | |
29293 | + * - flush_tlb_all() flushes all processes TLBs | |
29294 | + * - flush_tlb_mm(mm) flushes the specified mm context TLB's | |
29295 | + * - flush_tlb_page(vma, vmaddr) flushes one page | |
29296 | + * - flush_tlb_range(vma, start, end) flushes a range of pages | |
29297 | + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages | |
29298 | + * | |
29299 | + * ..but the i386 has somewhat limited tlb flushing capabilities, | |
29300 | + * and page-granular flushes are available only on i486 and up. | |
29301 | + * | |
29302 | + * x86-64 can only flush individual pages or full VMs. For a range flush | |
29303 | + * we always do the full VM. Might be worth trying if for a small | |
29304 | + * range a few INVLPGs in a row are a win. | |
29305 | + */ | |
29306 | + | |
29307 | +#ifndef CONFIG_SMP | |
29308 | + | |
29309 | +#define flush_tlb() __flush_tlb() | |
29310 | +#define flush_tlb_all() __flush_tlb_all() | |
29311 | +#define local_flush_tlb() __flush_tlb() | |
29312 | + | |
29313 | +static inline void flush_tlb_mm(struct mm_struct *mm) | |
29314 | +{ | |
29315 | + if (mm == current->active_mm) | |
29316 | + __flush_tlb(); | |
29317 | +} | |
29318 | + | |
29319 | +static inline void flush_tlb_page(struct vm_area_struct *vma, | |
29320 | + unsigned long addr) | |
29321 | +{ | |
29322 | + if (vma->vm_mm == current->active_mm) | |
29323 | + __flush_tlb_one(addr); | |
29324 | +} | |
29325 | + | |
29326 | +static inline void flush_tlb_range(struct vm_area_struct *vma, | |
29327 | + unsigned long start, unsigned long end) | |
29328 | +{ | |
29329 | + if (vma->vm_mm == current->active_mm) | |
29330 | + __flush_tlb(); | |
29331 | +} | |
29332 | + | |
29333 | +#else /* SMP */ | |
29334 | + | |
29335 | +#include <asm/smp.h> | |
29336 | + | |
29337 | +#define local_flush_tlb() __flush_tlb() | |
29338 | + | |
29339 | +#define flush_tlb_all xen_tlb_flush_all | |
29340 | +#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask) | |
29341 | +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask) | |
29342 | +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va) | |
29343 | + | |
29344 | +#define flush_tlb() flush_tlb_current_task() | |
29345 | + | |
29346 | +static inline void flush_tlb_range(struct vm_area_struct *vma, | |
29347 | + unsigned long start, unsigned long end) | |
29348 | +{ | |
29349 | + flush_tlb_mm(vma->vm_mm); | |
29350 | +} | |
29351 | + | |
29352 | +#define TLBSTATE_OK 1 | |
29353 | +#define TLBSTATE_LAZY 2 | |
29354 | + | |
29355 | +#ifdef CONFIG_X86_32 | |
29356 | +struct tlb_state | |
29357 | +{ | |
29358 | + struct mm_struct *active_mm; | |
29359 | + int state; | |
29360 | + char __cacheline_padding[L1_CACHE_BYTES-8]; | |
29361 | +}; | |
29362 | +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); | |
29363 | +#endif | |
29364 | + | |
29365 | +#endif /* SMP */ | |
29366 | + | |
29367 | +static inline void flush_tlb_kernel_range(unsigned long start, | |
29368 | + unsigned long end) | |
29369 | +{ | |
29370 | + flush_tlb_all(); | |
29371 | +} | |
29372 | + | |
29373 | +#endif /* _ASM_X86_TLBFLUSH_H */ | |
29374 | --- a/include/asm-x86/mach-xen/irq_vectors.h | |
29375 | +++ b/include/asm-x86/mach-xen/irq_vectors.h | |
29376 | @@ -82,7 +82,8 @@ | |
29377 | ||
29378 | #define RESCHEDULE_VECTOR 0 | |
29379 | #define CALL_FUNCTION_VECTOR 1 | |
29380 | -#define NR_IPIS 2 | |
29381 | +#define SPIN_UNLOCK_VECTOR 2 | |
29382 | +#define NR_IPIS 3 | |
29383 | ||
29384 | /* | |
29385 | * The maximum number of vectors supported by i386 processors | |
29386 | --- a/include/asm-x86/mmu.h | |
29387 | +++ b/include/asm-x86/mmu.h | |
29388 | @@ -23,7 +23,7 @@ typedef struct { | |
29389 | void *vdso; | |
29390 | } mm_context_t; | |
29391 | ||
29392 | -#ifdef CONFIG_SMP | |
29393 | +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) | |
29394 | void leave_mm(int cpu); | |
29395 | #else | |
29396 | static inline void leave_mm(int cpu) | |
29397 | --- a/include/asm-x86/ptrace.h | |
29398 | +++ b/include/asm-x86/ptrace.h | |
29399 | @@ -249,7 +249,9 @@ extern void user_enable_single_step(stru | |
29400 | extern void user_disable_single_step(struct task_struct *); | |
29401 | ||
29402 | extern void user_enable_block_step(struct task_struct *); | |
29403 | -#ifdef CONFIG_X86_DEBUGCTLMSR | |
29404 | +#if defined(CONFIG_XEN) | |
29405 | +#define arch_has_block_step() (0) | |
29406 | +#elif defined(CONFIG_X86_DEBUGCTLMSR) | |
29407 | #define arch_has_block_step() (1) | |
29408 | #else | |
29409 | #define arch_has_block_step() (boot_cpu_data.x86 >= 6) | |
29410 | --- a/include/asm-x86/thread_info.h | |
29411 | +++ b/include/asm-x86/thread_info.h | |
29412 | @@ -94,6 +94,9 @@ struct thread_info { | |
29413 | #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ | |
29414 | #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ | |
29415 | #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ | |
29416 | +#ifdef CONFIG_X86_XEN | |
29417 | +#define TIF_CSTAR 31 /* cstar-based syscall (special handling) */ | |
29418 | +#endif | |
29419 | ||
29420 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) | |
29421 | #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) | |
29422 | @@ -118,6 +121,7 @@ struct thread_info { | |
29423 | #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) | |
29424 | #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) | |
29425 | #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) | |
29426 | +#define _TIF_CSTAR (1 << TIF_CSTAR) | |
29427 | ||
29428 | /* work to do in syscall_trace_enter() */ | |
29429 | #define _TIF_WORK_SYSCALL_ENTRY \ | |
29430 | @@ -147,12 +151,12 @@ struct thread_info { | |
29431 | (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ | |
29432 | _TIF_NOTSC|_TIF_PERFMON_CTXSW) | |
29433 | ||
29434 | -#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW | |
29435 | -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) | |
29436 | #else | |
29437 | -#define _TIF_WORK_CTXSW_NEXT (_TIF_NOTSC | _TIF_DEBUG) | |
29438 | -#define _TIF_WORK_CTXSW_PREV (_TIF_NOTSC) | |
29439 | +#define _TIF_WORK_CTXSW (_TIF_NOTSC \ | |
29440 | + /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/) | |
29441 | #endif | |
29442 | +#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW | |
29443 | +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) | |
29444 | ||
29445 | #define PREEMPT_ACTIVE 0x10000000 | |
29446 | ||
29447 | --- a/include/asm-x86/time.h | |
29448 | +++ b/include/asm-x86/time.h | |
29449 | @@ -58,4 +58,10 @@ static inline int native_set_wallclock(u | |
29450 | ||
29451 | extern unsigned long __init calibrate_cpu(void); | |
29452 | ||
29453 | +#ifdef CONFIG_XEN | |
29454 | +extern int xen_independent_wallclock(void); | |
29455 | +extern unsigned long xen_read_persistent_clock(void); | |
29456 | +extern int xen_update_persistent_clock(void); | |
29457 | +#endif | |
29458 | + | |
29459 | #endif | |
29460 | --- a/include/linux/page-flags.h | |
29461 | +++ b/include/linux/page-flags.h | |
29462 | @@ -101,8 +101,8 @@ enum pageflags { | |
29463 | PG_foreign, /* Page is owned by foreign allocator. */ | |
29464 | PG_pinned, /* Cannot alias with PG_owner_priv_1 since | |
29465 | * bad_page() checks include this bit. | |
29466 | - * Also cannot use PG_arch_1 since that now | |
29467 | - * has a different purpose on x86. */ | |
29468 | + * Should not use PG_arch_1 as that may have | |
29469 | + * a different purpose elsewhere. */ | |
29470 | #endif | |
29471 | __NR_PAGEFLAGS, | |
29472 | ||
29473 | --- a/include/linux/pci.h | |
29474 | +++ b/include/linux/pci.h | |
29475 | @@ -644,6 +644,9 @@ int pcie_set_readrq(struct pci_dev *dev, | |
29476 | void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno); | |
29477 | int __must_check pci_assign_resource(struct pci_dev *dev, int i); | |
29478 | int pci_select_bars(struct pci_dev *dev, unsigned long flags); | |
29479 | +#ifdef CONFIG_XEN | |
29480 | +void pci_restore_bars(struct pci_dev *); | |
29481 | +#endif | |
29482 | ||
29483 | /* ROM control related routines */ | |
29484 | void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size); | |
29485 | --- a/include/xen/evtchn.h | |
29486 | +++ b/include/xen/evtchn.h | |
29487 | @@ -130,12 +130,37 @@ static inline void clear_evtchn(int port | |
29488 | synch_clear_bit(port, s->evtchn_pending); | |
29489 | } | |
29490 | ||
29491 | +static inline void set_evtchn(int port) | |
29492 | +{ | |
29493 | + shared_info_t *s = HYPERVISOR_shared_info; | |
29494 | + synch_set_bit(port, s->evtchn_pending); | |
29495 | +} | |
29496 | + | |
29497 | +static inline int test_evtchn(int port) | |
29498 | +{ | |
29499 | + shared_info_t *s = HYPERVISOR_shared_info; | |
29500 | + return synch_test_bit(port, s->evtchn_pending); | |
29501 | +} | |
29502 | + | |
29503 | static inline void notify_remote_via_evtchn(int port) | |
29504 | { | |
29505 | struct evtchn_send send = { .port = port }; | |
29506 | VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send)); | |
29507 | } | |
29508 | ||
29509 | +/* Clear an irq's pending state, in preparation for polling on it. */ | |
29510 | +void xen_clear_irq_pending(int irq); | |
29511 | + | |
29512 | +/* Set an irq's pending state, to avoid blocking on it. */ | |
29513 | +void xen_set_irq_pending(int irq); | |
29514 | + | |
29515 | +/* Test an irq's pending state. */ | |
29516 | +int xen_test_irq_pending(int irq); | |
29517 | + | |
29518 | +/* Poll waiting for an irq to become pending. In the usual case, the | |
29519 | + irq will be disabled so it won't deliver an interrupt. */ | |
29520 | +void xen_poll_irq(int irq); | |
29521 | + | |
29522 | /* | |
29523 | * Use these to access the event channel underlying the IRQ handle returned | |
29524 | * by bind_*_to_irqhandler(). | |
29525 | --- a/kernel/sysctl_check.c | |
29526 | +++ b/kernel/sysctl_check.c | |
29527 | @@ -899,7 +899,7 @@ static const struct trans_ctl_table tran | |
29528 | }; | |
29529 | ||
29530 | #ifdef CONFIG_XEN | |
29531 | -static struct trans_ctl_table trans_xen_table[] = { | |
29532 | +static const struct trans_ctl_table trans_xen_table[] = { | |
29533 | { CTL_XEN_INDEPENDENT_WALLCLOCK, "independent_wallclock" }, | |
29534 | { CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" }, | |
29535 | {} | |
29536 | --- a/lib/swiotlb-xen.c | |
29537 | +++ b/lib/swiotlb-xen.c | |
29538 | @@ -30,7 +30,6 @@ | |
29539 | #include <asm/gnttab_dma.h> | |
29540 | ||
29541 | int swiotlb; | |
29542 | -EXPORT_SYMBOL(swiotlb); | |
29543 | ||
29544 | #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1))) | |
29545 | ||
29546 | @@ -289,6 +288,15 @@ __sync_single(struct phys_addr buffer, c | |
29547 | } | |
29548 | } | |
29549 | ||
29550 | +static inline unsigned int is_span_boundary(unsigned int index, | |
29551 | + unsigned int nslots, | |
29552 | + unsigned long offset_slots, | |
29553 | + unsigned long max_slots) | |
29554 | +{ | |
29555 | + unsigned long offset = (offset_slots + index) & (max_slots - 1); | |
29556 | + return offset + nslots > max_slots; | |
29557 | +} | |
29558 | + | |
29559 | /* | |
29560 | * Allocates bounce buffer and returns its kernel virtual address. | |
29561 | */ | |
29562 | @@ -300,6 +308,15 @@ map_single(struct device *hwdev, struct | |
29563 | unsigned int nslots, stride, index, wrap; | |
29564 | struct phys_addr slot_buf; | |
29565 | int i; | |
29566 | + unsigned long mask; | |
29567 | + unsigned long offset_slots; | |
29568 | + unsigned long max_slots; | |
29569 | + | |
29570 | + mask = dma_get_seg_boundary(hwdev); | |
29571 | + offset_slots = -IO_TLB_SEGSIZE; | |
29572 | + max_slots = mask + 1 | |
29573 | + ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT | |
29574 | + : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); | |
29575 | ||
29576 | /* | |
29577 | * For mappings greater than a page, we limit the stride (and | |
29578 | @@ -319,12 +336,21 @@ map_single(struct device *hwdev, struct | |
29579 | */ | |
29580 | spin_lock_irqsave(&io_tlb_lock, flags); | |
29581 | { | |
29582 | - wrap = index = ALIGN(io_tlb_index, stride); | |
29583 | - | |
29584 | + index = ALIGN(io_tlb_index, stride); | |
29585 | if (index >= iotlb_nslabs) | |
29586 | - wrap = index = 0; | |
29587 | + index = 0; | |
29588 | + wrap = index; | |
29589 | ||
29590 | do { | |
29591 | + while (is_span_boundary(index, nslots, offset_slots, | |
29592 | + max_slots)) { | |
29593 | + index += stride; | |
29594 | + if (index >= iotlb_nslabs) | |
29595 | + index = 0; | |
29596 | + if (index == wrap) | |
29597 | + goto not_found; | |
29598 | + } | |
29599 | + | |
29600 | /* | |
29601 | * If we find a slot that indicates we have 'nslots' | |
29602 | * number of contiguous buffers, we allocate the | |
29603 | @@ -359,6 +385,7 @@ map_single(struct device *hwdev, struct | |
29604 | index = 0; | |
29605 | } while (index != wrap); | |
29606 | ||
29607 | + not_found: | |
29608 | spin_unlock_irqrestore(&io_tlb_lock, flags); | |
29609 | return NULL; | |
29610 | } |