5 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
7 Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches.py
11 arch/x86/Kconfig.debug | 1
12 arch/x86/ia32/ia32entry-xen.S | 12
13 arch/x86/kernel/Makefile | 3
14 arch/x86/kernel/acpi/boot.c | 3
15 arch/x86/kernel/acpi/sleep-xen.c | 95 +
16 arch/x86/kernel/acpi/sleep_32-xen.c | 117 --
17 arch/x86/kernel/acpi/sleep_64-xen.c | 125 --
18 arch/x86/kernel/apic_32-xen.c | 2
19 arch/x86/kernel/apic_64-xen.c | 73 -
20 arch/x86/kernel/asm-offsets_32.c | 2
21 arch/x86/kernel/cpu/common-xen.c | 214 +--
22 arch/x86/kernel/cpu/mtrr/main-xen.c | 19
23 arch/x86/kernel/e820_32-xen.c | 275 -----
24 arch/x86/kernel/e820_64-xen.c | 485 +++++---
25 arch/x86/kernel/early_printk-xen.c | 2
26 arch/x86/kernel/entry_32-xen.S | 195 +++
27 arch/x86/kernel/entry_64-xen.S | 91 -
28 arch/x86/kernel/fixup.c | 2
29 arch/x86/kernel/genapic_64-xen.c | 15
30 arch/x86/kernel/head64-xen.c | 63 +
31 arch/x86/kernel/head_32-xen.S | 3
32 arch/x86/kernel/init_task-xen.c | 2
33 arch/x86/kernel/io_apic_32-xen.c | 15
34 arch/x86/kernel/io_apic_64-xen.c | 110 +-
35 arch/x86/kernel/ioport-xen.c | 112 ++
36 arch/x86/kernel/ioport_32-xen.c | 121 --
37 arch/x86/kernel/ioport_64-xen.c | 99 -
38 arch/x86/kernel/irq_32-xen.c | 22
39 arch/x86/kernel/irq_64-xen.c | 43
40 arch/x86/kernel/ldt-xen.c | 272 +++++
41 arch/x86/kernel/ldt_32-xen.c | 265 ----
42 arch/x86/kernel/ldt_64-xen.c | 271 ----
43 arch/x86/kernel/machine_kexec_64.c | 2
44 arch/x86/kernel/microcode-xen.c | 2
45 arch/x86/kernel/mpparse_32-xen.c | 49
46 arch/x86/kernel/mpparse_64-xen.c | 30
47 arch/x86/kernel/pci-dma-xen.c | 20
48 arch/x86/kernel/process_32-xen.c | 438 ++------
49 arch/x86/kernel/process_64-xen.c | 303 ++---
50 arch/x86/kernel/quirks-xen.c | 82 -
51 arch/x86/kernel/rtc.c | 8
52 arch/x86/kernel/setup64-xen.c | 70 +
53 arch/x86/kernel/setup_32-xen.c | 311 ++++-
54 arch/x86/kernel/setup_64-xen.c | 686 ++++++------
55 arch/x86/kernel/smp_32-xen.c | 5
56 arch/x86/kernel/smp_64-xen.c | 91 -
57 arch/x86/kernel/time_32-xen.c | 136 --
58 arch/x86/kernel/traps_32-xen.c | 320 +++--
59 arch/x86/kernel/traps_64-xen.c | 371 +++---
60 arch/x86/kernel/vsyscall_64-xen.c | 60 -
61 arch/x86/kernel/xen_entry_64.S | 36
62 arch/x86/mach-xen/setup.c | 11
63 arch/x86/mm/fault-xen.c | 1026 ++++++++++++++++++
64 arch/x86/mm/fault_32-xen.c | 757 -------------
65 arch/x86/mm/fault_64-xen.c | 686 ------------
66 arch/x86/mm/highmem_32-xen.c | 45
67 arch/x86/mm/hypervisor.c | 10
68 arch/x86/mm/init_32-xen.c | 464 +++-----
69 arch/x86/mm/init_64-xen.c | 517 ++++-----
70 arch/x86/mm/ioremap-xen.c | 685 ++++++++++++
71 arch/x86/mm/ioremap_32-xen.c | 445 --------
72 arch/x86/mm/pageattr-xen.c | 1412 ++++++++++++++++++++++++++
73 arch/x86/mm/pageattr_64-xen.c | 542 ---------
74 arch/x86/mm/pgtable_32-xen.c | 672 ++----------
75 arch/x86/pci/irq-xen.c | 24
76 arch/x86/vdso/Makefile | 1
77 arch/x86/vdso/vdso32-setup-xen.c | 506 +++++++++
78 arch/x86/vdso/vdso32-setup.c | 34
79 arch/x86/vdso/vdso32.S | 12
80 arch/x86/vdso/vdso32/syscall.S | 2
81 drivers/pci/msi-xen.c | 98 -
83 drivers/xen/balloon/sysfs.c | 2
84 drivers/xen/blkback/blkback.c | 5
85 drivers/xen/blkfront/blkfront.c | 9
86 drivers/xen/blktap/blktap.c | 8
87 drivers/xen/core/Makefile | 1
88 drivers/xen/core/evtchn.c | 46
89 drivers/xen/core/hypervisor_sysfs.c | 2
90 drivers/xen/core/smpboot.c | 29
91 drivers/xen/core/spinlock.c | 161 ++
92 drivers/xen/core/xen_sysfs.c | 30
93 drivers/xen/gntdev/gntdev.c | 4
94 drivers/xen/scsifront/scsifront.c | 49
95 drivers/xen/xenoprof/xenoprofile.c | 2
96 include/asm-x86/mach-xen/asm/agp.h | 9
97 include/asm-x86/mach-xen/asm/desc.h | 403 +++++++
98 include/asm-x86/mach-xen/asm/desc_32.h | 262 ----
99 include/asm-x86/mach-xen/asm/desc_64.h | 228 ----
100 include/asm-x86/mach-xen/asm/dma-mapping_32.h | 18
101 include/asm-x86/mach-xen/asm/fixmap_32.h | 24
102 include/asm-x86/mach-xen/asm/fixmap_64.h | 25
103 include/asm-x86/mach-xen/asm/highmem.h | 10
104 include/asm-x86/mach-xen/asm/hypervisor.h | 19
105 include/asm-x86/mach-xen/asm/io_32.h | 69 -
106 include/asm-x86/mach-xen/asm/io_64.h | 62 -
107 include/asm-x86/mach-xen/asm/irqflags.h | 248 ++++
108 include/asm-x86/mach-xen/asm/irqflags_32.h | 212 ---
109 include/asm-x86/mach-xen/asm/irqflags_64.h | 178 ---
110 include/asm-x86/mach-xen/asm/maddr_32.h | 21
111 include/asm-x86/mach-xen/asm/maddr_64.h | 19
112 include/asm-x86/mach-xen/asm/mmu_context_32.h | 2
113 include/asm-x86/mach-xen/asm/mmu_context_64.h | 12
114 include/asm-x86/mach-xen/asm/page.h | 238 ++++
115 include/asm-x86/mach-xen/asm/page_64.h | 196 ---
116 include/asm-x86/mach-xen/asm/pci.h | 17
117 include/asm-x86/mach-xen/asm/pci_64.h | 1
118 include/asm-x86/mach-xen/asm/pgalloc_32.h | 116 +-
119 include/asm-x86/mach-xen/asm/pgalloc_64.h | 87 -
120 include/asm-x86/mach-xen/asm/pgtable-3level.h | 107 -
121 include/asm-x86/mach-xen/asm/pgtable.h | 449 ++++++++
122 include/asm-x86/mach-xen/asm/pgtable_32.h | 361 ------
123 include/asm-x86/mach-xen/asm/pgtable_64.h | 400 +------
124 include/asm-x86/mach-xen/asm/processor.h | 792 ++++++++++++++
125 include/asm-x86/mach-xen/asm/processor_32.h | 751 -------------
126 include/asm-x86/mach-xen/asm/processor_64.h | 461 --------
127 include/asm-x86/mach-xen/asm/segment.h | 203 +++
128 include/asm-x86/mach-xen/asm/segment_32.h | 150 --
129 include/asm-x86/mach-xen/asm/smp_32.h | 125 +-
130 include/asm-x86/mach-xen/asm/smp_64.h | 138 --
131 include/asm-x86/mach-xen/asm/spinlock.h | 333 ++++++
132 include/asm-x86/mach-xen/asm/system.h | 392 +++++++
133 include/asm-x86/mach-xen/asm/system_32.h | 312 -----
134 include/asm-x86/mach-xen/asm/system_64.h | 159 --
135 include/asm-x86/mach-xen/asm/tlbflush.h | 105 +
136 include/asm-x86/mach-xen/asm/tlbflush_32.h | 99 -
137 include/asm-x86/mach-xen/asm/tlbflush_64.h | 97 -
138 include/asm-x86/mach-xen/irq_vectors.h | 3
139 include/asm-x86/mmu.h | 2
140 include/asm-x86/ptrace.h | 4
141 include/asm-x86/thread_info.h | 12
142 include/asm-x86/time.h | 6
143 include/linux/page-flags.h | 4
144 include/linux/pci.h | 3
145 include/xen/evtchn.h | 25
146 kernel/sysctl_check.c | 2
147 lib/swiotlb-xen.c | 35
148 138 files changed, 11322 insertions(+), 11153 deletions(-)
150 --- a/arch/x86/ia32/ia32entry-xen.S
151 +++ b/arch/x86/ia32/ia32entry-xen.S
153 #include <asm/ia32_unistd.h>
154 #include <asm/thread_info.h>
155 #include <asm/segment.h>
156 -#include <asm/vsyscall32.h>
157 #include <asm/irqflags.h>
158 #include <linux/linkage.h>
160 @@ -99,10 +98,11 @@ ENTRY(ia32_sysenter_target)
162 movl %ebp,%ebp /* zero extension */
164 + movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
165 movl $__USER32_DS,40(%rsp)
167 movl $__USER32_CS,16(%rsp)
168 - movl $VSYSCALL32_SYSEXIT,8(%rsp)
173 @@ -582,8 +582,8 @@ ia32_sys_call_table:
174 .quad compat_sys_futex /* 240 */
175 .quad compat_sys_sched_setaffinity
176 .quad compat_sys_sched_getaffinity
177 - .quad sys32_set_thread_area
178 - .quad sys32_get_thread_area
179 + .quad sys_set_thread_area
180 + .quad sys_get_thread_area
181 .quad compat_sys_io_setup /* 245 */
183 .quad compat_sys_io_getevents
184 @@ -661,7 +661,9 @@ ia32_sys_call_table:
185 .quad sys_epoll_pwait
186 .quad compat_sys_utimensat /* 320 */
187 .quad compat_sys_signalfd
188 - .quad compat_sys_timerfd
189 + .quad sys_timerfd_create
191 .quad sys32_fallocate
192 + .quad compat_sys_timerfd_settime /* 325 */
193 + .quad compat_sys_timerfd_gettime
195 --- a/arch/x86/Kconfig
196 +++ b/arch/x86/Kconfig
197 @@ -27,7 +27,7 @@ config X86
198 select HAVE_KRETPROBES
199 select HAVE_DYNAMIC_FTRACE
201 - select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
202 + select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
203 select HAVE_ARCH_KGDB if !X86_VOYAGER
204 select HAVE_ARCH_TRACEHOOK
205 select HAVE_GENERIC_DMA_COHERENT if X86_32
206 @@ -208,14 +208,12 @@ config X86_TRAMPOLINE
223 @@ -724,9 +722,8 @@ config X86_VISWS_APIC
224 depends on X86_32 && X86_VISWS
226 config X86_XEN_GENAPIC
229 depends on X86_64_XEN
233 bool "Machine Check Exception"
234 @@ -1113,7 +1110,7 @@ config ARCH_DISCONTIGMEM_DEFAULT
236 config ARCH_SPARSEMEM_DEFAULT
239 + depends on X86_64 && !X86_64_XEN
241 config ARCH_SPARSEMEM_ENABLE
243 @@ -1743,10 +1740,10 @@ config PCI_MMCONFIG
244 depends on X86_64 && PCI && ACPI
246 config XEN_PCIDEV_FRONTEND
247 - bool "Xen PCI Frontend" if X86_64
249 + prompt "Xen PCI Frontend" if X86_64
250 depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64)
254 The PCI device frontend driver allows the kernel to import arbitrary
255 PCI devices from a PCI backend to support PCI driver domains.
256 @@ -1754,7 +1751,6 @@ config XEN_PCIDEV_FRONTEND
257 config XEN_PCIDEV_FE_DEBUG
258 bool "Xen PCI Frontend Debugging"
259 depends on XEN_PCIDEV_FRONTEND
262 Enables some debug statements within the PCI Frontend.
264 --- a/arch/x86/Kconfig.debug
265 +++ b/arch/x86/Kconfig.debug
266 @@ -266,6 +266,7 @@ config DEBUG_BOOT_PARAMS
267 bool "Debug boot parameters"
268 depends on DEBUG_KERNEL
272 This option will cause struct boot_params to be exported via debugfs.
274 --- a/arch/x86/kernel/acpi/boot.c
275 +++ b/arch/x86/kernel/acpi/boot.c
276 @@ -133,6 +133,9 @@ char *__init __acpi_map_table(unsigned l
278 if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
281 + if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT))
282 + return isa_bus_to_virt(phys);
285 offset = phys & (PAGE_SIZE - 1);
286 --- a/arch/x86/kernel/acpi/sleep_32-xen.c
290 - * sleep.c - x86-specific ACPI sleep support.
292 - * Copyright (C) 2001-2003 Patrick Mochel
293 - * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
296 -#include <linux/acpi.h>
297 -#include <linux/bootmem.h>
298 -#include <linux/dmi.h>
299 -#include <linux/cpumask.h>
301 -#include <asm/smp.h>
303 -#ifndef CONFIG_ACPI_PV_SLEEP
304 -/* address in low memory of the wakeup routine. */
305 -unsigned long acpi_wakeup_address = 0;
306 -unsigned long acpi_realmode_flags;
307 -extern char wakeup_start, wakeup_end;
309 -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
313 - * acpi_save_state_mem - save kernel state
315 - * Create an identity mapped page table and copy the wakeup routine to
318 -int acpi_save_state_mem(void)
320 -#ifndef CONFIG_ACPI_PV_SLEEP
321 - if (!acpi_wakeup_address)
323 - memcpy((void *)acpi_wakeup_address, &wakeup_start,
324 - &wakeup_end - &wakeup_start);
325 - acpi_copy_wakeup_routine(acpi_wakeup_address);
331 - * acpi_restore_state - undo effects of acpi_save_state_mem
333 -void acpi_restore_state_mem(void)
338 - * acpi_reserve_bootmem - do _very_ early ACPI initialisation
340 - * We allocate a page from the first 1MB of memory for the wakeup
341 - * routine for when we come back from a sleep state. The
342 - * runtime allocator allows specification of <16MB pages, but not
345 -void __init acpi_reserve_bootmem(void)
347 -#ifndef CONFIG_ACPI_PV_SLEEP
348 - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
350 - "ACPI: Wakeup code way too big, S3 disabled.\n");
354 - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
355 - if (!acpi_wakeup_address)
356 - printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
360 -#ifndef CONFIG_ACPI_PV_SLEEP
361 -static int __init acpi_sleep_setup(char *str)
363 - while ((str != NULL) && (*str != '\0')) {
364 - if (strncmp(str, "s3_bios", 7) == 0)
365 - acpi_realmode_flags |= 1;
366 - if (strncmp(str, "s3_mode", 7) == 0)
367 - acpi_realmode_flags |= 2;
368 - if (strncmp(str, "s3_beep", 7) == 0)
369 - acpi_realmode_flags |= 4;
370 - str = strchr(str, ',');
372 - str += strspn(str, ", \t");
377 -__setup("acpi_sleep=", acpi_sleep_setup);
379 -/* Ouch, we want to delete this. We already have better version in userspace, in
380 - s2ram from suspend.sf.net project */
381 -static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
383 - acpi_realmode_flags |= 2;
387 -static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
388 - { /* Reset video mode after returning from ACPI S3 sleep */
389 - .callback = reset_videomode_after_s3,
390 - .ident = "Toshiba Satellite 4030cdt",
392 - DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
398 -static int __init acpisleep_dmi_init(void)
400 - dmi_check_system(acpisleep_dmi_table);
404 -core_initcall(acpisleep_dmi_init);
405 -#endif /* CONFIG_ACPI_PV_SLEEP */
406 --- a/arch/x86/kernel/acpi/sleep_64-xen.c
410 - * acpi.c - Architecture-Specific Low-Level ACPI Support
412 - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
413 - * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
414 - * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
415 - * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
416 - * Copyright (C) 2003 Pavel Machek, SuSE Labs
418 - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
420 - * This program is free software; you can redistribute it and/or modify
421 - * it under the terms of the GNU General Public License as published by
422 - * the Free Software Foundation; either version 2 of the License, or
423 - * (at your option) any later version.
425 - * This program is distributed in the hope that it will be useful,
426 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
427 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
428 - * GNU General Public License for more details.
430 - * You should have received a copy of the GNU General Public License
431 - * along with this program; if not, write to the Free Software
432 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
434 - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
437 -#include <linux/kernel.h>
438 -#include <linux/init.h>
439 -#include <linux/types.h>
440 -#include <linux/stddef.h>
441 -#include <linux/slab.h>
442 -#include <linux/pci.h>
443 -#include <linux/bootmem.h>
444 -#include <linux/acpi.h>
445 -#include <linux/cpumask.h>
447 -#include <asm/mpspec.h>
449 -#include <asm/apic.h>
450 -#include <asm/apicdef.h>
451 -#include <asm/page.h>
452 -#include <asm/pgtable.h>
453 -#include <asm/pgalloc.h>
454 -#include <asm/io_apic.h>
455 -#include <asm/proto.h>
456 -#include <asm/tlbflush.h>
458 -/* --------------------------------------------------------------------------
459 - Low-Level Sleep Support
460 - -------------------------------------------------------------------------- */
462 -#ifndef CONFIG_ACPI_PV_SLEEP
463 -/* address in low memory of the wakeup routine. */
464 -unsigned long acpi_wakeup_address = 0;
465 -unsigned long acpi_realmode_flags;
466 -extern char wakeup_start, wakeup_end;
468 -extern unsigned long acpi_copy_wakeup_routine(unsigned long);
472 - * acpi_save_state_mem - save kernel state
474 - * Create an identity mapped page table and copy the wakeup routine to
477 -int acpi_save_state_mem(void)
479 -#ifndef CONFIG_ACPI_PV_SLEEP
480 - memcpy((void *)acpi_wakeup_address, &wakeup_start,
481 - &wakeup_end - &wakeup_start);
482 - acpi_copy_wakeup_routine(acpi_wakeup_address);
488 - * acpi_restore_state
490 -void acpi_restore_state_mem(void)
495 - * acpi_reserve_bootmem - do _very_ early ACPI initialisation
497 - * We allocate a page in low memory for the wakeup
498 - * routine for when we come back from a sleep state. The
499 - * runtime allocator allows specification of <16M pages, but not
502 -void __init acpi_reserve_bootmem(void)
504 -#ifndef CONFIG_ACPI_PV_SLEEP
505 - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
506 - if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
508 - "ACPI: Wakeup code way too big, will crash on attempt"
513 -#ifndef CONFIG_ACPI_PV_SLEEP
514 -static int __init acpi_sleep_setup(char *str)
516 - while ((str != NULL) && (*str != '\0')) {
517 - if (strncmp(str, "s3_bios", 7) == 0)
518 - acpi_realmode_flags |= 1;
519 - if (strncmp(str, "s3_mode", 7) == 0)
520 - acpi_realmode_flags |= 2;
521 - if (strncmp(str, "s3_beep", 7) == 0)
522 - acpi_realmode_flags |= 4;
523 - str = strchr(str, ',');
525 - str += strspn(str, ", \t");
531 -__setup("acpi_sleep=", acpi_sleep_setup);
532 -#endif /* CONFIG_ACPI_PV_SLEEP */
535 +++ b/arch/x86/kernel/acpi/sleep-xen.c
538 + * sleep.c - x86-specific ACPI sleep support.
540 + * Copyright (C) 2001-2003 Patrick Mochel
541 + * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
544 +#include <linux/acpi.h>
545 +#include <linux/bootmem.h>
546 +#include <linux/dmi.h>
547 +#include <linux/cpumask.h>
549 +#include <asm/smp.h>
551 +#ifndef CONFIG_ACPI_PV_SLEEP
552 +/* address in low memory of the wakeup routine. */
553 +unsigned long acpi_wakeup_address = 0;
554 +unsigned long acpi_realmode_flags;
555 +extern char wakeup_start, wakeup_end;
557 +extern unsigned long acpi_copy_wakeup_routine(unsigned long);
561 + * acpi_save_state_mem - save kernel state
563 + * Create an identity mapped page table and copy the wakeup routine to
566 +int acpi_save_state_mem(void)
568 +#ifndef CONFIG_ACPI_PV_SLEEP
569 + if (!acpi_wakeup_address) {
570 + printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
573 + memcpy((void *)acpi_wakeup_address, &wakeup_start,
574 + &wakeup_end - &wakeup_start);
575 + acpi_copy_wakeup_routine(acpi_wakeup_address);
582 + * acpi_restore_state - undo effects of acpi_save_state_mem
584 +void acpi_restore_state_mem(void)
590 + * acpi_reserve_bootmem - do _very_ early ACPI initialisation
592 + * We allocate a page from the first 1MB of memory for the wakeup
593 + * routine for when we come back from a sleep state. The
594 + * runtime allocator allows specification of <16MB pages, but not
597 +void __init acpi_reserve_bootmem(void)
599 +#ifndef CONFIG_ACPI_PV_SLEEP
600 + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
602 + "ACPI: Wakeup code way too big, S3 disabled.\n");
606 + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
607 + if (!acpi_wakeup_address)
608 + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
613 +#ifndef CONFIG_ACPI_PV_SLEEP
614 +static int __init acpi_sleep_setup(char *str)
616 + while ((str != NULL) && (*str != '\0')) {
617 + if (strncmp(str, "s3_bios", 7) == 0)
618 + acpi_realmode_flags |= 1;
619 + if (strncmp(str, "s3_mode", 7) == 0)
620 + acpi_realmode_flags |= 2;
621 + if (strncmp(str, "s3_beep", 7) == 0)
622 + acpi_realmode_flags |= 4;
623 + str = strchr(str, ',');
625 + str += strspn(str, ", \t");
630 +__setup("acpi_sleep=", acpi_sleep_setup);
631 +#endif /* CONFIG_ACPI_PV_SLEEP */
632 --- a/arch/x86/kernel/apic_32-xen.c
633 +++ b/arch/x86/kernel/apic_32-xen.c
634 @@ -86,7 +86,7 @@ int setup_profiling_timer(unsigned int m
635 * This initializes the IO-APIC and APIC hardware if this is
638 -int __init APIC_init_uniprocessor (void)
639 +int __init APIC_init_uniprocessor(void)
641 #ifdef CONFIG_X86_IO_APIC
642 if (smp_found_config)
643 --- a/arch/x86/kernel/apic_64-xen.c
644 +++ b/arch/x86/kernel/apic_64-xen.c
646 #include <asm/hpet.h>
647 #include <asm/idle.h>
653 - * 'what should we do if we get a hw irq event on an illegal vector'.
654 - * each architecture has to answer this themselves.
655 + * Debug level, exported for io_apic.c
657 -void ack_bad_irq(unsigned int irq)
659 - printk("unexpected IRQ trap at irq %02x\n", irq);
661 - * Currently unexpected vectors happen only on SMP and APIC.
662 - * We _must_ ack these because every local APIC has only N
663 - * irq slots per priority level, and a 'hanging, unacked' IRQ
664 - * holds up an irq slot - in excessive cases (when multiple
665 - * unexpected vectors occur) that might lock up the APIC
667 - * But don't ack when the APIC is disabled. -AK
673 -int setup_profiling_timer(unsigned int multiplier)
679 -void smp_local_timer_interrupt(void)
681 + * The guts of the apic timer interrupt
683 +static void local_apic_timer_interrupt(void)
686 int cpu = smp_processor_id();
687 @@ -121,11 +104,34 @@ void smp_apic_timer_interrupt(struct pt_
691 - smp_local_timer_interrupt();
692 + local_apic_timer_interrupt();
694 set_irq_regs(old_regs);
697 +int setup_profiling_timer(unsigned int multiplier)
703 + * This initializes the IO-APIC and APIC hardware if this is
706 +int __init APIC_init_uniprocessor(void)
708 +#ifdef CONFIG_X86_IO_APIC
709 + if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
717 + * Local APIC interrupts
721 * This interrupt should _never_ happen with our APIC/SMP architecture
723 @@ -150,7 +156,6 @@ asmlinkage void smp_spurious_interrupt(v
725 * This interrupt should never happen with our APIC/SMP architecture
728 asmlinkage void smp_error_interrupt(void)
731 @@ -178,19 +183,3 @@ asmlinkage void smp_error_interrupt(void
732 smp_processor_id(), v , v1);
739 - * This initializes the IO-APIC and APIC hardware if this is
742 -int __init APIC_init_uniprocessor (void)
744 -#ifdef CONFIG_X86_IO_APIC
745 - if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
751 --- a/arch/x86/kernel/asm-offsets_32.c
752 +++ b/arch/x86/kernel/asm-offsets_32.c
754 #include <xen/interface/xen.h>
757 +#ifdef CONFIG_LGUEST_GUEST
758 #include <linux/lguest.h>
759 #include "../../../drivers/lguest/lg.h"
762 /* workaround for a warning with -Wmissing-prototypes */
764 --- a/arch/x86/kernel/cpu/common-xen.c
765 +++ b/arch/x86/kernel/cpu/common-xen.c
769 DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
770 - [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
771 - [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
772 - [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
773 - [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
774 + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
775 + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
776 + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
777 + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
780 * Segments used for calling PnP BIOS have byte granularity.
781 * They code segments and data segments have fixed 64k limits,
782 * the transfer segment sizes are set at run time.
784 - [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
785 - [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
786 - [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
787 - [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
788 - [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
790 + [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
792 + [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
794 + [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
796 + [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
798 + [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
800 * The APM segments have byte granularity and their bases
801 * are set at run time. All have 64k limits.
803 - [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
805 + [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
807 - [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
808 - [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
809 + [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
811 + [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
813 - [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
814 + [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
816 - [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
817 + [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
819 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
821 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
823 static int cachesize_override __cpuinitdata = -1;
824 -static int disable_x86_fxsr __cpuinitdata;
825 static int disable_x86_serial_nr __cpuinitdata = 1;
826 -static int disable_x86_sep __cpuinitdata;
828 struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
830 -extern int disable_pse;
832 static void __cpuinit default_init(struct cpuinfo_x86 * c)
834 /* Not much we can do here... */
835 @@ -214,16 +219,8 @@ static void __cpuinit get_cpu_vendor(str
837 static int __init x86_fxsr_setup(char * s)
839 - /* Tell all the other CPUs to not use it... */
840 - disable_x86_fxsr = 1;
843 - * ... and clear the bits early in the boot_cpu_data
844 - * so that the bootup process doesn't try to do this
847 - clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
848 - clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
849 + setup_clear_cpu_cap(X86_FEATURE_FXSR);
850 + setup_clear_cpu_cap(X86_FEATURE_XMM);
853 __setup("nofxsr", x86_fxsr_setup);
854 @@ -231,7 +228,7 @@ __setup("nofxsr", x86_fxsr_setup);
856 static int __init x86_sep_setup(char * s)
858 - disable_x86_sep = 1;
859 + setup_clear_cpu_cap(X86_FEATURE_SEP);
862 __setup("nosep", x86_sep_setup);
863 @@ -268,10 +265,10 @@ static int __cpuinit have_cpuid_p(void)
864 void __init cpu_detect(struct cpuinfo_x86 *c)
866 /* Get vendor name */
867 - cpuid(0x00000000, &c->cpuid_level,
868 - (int *)&c->x86_vendor_id[0],
869 - (int *)&c->x86_vendor_id[8],
870 - (int *)&c->x86_vendor_id[4]);
871 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
872 + (unsigned int *)&c->x86_vendor_id[0],
873 + (unsigned int *)&c->x86_vendor_id[8],
874 + (unsigned int *)&c->x86_vendor_id[4]);
877 if (c->cpuid_level >= 0x00000001) {
878 @@ -284,9 +281,38 @@ void __init cpu_detect(struct cpuinfo_x8
880 c->x86_model += ((tfms >> 16) & 0xF) << 4;
881 c->x86_mask = tfms & 15;
882 - if (cap0 & (1<<19))
883 + if (cap0 & (1<<19)) {
884 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
885 + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
889 +static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
894 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
895 + if (have_cpuid_p()) {
896 + /* Intel-defined flags: level 0x00000001 */
897 + if (c->cpuid_level >= 0x00000001) {
898 + u32 capability, excap;
899 + cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
900 + c->x86_capability[0] = capability;
901 + c->x86_capability[4] = excap;
904 + /* AMD-defined flags: level 0x80000001 */
905 + xlvl = cpuid_eax(0x80000000);
906 + if ((xlvl & 0xffff0000) == 0x80000000) {
907 + if (xlvl >= 0x80000001) {
908 + c->x86_capability[1] = cpuid_edx(0x80000001);
909 + c->x86_capability[6] = cpuid_ecx(0x80000001);
917 /* Do minimum CPU detection early.
918 @@ -300,6 +326,7 @@ static void __init early_cpu_detect(void
919 struct cpuinfo_x86 *c = &boot_cpu_data;
921 c->x86_cache_alignment = 32;
922 + c->x86_clflush_size = 32;
926 @@ -307,19 +334,30 @@ static void __init early_cpu_detect(void
929 get_cpu_vendor(c, 1);
931 + switch (c->x86_vendor) {
932 + case X86_VENDOR_AMD:
935 + case X86_VENDOR_INTEL:
936 + early_init_intel(c);
943 static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
949 if (have_cpuid_p()) {
950 /* Get vendor name */
951 - cpuid(0x00000000, &c->cpuid_level,
952 - (int *)&c->x86_vendor_id[0],
953 - (int *)&c->x86_vendor_id[8],
954 - (int *)&c->x86_vendor_id[4]);
955 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
956 + (unsigned int *)&c->x86_vendor_id[0],
957 + (unsigned int *)&c->x86_vendor_id[8],
958 + (unsigned int *)&c->x86_vendor_id[4]);
960 get_cpu_vendor(c, 0);
961 /* Initialize the standard set of capabilities */
962 @@ -364,8 +402,6 @@ static void __cpuinit generic_identify(s
963 init_scattered_cpuid_features(c);
966 - early_intel_workaround(c);
969 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
971 @@ -399,7 +435,7 @@ __setup("serialnumber", x86_serial_nr_se
973 * This does the hard work of actually picking apart the CPU stuff...
975 -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
976 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
980 @@ -425,20 +461,9 @@ static void __cpuinit identify_cpu(struc
984 - printk(KERN_DEBUG "CPU: After generic identify, caps:");
985 - for (i = 0; i < NCAPINTS; i++)
986 - printk(" %08lx", c->x86_capability[i]);
989 - if (this_cpu->c_identify) {
990 + if (this_cpu->c_identify)
991 this_cpu->c_identify(c);
993 - printk(KERN_DEBUG "CPU: After vendor identify, caps:");
994 - for (i = 0; i < NCAPINTS; i++)
995 - printk(" %08lx", c->x86_capability[i]);
1000 * Vendor-specific initialization. In this section we
1001 * canonicalize the feature flags, meaning if there are
1002 @@ -460,23 +485,6 @@ static void __cpuinit identify_cpu(struc
1003 * we do "generic changes."
1006 - /* TSC disabled? */
1007 - if ( tsc_disable )
1008 - clear_bit(X86_FEATURE_TSC, c->x86_capability);
1010 - /* FXSR disabled? */
1011 - if (disable_x86_fxsr) {
1012 - clear_bit(X86_FEATURE_FXSR, c->x86_capability);
1013 - clear_bit(X86_FEATURE_XMM, c->x86_capability);
1016 - /* SEP disabled? */
1017 - if (disable_x86_sep)
1018 - clear_bit(X86_FEATURE_SEP, c->x86_capability);
1021 - clear_bit(X86_FEATURE_PSE, c->x86_capability);
1023 /* If the model name is still unset, do table lookup. */
1024 if ( !c->x86_model_id[0] ) {
1026 @@ -489,13 +497,6 @@ static void __cpuinit identify_cpu(struc
1027 c->x86, c->x86_model);
1030 - /* Now the feature flags better reflect actual CPU features! */
1032 - printk(KERN_DEBUG "CPU: After all inits, caps:");
1033 - for (i = 0; i < NCAPINTS; i++)
1034 - printk(" %08lx", c->x86_capability[i]);
1038 * On SMP, boot_cpu_data holds the common feature set between
1039 * all CPUs; so make sure that we indicate which features are
1040 @@ -508,8 +509,14 @@ static void __cpuinit identify_cpu(struc
1041 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1044 + /* Clear all flags overriden by options */
1045 + for (i = 0; i < NCAPINTS; i++)
1046 + c->x86_capability[i] &= ~cleared_cpu_caps[i];
1048 /* Init Machine Check Exception if available. */
1051 + select_idle_routine(c);
1054 void __init identify_boot_cpu(void)
1055 @@ -517,7 +524,6 @@ void __init identify_boot_cpu(void)
1056 identify_cpu(&boot_cpu_data);
1062 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1063 @@ -574,6 +580,13 @@ void __cpuinit detect_ht(struct cpuinfo_
1067 +static __init int setup_noclflush(char *arg)
1069 + setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1072 +__setup("noclflush", setup_noclflush);
1074 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1076 char *vendor = NULL;
1077 @@ -597,6 +610,17 @@ void __cpuinit print_cpu_info(struct cpu
1081 +static __init int setup_disablecpuid(char *arg)
1084 + if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1085 + setup_clear_cpu_cap(bit);
1090 +__setup("clearcpuid=", setup_disablecpuid);
1092 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1094 /* This is hacky. :)
1095 @@ -606,16 +630,6 @@ cpumask_t cpu_initialized __cpuinitdata
1096 * They will insert themselves into the cpu_devs structure.
1097 * Then, when cpu_init() is called, we can just iterate over that array.
1100 -extern int intel_cpu_init(void);
1101 -extern int cyrix_init_cpu(void);
1102 -extern int nsc_init_cpu(void);
1103 -extern int amd_init_cpu(void);
1104 -extern int centaur_init_cpu(void);
1105 -extern int transmeta_init_cpu(void);
1106 -extern int nexgen_init_cpu(void);
1107 -extern int umc_init_cpu(void);
1109 void __init early_cpu_init(void)
1112 @@ -627,21 +641,13 @@ void __init early_cpu_init(void)
1117 -#ifdef CONFIG_DEBUG_PAGEALLOC
1118 - /* pse is not compatible with on-the-fly unmapping,
1119 - * disable it even if the cpus claim to support it.
1121 - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
1126 /* Make sure %fs is initialized properly in idle threads */
1127 -struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
1128 +struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
1130 memset(regs, 0, sizeof(struct pt_regs));
1131 - regs->xfs = __KERNEL_PERCPU;
1132 + regs->fs = __KERNEL_PERCPU;
1136 @@ -649,7 +655,7 @@ struct pt_regs * __devinit idle_regs(str
1137 * it's on the real one. */
1138 void switch_to_new_gdt(void)
1140 - struct Xgt_desc_struct gdt_descr;
1141 + struct desc_ptr gdt_descr;
1142 unsigned long va, frames[16];
1145 @@ -692,12 +698,6 @@ void __cpuinit cpu_init(void)
1147 if (cpu_has_vme || cpu_has_de)
1148 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1149 - if (tsc_disable && cpu_has_tsc) {
1150 - printk(KERN_NOTICE "Disabling TSC...\n");
1151 - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
1152 - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
1153 - set_in_cr4(X86_CR4_TSD);
1156 switch_to_new_gdt();
1158 @@ -710,7 +710,7 @@ void __cpuinit cpu_init(void)
1160 enter_lazy_tlb(&init_mm, curr);
1162 - load_esp0(t, thread);
1163 + load_sp0(t, thread);
1165 load_LDT(&init_mm.context);
1167 --- a/arch/x86/kernel/cpu/mtrr/main-xen.c
1168 +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
1169 @@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = {
1171 struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
1172 unsigned int num_var_ranges;
1173 -unsigned int *usage_table;
1174 +unsigned int mtrr_usage_table[MAX_VAR_RANGES];
1176 static void __init set_num_var_ranges(void)
1178 @@ -52,17 +52,12 @@ static void __init init_table(void)
1181 max = num_var_ranges;
1182 - if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
1184 - printk(KERN_ERR "mtrr: could not allocate\n");
1187 for (i = 0; i < max; i++)
1188 - usage_table[i] = 0;
1189 + mtrr_usage_table[i] = 0;
1192 int mtrr_add_page(unsigned long base, unsigned long size,
1193 - unsigned int type, char increment)
1194 + unsigned int type, bool increment)
1197 struct xen_platform_op op;
1198 @@ -81,7 +76,7 @@ int mtrr_add_page(unsigned long base, un
1202 - ++usage_table[op.u.add_memtype.reg];
1203 + ++mtrr_usage_table[op.u.add_memtype.reg];
1205 mutex_unlock(&mtrr_mutex);
1207 @@ -103,7 +98,7 @@ static int mtrr_check(unsigned long base
1210 mtrr_add(unsigned long base, unsigned long size, unsigned int type,
1214 if (mtrr_check(base, size))
1216 @@ -136,11 +131,11 @@ int mtrr_del_page(int reg, unsigned long
1220 - if (usage_table[reg] < 1) {
1221 + if (mtrr_usage_table[reg] < 1) {
1222 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
1225 - if (--usage_table[reg] < 1) {
1226 + if (--mtrr_usage_table[reg] < 1) {
1227 op.cmd = XENPF_del_memtype;
1228 op.u.del_memtype.handle = 0;
1229 op.u.del_memtype.reg = reg;
1230 --- a/arch/x86/kernel/e820_32-xen.c
1231 +++ b/arch/x86/kernel/e820_32-xen.c
1233 #include <linux/kexec.h>
1234 #include <linux/module.h>
1235 #include <linux/mm.h>
1236 -#include <linux/efi.h>
1237 #include <linux/pfn.h>
1238 #include <linux/uaccess.h>
1239 #include <linux/suspend.h>
1241 #include <asm/setup.h>
1242 #include <xen/interface/memory.h>
1245 -int efi_enabled = 0;
1246 -EXPORT_SYMBOL(efi_enabled);
1249 struct e820map e820;
1250 struct change_member {
1251 struct e820entry *pbios; /* pointer to original bios entry */
1252 @@ -38,26 +32,6 @@ unsigned long pci_mem_start = 0x10000000
1253 EXPORT_SYMBOL(pci_mem_start);
1255 extern int user_defined_memmap;
1256 -struct resource data_resource = {
1257 - .name = "Kernel data",
1260 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1263 -struct resource code_resource = {
1264 - .name = "Kernel code",
1267 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1270 -struct resource bss_resource = {
1271 - .name = "Kernel bss",
1274 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1277 static struct resource system_rom_resource = {
1278 .name = "System ROM",
1279 @@ -112,60 +86,6 @@ static struct resource video_rom_resourc
1280 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
1283 -static struct resource video_ram_resource = {
1284 - .name = "Video RAM area",
1287 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1290 -static struct resource standard_io_resources[] = { {
1294 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1299 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1304 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1309 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1311 - .name = "keyboard",
1314 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1316 - .name = "dma page reg",
1319 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1324 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1329 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1334 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1337 #define ROMSIGNATURE 0xaa55
1339 static int __init romsignature(const unsigned char *rom)
1340 @@ -272,10 +192,9 @@ static struct e820map machine_e820;
1341 * Request address space for all standard RAM and ROM resources
1342 * and also for regions reported as reserved by the e820.
1345 -legacy_init_iomem_resources(struct resource *code_resource,
1346 - struct resource *data_resource,
1347 - struct resource *bss_resource)
1348 +void __init init_iomem_resources(struct resource *code_resource,
1349 + struct resource *data_resource,
1350 + struct resource *bss_resource)
1354 @@ -324,39 +243,6 @@ legacy_init_iomem_resources(struct resou
1359 - * Request address space for all standard resources
1361 - * This is called just before pcibios_init(), which is also a
1362 - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
1364 -static int __init request_standard_resources(void)
1368 - /* Nothing to do if not running in dom0. */
1369 - if (!is_initial_xendomain())
1372 - printk("Setting up standard PCI resources\n");
1374 - efi_initialize_iomem_resources(&code_resource,
1375 - &data_resource, &bss_resource);
1377 - legacy_init_iomem_resources(&code_resource,
1378 - &data_resource, &bss_resource);
1380 - /* EFI systems may still have VGA */
1381 - request_resource(&iomem_resource, &video_ram_resource);
1383 - /* request I/O space for devices used on all i[345]86 PCs */
1384 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
1385 - request_resource(&ioport_resource, &standard_io_resources[i]);
1389 -subsys_initcall(request_standard_resources);
1391 #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
1393 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
1394 @@ -393,19 +279,17 @@ void __init add_memory_region(unsigned l
1398 - if (!efi_enabled) {
1401 - if (x == E820MAX) {
1402 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1407 - e820.map[x].addr = start;
1408 - e820.map[x].size = size;
1409 - e820.map[x].type = type;
1411 + if (x == E820MAX) {
1412 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1416 + e820.map[x].addr = start;
1417 + e820.map[x].size = size;
1418 + e820.map[x].type = type;
1420 } /* add_memory_region */
1423 @@ -642,29 +526,6 @@ int __init copy_e820_map(struct e820entr
1427 - * Callback for efi_memory_walk.
1430 -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
1432 - unsigned long *max_pfn = arg, pfn;
1434 - if (start < end) {
1435 - pfn = PFN_UP(end -1);
1436 - if (pfn > *max_pfn)
1443 -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
1445 - memory_present(0, PFN_UP(start), PFN_DOWN(end));
1450 * Find the highest page frame number we have available
1452 void __init find_max_pfn(void)
1453 @@ -672,11 +533,6 @@ void __init find_max_pfn(void)
1457 - if (efi_enabled) {
1458 - efi_memmap_walk(efi_find_max_pfn, &max_pfn);
1459 - efi_memmap_walk(efi_memory_present_wrapper, NULL);
1463 for (i = 0; i < e820.nr_map; i++) {
1464 unsigned long start, end;
1465 @@ -694,34 +550,12 @@ void __init find_max_pfn(void)
1469 - * Free all available memory for boot time allocation. Used
1470 - * as a callback function by efi_memory_walk()
1474 -free_available_memory(unsigned long start, unsigned long end, void *arg)
1476 - /* check max_low_pfn */
1477 - if (start >= (max_low_pfn << PAGE_SHIFT))
1479 - if (end >= (max_low_pfn << PAGE_SHIFT))
1480 - end = max_low_pfn << PAGE_SHIFT;
1482 - free_bootmem(start, end - start);
1487 * Register fully available low RAM pages with the bootmem allocator.
1489 void __init register_bootmem_low_pages(unsigned long max_low_pfn)
1493 - if (efi_enabled) {
1494 - efi_memmap_walk(free_available_memory, NULL);
1497 for (i = 0; i < e820.nr_map; i++) {
1498 unsigned long curr_pfn, last_pfn, size;
1500 @@ -855,56 +689,12 @@ void __init print_memory_map(char *who)
1504 -static __init __always_inline void efi_limit_regions(unsigned long long size)
1506 - unsigned long long current_addr = 0;
1507 - efi_memory_desc_t *md, *next_md;
1513 - for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
1516 - current_addr = md->phys_addr +
1517 - PFN_PHYS(md->num_pages);
1518 - if (is_available_memory(md)) {
1519 - if (md->phys_addr >= size) continue;
1520 - memcpy(next_md, md, memmap.desc_size);
1521 - if (current_addr >= size) {
1522 - next_md->num_pages -=
1523 - PFN_UP(current_addr-size);
1525 - p1 += memmap.desc_size;
1528 - } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
1529 - EFI_MEMORY_RUNTIME) {
1530 - /* In order to make runtime services
1531 - * available we have to include runtime
1532 - * memory regions in memory map */
1533 - memcpy(next_md, md, memmap.desc_size);
1534 - p1 += memmap.desc_size;
1539 - memmap.nr_map = j;
1540 - memmap.map_end = memmap.map +
1541 - (memmap.nr_map * memmap.desc_size);
1544 void __init limit_regions(unsigned long long size)
1546 unsigned long long current_addr = 0;
1549 print_memory_map("limit_regions start");
1550 - if (efi_enabled) {
1551 - efi_limit_regions(size);
1554 for (i = 0; i < e820.nr_map; i++) {
1555 current_addr = e820.map[i].addr + e820.map[i].size;
1556 if (current_addr < size)
1557 @@ -1056,3 +846,44 @@ static int __init parse_memmap(char *arg
1560 early_param("memmap", parse_memmap);
1563 +void __init update_memory_range(u64 start, u64 size, unsigned old_type,
1564 + unsigned new_type)
1568 + BUG_ON(old_type == new_type);
1570 + for (i = 0; i < e820.nr_map; i++) {
1571 + struct e820entry *ei = &e820.map[i];
1572 + u64 final_start, final_end;
1573 + if (ei->type != old_type)
1575 + /* totally covered? */
1576 + if (ei->addr >= start && ei->size <= size) {
1577 + ei->type = new_type;
1580 + /* partially covered */
1581 + final_start = max(start, ei->addr);
1582 + final_end = min(start + size, ei->addr + ei->size);
1583 + if (final_start >= final_end)
1585 + add_memory_region(final_start, final_end - final_start,
1590 +void __init update_e820(void)
1594 + nr_map = e820.nr_map;
1595 + if (sanitize_e820_map(e820.map, &nr_map))
1597 + e820.nr_map = nr_map;
1598 + printk(KERN_INFO "modified physical RAM map:\n");
1599 + print_memory_map("modified");
1602 --- a/arch/x86/kernel/e820_64-xen.c
1603 +++ b/arch/x86/kernel/e820_64-xen.c
1607 * Handle the memory map.
1608 * The functions here do the job until bootmem takes over.
1611 #include <asm/proto.h>
1612 #include <asm/setup.h>
1613 #include <asm/sections.h>
1614 +#include <asm/kdebug.h>
1615 #include <xen/interface/memory.h>
1617 struct e820map e820 __initdata;
1618 @@ -33,96 +34,103 @@ struct e820map e820 __initdata;
1619 struct e820map machine_e820;
1624 * PFN of last memory page.
1626 -unsigned long end_pfn;
1627 -EXPORT_SYMBOL(end_pfn);
1628 +unsigned long end_pfn;
1632 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
1633 * The direct mapping extends to end_pfn_map, so that we can directly access
1634 * apertures, ACPI and other tables without having to play with fixmaps.
1636 -unsigned long end_pfn_map;
1638 +unsigned long end_pfn_map;
1642 * Last pfn which the user wants to use.
1644 static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
1646 -extern struct resource code_resource, data_resource, bss_resource;
1648 -/* Check for some hardcoded bad areas that early boot is not allowed to touch */
1649 -static inline int bad_addr(unsigned long *addrp, unsigned long size)
1651 - unsigned long addr = *addrp, last = addr + size;
1653 + * Early reserved memory areas.
1655 +#define MAX_EARLY_RES 20
1658 + unsigned long start, end;
1661 +static struct early_res early_res[MAX_EARLY_RES] __initdata = {
1663 - /* various gunk below that needed for SMP startup */
1664 - if (addr < 0x8000) {
1665 - *addrp = PAGE_ALIGN(0x8000);
1669 - /* direct mapping tables of the kernel */
1670 - if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
1671 - *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
1676 -#ifdef CONFIG_BLK_DEV_INITRD
1677 - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
1678 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
1679 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
1680 - unsigned long ramdisk_end = ramdisk_image+ramdisk_size;
1682 - if (last >= ramdisk_image && addr < ramdisk_end) {
1683 - *addrp = PAGE_ALIGN(ramdisk_end);
1687 + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
1689 + { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
1692 - if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
1693 - *addrp = PAGE_ALIGN(__pa_symbol(&_end));
1700 - if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
1701 - *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
1703 +void __init reserve_early(unsigned long start, unsigned long end, char *name)
1706 + struct early_res *r;
1707 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1708 + r = &early_res[i];
1709 + if (end > r->start && start < r->end)
1710 + panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
1711 + start, end - 1, name?name:"", r->start, r->end - 1, r->name);
1713 + if (i >= MAX_EARLY_RES)
1714 + panic("Too many early reservations");
1715 + r = &early_res[i];
1719 + strncpy(r->name, name, sizeof(r->name) - 1);
1723 - /* NUMA memory to node map */
1724 - if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
1725 - *addrp = nodemap_addr + nodemap_size;
1727 +void __init early_res_to_bootmem(void)
1730 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1731 + struct early_res *r = &early_res[i];
1732 + printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
1733 + r->start, r->end - 1, r->name);
1734 + reserve_bootmem_generic(r->start, r->end - r->start);
1737 - /* XXX ramdisk image here? */
1739 - if (last < (table_end<<PAGE_SHIFT)) {
1740 - *addrp = table_end << PAGE_SHIFT;
1744 +/* Check for already reserved areas */
1745 +static inline int bad_addr(unsigned long *addrp, unsigned long size)
1748 + unsigned long addr = *addrp, last;
1751 + last = addr + size;
1752 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1753 + struct early_res *r = &early_res[i];
1754 + if (last >= r->start && addr < r->end) {
1755 + *addrp = addr = r->end;
1767 * This function checks if any part of the range <start,end> is mapped
1770 -int e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
1773 +e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
1778 - for (i = 0; i < e820.nr_map; i++) {
1779 - struct e820entry *ei = &e820.map[i];
1780 + for (i = 0; i < e820.nr_map; i++) {
1781 + struct e820entry *ei = &e820.map[i];
1783 if (!is_initial_xendomain())
1785 @@ -130,12 +138,12 @@ int e820_any_mapped(unsigned long start,
1786 const struct e820entry *ei = &machine_e820.map[i];
1789 - if (type && ei->type != type)
1790 + if (type && ei->type != type)
1792 if (ei->addr >= end || ei->addr + ei->size <= start)
1801 EXPORT_SYMBOL_GPL(e820_any_mapped);
1802 @@ -146,7 +154,8 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
1803 * Note: this function only works correct if the e820 table is sorted and
1804 * not-overlapping, which is the case
1806 -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
1807 +int __init e820_all_mapped(unsigned long start, unsigned long end,
1812 @@ -171,65 +180,77 @@ int __init e820_all_mapped(unsigned long
1814 if (ei->addr <= start)
1815 start = ei->addr + ei->size;
1816 - /* if start is now at or beyond end, we're done, full coverage */
1818 + * if start is now at or beyond end, we're done, full
1822 - return 1; /* we're done */
1829 - * Find a free area in a specific range.
1831 -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
1834 - for (i = 0; i < e820.nr_map; i++) {
1835 - struct e820entry *ei = &e820.map[i];
1836 - unsigned long addr = ei->addr, last;
1837 - if (ei->type != E820_RAM)
1841 + * Find a free area with specified alignment in a specific range.
1843 +unsigned long __init find_e820_area(unsigned long start, unsigned long end,
1844 + unsigned size, unsigned long align)
1847 + unsigned long mask = ~(align - 1);
1849 + for (i = 0; i < e820.nr_map; i++) {
1850 + struct e820entry *ei = &e820.map[i];
1851 + unsigned long addr = ei->addr, last;
1853 + if (ei->type != E820_RAM)
1857 - if (addr > ei->addr + ei->size)
1859 + if (addr > ei->addr + ei->size)
1861 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
1863 - last = PAGE_ALIGN(addr) + size;
1864 + addr = (addr + align - 1) & mask;
1865 + last = addr + size;
1866 if (last > ei->addr + ei->size)
1881 * Find the highest page frame number we have available
1883 unsigned long __init e820_end_of_ram(void)
1885 - unsigned long end_pfn = 0;
1886 + unsigned long end_pfn;
1888 end_pfn = find_max_pfn_with_active_regions();
1890 - if (end_pfn > end_pfn_map)
1892 + if (end_pfn > end_pfn_map)
1893 end_pfn_map = end_pfn;
1894 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
1895 end_pfn_map = MAXMEM>>PAGE_SHIFT;
1896 if (end_pfn > end_user_pfn)
1897 end_pfn = end_user_pfn;
1898 - if (end_pfn > end_pfn_map)
1899 - end_pfn = end_pfn_map;
1900 + if (end_pfn > end_pfn_map)
1901 + end_pfn = end_pfn_map;
1903 - printk("end_pfn_map = %lu\n", end_pfn_map);
1905 + printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
1910 * Mark e820 reserved areas as busy for the resource manager.
1912 -void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
1913 +void __init e820_reserve_resources(struct e820entry *e820, int nr_map,
1914 + struct resource *code_resource,
1915 + struct resource *data_resource,
1916 + struct resource *bss_resource)
1919 for (i = 0; i < nr_map; i++) {
1920 @@ -247,14 +268,14 @@ void __init e820_reserve_resources(struc
1921 request_resource(&iomem_resource, res);
1922 if (e820[i].type == E820_RAM) {
1924 - * We don't know which RAM region contains kernel data,
1925 - * so we try it repeatedly and let the resource manager
1927 + * We don't know which RAM region contains kernel data,
1928 + * so we try it repeatedly and let the resource manager
1932 - request_resource(res, &code_resource);
1933 - request_resource(res, &data_resource);
1934 - request_resource(res, &bss_resource);
1935 + request_resource(res, code_resource);
1936 + request_resource(res, data_resource);
1937 + request_resource(res, bss_resource);
1940 if (crashk_res.start != crashk_res.end)
1941 @@ -357,9 +378,9 @@ e820_register_active_regions(int nid, un
1942 add_active_range(nid, ei_startpfn, ei_endpfn);
1947 * Add a memory region to the kernel e820 map.
1950 void __init add_memory_region(unsigned long start, unsigned long size, int type)
1952 int x = e820.nr_map;
1953 @@ -384,9 +405,7 @@ unsigned long __init e820_hole_size(unsi
1955 unsigned long start_pfn = start >> PAGE_SHIFT;
1956 unsigned long end_pfn = end >> PAGE_SHIFT;
1957 - unsigned long ei_startpfn;
1958 - unsigned long ei_endpfn;
1959 - unsigned long ram = 0;
1960 + unsigned long ei_startpfn, ei_endpfn, ram = 0;
1963 for (i = 0; i < e820.nr_map; i++) {
1964 @@ -398,28 +417,31 @@ unsigned long __init e820_hole_size(unsi
1965 return end - start - (ram << PAGE_SHIFT);
1968 -void __init e820_print_map(char *who)
1969 +static void __init e820_print_map(char *who)
1973 for (i = 0; i < e820.nr_map; i++) {
1974 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1975 - (unsigned long long) e820.map[i].addr,
1976 - (unsigned long long) (e820.map[i].addr + e820.map[i].size));
1977 + (unsigned long long) e820.map[i].addr,
1978 + (unsigned long long)
1979 + (e820.map[i].addr + e820.map[i].size));
1980 switch (e820.map[i].type) {
1981 - case E820_RAM: printk("(usable)\n");
1984 + printk(KERN_CONT "(usable)\n");
1987 - printk("(reserved)\n");
1989 + printk(KERN_CONT "(reserved)\n");
1992 - printk("(ACPI data)\n");
1994 + printk(KERN_CONT "(ACPI data)\n");
1997 - printk("(ACPI NVS)\n");
1999 - default: printk("type %u\n", e820.map[i].type);
2001 + printk(KERN_CONT "(ACPI NVS)\n");
2004 + printk(KERN_CONT "type %u\n", e820.map[i].type);
2009 @@ -427,11 +449,11 @@ void __init e820_print_map(char *who)
2011 * Sanitize the BIOS e820 map.
2013 - * Some e820 responses include overlapping entries. The following
2014 + * Some e820 responses include overlapping entries. The following
2015 * replaces the original e820 map with a new one, removing overlaps.
2018 -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
2019 +static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
2021 struct change_member {
2022 struct e820entry *pbios; /* pointer to original bios entry */
2023 @@ -451,7 +473,8 @@ static int __init sanitize_e820_map(stru
2027 - Visually we're performing the following (1,2,3,4 = memory types)...
2028 + Visually we're performing the following
2029 + (1,2,3,4 = memory types)...
2031 Sample memory map (w/overlaps):
2032 ____22__________________
2033 @@ -493,22 +516,23 @@ static int __init sanitize_e820_map(stru
2036 /* bail out if we find any unreasonable addresses in bios map */
2037 - for (i=0; i<old_nr; i++)
2038 + for (i = 0; i < old_nr; i++)
2039 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
2042 /* create pointers for initial change-point information (for sorting) */
2043 - for (i=0; i < 2*old_nr; i++)
2044 + for (i = 0; i < 2 * old_nr; i++)
2045 change_point[i] = &change_point_list[i];
2047 /* record all known change-points (starting and ending addresses),
2048 omitting those that are for empty memory regions */
2050 - for (i=0; i < old_nr; i++) {
2051 + for (i = 0; i < old_nr; i++) {
2052 if (biosmap[i].size != 0) {
2053 change_point[chgidx]->addr = biosmap[i].addr;
2054 change_point[chgidx++]->pbios = &biosmap[i];
2055 - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
2056 + change_point[chgidx]->addr = biosmap[i].addr +
2058 change_point[chgidx++]->pbios = &biosmap[i];
2061 @@ -518,75 +542,106 @@ static int __init sanitize_e820_map(stru
2063 while (still_changing) {
2065 - for (i=1; i < chg_nr; i++) {
2066 - /* if <current_addr> > <last_addr>, swap */
2067 - /* or, if current=<start_addr> & last=<end_addr>, swap */
2068 - if ((change_point[i]->addr < change_point[i-1]->addr) ||
2069 - ((change_point[i]->addr == change_point[i-1]->addr) &&
2070 - (change_point[i]->addr == change_point[i]->pbios->addr) &&
2071 - (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
2074 + for (i = 1; i < chg_nr; i++) {
2075 + unsigned long long curaddr, lastaddr;
2076 + unsigned long long curpbaddr, lastpbaddr;
2078 + curaddr = change_point[i]->addr;
2079 + lastaddr = change_point[i - 1]->addr;
2080 + curpbaddr = change_point[i]->pbios->addr;
2081 + lastpbaddr = change_point[i - 1]->pbios->addr;
2084 + * swap entries, when:
2086 + * curaddr > lastaddr or
2087 + * curaddr == lastaddr and curaddr == curpbaddr and
2088 + * lastaddr != lastpbaddr
2090 + if (curaddr < lastaddr ||
2091 + (curaddr == lastaddr && curaddr == curpbaddr &&
2092 + lastaddr != lastpbaddr)) {
2093 change_tmp = change_point[i];
2094 change_point[i] = change_point[i-1];
2095 change_point[i-1] = change_tmp;
2097 + still_changing = 1;
2102 /* create a new bios memory map, removing overlaps */
2103 - overlap_entries=0; /* number of entries in the overlap table */
2104 - new_bios_entry=0; /* index for creating new bios map entries */
2105 + overlap_entries = 0; /* number of entries in the overlap table */
2106 + new_bios_entry = 0; /* index for creating new bios map entries */
2107 last_type = 0; /* start with undefined memory type */
2108 last_addr = 0; /* start with 0 as last starting address */
2110 /* loop through change-points, determining affect on the new bios map */
2111 - for (chgidx=0; chgidx < chg_nr; chgidx++)
2113 + for (chgidx = 0; chgidx < chg_nr; chgidx++) {
2114 /* keep track of all overlapping bios entries */
2115 - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
2117 - /* add map entry to overlap list (> 1 entry implies an overlap) */
2118 - overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
2122 - /* remove entry from list (order independent, so swap with last) */
2123 - for (i=0; i<overlap_entries; i++)
2125 - if (overlap_list[i] == change_point[chgidx]->pbios)
2126 - overlap_list[i] = overlap_list[overlap_entries-1];
2127 + if (change_point[chgidx]->addr ==
2128 + change_point[chgidx]->pbios->addr) {
2130 + * add map entry to overlap list (> 1 entry
2131 + * implies an overlap)
2133 + overlap_list[overlap_entries++] =
2134 + change_point[chgidx]->pbios;
2137 + * remove entry from list (order independent,
2138 + * so swap with last)
2140 + for (i = 0; i < overlap_entries; i++) {
2141 + if (overlap_list[i] ==
2142 + change_point[chgidx]->pbios)
2144 + overlap_list[overlap_entries-1];
2148 - /* if there are overlapping entries, decide which "type" to use */
2149 - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
2151 + * if there are overlapping entries, decide which
2152 + * "type" to use (larger value takes precedence --
2153 + * 1=usable, 2,3,4,4+=unusable)
2156 - for (i=0; i<overlap_entries; i++)
2157 + for (i = 0; i < overlap_entries; i++)
2158 if (overlap_list[i]->type > current_type)
2159 current_type = overlap_list[i]->type;
2160 - /* continue building up new bios map based on this information */
2162 + * continue building up new bios map based on this
2165 if (current_type != last_type) {
2166 if (last_type != 0) {
2167 new_bios[new_bios_entry].size =
2168 change_point[chgidx]->addr - last_addr;
2169 - /* move forward only if the new size was non-zero */
2171 + * move forward only if the new size
2174 if (new_bios[new_bios_entry].size != 0)
2176 + * no more space left for new
2179 if (++new_bios_entry >= E820MAX)
2180 - break; /* no more space left for new bios entries */
2183 if (current_type != 0) {
2184 - new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
2185 + new_bios[new_bios_entry].addr =
2186 + change_point[chgidx]->addr;
2187 new_bios[new_bios_entry].type = current_type;
2188 - last_addr=change_point[chgidx]->addr;
2189 + last_addr = change_point[chgidx]->addr;
2191 last_type = current_type;
2194 - new_nr = new_bios_entry; /* retain count for new bios entries */
2195 + /* retain count for new bios entries */
2196 + new_nr = new_bios_entry;
2198 /* copy new bios mapping into original location */
2199 - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
2200 + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
2204 @@ -601,7 +656,7 @@ static int __init sanitize_e820_map(stru
2205 * will have given us a memory map that we can use to properly
2206 * set up memory. If we aren't, we'll fake a memory map.
2208 -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
2209 +static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
2212 /* Only one memory region (or negative)? Ignore it */
2213 @@ -622,7 +677,7 @@ static int __init copy_e820_map(struct e
2216 add_memory_region(start, size, type);
2217 - } while (biosmap++,--nr_map);
2218 + } while (biosmap++, --nr_map);
2221 if (is_initial_xendomain()) {
2222 @@ -641,15 +696,17 @@ static int __init copy_e820_map(struct e
2226 -void early_panic(char *msg)
2227 +static void early_panic(char *msg)
2234 -void __init setup_memory_region(void)
2235 +/* We're not void only for x86 32-bit compat */
2236 +char * __init machine_specific_memory_setup(void)
2239 + char *who = "BIOS-e820";
2241 * Try to copy the BIOS-supplied E820-map.
2243 @@ -659,14 +716,8 @@ void __init setup_memory_region(void)
2244 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
2245 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
2246 early_panic("Cannot find a valid memory map");
2247 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
2248 - e820_print_map("BIOS-e820");
2251 #else /* CONFIG_XEN */
2253 -void __init setup_memory_region(void)
2255 + char *who = "Xen";
2257 struct xen_memory_map memmap;
2259 @@ -694,11 +745,13 @@ void __init setup_memory_region(void)
2261 if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
2262 early_panic("Cannot find a valid memory map");
2265 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
2266 - e820_print_map("Xen");
2267 + e820_print_map(who);
2269 + /* In case someone cares... */
2274 static int __init parse_memopt(char *p)
2276 @@ -709,7 +762,7 @@ static int __init parse_memopt(char *p)
2279 end_user_pfn = memparse(p, &p);
2280 - end_user_pfn >>= PAGE_SHIFT;
2281 + end_user_pfn >>= PAGE_SHIFT;
2283 end = end_user_pfn<<PAGE_SHIFT;
2285 @@ -727,7 +780,7 @@ static int __init parse_memopt(char *p)
2291 early_param("mem", parse_memopt);
2293 static int userdef __initdata;
2294 @@ -739,9 +792,9 @@ static int __init parse_memmap_opt(char
2296 if (!strcmp(p, "exactmap")) {
2297 #ifdef CONFIG_CRASH_DUMP
2298 - /* If we are doing a crash dump, we
2299 - * still need to know the real mem
2300 - * size before original memory map is
2302 + * If we are doing a crash dump, we still need to know
2303 + * the real mem size before original memory map is
2306 e820_register_active_regions(0, 0, -1UL);
2307 @@ -758,6 +811,8 @@ static int __init parse_memmap_opt(char
2308 mem_size = memparse(p, &p);
2314 start_at = memparse(p+1, &p);
2315 add_memory_region(start_at, mem_size, E820_RAM);
2316 @@ -777,11 +832,58 @@ early_param("memmap", parse_memmap_opt);
2317 void __init finish_e820_parsing(void)
2320 + char nr = e820.nr_map;
2322 + if (sanitize_e820_map(e820.map, &nr) < 0)
2323 + early_panic("Invalid user supplied memory map");
2326 printk(KERN_INFO "user-defined physical RAM map:\n");
2327 e820_print_map("user");
2332 +void __init update_memory_range(u64 start, u64 size, unsigned old_type,
2333 + unsigned new_type)
2337 + BUG_ON(old_type == new_type);
2339 + for (i = 0; i < e820.nr_map; i++) {
2340 + struct e820entry *ei = &e820.map[i];
2341 + u64 final_start, final_end;
2342 + if (ei->type != old_type)
2344 + /* totally covered? */
2345 + if (ei->addr >= start && ei->size <= size) {
2346 + ei->type = new_type;
2349 + /* partially covered */
2350 + final_start = max(start, ei->addr);
2351 + final_end = min(start + size, ei->addr + ei->size);
2352 + if (final_start >= final_end)
2354 + add_memory_region(final_start, final_end - final_start,
2359 +void __init update_e820(void)
2363 + nr_map = e820.nr_map;
2364 + if (sanitize_e820_map(e820.map, &nr_map))
2366 + e820.nr_map = nr_map;
2367 + printk(KERN_INFO "modified physical RAM map:\n");
2368 + e820_print_map("modified");
2372 unsigned long pci_mem_start = 0xaeedbabe;
2373 EXPORT_SYMBOL(pci_mem_start);
2375 @@ -825,8 +927,10 @@ __init void e820_setup_gap(struct e820en
2378 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
2379 - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
2380 - KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
2381 + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2383 + KERN_ERR "PCI: Unassigned devices with 32bit resource "
2384 + "registers may break!\n");
2388 @@ -839,8 +943,9 @@ __init void e820_setup_gap(struct e820en
2389 /* Fun with two's complement */
2390 pci_mem_start = (gapstart + round) & -round;
2392 - printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2393 - pci_mem_start, gapstart, gapsize);
2395 + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2396 + pci_mem_start, gapstart, gapsize);
2399 int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
2400 --- a/arch/x86/kernel/early_printk-xen.c
2401 +++ b/arch/x86/kernel/early_printk-xen.c
2402 @@ -222,7 +222,7 @@ static struct console simnow_console = {
2405 /* Direct interface for emergencies */
2406 -struct console *early_console = &early_vga_console;
2407 +static struct console *early_console = &early_vga_console;
2408 static int early_console_initialized = 0;
2410 void early_printk(const char *fmt, ...)
2411 --- a/arch/x86/kernel/entry_32-xen.S
2412 +++ b/arch/x86/kernel/entry_32-xen.S
2414 * for paravirtualization. The following will never clobber any registers:
2415 * INTERRUPT_RETURN (aka. "iret")
2416 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
2417 - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
2418 + * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
2420 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
2421 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
2422 @@ -282,16 +282,21 @@ END(resume_kernel)
2426 + .macro test_tif ti_reg # system call tracing in operation / emulation
2427 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2428 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
2431 /* SYSENTER_RETURN points to after the "sysenter" instruction in
2432 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
2434 # sysenter call handler stub
2435 -ENTRY(sysenter_entry)
2436 +ENTRY(ia32_sysenter_target)
2437 CFI_STARTPROC simple
2440 CFI_REGISTER esp, ebp
2441 - movl SYSENTER_stack_esp0(%esp),%esp
2442 + movl SYSENTER_stack_sp0(%esp),%esp
2445 * No need to follow this irqs on/off section: the syscall
2446 @@ -334,9 +339,7 @@ sysenter_past_esp:
2447 CFI_ADJUST_CFA_OFFSET 4
2449 GET_THREAD_INFO(%ebp)
2451 - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2452 - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2454 jnz syscall_trace_entry
2455 cmpl $(nr_syscalls), %eax
2457 @@ -354,7 +357,7 @@ sysenter_past_esp:
2460 1: mov PT_FS(%esp), %fs
2461 - ENABLE_INTERRUPTS_SYSEXIT
2462 + ENABLE_INTERRUPTS_SYSCALL_RET
2464 .pushsection .fixup,"ax"
2465 2: movl $0,PT_FS(%esp)
2466 @@ -363,10 +366,10 @@ sysenter_past_esp:
2470 -ENDPROC(sysenter_entry)
2471 +ENDPROC(ia32_sysenter_target)
2473 # pv sysenter call handler stub
2474 -ENTRY(sysenter_entry_pv)
2475 +ENTRY(ia32pv_sysenter_target)
2477 movl $__USER_DS,16(%esp)
2479 @@ -389,7 +392,7 @@ ENTRY(sysenter_entry_pv)
2483 -ENDPROC(sysenter_entry_pv)
2484 +ENDPROC(ia32pv_sysenter_target)
2486 # system call handler stub
2488 @@ -398,9 +401,7 @@ ENTRY(system_call)
2489 CFI_ADJUST_CFA_OFFSET 4
2491 GET_THREAD_INFO(%ebp)
2492 - # system call tracing in operation / emulation
2493 - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2494 - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2496 jnz syscall_trace_entry
2497 cmpl $(nr_syscalls), %eax
2499 @@ -452,7 +453,8 @@ restore_nocheck_notrace:
2501 addl $4, %esp # skip orig_eax/error_code
2502 CFI_ADJUST_CFA_OFFSET -4
2503 -1: INTERRUPT_RETURN
2506 .section .fixup,"ax"
2508 pushl $0 # no error code
2509 @@ -461,7 +463,7 @@ iret_exc:
2511 .section __ex_table,"a"
2514 + .long irq_return,iret_exc
2518 @@ -657,7 +659,7 @@ END(syscall_badsys)
2519 * Build the entry stubs and pointer table with
2520 * some assembler magic.
2523 +.section .rodata,"a"
2527 @@ -959,7 +961,7 @@ END(device_not_available)
2528 * that sets up the real kernel stack. Check here, since we can't
2529 * allow the wrong stack to be used.
2531 - * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
2532 + * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
2533 * already pushed 3 words if it hits on the sysenter instruction:
2534 * eflags, cs and eip.
2536 @@ -971,7 +973,7 @@ END(device_not_available)
2537 cmpw $__KERNEL_CS,4(%esp); \
2540 - movl SYSENTER_stack_esp0+offset(%esp),%esp; \
2541 + movl SYSENTER_stack_sp0+offset(%esp),%esp; \
2542 CFI_DEF_CFA esp, 0; \
2543 CFI_UNDEFINED eip; \
2545 @@ -986,7 +988,7 @@ label: \
2549 - cmpl $sysenter_entry,(%esp)
2550 + cmpl $ia32_sysenter_target,(%esp)
2551 jne debug_stack_correct
2552 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
2553 debug_stack_correct:
2554 @@ -1019,7 +1021,7 @@ KPROBE_ENTRY(nmi)
2556 CFI_ADJUST_CFA_OFFSET -4
2558 - cmpl $sysenter_entry,(%esp)
2559 + cmpl $ia32_sysenter_target,(%esp)
2562 CFI_ADJUST_CFA_OFFSET 4
2563 @@ -1032,7 +1034,7 @@ KPROBE_ENTRY(nmi)
2565 CFI_ADJUST_CFA_OFFSET -4
2566 jae nmi_stack_correct
2567 - cmpl $sysenter_entry,12(%esp)
2568 + cmpl $ia32_sysenter_target,12(%esp)
2569 je nmi_debug_stack_check
2571 /* We have a RING0_INT_FRAME here */
2572 @@ -1085,12 +1087,8 @@ nmi_espfix_stack:
2574 lss 12+4(%esp), %esp # back to espfix stack
2575 CFI_ADJUST_CFA_OFFSET -24
2576 -1: INTERRUPT_RETURN
2579 -.section __ex_table,"a"
2586 @@ -1108,17 +1106,17 @@ KPROBE_END(nmi)
2588 #ifdef CONFIG_PARAVIRT
2592 .section __ex_table,"a"
2595 + .long native_iret, iret_exc
2599 -ENTRY(native_irq_enable_sysexit)
2600 +ENTRY(native_irq_enable_syscall_ret)
2603 -END(native_irq_enable_sysexit)
2604 +END(native_irq_enable_syscall_ret)
2608 @@ -1267,7 +1265,144 @@ ENTRY(kernel_thread_helper)
2610 ENDPROC(kernel_thread_helper)
2612 +#include <asm/alternative-asm.h>
2614 + # pv syscall call handler stub
2615 +ENTRY(ia32pv_cstar_target)
2617 + movl $__USER_DS,16(%esp)
2619 + movl $__USER_CS,4(%esp)
2620 + movl 12(%esp),%ebp
2621 + pushl %eax # save orig_eax
2622 + CFI_ADJUST_CFA_OFFSET 4
2624 + * Load the potential sixth argument from user stack.
2625 + * Careful about security.
2627 + cmpl $__PAGE_OFFSET-4,%ebp
2628 + CFI_REMEMBER_STATE
2630 +1: movl (%ebp),%ebp
2631 +.section __ex_table,"a"
2633 + .long 1b,cstar_fault
2636 + GET_THREAD_INFO(%ebp)
2638 + jnz cstar_trace_entry
2639 + cmpl $nr_syscalls,%eax
2642 + btl %eax,cstar_special
2643 + jc .Lcstar_special
2644 + call *cstar_call_table(,%eax,4)
2645 + movl %eax,PT_EAX(%esp) # store the return value
2647 + movl PT_ECX(%esp),%ecx
2648 + movl %ecx,PT_EBP(%esp) # put user EBP back in place
2651 + movl PT_ECX(%esp),%ecx
2652 + movl %ecx,PT_EBP(%esp) # put user EBP back in place
2655 + movl $cstar_clear_tif,(%esp) # replace return address
2657 + orl $_TIF_CSTAR,TI_flags(%ebp)
2658 + jmp *sys_call_table(,%eax,4)
2660 + movl %eax,PT_EAX(%esp) # store the return value
2662 + andl $~_TIF_CSTAR,TI_flags(%ebp)
2665 + movl $-ENOSYS,PT_EAX(%esp)
2666 + cmpl $nr_syscalls,%eax
2668 + btl %eax,cstar_special
2669 + jc .Lcstar_trace_special
2673 + orl $_TIF_CSTAR,TI_flags(%ebp)
2674 + call do_syscall_trace
2676 + andl $~_TIF_CSTAR,TI_flags(%ebp)
2678 + jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
2679 + # so must skip actual syscall
2680 + movl PT_ORIG_EAX(%esp),%eax
2681 + cmpl $nr_syscalls,%eax
2684 +.Lcstar_trace_special:
2685 + movl PT_ECX(%esp),%ecx
2688 + movl %ecx,PT_EBP(%esp) # put user EBP back in place
2689 + call do_syscall_trace
2691 + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
2692 + # so must skip actual syscall
2693 + movl PT_ORIG_EAX(%esp),%eax
2694 + cmpl $nr_syscalls,%eax
2698 + movl $-ENOSYS,PT_EAX(%esp)
2700 + movl PT_ECX(%esp),%ecx
2701 + movl %ecx,PT_EBP(%esp) # put user EBP back in place
2702 + jmp resume_userspace
2705 + movl $-EFAULT,%eax
2707 + GET_THREAD_INFO(%ebp)
2708 + jmp .Lcstar_resume
2710 +ENDPROC(ia32pv_cstar_target)
2712 +ENTRY(cstar_ret_from_fork)
2714 + movl PT_ECX(%esp),%ecx
2715 + GET_THREAD_INFO(%ebp)
2716 + movl %ecx,PT_EBP(%esp) # put user EBP back in place
2718 + andl $~_TIF_CSTAR,TI_flags(%ebp)
2723 .section .rodata,"a"
2724 #include "syscall_table_32.S"
2726 syscall_table_size=(.-sys_call_table)
2728 +#include <asm/unistd.h>
2732 +.rept nr_syscalls+31
2733 + .irp n, __NR_sigreturn, __NR_rt_sigreturn
2735 + mask = mask | (1 << (\n & 31))
2739 + .if (nr & 31) == 0
2744 +#define sys_call_table cstar_call_table
2745 +#define sys_fork cstar_set_tif
2746 +#define sys_clone cstar_set_tif
2747 +#define sys_vfork cstar_set_tif
2748 +#include "syscall_table_32.S"
2749 +#undef sys_call_table
2753 --- a/arch/x86/kernel/entry_64-xen.S
2754 +++ b/arch/x86/kernel/entry_64-xen.S
2756 #include <asm/page.h>
2757 #include <asm/irqflags.h>
2758 #include <asm/errno.h>
2759 -#include <xen/interface/arch-x86_64.h>
2760 +#include <xen/interface/xen.h>
2761 #include <xen/interface/features.h>
2763 -#include "xen_entry_64.S"
2767 #ifndef CONFIG_PREEMPT
2768 #define retint_kernel retint_restore_args
2771 +#ifdef CONFIG_PARAVIRT
2772 +ENTRY(native_irq_enable_syscall_ret)
2773 + movq %gs:pda_oldrsp,%rsp
2776 +#endif /* CONFIG_PARAVIRT */
2779 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
2780 #ifdef CONFIG_TRACE_IRQFLAGS
2781 @@ -277,7 +282,7 @@ ret_from_sys_call:
2784 GET_THREAD_INFO(%rcx)
2785 - XEN_BLOCK_EVENTS(%rsi)
2786 + DISABLE_INTERRUPTS(CLBR_NONE)
2788 movl threadinfo_flags(%rcx),%edx
2790 @@ -287,7 +292,7 @@ sysret_check:
2791 * sysretq will re-enable interrupts:
2794 - XEN_UNBLOCK_EVENTS(%rsi)
2795 + ENABLE_INTERRUPTS(CLBR_NONE)
2797 HYPERVISOR_IRET VGCF_IN_SYSCALL
2799 @@ -298,7 +303,7 @@ sysret_careful:
2800 bt $TIF_NEED_RESCHED,%edx
2803 - XEN_UNBLOCK_EVENTS(%rsi)
2804 + ENABLE_INTERRUPTS(CLBR_NONE)
2806 CFI_ADJUST_CFA_OFFSET 8
2808 @@ -309,9 +314,8 @@ sysret_careful:
2809 /* Handle a signal */
2813 - XEN_UNBLOCK_EVENTS(%rsi)
2814 - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2815 + ENABLE_INTERRUPTS(CLBR_NONE)
2816 + testl $_TIF_DO_NOTIFY_MASK,%edx
2819 /* Really a signal */
2820 @@ -323,7 +327,7 @@ sysret_signal:
2821 1: movl $_TIF_NEED_RESCHED,%edi
2822 /* Use IRET because user could have changed frame. This
2823 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
2824 - XEN_BLOCK_EVENTS(%rsi)
2825 + DISABLE_INTERRUPTS(CLBR_NONE)
2829 @@ -355,7 +359,7 @@ tracesys:
2831 .globl int_ret_from_sys_call
2832 int_ret_from_sys_call:
2833 - XEN_BLOCK_EVENTS(%rsi)
2834 + DISABLE_INTERRUPTS(CLBR_NONE)
2836 testb $3,CS-ARGOFFSET(%rsp)
2838 @@ -381,22 +385,20 @@ int_careful:
2839 bt $TIF_NEED_RESCHED,%edx
2840 jnc int_very_careful
2843 - XEN_UNBLOCK_EVENTS(%rsi)
2844 + ENABLE_INTERRUPTS(CLBR_NONE)
2846 CFI_ADJUST_CFA_OFFSET 8
2849 CFI_ADJUST_CFA_OFFSET -8
2850 - XEN_BLOCK_EVENTS(%rsi)
2851 + DISABLE_INTERRUPTS(CLBR_NONE)
2855 /* handle signals and tracing -- both require a full stack frame */
2859 - XEN_UNBLOCK_EVENTS(%rsi)
2860 + ENABLE_INTERRUPTS(CLBR_NONE)
2862 /* Check for syscall exit trace */
2863 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
2864 @@ -411,7 +413,7 @@ int_very_careful:
2865 jmp int_restore_rest
2868 - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2869 + testl $_TIF_DO_NOTIFY_MASK,%edx
2871 movq %rsp,%rdi # &ptregs -> arg1
2872 xorl %esi,%esi # oldset -> arg2
2873 @@ -419,7 +421,7 @@ int_signal:
2874 1: movl $_TIF_NEED_RESCHED,%edi
2877 - XEN_BLOCK_EVENTS(%rsi)
2878 + DISABLE_INTERRUPTS(CLBR_NONE)
2882 @@ -474,6 +476,7 @@ ENTRY(stub_execve)
2883 CFI_REGISTER rip, r11
2885 FIXUP_TOP_OF_STACK %r11
2888 RESTORE_TOP_OF_STACK %r11
2890 @@ -526,11 +529,10 @@ retint_check:
2891 retint_restore_args: /* return to kernel space */
2892 movl EFLAGS-REST_SKIP(%rsp), %eax
2893 shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
2894 - XEN_GET_VCPU_INFO(%rsi)
2896 andb evtchn_upcall_mask(%rsi),%al
2897 andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
2898 jnz restore_all_enable_events # != 0 => enable event delivery
2899 - XEN_PUT_VCPU_INFO(%rsi)
2903 @@ -541,31 +543,29 @@ retint_careful:
2904 bt $TIF_NEED_RESCHED,%edx
2907 - XEN_UNBLOCK_EVENTS(%rsi)
2909 + ENABLE_INTERRUPTS(CLBR_NONE)
2911 CFI_ADJUST_CFA_OFFSET 8
2914 CFI_ADJUST_CFA_OFFSET -8
2915 GET_THREAD_INFO(%rcx)
2916 - XEN_BLOCK_EVENTS(%rsi)
2918 + DISABLE_INTERRUPTS(CLBR_NONE)
2923 - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2924 + testl $_TIF_DO_NOTIFY_MASK,%edx
2925 jz retint_restore_args
2927 - XEN_UNBLOCK_EVENTS(%rsi)
2928 + ENABLE_INTERRUPTS(CLBR_NONE)
2930 movq $-1,ORIG_RAX(%rsp)
2931 xorl %esi,%esi # oldset
2932 movq %rsp,%rdi # &pt_regs
2933 call do_notify_resume
2935 - XEN_BLOCK_EVENTS(%rsi)
2936 + DISABLE_INTERRUPTS(CLBR_NONE)
2938 movl $_TIF_NEED_RESCHED,%edi
2939 GET_THREAD_INFO(%rcx)
2940 @@ -702,7 +702,7 @@ END(spurious_interrupt)
2949 @@ -719,8 +719,7 @@ END(spurious_interrupt)
2951 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
2954 - XEN_BLOCK_EVENTS(%rsi)
2955 + DISABLE_INTERRUPTS(CLBR_NONE)
2959 @@ -749,10 +748,10 @@ paranoid_swapgs\trace:
2964 + SWAPGS_UNSAFE_STACK
2965 paranoid_restore\trace:
2969 paranoid_userspace\trace:
2970 GET_THREAD_INFO(%rcx)
2971 movl threadinfo_flags(%rcx),%ebx
2972 @@ -767,11 +766,11 @@ paranoid_userspace\trace:
2977 + ENABLE_INTERRUPTS(CLBR_NONE)
2978 xorl %esi,%esi /* arg2: oldset */
2979 movq %rsp,%rdi /* arg1: &pt_regs */
2980 call do_notify_resume
2982 + DISABLE_INTERRUPTS(CLBR_NONE)
2986 @@ -780,9 +779,9 @@ paranoid_schedule\trace:
2991 + ENABLE_INTERRUPTS(CLBR_ANY)
2994 + DISABLE_INTERRUPTS(CLBR_ANY)
2998 @@ -846,8 +845,7 @@ error_call_handler:
3003 - XEN_BLOCK_EVENTS(%rsi)
3004 + DISABLE_INTERRUPTS(CLBR_NONE)
3006 GET_THREAD_INFO(%rcx)
3007 testb $3,CS-ARGOFFSET(%rsp)
3008 @@ -875,7 +873,7 @@ error_kernelspace:
3009 iret run with kernel gs again, so don't set the user space flag.
3010 B stepping K8s sometimes report an truncated RIP for IRET
3011 exceptions returning to compat mode. Check for these here too. */
3012 - leaq iret_label(%rip),%rbp
3013 + leaq irq_return(%rip),%rbp
3016 movl %ebp,%ebp /* zero extend */
3017 @@ -930,19 +928,17 @@ END(do_hypervisor_callback)
3018 restore_all_enable_events:
3019 CFI_DEFAULT_STACK adj=1
3021 - XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up...
3022 + __ENABLE_INTERRUPTS
3024 scrit: /**** START OF CRITICAL REGION ****/
3025 - XEN_TEST_PENDING(%rsi)
3028 jnz 14f # process more events if necessary...
3029 - XEN_PUT_VCPU_INFO(%rsi)
3034 -14: XEN_LOCKED_BLOCK_EVENTS(%rsi)
3035 - XEN_PUT_VCPU_INFO(%rsi)
3036 +14: __DISABLE_INTERRUPTS
3038 movq %rsp,%rdi # set the argument again
3040 @@ -1086,15 +1082,16 @@ ENDPROC(child_rip)
3041 * rdi: name, rsi: argv, rdx: envp
3043 * We want to fallback into:
3044 - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
3045 + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
3047 * do_sys_execve asm fallback arguments:
3048 - * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
3049 + * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
3051 ENTRY(kernel_execve)
3057 movq %rax, RAX(%rsp)
3059 @@ -1144,7 +1141,7 @@ do_nmi_callback:
3061 orl $NMI_MASK,EFLAGS(%rsp)
3063 - XEN_BLOCK_EVENTS(%rsi)
3064 + DISABLE_INTERRUPTS(CLBR_NONE)
3066 GET_THREAD_INFO(%rcx)
3067 jmp retint_restore_args
3068 --- a/arch/x86/kernel/fixup.c
3069 +++ b/arch/x86/kernel/fixup.c
3072 #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
3074 -fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
3075 +void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
3077 static unsigned long printed = 0;
3079 --- a/arch/x86/kernel/genapic_64-xen.c
3080 +++ b/arch/x86/kernel/genapic_64-xen.c
3082 #include <acpi/acpi_bus.h>
3086 - * which logical CPU number maps to which CPU (physical APIC ID)
3088 - * The following static array is used during kernel startup
3089 - * and the x86_cpu_to_apicid_ptr contains the address of the
3090 - * array during this time. Is it zeroed when the per_cpu
3091 - * data area is removed.
3093 +/* which logical CPU number maps to which CPU (physical APIC ID) */
3095 -u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
3096 +u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
3097 = { [0 ... NR_CPUS-1] = BAD_APICID };
3098 -void *x86_cpu_to_apicid_ptr;
3099 +void *x86_cpu_to_apicid_early_ptr;
3101 -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
3102 +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
3103 EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
3106 --- a/arch/x86/kernel/head_32-xen.S
3107 +++ b/arch/x86/kernel/head_32-xen.S
3110 #include <linux/elfnote.h>
3111 #include <linux/threads.h>
3112 +#include <linux/init.h>
3113 #include <linux/linkage.h>
3114 #include <asm/segment.h>
3115 #include <asm/page.h>
3116 @@ -88,7 +89,7 @@ ENTRY(_stext)
3118 .section ".bss.page_aligned","wa"
3119 .align PAGE_SIZE_asm
3120 -ENTRY(swapper_pg_pmd)
3121 +ENTRY(swapper_pg_fixmap)
3123 ENTRY(empty_zero_page)
3125 --- a/arch/x86/kernel/head64-xen.c
3126 +++ b/arch/x86/kernel/head64-xen.c
3128 #include <linux/kernel.h>
3129 #include <linux/string.h>
3130 #include <linux/percpu.h>
3131 +#include <linux/start_kernel.h>
3132 #include <linux/module.h>
3134 #include <asm/processor.h>
3136 #include <asm/pgtable.h>
3137 #include <asm/tlbflush.h>
3138 #include <asm/sections.h>
3139 +#include <asm/kdebug.h>
3140 +#include <asm/e820.h>
3142 unsigned long start_pfn;
3144 @@ -34,7 +37,7 @@ static void __init zap_identity_mappings
3146 pgd_t *pgd = pgd_offset_k(0UL);
3149 + __flush_tlb_all();
3152 /* Don't add a printk in there. printk relies on the PDA which is not initialized
3153 @@ -72,6 +75,37 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
3154 unsigned int machine_to_phys_order;
3155 EXPORT_SYMBOL(machine_to_phys_order);
3157 +#define EBDA_ADDR_POINTER 0x40E
3159 +static __init void reserve_ebda(void)
3162 + unsigned ebda_addr, ebda_size;
3165 + * there is a real-mode segmented pointer pointing to the
3166 + * 4K EBDA area at 0x40E
3168 + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
3174 + ebda_size = *(unsigned short *)__va(ebda_addr);
3176 + /* Round EBDA up to pages */
3177 + if (ebda_size == 0)
3180 + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
3181 + if (ebda_size > 64*1024)
3182 + ebda_size = 64*1024;
3184 + reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
3188 void __init x86_64_start_kernel(char * real_mode_data)
3190 struct xen_machphys_mapping mapping;
3191 @@ -103,8 +137,16 @@ void __init x86_64_start_kernel(char * r
3192 /* Make NULL pointers segfault */
3193 zap_identity_mappings();
3195 - for (i = 0; i < IDT_ENTRIES; i++)
3196 + /* Cleanup the over mapped high alias */
3197 + cleanup_highmap();
3199 + for (i = 0; i < IDT_ENTRIES; i++) {
3200 +#ifdef CONFIG_EARLY_PRINTK
3201 + set_intr_gate(i, &early_idt_handlers[i]);
3203 set_intr_gate(i, early_idt_handler);
3206 load_idt((const struct desc_ptr *)&idt_descr);
3209 @@ -115,8 +157,19 @@ void __init x86_64_start_kernel(char * r
3212 copy_bootdata(__va(real_mode_data));
3214 - cpu_set(0, cpu_online_map);
3217 + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
3219 + reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
3220 + start_pfn << PAGE_SHIFT, "Xen provided");
3225 + * At this point everything still needed from the boot loader
3226 + * or BIOS or kernel text should be early reserved or marked not
3227 + * RAM in e820. All other memory is free game.
3232 --- a/arch/x86/kernel/init_task-xen.c
3233 +++ b/arch/x86/kernel/init_task-xen.c
3234 @@ -19,7 +19,7 @@ static struct sighand_struct init_sighan
3236 struct mm_struct init_mm = INIT_MM(init_mm);
3237 #undef swapper_pg_dir
3238 -EXPORT_SYMBOL(init_mm);
3239 +EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
3242 * Initial thread structure.
3243 --- a/arch/x86/kernel/io_apic_32-xen.c
3244 +++ b/arch/x86/kernel/io_apic_32-xen.c
3246 #include <linux/htirq.h>
3247 #include <linux/freezer.h>
3248 #include <linux/kthread.h>
3249 +#include <linux/jiffies.h> /* time_after() */
3252 #include <asm/smp.h>
3254 #include <mach_apic.h>
3255 #include <mach_apicdef.h>
3257 -#include "io_ports.h"
3260 #include <xen/interface/xen.h>
3261 #include <xen/interface/physdev.h>
3262 @@ -400,7 +399,7 @@ static void set_ioapic_affinity_irq(unsi
3263 # include <asm/processor.h> /* kernel_thread() */
3264 # include <linux/kernel_stat.h> /* kstat */
3265 # include <linux/slab.h> /* kmalloc() */
3266 -# include <linux/timer.h> /* time_after() */
3267 +# include <linux/timer.h>
3269 #define IRQBALANCE_CHECK_ARCH -999
3270 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
3271 @@ -777,7 +776,7 @@ late_initcall(balanced_irq_init);
3275 -void fastcall send_IPI_self(int vector)
3276 +void send_IPI_self(int vector)
3280 @@ -1959,7 +1958,7 @@ static int __init timer_irq_works(void)
3281 * might have cached one ExtINT interrupt. Finally, at
3282 * least one tick may be lost due to delays.
3284 - if (jiffies - t1 > 4)
3285 + if (time_after(jiffies, t1 + 4))
3289 @@ -2142,7 +2141,7 @@ static struct irq_chip lapic_chip __read
3293 -static void setup_nmi (void)
3294 +static void __init setup_nmi(void)
3297 * Dirty trick to enable the NMI watchdog ...
3298 @@ -2155,7 +2154,7 @@ static void setup_nmi (void)
3300 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
3302 - on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
3303 + enable_NMI_through_LVT0();
3305 apic_printk(APIC_VERBOSE, " done.\n");
3307 @@ -2479,7 +2478,7 @@ static int ioapic_resume(struct sys_devi
3310 static struct sysdev_class ioapic_sysdev_class = {
3311 - set_kset_name("ioapic"),
3313 .suspend = ioapic_suspend,
3314 .resume = ioapic_resume,
3316 --- a/arch/x86/kernel/io_apic_64-xen.c
3317 +++ b/arch/x86/kernel/io_apic_64-xen.c
3319 #include <linux/msi.h>
3320 #include <linux/htirq.h>
3321 #include <linux/dmar.h>
3322 +#include <linux/jiffies.h>
3324 #include <acpi/acpi_bus.h>
3326 +#include <linux/bootmem.h>
3328 #include <asm/idle.h>
3330 @@ -1064,7 +1066,7 @@ void __apicdebuginit print_local_APIC(vo
3331 v = apic_read(APIC_LVR);
3332 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
3333 ver = GET_APIC_VERSION(v);
3334 - maxlvt = get_maxlvt();
3335 + maxlvt = lapic_get_maxlvt();
3337 v = apic_read(APIC_TASKPRI);
3338 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
3339 @@ -1165,7 +1167,7 @@ void __apicdebuginit print_PIC(void)
3341 #endif /* !CONFIG_XEN */
3343 -static void __init enable_IO_APIC(void)
3344 +void __init enable_IO_APIC(void)
3346 union IO_APIC_reg_01 reg_01;
3348 @@ -1299,7 +1301,7 @@ static int __init timer_irq_works(void)
3352 - if (jiffies - t1 > 4)
3353 + if (time_after(jiffies, t1 + 4))
3357 @@ -1412,7 +1414,7 @@ static void irq_complete_move(unsigned i
3358 if (likely(!cfg->move_in_progress))
3361 - vector = ~get_irq_regs()->orig_rax;
3362 + vector = ~get_irq_regs()->orig_ax;
3363 me = smp_processor_id();
3364 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
3365 cpumask_t cleanup_mask;
3366 @@ -1439,7 +1441,7 @@ static void ack_apic_level(unsigned int
3367 int do_unmask_irq = 0;
3369 irq_complete_move(irq);
3370 -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
3371 +#ifdef CONFIG_GENERIC_PENDING_IRQ
3372 /* If we are moving the irq we need to mask it */
3373 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
3375 @@ -1570,7 +1572,7 @@ static struct hw_interrupt_type lapic_ir
3376 .end = end_lapic_irq,
3379 -static void setup_nmi (void)
3380 +static void __init setup_nmi(void)
3383 * Dirty trick to enable the NMI watchdog ...
3384 @@ -1583,7 +1585,7 @@ static void setup_nmi (void)
3386 printk(KERN_INFO "activating NMI Watchdog ...");
3388 - enable_NMI_through_LVT0(NULL);
3389 + enable_NMI_through_LVT0();
3393 @@ -1659,7 +1661,7 @@ static inline void unlock_ExtINT_logic(v
3395 * FIXME: really need to revamp this for modern platforms only.
3397 -static inline void check_timer(void)
3398 +static inline void __init check_timer(void)
3400 struct irq_cfg *cfg = irq_cfg + 0;
3401 int apic1, pin1, apic2, pin2;
3402 @@ -1863,7 +1865,7 @@ static int ioapic_resume(struct sys_devi
3405 static struct sysdev_class ioapic_sysdev_class = {
3406 - set_kset_name("ioapic"),
3408 .suspend = ioapic_suspend,
3409 .resume = ioapic_resume,
3411 @@ -2303,5 +2305,93 @@ void __init setup_ioapic_dest(void)
3415 -#endif /* !CONFIG_XEN */
3417 +#define IOAPIC_RESOURCE_NAME_SIZE 11
3419 +static struct resource *ioapic_resources;
3421 +static struct resource * __init ioapic_setup_resources(void)
3424 + struct resource *res;
3428 + if (nr_ioapics <= 0)
3431 + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
3434 + mem = alloc_bootmem(n);
3435 + res = (void *)mem;
3437 + if (mem != NULL) {
3438 + memset(mem, 0, n);
3439 + mem += sizeof(struct resource) * nr_ioapics;
3441 + for (i = 0; i < nr_ioapics; i++) {
3442 + res[i].name = mem;
3443 + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3444 + sprintf(mem, "IOAPIC %u", i);
3445 + mem += IOAPIC_RESOURCE_NAME_SIZE;
3449 + ioapic_resources = res;
3454 +void __init ioapic_init_mappings(void)
3456 + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
3457 + struct resource *ioapic_res;
3460 + ioapic_res = ioapic_setup_resources();
3461 + for (i = 0; i < nr_ioapics; i++) {
3462 + if (smp_found_config) {
3463 + ioapic_phys = mp_ioapics[i].mpc_apicaddr;
3465 + ioapic_phys = (unsigned long)
3466 + alloc_bootmem_pages(PAGE_SIZE);
3467 + ioapic_phys = __pa(ioapic_phys);
3469 + set_fixmap_nocache(idx, ioapic_phys);
3470 + apic_printk(APIC_VERBOSE,
3471 + "mapped IOAPIC to %016lx (%016lx)\n",
3472 + __fix_to_virt(idx), ioapic_phys);
3475 + if (ioapic_res != NULL) {
3476 + ioapic_res->start = ioapic_phys;
3477 + ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
3483 +static int __init ioapic_insert_resources(void)
3486 + struct resource *r = ioapic_resources;
3490 + "IO APIC resources could be not be allocated.\n");
3494 + for (i = 0; i < nr_ioapics; i++) {
3495 + insert_resource(&iomem_resource, r);
3502 +/* Insert the IO APIC resources after PCI initialization has occured to handle
3503 + * IO APICS that are mapped in on a BAR in PCI space. */
3504 +late_initcall(ioapic_insert_resources);
3505 +#endif /* !CONFIG_XEN */
3506 --- a/arch/x86/kernel/ioport_32-xen.c
3510 - * This contains the io-permission bitmap code - written by obz, with changes
3514 -#include <linux/sched.h>
3515 -#include <linux/kernel.h>
3516 -#include <linux/capability.h>
3517 -#include <linux/errno.h>
3518 -#include <linux/types.h>
3519 -#include <linux/ioport.h>
3520 -#include <linux/smp.h>
3521 -#include <linux/stddef.h>
3522 -#include <linux/slab.h>
3523 -#include <linux/thread_info.h>
3524 -#include <linux/syscalls.h>
3525 -#include <xen/interface/physdev.h>
3527 -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3528 -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
3530 - unsigned long mask;
3531 - unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
3532 - unsigned int low_index = base & (BITS_PER_LONG-1);
3533 - int length = low_index + extent;
3535 - if (low_index != 0) {
3536 - mask = (~0UL << low_index);
3537 - if (length < BITS_PER_LONG)
3538 - mask &= ~(~0UL << length);
3540 - *bitmap_base++ |= mask;
3542 - *bitmap_base++ &= ~mask;
3543 - length -= BITS_PER_LONG;
3546 - mask = (new_value ? ~0UL : 0UL);
3547 - while (length >= BITS_PER_LONG) {
3548 - *bitmap_base++ = mask;
3549 - length -= BITS_PER_LONG;
3553 - mask = ~(~0UL << length);
3555 - *bitmap_base++ |= mask;
3557 - *bitmap_base++ &= ~mask;
3563 - * this changes the io permissions bitmap in the current task.
3565 -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3567 - struct thread_struct * t = ¤t->thread;
3568 - unsigned long *bitmap;
3569 - struct physdev_set_iobitmap set_iobitmap;
3571 - if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3573 - if (turn_on && !capable(CAP_SYS_RAWIO))
3577 - * If it's the first ioperm() call in this thread's lifetime, set the
3578 - * IO bitmap up. ioperm() is much less timing critical than clone(),
3579 - * this is why we delay this operation until now:
3581 - if (!t->io_bitmap_ptr) {
3582 - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3586 - memset(bitmap, 0xff, IO_BITMAP_BYTES);
3587 - t->io_bitmap_ptr = bitmap;
3588 - set_thread_flag(TIF_IO_BITMAP);
3590 - set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3591 - set_iobitmap.nr_ports = IO_BITMAP_BITS;
3592 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3596 - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3602 - * sys_iopl has to be used when you want to access the IO ports
3603 - * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3604 - * you'd need 8kB of bitmaps/process, which is a bit excessive.
3606 - * Here we just change the eflags value on the stack: we allow
3607 - * only the super-user to do it. This depends on the stack-layout
3608 - * on system-call entry - see also fork() and the signal handling
3612 -asmlinkage long sys_iopl(unsigned long unused)
3614 - volatile struct pt_regs * regs = (struct pt_regs *) &unused;
3615 - unsigned int level = regs->ebx;
3616 - struct thread_struct *t = ¤t->thread;
3617 - unsigned int old = (t->iopl >> 12) & 3;
3621 - /* Trying to gain more privileges? */
3622 - if (level > old) {
3623 - if (!capable(CAP_SYS_RAWIO))
3626 - t->iopl = level << 12;
3627 - set_iopl_mask(t->iopl);
3630 --- a/arch/x86/kernel/ioport_64-xen.c
3634 - * This contains the io-permission bitmap code - written by obz, with changes
3638 -#include <linux/sched.h>
3639 -#include <linux/kernel.h>
3640 -#include <linux/capability.h>
3641 -#include <linux/errno.h>
3642 -#include <linux/types.h>
3643 -#include <linux/ioport.h>
3644 -#include <linux/mm.h>
3645 -#include <linux/smp.h>
3646 -#include <linux/stddef.h>
3647 -#include <linux/slab.h>
3648 -#include <linux/thread_info.h>
3649 -#include <linux/syscalls.h>
3650 -#include <xen/interface/physdev.h>
3652 -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3653 -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
3658 - for (i = base; i < base + extent; i++)
3659 - __set_bit(i, bitmap);
3661 - for (i = base; i < base + extent; i++)
3662 - clear_bit(i, bitmap);
3666 - * this changes the io permissions bitmap in the current task.
3668 -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3670 - struct thread_struct * t = ¤t->thread;
3671 - unsigned long *bitmap;
3672 - struct physdev_set_iobitmap set_iobitmap;
3674 - if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3676 - if (turn_on && !capable(CAP_SYS_RAWIO))
3680 - * If it's the first ioperm() call in this thread's lifetime, set the
3681 - * IO bitmap up. ioperm() is much less timing critical than clone(),
3682 - * this is why we delay this operation until now:
3684 - if (!t->io_bitmap_ptr) {
3685 - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3689 - memset(bitmap, 0xff, IO_BITMAP_BYTES);
3690 - t->io_bitmap_ptr = bitmap;
3691 - set_thread_flag(TIF_IO_BITMAP);
3693 - set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3694 - set_iobitmap.nr_ports = IO_BITMAP_BITS;
3695 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3699 - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3705 - * sys_iopl has to be used when you want to access the IO ports
3706 - * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3707 - * you'd need 8kB of bitmaps/process, which is a bit excessive.
3711 -asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
3713 - unsigned int old_iopl = current->thread.iopl;
3714 - struct physdev_set_iopl set_iopl;
3719 - /* Need "raw I/O" privileges for direct port access. */
3720 - if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
3723 - /* Change our version of the privilege levels. */
3724 - current->thread.iopl = new_iopl;
3726 - /* Force the change at ring 0. */
3727 - set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
3728 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
3733 +++ b/arch/x86/kernel/ioport-xen.c
3736 + * This contains the io-permission bitmap code - written by obz, with changes
3737 + * by Linus. 32/64 bits code unification by Miguel Botón.
3740 +#include <linux/sched.h>
3741 +#include <linux/kernel.h>
3742 +#include <linux/capability.h>
3743 +#include <linux/errno.h>
3744 +#include <linux/types.h>
3745 +#include <linux/ioport.h>
3746 +#include <linux/smp.h>
3747 +#include <linux/stddef.h>
3748 +#include <linux/slab.h>
3749 +#include <linux/thread_info.h>
3750 +#include <linux/syscalls.h>
3751 +#include <xen/interface/physdev.h>
3753 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3754 +static void set_bitmap(unsigned long *bitmap, unsigned int base,
3755 + unsigned int extent, int new_value)
3759 + for (i = base; i < base + extent; i++) {
3761 + __set_bit(i, bitmap);
3763 + __clear_bit(i, bitmap);
3768 + * this changes the io permissions bitmap in the current task.
3770 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3772 + struct thread_struct * t = ¤t->thread;
3773 + struct physdev_set_iobitmap set_iobitmap;
3775 + if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3777 + if (turn_on && !capable(CAP_SYS_RAWIO))
3781 + * If it's the first ioperm() call in this thread's lifetime, set the
3782 + * IO bitmap up. ioperm() is much less timing critical than clone(),
3783 + * this is why we delay this operation until now:
3785 + if (!t->io_bitmap_ptr) {
3786 + unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3791 + memset(bitmap, 0xff, IO_BITMAP_BYTES);
3792 + t->io_bitmap_ptr = bitmap;
3793 + set_thread_flag(TIF_IO_BITMAP);
3795 + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3796 + set_iobitmap.nr_ports = IO_BITMAP_BITS;
3797 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3801 + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3807 + * sys_iopl has to be used when you want to access the IO ports
3808 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3809 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
3811 +static int do_iopl(unsigned int level, struct thread_struct *t)
3813 + unsigned int old = t->iopl >> 12;
3817 + /* Trying to gain more privileges? */
3818 + if (level > old) {
3819 + if (!capable(CAP_SYS_RAWIO))
3826 +#ifdef CONFIG_X86_32
3827 +asmlinkage long sys_iopl(unsigned long regsp)
3829 + struct pt_regs *regs = (struct pt_regs *)®sp;
3830 + unsigned int level = regs->bx;
3832 +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
3835 + struct thread_struct *t = ¤t->thread;
3838 + rc = do_iopl(level, t);
3842 + t->iopl = level << 12;
3843 + set_iopl_mask(t->iopl);
3847 --- a/arch/x86/kernel/irq_32-xen.c
3848 +++ b/arch/x86/kernel/irq_32-xen.c
3849 @@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPU
3850 * SMP cross-CPU interrupts have their own specific
3853 -fastcall unsigned int do_IRQ(struct pt_regs *regs)
3854 +unsigned int do_IRQ(struct pt_regs *regs)
3856 struct pt_regs *old_regs;
3857 /* high bit used in ret_from_ code */
3858 - int irq = ~regs->orig_eax;
3859 + int irq = ~regs->orig_ax;
3860 struct irq_desc *desc = irq_desc + irq;
3861 #ifdef CONFIG_4KSTACKS
3862 union irq_ctx *curctx, *irqctx;
3863 @@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_r
3864 #ifdef CONFIG_DEBUG_STACKOVERFLOW
3865 /* Debugging check for stack overflow: is there less than 1KB free? */
3870 __asm__ __volatile__("andl %%esp,%0" :
3871 - "=r" (esp) : "0" (THREAD_SIZE - 1));
3872 - if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
3873 + "=r" (sp) : "0" (THREAD_SIZE - 1));
3874 + if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
3875 printk("do_IRQ: stack overflow: %ld\n",
3876 - esp - sizeof(struct thread_info));
3877 + sp - sizeof(struct thread_info));
3881 @@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_r
3882 * current stack (which is the irq stack already after all)
3884 if (curctx != irqctx) {
3885 - int arg1, arg2, ebx;
3886 + int arg1, arg2, bx;
3888 /* build the stack frame on the IRQ stack */
3889 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
3890 @@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_r
3891 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
3894 - " xchgl %%ebx,%%esp \n"
3896 - " movl %%ebx,%%esp \n"
3897 - : "=a" (arg1), "=d" (arg2), "=b" (ebx)
3898 + " xchgl %%ebx,%%esp \n"
3900 + " movl %%ebx,%%esp \n"
3901 + : "=a" (arg1), "=d" (arg2), "=b" (bx)
3902 : "0" (irq), "1" (desc), "2" (isp),
3903 "D" (desc->handle_irq)
3905 --- a/arch/x86/kernel/irq_64-xen.c
3906 +++ b/arch/x86/kernel/irq_64-xen.c
3909 atomic_t irq_err_count;
3912 + * 'what should we do if we get a hw irq event on an illegal vector'.
3913 + * each architecture has to answer this themselves.
3915 +void ack_bad_irq(unsigned int irq)
3917 + printk(KERN_WARNING "unexpected IRQ trap at irq %02x\n", irq);
3918 +#ifdef CONFIG_X86_LOCAL_APIC
3920 + * Currently unexpected vectors happen only on SMP and APIC.
3921 + * We _must_ ack these because every local APIC has only N
3922 + * irq slots per priority level, and a 'hanging, unacked' IRQ
3923 + * holds up an irq slot - in excessive cases (when multiple
3924 + * unexpected vectors occur) that might lock up the APIC
3926 + * But don't ack when the APIC is disabled. -AK
3928 + if (!disable_apic)
3933 #ifdef CONFIG_DEBUG_STACKOVERFLOW
3935 * Probabilistic stack overflow check:
3936 @@ -33,11 +55,11 @@ static inline void stack_overflow_check(
3937 u64 curbase = (u64)task_stack_page(current);
3938 static unsigned long warned = -60*HZ;
3940 - if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
3941 - regs->rsp < curbase + sizeof(struct thread_info) + 128 &&
3942 + if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
3943 + regs->sp < curbase + sizeof(struct thread_info) + 128 &&
3944 time_after(jiffies, warned + 60*HZ)) {
3945 - printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
3946 - current->comm, curbase, regs->rsp);
3947 + printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
3948 + current->comm, curbase, regs->sp);
3949 show_stack(NULL,NULL);
3952 @@ -150,7 +172,7 @@ asmlinkage unsigned int do_IRQ(struct pt
3953 struct pt_regs *old_regs = set_irq_regs(regs);
3955 /* high bit used in ret_from_ code */
3956 - unsigned irq = ~regs->orig_rax;
3957 + unsigned irq = ~regs->orig_ax;
3961 @@ -251,14 +273,3 @@ asmlinkage void do_softirq(void)
3963 local_irq_restore(flags);
3966 -#ifndef CONFIG_X86_LOCAL_APIC
3968 - * 'what should we do if we get a hw irq event on an illegal vector'.
3969 - * each architecture has to answer this themselves.
3971 -void ack_bad_irq(unsigned int irq)
3973 - printk("unexpected IRQ trap at irq %02x\n", irq);
3976 --- a/arch/x86/kernel/ldt_32-xen.c
3980 - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
3981 - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
3984 -#include <linux/errno.h>
3985 -#include <linux/sched.h>
3986 -#include <linux/string.h>
3987 -#include <linux/mm.h>
3988 -#include <linux/smp.h>
3989 -#include <linux/vmalloc.h>
3990 -#include <linux/slab.h>
3992 -#include <asm/uaccess.h>
3993 -#include <asm/system.h>
3994 -#include <asm/ldt.h>
3995 -#include <asm/desc.h>
3996 -#include <asm/mmu_context.h>
3998 -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
3999 -static void flush_ldt(void *null)
4001 - if (current->active_mm)
4002 - load_LDT(¤t->active_mm->context);
4006 -static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
4012 - if (mincount <= pc->size)
4014 - oldsize = pc->size;
4015 - mincount = (mincount+511)&(~511);
4016 - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
4017 - newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
4019 - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
4025 - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
4027 - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
4030 - pc->size = mincount;
4036 - preempt_disable();
4038 - make_pages_readonly(
4040 - (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4041 - XENFEAT_writable_descriptor_tables);
4044 - mask = cpumask_of_cpu(smp_processor_id());
4045 - if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4046 - smp_call_function(flush_ldt, NULL, 1, 1);
4051 - make_pages_writable(
4053 - (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4054 - XENFEAT_writable_descriptor_tables);
4055 - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
4063 -static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4065 - int err = alloc_ldt(new, old->size, 0);
4068 - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
4069 - make_pages_readonly(
4071 - (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4072 - XENFEAT_writable_descriptor_tables);
4077 - * we do not have to muck with descriptors here, that is
4078 - * done in switch_mm() as needed.
4080 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4082 - struct mm_struct * old_mm;
4085 - mutex_init(&mm->context.lock);
4086 - mm->context.size = 0;
4087 - mm->context.has_foreign_mappings = 0;
4088 - old_mm = current->mm;
4089 - if (old_mm && old_mm->context.size > 0) {
4090 - mutex_lock(&old_mm->context.lock);
4091 - retval = copy_ldt(&mm->context, &old_mm->context);
4092 - mutex_unlock(&old_mm->context.lock);
4098 - * No need to lock the MM as we are the last user
4100 -void destroy_context(struct mm_struct *mm)
4102 - if (mm->context.size) {
4103 - if (mm == current->active_mm)
4105 - make_pages_writable(
4107 - (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4108 - XENFEAT_writable_descriptor_tables);
4109 - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
4110 - vfree(mm->context.ldt);
4112 - kfree(mm->context.ldt);
4113 - mm->context.size = 0;
4117 -static int read_ldt(void __user * ptr, unsigned long bytecount)
4120 - unsigned long size;
4121 - struct mm_struct * mm = current->mm;
4123 - if (!mm->context.size)
4125 - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
4126 - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
4128 - mutex_lock(&mm->context.lock);
4129 - size = mm->context.size*LDT_ENTRY_SIZE;
4130 - if (size > bytecount)
4134 - if (copy_to_user(ptr, mm->context.ldt, size))
4136 - mutex_unlock(&mm->context.lock);
4138 - goto error_return;
4139 - if (size != bytecount) {
4140 - /* zero-fill the rest */
4141 - if (clear_user(ptr+size, bytecount-size) != 0) {
4143 - goto error_return;
4151 -static int read_default_ldt(void __user * ptr, unsigned long bytecount)
4154 - unsigned long size;
4157 - size = 5*sizeof(struct desc_struct);
4158 - if (size > bytecount)
4162 - if (clear_user(ptr, size))
4168 -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
4170 - struct mm_struct * mm = current->mm;
4171 - __u32 entry_1, entry_2;
4173 - struct user_desc ldt_info;
4176 - if (bytecount != sizeof(ldt_info))
4179 - if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
4183 - if (ldt_info.entry_number >= LDT_ENTRIES)
4185 - if (ldt_info.contents == 3) {
4188 - if (ldt_info.seg_not_present == 0)
4192 - mutex_lock(&mm->context.lock);
4193 - if (ldt_info.entry_number >= mm->context.size) {
4194 - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1);
4199 - /* Allow LDTs to be cleared by the user. */
4200 - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4201 - if (oldmode || LDT_empty(&ldt_info)) {
4208 - entry_1 = LDT_entry_a(&ldt_info);
4209 - entry_2 = LDT_entry_b(&ldt_info);
4211 - entry_2 &= ~(1 << 20);
4213 - /* Install the new entry ... */
4215 - error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
4216 - entry_1, entry_2);
4219 - mutex_unlock(&mm->context.lock);
4224 -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
4226 - int ret = -ENOSYS;
4230 - ret = read_ldt(ptr, bytecount);
4233 - ret = write_ldt(ptr, bytecount, 1);
4236 - ret = read_default_ldt(ptr, bytecount);
4239 - ret = write_ldt(ptr, bytecount, 0);
4244 --- a/arch/x86/kernel/ldt_64-xen.c
4248 - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4249 - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4250 - * Copyright (C) 2002 Andi Kleen
4252 - * This handles calls from both 32bit and 64bit mode.
4255 -#include <linux/errno.h>
4256 -#include <linux/sched.h>
4257 -#include <linux/string.h>
4258 -#include <linux/mm.h>
4259 -#include <linux/smp.h>
4260 -#include <linux/vmalloc.h>
4261 -#include <linux/slab.h>
4263 -#include <asm/uaccess.h>
4264 -#include <asm/system.h>
4265 -#include <asm/ldt.h>
4266 -#include <asm/desc.h>
4267 -#include <asm/proto.h>
4268 -#include <asm/pgalloc.h>
4270 -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
4271 -static void flush_ldt(void *null)
4273 - if (current->active_mm)
4274 - load_LDT(¤t->active_mm->context);
4278 -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
4284 - if (mincount <= (unsigned)pc->size)
4286 - oldsize = pc->size;
4287 - mincount = (mincount+511)&(~511);
4288 - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
4289 - newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
4291 - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
4297 - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
4299 - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
4303 - pc->size = mincount;
4309 - preempt_disable();
4311 - make_pages_readonly(
4313 - (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4314 - XENFEAT_writable_descriptor_tables);
4317 - mask = cpumask_of_cpu(smp_processor_id());
4318 - if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4319 - smp_call_function(flush_ldt, NULL, 1, 1);
4324 - make_pages_writable(
4326 - (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4327 - XENFEAT_writable_descriptor_tables);
4328 - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
4336 -static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4338 - int err = alloc_ldt(new, old->size, 0);
4341 - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
4342 - make_pages_readonly(
4344 - (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4345 - XENFEAT_writable_descriptor_tables);
4350 - * we do not have to muck with descriptors here, that is
4351 - * done in switch_mm() as needed.
4353 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4355 - struct mm_struct * old_mm;
4358 - memset(&mm->context, 0, sizeof(mm->context));
4359 - mutex_init(&mm->context.lock);
4360 - old_mm = current->mm;
4362 - mm->context.vdso = old_mm->context.vdso;
4363 - if (old_mm && old_mm->context.size > 0) {
4364 - mutex_lock(&old_mm->context.lock);
4365 - retval = copy_ldt(&mm->context, &old_mm->context);
4366 - mutex_unlock(&old_mm->context.lock);
4373 - * Don't touch the LDT register - we're already in the next thread.
4375 -void destroy_context(struct mm_struct *mm)
4377 - if (mm->context.size) {
4378 - if (mm == current->active_mm)
4380 - make_pages_writable(
4382 - (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4383 - XENFEAT_writable_descriptor_tables);
4384 - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
4385 - vfree(mm->context.ldt);
4387 - kfree(mm->context.ldt);
4388 - mm->context.size = 0;
4392 -static int read_ldt(void __user * ptr, unsigned long bytecount)
4395 - unsigned long size;
4396 - struct mm_struct * mm = current->mm;
4398 - if (!mm->context.size)
4400 - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
4401 - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
4403 - mutex_lock(&mm->context.lock);
4404 - size = mm->context.size*LDT_ENTRY_SIZE;
4405 - if (size > bytecount)
4409 - if (copy_to_user(ptr, mm->context.ldt, size))
4411 - mutex_unlock(&mm->context.lock);
4413 - goto error_return;
4414 - if (size != bytecount) {
4415 - /* zero-fill the rest */
4416 - if (clear_user(ptr+size, bytecount-size) != 0) {
4418 - goto error_return;
4426 -static int read_default_ldt(void __user * ptr, unsigned long bytecount)
4428 - /* Arbitrary number */
4429 - /* x86-64 default LDT is all zeros */
4430 - if (bytecount > 128)
4432 - if (clear_user(ptr, bytecount))
4437 -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
4439 - struct task_struct *me = current;
4440 - struct mm_struct * mm = me->mm;
4441 - __u32 entry_1, entry_2, *lp;
4442 - unsigned long mach_lp;
4444 - struct user_desc ldt_info;
4448 - if (bytecount != sizeof(ldt_info))
4451 - if (copy_from_user(&ldt_info, ptr, bytecount))
4455 - if (ldt_info.entry_number >= LDT_ENTRIES)
4457 - if (ldt_info.contents == 3) {
4460 - if (ldt_info.seg_not_present == 0)
4464 - mutex_lock(&mm->context.lock);
4465 - if (ldt_info.entry_number >= (unsigned)mm->context.size) {
4466 - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1);
4471 - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
4472 - mach_lp = arbitrary_virt_to_machine(lp);
4474 - /* Allow LDTs to be cleared by the user. */
4475 - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4476 - if (oldmode || LDT_empty(&ldt_info)) {
4483 - entry_1 = LDT_entry_a(&ldt_info);
4484 - entry_2 = LDT_entry_b(&ldt_info);
4486 - entry_2 &= ~(1 << 20);
4488 - /* Install the new entry ... */
4490 - error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
4493 - mutex_unlock(&mm->context.lock);
4498 -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
4500 - int ret = -ENOSYS;
4504 - ret = read_ldt(ptr, bytecount);
4507 - ret = write_ldt(ptr, bytecount, 1);
4510 - ret = read_default_ldt(ptr, bytecount);
4513 - ret = write_ldt(ptr, bytecount, 0);
4519 +++ b/arch/x86/kernel/ldt-xen.c
4522 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4523 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4524 + * Copyright (C) 2002 Andi Kleen
4526 + * This handles calls from both 32bit and 64bit mode.
4529 +#include <linux/errno.h>
4530 +#include <linux/sched.h>
4531 +#include <linux/string.h>
4532 +#include <linux/mm.h>
4533 +#include <linux/smp.h>
4534 +#include <linux/vmalloc.h>
4536 +#include <asm/uaccess.h>
4537 +#include <asm/system.h>
4538 +#include <asm/ldt.h>
4539 +#include <asm/desc.h>
4540 +#include <asm/mmu_context.h>
4543 +static void flush_ldt(void *null)
4545 + if (current->active_mm)
4546 + load_LDT(¤t->active_mm->context);
4550 +static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
4552 + void *oldldt, *newldt;
4555 + if (mincount <= pc->size)
4557 + oldsize = pc->size;
4558 + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
4559 + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
4560 + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
4561 + newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
4563 + newldt = (void *)__get_free_page(GFP_KERNEL);
4569 + memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
4571 + memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
4572 + (mincount - oldsize) * LDT_ENTRY_SIZE);
4574 +#ifdef CONFIG_X86_64
4575 + /* CHECKME: Do we really need this ? */
4580 + pc->size = mincount;
4587 + preempt_disable();
4589 + make_pages_readonly(newldt,
4590 + (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
4591 + XENFEAT_writable_descriptor_tables);
4594 + mask = cpumask_of_cpu(smp_processor_id());
4595 + if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4596 + smp_call_function(flush_ldt, NULL, 1, 1);
4601 + make_pages_writable(oldldt,
4602 + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4603 + XENFEAT_writable_descriptor_tables);
4604 + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
4607 + put_page(virt_to_page(oldldt));
4612 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4614 + int err = alloc_ldt(new, old->size, 0);
4618 + memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
4619 + make_pages_readonly(new->ldt,
4620 + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4621 + XENFEAT_writable_descriptor_tables);
4626 + * we do not have to muck with descriptors here, that is
4627 + * done in switch_mm() as needed.
4629 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4631 + struct mm_struct *old_mm;
4634 + memset(&mm->context, 0, sizeof(mm->context));
4635 + mutex_init(&mm->context.lock);
4636 + old_mm = current->mm;
4638 + mm->context.vdso = old_mm->context.vdso;
4639 + if (old_mm && old_mm->context.size > 0) {
4640 + mutex_lock(&old_mm->context.lock);
4641 + retval = copy_ldt(&mm->context, &old_mm->context);
4642 + mutex_unlock(&old_mm->context.lock);
4648 + * No need to lock the MM as we are the last user
4650 + * 64bit: Don't touch the LDT register - we're already in the next thread.
4652 +void destroy_context(struct mm_struct *mm)
4654 + if (mm->context.size) {
4655 + /* CHECKME: Can this ever happen ? */
4656 + if (mm == current->active_mm)
4658 + make_pages_writable(mm->context.ldt,
4659 + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4660 + XENFEAT_writable_descriptor_tables);
4661 + if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
4662 + vfree(mm->context.ldt);
4664 + put_page(virt_to_page(mm->context.ldt));
4665 + mm->context.size = 0;
4669 +static int read_ldt(void __user *ptr, unsigned long bytecount)
4672 + unsigned long size;
4673 + struct mm_struct *mm = current->mm;
4675 + if (!mm->context.size)
4677 + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
4678 + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
4680 + mutex_lock(&mm->context.lock);
4681 + size = mm->context.size * LDT_ENTRY_SIZE;
4682 + if (size > bytecount)
4686 + if (copy_to_user(ptr, mm->context.ldt, size))
4688 + mutex_unlock(&mm->context.lock);
4690 + goto error_return;
4691 + if (size != bytecount) {
4692 + /* zero-fill the rest */
4693 + if (clear_user(ptr + size, bytecount - size) != 0) {
4695 + goto error_return;
4703 +static int read_default_ldt(void __user *ptr, unsigned long bytecount)
4705 + /* CHECKME: Can we use _one_ random number ? */
4706 +#ifdef CONFIG_X86_32
4707 + unsigned long size = 5 * sizeof(struct desc_struct);
4709 + unsigned long size = 128;
4711 + if (bytecount > size)
4713 + if (clear_user(ptr, bytecount))
4718 +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
4720 + struct mm_struct *mm = current->mm;
4721 + struct desc_struct ldt;
4723 + struct user_desc ldt_info;
4726 + if (bytecount != sizeof(ldt_info))
4729 + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
4733 + if (ldt_info.entry_number >= LDT_ENTRIES)
4735 + if (ldt_info.contents == 3) {
4738 + if (ldt_info.seg_not_present == 0)
4742 + mutex_lock(&mm->context.lock);
4743 + if (ldt_info.entry_number >= mm->context.size) {
4744 + error = alloc_ldt(¤t->mm->context,
4745 + ldt_info.entry_number + 1, 1);
4750 + /* Allow LDTs to be cleared by the user. */
4751 + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4752 + if (oldmode || LDT_empty(&ldt_info)) {
4753 + memset(&ldt, 0, sizeof(ldt));
4758 + fill_ldt(&ldt, &ldt_info);
4762 + /* Install the new entry ... */
4764 + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
4767 + mutex_unlock(&mm->context.lock);
4772 +asmlinkage int sys_modify_ldt(int func, void __user *ptr,
4773 + unsigned long bytecount)
4775 + int ret = -ENOSYS;
4779 + ret = read_ldt(ptr, bytecount);
4782 + ret = write_ldt(ptr, bytecount, 1);
4785 + ret = read_default_ldt(ptr, bytecount);
4788 + ret = write_ldt(ptr, bytecount, 0);
4793 --- a/arch/x86/kernel/machine_kexec_64.c
4794 +++ b/arch/x86/kernel/machine_kexec_64.c
4795 @@ -300,7 +300,9 @@ void machine_kexec(struct kimage *image)
4797 void arch_crash_save_vmcoreinfo(void)
4799 +#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
4800 VMCOREINFO_SYMBOL(phys_base);
4802 VMCOREINFO_SYMBOL(init_level4_pgt);
4805 --- a/arch/x86/kernel/Makefile
4806 +++ b/arch/x86/kernel/Makefile
4807 @@ -120,11 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
4809 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
4811 + obj-$(CONFIG_XEN) += nmi_64.o
4812 time_64-$(CONFIG_XEN) += time_32.o
4813 pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
4816 disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
4817 smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
4818 -disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o
4819 -%/head_64.o %/head_64.s: asflags-$(CONFIG_XEN) :=
4820 --- a/arch/x86/kernel/microcode-xen.c
4821 +++ b/arch/x86/kernel/microcode-xen.c
4822 @@ -167,7 +167,7 @@ static int request_microcode(void)
4825 op.cmd = XENPF_microcode_update;
4826 - set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data);
4827 + set_xen_guest_handle(op.u.microcode.data, firmware->data);
4828 op.u.microcode.length = firmware->size;
4829 error = HYPERVISOR_platform_op(&op);
4831 --- a/arch/x86/kernel/mpparse_32-xen.c
4832 +++ b/arch/x86/kernel/mpparse_32-xen.c
4833 @@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
4834 /* Processor that is doing the boot up */
4835 unsigned int boot_cpu_physical_apicid = -1U;
4836 /* Internal processor count */
4837 -unsigned int __cpuinitdata num_processors;
4838 +unsigned int num_processors;
4840 /* Bitmask of physically existing CPUs */
4841 physid_mask_t phys_cpu_present_map;
4842 @@ -265,7 +265,7 @@ static void __init MP_ioapic_info (struc
4843 if (!(m->mpc_flags & MPC_APIC_USABLE))
4846 - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
4847 + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
4848 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
4849 if (nr_ioapics >= MAX_IO_APICS) {
4850 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
4851 @@ -412,9 +412,9 @@ static int __init smp_read_mpc(struct mp
4853 mps_oem_check(mpc, oem, str);
4855 - printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
4856 + printk("APIC at: 0x%X\n", mpc->mpc_lapic);
4860 * Save the local APIC address (it might be non-default) -- but only
4861 * if we're not using ACPI.
4863 @@ -728,7 +728,7 @@ static int __init smp_scan_config (unsig
4864 unsigned long *bp = isa_bus_to_virt(base);
4865 struct intel_mp_floating *mpf;
4867 - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
4868 + printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
4869 if (sizeof(*mpf) != 16)
4870 printk("Error: MPF size\n");
4872 @@ -742,9 +742,10 @@ static int __init smp_scan_config (unsig
4874 smp_found_config = 1;
4876 - printk(KERN_INFO "found SMP MP-table at %08lx\n",
4877 - virt_to_phys(mpf));
4878 - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
4879 + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4880 + mpf, virt_to_phys(mpf));
4881 + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
4883 if (mpf->mpf_physptr) {
4885 * We cannot access to MPC table to compute
4886 @@ -759,11 +760,12 @@ static int __init smp_scan_config (unsig
4887 unsigned long end = max_low_pfn * PAGE_SIZE;
4888 if (mpf->mpf_physptr + size > end)
4889 size = end - mpf->mpf_physptr;
4890 - reserve_bootmem(mpf->mpf_physptr, size);
4891 + reserve_bootmem(mpf->mpf_physptr, size,
4895 - printk(KERN_INFO "found SMP MP-table at %08lx\n",
4896 - ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
4897 + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4898 + mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
4902 @@ -940,14 +942,14 @@ void __init mp_register_ioapic(u8 id, u3
4904 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
4905 mp_ioapic_routing[idx].gsi_base = gsi_base;
4906 - mp_ioapic_routing[idx].gsi_end = gsi_base +
4907 + mp_ioapic_routing[idx].gsi_end = gsi_base +
4908 io_apic_get_redir_entries(idx);
4910 - printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
4911 - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4912 - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4913 - mp_ioapic_routing[idx].gsi_base,
4914 - mp_ioapic_routing[idx].gsi_end);
4915 + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4916 + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4917 + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4918 + mp_ioapic_routing[idx].gsi_base,
4919 + mp_ioapic_routing[idx].gsi_end);
4923 @@ -1063,15 +1065,16 @@ void __init mp_config_acpi_legacy_irqs (
4926 #define MAX_GSI_NUM 4096
4927 +#define IRQ_COMPRESSION_START 64
4929 int mp_register_gsi(u32 gsi, int triggering, int polarity)
4934 - static int pci_irq = 16;
4935 + static int pci_irq = IRQ_COMPRESSION_START;
4937 - * Mapping between Global System Interrups, which
4938 + * Mapping between Global System Interrupts, which
4939 * represent all possible interrupts, and IRQs
4940 * assigned to actual devices.
4942 @@ -1108,12 +1111,16 @@ int mp_register_gsi(u32 gsi, int trigger
4943 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
4944 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
4945 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
4946 - return gsi_to_irq[gsi];
4947 + return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
4950 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
4952 - if (triggering == ACPI_LEVEL_SENSITIVE) {
4954 + * For GSI >= 64, use IRQ compression
4956 + if ((gsi >= IRQ_COMPRESSION_START)
4957 + && (triggering == ACPI_LEVEL_SENSITIVE)) {
4959 * For PCI devices assign IRQs in order, avoiding gaps
4960 * due to unused I/O APIC pins.
4961 --- a/arch/x86/kernel/mpparse_64-xen.c
4962 +++ b/arch/x86/kernel/mpparse_64-xen.c
4963 @@ -60,14 +60,20 @@ unsigned int boot_cpu_id = -1U;
4964 EXPORT_SYMBOL(boot_cpu_id);
4966 /* Internal processor count */
4967 -unsigned int num_processors __cpuinitdata = 0;
4968 +unsigned int num_processors;
4970 unsigned disabled_cpus __cpuinitdata;
4972 /* Bitmask of physically existing CPUs */
4973 physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
4975 -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
4977 +u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
4978 + = { [0 ... NR_CPUS-1] = BAD_APICID };
4979 +void *x86_bios_cpu_apicid_early_ptr;
4981 +DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
4982 +EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
4986 @@ -119,24 +125,22 @@ static void __cpuinit MP_processor_info(
4987 physid_set(m->mpc_apicid, phys_cpu_present_map);
4988 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4990 - * bios_cpu_apicid is required to have processors listed
4991 + * x86_bios_cpu_apicid is required to have processors listed
4992 * in same order as logical cpu numbers. Hence the first
4993 * entry is BSP, and so on.
4997 - bios_cpu_apicid[cpu] = m->mpc_apicid;
4999 - * We get called early in the the start_kernel initialization
5000 - * process when the per_cpu data area is not yet setup, so we
5001 - * use a static array that is removed after the per_cpu data
5002 - * area is created.
5004 - if (x86_cpu_to_apicid_ptr) {
5005 - u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
5006 - x86_cpu_to_apicid[cpu] = m->mpc_apicid;
5007 + /* are we being called early in kernel startup? */
5008 + if (x86_cpu_to_apicid_early_ptr) {
5009 + u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
5010 + u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
5012 + cpu_to_apicid[cpu] = m->mpc_apicid;
5013 + bios_cpu_apicid[cpu] = m->mpc_apicid;
5015 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
5016 + per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
5019 cpu_set(cpu, cpu_possible_map);
5020 --- a/arch/x86/kernel/pci-dma-xen.c
5021 +++ b/arch/x86/kernel/pci-dma-xen.c
5022 @@ -434,3 +434,23 @@ dma_sync_single_for_device(struct device
5023 swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
5025 EXPORT_SYMBOL(dma_sync_single_for_device);
5028 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
5029 + enum dma_data_direction direction)
5032 + swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
5033 + flush_write_buffers();
5035 +EXPORT_SYMBOL(dma_sync_sg_for_cpu);
5038 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
5039 + enum dma_data_direction direction)
5042 + swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
5043 + flush_write_buffers();
5045 +EXPORT_SYMBOL(dma_sync_sg_for_device);
5046 --- a/arch/x86/kernel/process_32-xen.c
5047 +++ b/arch/x86/kernel/process_32-xen.c
5049 #include <linux/slab.h>
5050 #include <linux/vmalloc.h>
5051 #include <linux/user.h>
5052 -#include <linux/a.out.h>
5053 #include <linux/interrupt.h>
5054 #include <linux/utsname.h>
5055 #include <linux/delay.h>
5058 #include <asm/tlbflush.h>
5059 #include <asm/cpu.h>
5060 +#include <asm/kdebug.h>
5062 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
5063 +asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
5065 static int hlt_counter;
5067 @@ -78,7 +79,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
5069 unsigned long thread_saved_pc(struct task_struct *tsk)
5071 - return ((unsigned long *)tsk->thread.esp)[3];
5072 + return ((unsigned long *)tsk->thread.sp)[3];
5076 @@ -86,7 +87,6 @@ unsigned long thread_saved_pc(struct tas
5078 void (*pm_idle)(void);
5079 EXPORT_SYMBOL(pm_idle);
5080 -static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
5082 void disable_hlt(void)
5084 @@ -107,7 +107,7 @@ EXPORT_SYMBOL(enable_hlt);
5085 * to poll the ->work.need_resched flag instead of waiting for the
5086 * cross-CPU IPI to arrive. Use this option with caution.
5088 -static void poll_idle (void)
5089 +static void poll_idle(void)
5093 @@ -122,10 +122,19 @@ static void xen_idle(void)
5096 local_irq_disable();
5097 - if (!need_resched())
5098 + if (!need_resched()) {
5103 + t0n = ktime_to_ns(t0);
5104 safe_halt(); /* enables interrupts racelessly */
5106 - local_irq_enable();
5107 + local_irq_disable();
5109 + t1n = ktime_to_ns(t1);
5110 + sched_clock_idle_wakeup_event(t1n - t0n);
5112 + local_irq_enable();
5113 current_thread_info()->status |= TS_POLLING;
5115 #ifdef CONFIG_APM_MODULE
5116 @@ -168,13 +177,13 @@ void cpu_idle(void)
5117 while (!need_resched()) {
5120 - if (__get_cpu_var(cpu_idle_state))
5121 - __get_cpu_var(cpu_idle_state) = 0;
5125 idle = xen_idle; /* no alternatives */
5127 + if (rcu_pending(cpu))
5128 + rcu_check_callbacks(cpu, 0);
5130 if (cpu_is_offline(cpu))
5133 @@ -192,40 +201,19 @@ static void do_nothing(void *unused)
5138 + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
5139 + * pm_idle and update to new pm_idle value. Required while changing pm_idle
5140 + * handler on SMP systems.
5142 + * Caller must have changed pm_idle to the new value before the call. Old
5143 + * pm_idle value will not be used by any CPU after the return of this function.
5145 void cpu_idle_wait(void)
5147 - unsigned int cpu, this_cpu = get_cpu();
5148 - cpumask_t map, tmp = current->cpus_allowed;
5150 - set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5154 - for_each_online_cpu(cpu) {
5155 - per_cpu(cpu_idle_state, cpu) = 1;
5156 - cpu_set(cpu, map);
5159 - __get_cpu_var(cpu_idle_state) = 0;
5164 - for_each_online_cpu(cpu) {
5165 - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
5166 - cpu_clear(cpu, map);
5168 - cpus_and(map, map, cpu_online_map);
5170 - * We waited 1 sec, if a CPU still did not call idle
5171 - * it may be because it is in idle and not waking up
5172 - * because it has nothing to do.
5173 - * Give all the remaining CPUS a kick.
5175 - smp_call_function_mask(map, do_nothing, 0, 0);
5176 - } while (!cpus_empty(map));
5178 - set_cpus_allowed(current, tmp);
5180 + /* kick all the CPUs so that they exit out of pm_idle */
5181 + smp_call_function(do_nothing, NULL, 0, 1);
5183 EXPORT_SYMBOL_GPL(cpu_idle_wait);
5185 @@ -251,15 +239,15 @@ void __show_registers(struct pt_regs *re
5187 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
5188 unsigned long d0, d1, d2, d3, d6, d7;
5189 - unsigned long esp;
5191 unsigned short ss, gs;
5193 if (user_mode_vm(regs)) {
5195 - ss = regs->xss & 0xffff;
5197 + ss = regs->ss & 0xffff;
5198 savesegment(gs, gs);
5200 - esp = (unsigned long) (®s->esp);
5201 + sp = (unsigned long) (®s->sp);
5202 savesegment(ss, ss);
5203 savesegment(gs, gs);
5205 @@ -272,17 +260,17 @@ void __show_registers(struct pt_regs *re
5206 init_utsname()->version);
5208 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
5209 - 0xffff & regs->xcs, regs->eip, regs->eflags,
5210 + 0xffff & regs->cs, regs->ip, regs->flags,
5211 smp_processor_id());
5212 - print_symbol("EIP is at %s\n", regs->eip);
5213 + print_symbol("EIP is at %s\n", regs->ip);
5215 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
5216 - regs->eax, regs->ebx, regs->ecx, regs->edx);
5217 + regs->ax, regs->bx, regs->cx, regs->dx);
5218 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
5219 - regs->esi, regs->edi, regs->ebp, esp);
5220 + regs->si, regs->di, regs->bp, sp);
5221 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
5222 - regs->xds & 0xffff, regs->xes & 0xffff,
5223 - regs->xfs & 0xffff, gs, ss);
5224 + regs->ds & 0xffff, regs->es & 0xffff,
5225 + regs->fs & 0xffff, gs, ss);
5229 @@ -310,12 +298,12 @@ void __show_registers(struct pt_regs *re
5230 void show_regs(struct pt_regs *regs)
5232 __show_registers(regs, 1);
5233 - show_trace(NULL, regs, ®s->esp);
5234 + show_trace(NULL, regs, ®s->sp, regs->bp);
5238 - * This gets run with %ebx containing the
5239 - * function to call, and %edx containing
5240 + * This gets run with %bx containing the
5241 + * function to call, and %dx containing
5244 extern void kernel_thread_helper(void);
5245 @@ -329,16 +317,16 @@ int kernel_thread(int (*fn)(void *), voi
5247 memset(®s, 0, sizeof(regs));
5249 - regs.ebx = (unsigned long) fn;
5250 - regs.edx = (unsigned long) arg;
5251 + regs.bx = (unsigned long) fn;
5252 + regs.dx = (unsigned long) arg;
5254 - regs.xds = __USER_DS;
5255 - regs.xes = __USER_DS;
5256 - regs.xfs = __KERNEL_PERCPU;
5257 - regs.orig_eax = -1;
5258 - regs.eip = (unsigned long) kernel_thread_helper;
5259 - regs.xcs = __KERNEL_CS | get_kernel_rpl();
5260 - regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5261 + regs.ds = __USER_DS;
5262 + regs.es = __USER_DS;
5263 + regs.fs = __KERNEL_PERCPU;
5264 + regs.orig_ax = -1;
5265 + regs.ip = (unsigned long) kernel_thread_helper;
5266 + regs.cs = __KERNEL_CS | get_kernel_rpl();
5267 + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5269 /* Ok, create the new process.. */
5270 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL);
5271 @@ -368,7 +356,12 @@ void flush_thread(void)
5273 struct task_struct *tsk = current;
5275 - memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
5276 + tsk->thread.debugreg0 = 0;
5277 + tsk->thread.debugreg1 = 0;
5278 + tsk->thread.debugreg2 = 0;
5279 + tsk->thread.debugreg3 = 0;
5280 + tsk->thread.debugreg6 = 0;
5281 + tsk->thread.debugreg7 = 0;
5282 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5283 clear_tsk_thread_flag(tsk, TIF_DEBUG);
5285 @@ -393,7 +386,7 @@ void prepare_to_copy(struct task_struct
5289 -int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
5290 +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
5291 unsigned long unused,
5292 struct task_struct * p, struct pt_regs * regs)
5294 @@ -403,17 +396,19 @@ int copy_thread(int nr, unsigned long cl
5296 childregs = task_pt_regs(p);
5298 - childregs->eax = 0;
5299 - childregs->esp = esp;
5300 + childregs->ax = 0;
5301 + childregs->sp = sp;
5303 - p->thread.esp = (unsigned long) childregs;
5304 - p->thread.esp0 = (unsigned long) (childregs+1);
5305 + p->thread.sp = (unsigned long) childregs;
5306 + p->thread.sp0 = (unsigned long) (childregs+1);
5308 - p->thread.eip = (unsigned long) ret_from_fork;
5309 + p->thread.ip = (unsigned long) ret_from_fork;
5311 - savesegment(gs,p->thread.gs);
5312 + savesegment(gs, p->thread.gs);
5315 + if (test_tsk_thread_flag(tsk, TIF_CSTAR))
5316 + p->thread.ip = (unsigned long) cstar_ret_from_fork;
5317 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
5318 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
5319 IO_BITMAP_BYTES, GFP_KERNEL);
5320 @@ -424,34 +419,17 @@ int copy_thread(int nr, unsigned long cl
5321 set_tsk_thread_flag(p, TIF_IO_BITMAP);
5327 * Set a new TLS for the child thread?
5329 - if (clone_flags & CLONE_SETTLS) {
5330 - struct desc_struct *desc;
5331 - struct user_desc info;
5335 - if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
5338 - if (LDT_empty(&info))
5341 - idx = info.entry_number;
5342 - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5345 - desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
5346 - desc->a = LDT_entry_a(&info);
5347 - desc->b = LDT_entry_b(&info);
5349 + if (clone_flags & CLONE_SETTLS)
5350 + err = do_set_thread_area(p, -1,
5351 + (struct user_desc __user *)childregs->si, 0);
5353 p->thread.iopl = current->thread.iopl;
5357 if (err && p->thread.io_bitmap_ptr) {
5358 kfree(p->thread.io_bitmap_ptr);
5359 p->thread.io_bitmap_max = 0;
5360 @@ -459,67 +437,8 @@ int copy_thread(int nr, unsigned long cl
5365 - * fill in the user structure for a core dump..
5367 -void dump_thread(struct pt_regs * regs, struct user * dump)
5371 -/* changed the size calculations - should hopefully work better. lbt */
5372 - dump->magic = CMAGIC;
5373 - dump->start_code = 0;
5374 - dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
5375 - dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
5376 - dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
5377 - dump->u_dsize -= dump->u_tsize;
5378 - dump->u_ssize = 0;
5379 - for (i = 0; i < 8; i++)
5380 - dump->u_debugreg[i] = current->thread.debugreg[i];
5382 - if (dump->start_stack < TASK_SIZE)
5383 - dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
5385 - dump->regs.ebx = regs->ebx;
5386 - dump->regs.ecx = regs->ecx;
5387 - dump->regs.edx = regs->edx;
5388 - dump->regs.esi = regs->esi;
5389 - dump->regs.edi = regs->edi;
5390 - dump->regs.ebp = regs->ebp;
5391 - dump->regs.eax = regs->eax;
5392 - dump->regs.ds = regs->xds;
5393 - dump->regs.es = regs->xes;
5394 - dump->regs.fs = regs->xfs;
5395 - savesegment(gs,dump->regs.gs);
5396 - dump->regs.orig_eax = regs->orig_eax;
5397 - dump->regs.eip = regs->eip;
5398 - dump->regs.cs = regs->xcs;
5399 - dump->regs.eflags = regs->eflags;
5400 - dump->regs.esp = regs->esp;
5401 - dump->regs.ss = regs->xss;
5403 - dump->u_fpvalid = dump_fpu (regs, &dump->i387);
5405 -EXPORT_SYMBOL(dump_thread);
5408 - * Capture the user space registers if the task is not running (in user space)
5410 -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
5412 - struct pt_regs ptregs = *task_pt_regs(tsk);
5413 - ptregs.xcs &= 0xffff;
5414 - ptregs.xds &= 0xffff;
5415 - ptregs.xes &= 0xffff;
5416 - ptregs.xss &= 0xffff;
5418 - elf_core_copy_regs(regs, &ptregs);
5423 #ifdef CONFIG_SECCOMP
5424 -void hard_disable_TSC(void)
5425 +static void hard_disable_TSC(void)
5427 write_cr4(read_cr4() | X86_CR4_TSD);
5429 @@ -534,7 +453,7 @@ void disable_TSC(void)
5433 -void hard_enable_TSC(void)
5434 +static void hard_enable_TSC(void)
5436 write_cr4(read_cr4() & ~X86_CR4_TSD);
5438 @@ -543,18 +462,32 @@ void hard_enable_TSC(void)
5439 static noinline void
5440 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
5442 - struct thread_struct *next;
5443 + struct thread_struct *prev, *next;
5444 + unsigned long debugctl;
5446 + prev = &prev_p->thread;
5447 next = &next_p->thread;
5449 + debugctl = prev->debugctlmsr;
5450 + if (next->ds_area_msr != prev->ds_area_msr) {
5451 + /* we clear debugctl to make sure DS
5452 + * is not in use when we change it */
5454 + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
5455 + wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
5458 + if (next->debugctlmsr != debugctl)
5459 + wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
5461 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
5462 - set_debugreg(next->debugreg[0], 0);
5463 - set_debugreg(next->debugreg[1], 1);
5464 - set_debugreg(next->debugreg[2], 2);
5465 - set_debugreg(next->debugreg[3], 3);
5466 + set_debugreg(next->debugreg0, 0);
5467 + set_debugreg(next->debugreg1, 1);
5468 + set_debugreg(next->debugreg2, 2);
5469 + set_debugreg(next->debugreg3, 3);
5471 - set_debugreg(next->debugreg[6], 6);
5472 - set_debugreg(next->debugreg[7], 7);
5473 + set_debugreg(next->debugreg6, 6);
5474 + set_debugreg(next->debugreg7, 7);
5477 #ifdef CONFIG_SECCOMP
5478 @@ -567,6 +500,14 @@ __switch_to_xtra(struct task_struct *pre
5484 + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
5485 + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
5487 + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
5488 + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
5493 @@ -592,11 +533,11 @@ __switch_to_xtra(struct task_struct *pre
5494 * More important, however, is the fact that this allows us much
5497 - * The return value (in %eax) will be the "prev" task after
5498 + * The return value (in %ax) will be the "prev" task after
5499 * the task-switch, and shows up in ret_from_fork in entry.S,
5502 -struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
5503 +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
5505 struct thread_struct *prev = &prev_p->thread,
5506 *next = &next_p->thread;
5507 @@ -632,12 +573,12 @@ struct task_struct fastcall * __switch_t
5512 - * This is load_esp0(tss, next) with a multicall.
5514 + * This is load_sp0(tss, next) with a multicall.
5516 mcl->op = __HYPERVISOR_stack_switch;
5517 mcl->args[0] = __KERNEL_DS;
5518 - mcl->args[1] = next->esp0;
5519 + mcl->args[1] = next->sp0;
5523 @@ -734,7 +675,7 @@ struct task_struct fastcall * __switch_t
5525 asmlinkage int sys_fork(struct pt_regs regs)
5527 - return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL);
5528 + return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL);
5531 asmlinkage int sys_clone(struct pt_regs regs)
5532 @@ -743,12 +684,12 @@ asmlinkage int sys_clone(struct pt_regs
5533 unsigned long newsp;
5534 int __user *parent_tidptr, *child_tidptr;
5536 - clone_flags = regs.ebx;
5538 - parent_tidptr = (int __user *)regs.edx;
5539 - child_tidptr = (int __user *)regs.edi;
5540 + clone_flags = regs.bx;
5542 + parent_tidptr = (int __user *)regs.dx;
5543 + child_tidptr = (int __user *)regs.di;
5547 return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr);
5550 @@ -764,7 +705,7 @@ asmlinkage int sys_clone(struct pt_regs
5552 asmlinkage int sys_vfork(struct pt_regs regs)
5554 - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL);
5555 + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL);
5559 @@ -775,18 +716,15 @@ asmlinkage int sys_execve(struct pt_regs
5563 - filename = getname((char __user *) regs.ebx);
5564 + filename = getname((char __user *) regs.bx);
5565 error = PTR_ERR(filename);
5566 if (IS_ERR(filename))
5568 error = do_execve(filename,
5569 - (char __user * __user *) regs.ecx,
5570 - (char __user * __user *) regs.edx,
5571 + (char __user * __user *) regs.cx,
5572 + (char __user * __user *) regs.dx,
5575 - task_lock(current);
5576 - current->ptrace &= ~PT_DTRACE;
5577 - task_unlock(current);
5578 /* Make sure we don't return using sysenter.. */
5579 set_thread_flag(TIF_IRET);
5581 @@ -800,145 +738,37 @@ out:
5583 unsigned long get_wchan(struct task_struct *p)
5585 - unsigned long ebp, esp, eip;
5586 + unsigned long bp, sp, ip;
5587 unsigned long stack_page;
5589 if (!p || p == current || p->state == TASK_RUNNING)
5591 stack_page = (unsigned long)task_stack_page(p);
5592 - esp = p->thread.esp;
5593 - if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
5594 + sp = p->thread.sp;
5595 + if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
5597 - /* include/asm-i386/system.h:switch_to() pushes ebp last. */
5598 - ebp = *(unsigned long *) esp;
5599 + /* include/asm-i386/system.h:switch_to() pushes bp last. */
5600 + bp = *(unsigned long *) sp;
5602 - if (ebp < stack_page || ebp > top_ebp+stack_page)
5603 + if (bp < stack_page || bp > top_ebp+stack_page)
5605 - eip = *(unsigned long *) (ebp+4);
5606 - if (!in_sched_functions(eip))
5608 - ebp = *(unsigned long *) ebp;
5609 + ip = *(unsigned long *) (bp+4);
5610 + if (!in_sched_functions(ip))
5612 + bp = *(unsigned long *) bp;
5613 } while (count++ < 16);
5618 - * sys_alloc_thread_area: get a yet unused TLS descriptor index.
5620 -static int get_free_idx(void)
5622 - struct thread_struct *t = ¤t->thread;
5625 - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
5626 - if (desc_empty(t->tls_array + idx))
5627 - return idx + GDT_ENTRY_TLS_MIN;
5632 - * Set a given TLS descriptor:
5634 -asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
5636 - struct thread_struct *t = ¤t->thread;
5637 - struct user_desc info;
5638 - struct desc_struct *desc;
5641 - if (copy_from_user(&info, u_info, sizeof(info)))
5643 - idx = info.entry_number;
5646 - * index -1 means the kernel should try to find and
5647 - * allocate an empty descriptor:
5650 - idx = get_free_idx();
5653 - if (put_user(idx, &u_info->entry_number))
5657 - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5660 - desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
5663 - * We must not get preempted while modifying the TLS.
5667 - if (LDT_empty(&info)) {
5671 - desc->a = LDT_entry_a(&info);
5672 - desc->b = LDT_entry_b(&info);
5682 - * Get the current Thread-Local Storage area:
5685 -#define GET_BASE(desc) ( \
5686 - (((desc)->a >> 16) & 0x0000ffff) | \
5687 - (((desc)->b << 16) & 0x00ff0000) | \
5688 - ( (desc)->b & 0xff000000) )
5690 -#define GET_LIMIT(desc) ( \
5691 - ((desc)->a & 0x0ffff) | \
5692 - ((desc)->b & 0xf0000) )
5694 -#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
5695 -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
5696 -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
5697 -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
5698 -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
5699 -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
5701 -asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
5703 - struct user_desc info;
5704 - struct desc_struct *desc;
5707 - if (get_user(idx, &u_info->entry_number))
5709 - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5712 - memset(&info, 0, sizeof(info));
5714 - desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
5716 - info.entry_number = idx;
5717 - info.base_addr = GET_BASE(desc);
5718 - info.limit = GET_LIMIT(desc);
5719 - info.seg_32bit = GET_32BIT(desc);
5720 - info.contents = GET_CONTENTS(desc);
5721 - info.read_exec_only = !GET_WRITABLE(desc);
5722 - info.limit_in_pages = GET_LIMIT_PAGES(desc);
5723 - info.seg_not_present = !GET_PRESENT(desc);
5724 - info.useable = GET_USEABLE(desc);
5726 - if (copy_to_user(u_info, &info, sizeof(info)))
5731 unsigned long arch_align_stack(unsigned long sp)
5733 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
5734 sp -= get_random_int() % 8192;
5738 +unsigned long arch_randomize_brk(struct mm_struct *mm)
5740 + unsigned long range_end = mm->brk + 0x02000000;
5741 + return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
5743 --- a/arch/x86/kernel/process_64-xen.c
5744 +++ b/arch/x86/kernel/process_64-xen.c
5747 * Pentium III FXSR, SSE support
5748 * Gareth Hughes <gareth@valinux.com>, May 2000
5755 #include <linux/cpu.h>
5756 #include <linux/errno.h>
5757 #include <linux/sched.h>
5758 +#include <linux/fs.h>
5759 #include <linux/kernel.h>
5760 #include <linux/mm.h>
5761 -#include <linux/fs.h>
5762 #include <linux/elfcore.h>
5763 #include <linux/smp.h>
5764 #include <linux/slab.h>
5765 #include <linux/user.h>
5766 -#include <linux/module.h>
5767 -#include <linux/a.out.h>
5768 #include <linux/interrupt.h>
5769 +#include <linux/utsname.h>
5770 #include <linux/delay.h>
5771 +#include <linux/module.h>
5772 #include <linux/ptrace.h>
5773 -#include <linux/utsname.h>
5774 #include <linux/random.h>
5775 #include <linux/notifier.h>
5776 #include <linux/kprobes.h>
5777 @@ -73,7 +72,6 @@ EXPORT_SYMBOL(boot_option_idle_override)
5779 void (*pm_idle)(void);
5780 EXPORT_SYMBOL(pm_idle);
5781 -static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
5783 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
5785 @@ -81,13 +79,6 @@ void idle_notifier_register(struct notif
5787 atomic_notifier_chain_register(&idle_notifier, n);
5789 -EXPORT_SYMBOL_GPL(idle_notifier_register);
5791 -void idle_notifier_unregister(struct notifier_block *n)
5793 - atomic_notifier_chain_unregister(&idle_notifier, n);
5795 -EXPORT_SYMBOL(idle_notifier_unregister);
5797 void enter_idle(void)
5799 @@ -116,7 +107,7 @@ void exit_idle(void)
5800 * to poll the ->need_resched flag instead of waiting for the
5801 * cross-CPU IPI to arrive. Use this option with caution.
5803 -static void poll_idle (void)
5804 +static void poll_idle(void)
5808 @@ -131,10 +122,19 @@ static void xen_idle(void)
5811 local_irq_disable();
5812 - if (!need_resched())
5815 - local_irq_enable();
5816 + if (!need_resched()) {
5821 + t0n = ktime_to_ns(t0);
5822 + safe_halt(); /* enables interrupts racelessly */
5823 + local_irq_disable();
5825 + t1n = ktime_to_ns(t1);
5826 + sched_clock_idle_wakeup_event(t1n - t0n);
5828 + local_irq_enable();
5829 current_thread_info()->status |= TS_POLLING;
5832 @@ -161,19 +161,15 @@ static inline void play_dead(void)
5833 * low exit latency (ie sit in a loop waiting for
5834 * somebody to say that they'd like to reschedule)
5836 -void cpu_idle (void)
5837 +void cpu_idle(void)
5839 current_thread_info()->status |= TS_POLLING;
5840 /* endless idle loop with no priority at all */
5842 + tick_nohz_stop_sched_tick();
5843 while (!need_resched()) {
5846 - if (__get_cpu_var(cpu_idle_state))
5847 - __get_cpu_var(cpu_idle_state) = 0;
5849 - tick_nohz_stop_sched_tick();
5852 idle = xen_idle; /* no alternatives */
5853 if (cpu_is_offline(smp_processor_id()))
5854 @@ -203,49 +199,27 @@ static void do_nothing(void *unused)
5859 + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
5860 + * pm_idle and update to new pm_idle value. Required while changing pm_idle
5861 + * handler on SMP systems.
5863 + * Caller must have changed pm_idle to the new value before the call. Old
5864 + * pm_idle value will not be used by any CPU after the return of this function.
5866 void cpu_idle_wait(void)
5868 - unsigned int cpu, this_cpu = get_cpu();
5869 - cpumask_t map, tmp = current->cpus_allowed;
5871 - set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5875 - for_each_online_cpu(cpu) {
5876 - per_cpu(cpu_idle_state, cpu) = 1;
5877 - cpu_set(cpu, map);
5880 - __get_cpu_var(cpu_idle_state) = 0;
5885 - for_each_online_cpu(cpu) {
5886 - if (cpu_isset(cpu, map) &&
5887 - !per_cpu(cpu_idle_state, cpu))
5888 - cpu_clear(cpu, map);
5890 - cpus_and(map, map, cpu_online_map);
5892 - * We waited 1 sec, if a CPU still did not call idle
5893 - * it may be because it is in idle and not waking up
5894 - * because it has nothing to do.
5895 - * Give all the remaining CPUS a kick.
5897 - smp_call_function_mask(map, do_nothing, 0, 0);
5898 - } while (!cpus_empty(map));
5900 - set_cpus_allowed(current, tmp);
5902 + /* kick all the CPUs so that they exit out of pm_idle */
5903 + smp_call_function(do_nothing, NULL, 0, 1);
5905 EXPORT_SYMBOL_GPL(cpu_idle_wait);
5907 -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
5908 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
5912 -static int __init idle_setup (char *str)
5913 +static int __init idle_setup(char *str)
5915 if (!strcmp(str, "poll")) {
5916 printk("using polling idle threads.\n");
5917 @@ -260,13 +234,13 @@ static int __init idle_setup (char *str)
5919 early_param("idle", idle_setup);
5921 -/* Prints also some state that isn't saved in the pt_regs */
5922 +/* Prints also some state that isn't saved in the pt_regs */
5923 void __show_regs(struct pt_regs * regs)
5925 unsigned long fs, gs, shadowgs;
5926 unsigned long d0, d1, d2, d3, d6, d7;
5927 - unsigned int fsindex,gsindex;
5928 - unsigned int ds,cs,es;
5929 + unsigned int fsindex, gsindex;
5930 + unsigned int ds, cs, es;
5934 @@ -275,16 +249,16 @@ void __show_regs(struct pt_regs * regs)
5935 init_utsname()->release,
5936 (int)strcspn(init_utsname()->version, " "),
5937 init_utsname()->version);
5938 - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
5939 - printk_address(regs->rip);
5940 - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
5942 + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
5943 + printk_address(regs->ip, 1);
5944 + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
5946 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
5947 - regs->rax, regs->rbx, regs->rcx);
5948 + regs->ax, regs->bx, regs->cx);
5949 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
5950 - regs->rdx, regs->rsi, regs->rdi);
5951 + regs->dx, regs->si, regs->di);
5952 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
5953 - regs->rbp, regs->r8, regs->r9);
5954 + regs->bp, regs->r8, regs->r9);
5955 printk("R10: %016lx R11: %016lx R12: %016lx\n",
5956 regs->r10, regs->r11, regs->r12);
5957 printk("R13: %016lx R14: %016lx R15: %016lx\n",
5958 @@ -318,7 +292,7 @@ void show_regs(struct pt_regs *regs)
5960 printk("CPU %d:", smp_processor_id());
5962 - show_trace(NULL, regs, (void *)(regs + 1));
5963 + show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
5967 @@ -329,7 +303,7 @@ void exit_thread(void)
5968 struct task_struct *me = current;
5969 struct thread_struct *t = &me->thread;
5971 - if (me->thread.io_bitmap_ptr) {
5972 + if (me->thread.io_bitmap_ptr) {
5973 #ifndef CONFIG_X86_NO_TSS
5974 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
5976 @@ -382,7 +356,7 @@ void flush_thread(void)
5977 tsk->thread.debugreg3 = 0;
5978 tsk->thread.debugreg6 = 0;
5979 tsk->thread.debugreg7 = 0;
5980 - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5981 + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5983 * Forget coprocessor state..
5985 @@ -405,26 +379,21 @@ void release_thread(struct task_struct *
5987 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
5989 - struct user_desc ud = {
5990 + struct user_desc ud = {
5994 .limit_in_pages = 1,
5997 - struct n_desc_struct *desc = (void *)t->thread.tls_array;
5998 + struct desc_struct *desc = t->thread.tls_array;
6000 - desc->a = LDT_entry_a(&ud);
6001 - desc->b = LDT_entry_b(&ud);
6002 + fill_ldt(desc, &ud);
6005 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
6007 - struct desc_struct *desc = (void *)t->thread.tls_array;
6009 - return desc->base0 |
6010 - (((u32)desc->base1) << 16) |
6011 - (((u32)desc->base2) << 24);
6012 + return get_desc_base(&t->thread.tls_array[tls]);
6016 @@ -436,7 +405,7 @@ void prepare_to_copy(struct task_struct
6020 -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
6021 +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
6022 unsigned long unused,
6023 struct task_struct * p, struct pt_regs * regs)
6025 @@ -448,14 +417,14 @@ int copy_thread(int nr, unsigned long cl
6026 (THREAD_SIZE + task_stack_page(p))) - 1;
6029 - childregs->rax = 0;
6030 - childregs->rsp = rsp;
6032 - childregs->rsp = (unsigned long)childregs;
6034 - p->thread.rsp = (unsigned long) childregs;
6035 - p->thread.rsp0 = (unsigned long) (childregs+1);
6036 - p->thread.userrsp = me->thread.userrsp;
6037 + childregs->ax = 0;
6038 + childregs->sp = sp;
6040 + childregs->sp = (unsigned long)childregs;
6042 + p->thread.sp = (unsigned long) childregs;
6043 + p->thread.sp0 = (unsigned long) (childregs+1);
6044 + p->thread.usersp = me->thread.usersp;
6046 set_tsk_thread_flag(p, TIF_FORK);
6048 @@ -476,7 +445,7 @@ int copy_thread(int nr, unsigned long cl
6049 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
6051 set_tsk_thread_flag(p, TIF_IO_BITMAP);
6056 * Set a new TLS for the child thread?
6057 @@ -484,7 +453,8 @@ int copy_thread(int nr, unsigned long cl
6058 if (clone_flags & CLONE_SETTLS) {
6059 #ifdef CONFIG_IA32_EMULATION
6060 if (test_thread_flag(TIF_IA32))
6061 - err = ia32_child_tls(p, childregs);
6062 + err = do_set_thread_area(p, -1,
6063 + (struct user_desc __user *)childregs->si, 0);
6066 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
6067 @@ -502,26 +472,32 @@ out:
6071 -static inline void __save_init_fpu( struct task_struct *tsk )
6073 - asm volatile( "rex64 ; fxsave %0 ; fnclex"
6074 - : "=m" (tsk->thread.i387.fxsave));
6075 - tsk->thread_info->status &= ~TS_USEDFPU;
6079 * This special macro can be used to load a debugging register
6081 -#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
6082 +#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
6084 static inline void __switch_to_xtra(struct task_struct *prev_p,
6085 - struct task_struct *next_p)
6086 + struct task_struct *next_p)
6088 struct thread_struct *prev, *next;
6089 + unsigned long debugctl;
6091 prev = &prev_p->thread,
6092 next = &next_p->thread;
6094 + debugctl = prev->debugctlmsr;
6095 + if (next->ds_area_msr != prev->ds_area_msr) {
6096 + /* we clear debugctl to make sure DS
6097 + * is not in use when we change it */
6099 + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
6100 + wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
6103 + if (next->debugctlmsr != debugctl)
6104 + wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
6106 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
6109 @@ -531,12 +507,20 @@ static inline void __switch_to_xtra(stru
6115 + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
6116 + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
6118 + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
6119 + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
6124 * switch_to(x,y) should switch tasks from x to y.
6126 - * This could still be optimized:
6127 + * This could still be optimized:
6128 * - fold all the options into a flag word and test it with a single test.
6129 * - could test fs/gs bitsliced
6131 @@ -547,7 +531,7 @@ __switch_to(struct task_struct *prev_p,
6133 struct thread_struct *prev = &prev_p->thread,
6134 *next = &next_p->thread;
6135 - int cpu = smp_processor_id();
6136 + int cpu = smp_processor_id();
6137 #ifndef CONFIG_X86_NO_TSS
6138 struct tss_struct *tss = &per_cpu(init_tss, cpu);
6140 @@ -581,11 +565,12 @@ __switch_to(struct task_struct *prev_p,
6141 prev_p->fpu_counter = 0;
6144 - * Reload esp0, LDT and the page table pointer:
6146 + * This is load_sp0(tss, next) with a multicall.
6148 mcl->op = __HYPERVISOR_stack_switch;
6149 mcl->args[0] = __KERNEL_DS;
6150 - mcl->args[1] = next->rsp0;
6151 + mcl->args[1] = next->sp0;
6155 @@ -593,11 +578,12 @@ __switch_to(struct task_struct *prev_p,
6156 * This is load_TLS(next, cpu) with multicalls.
6159 - if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
6160 + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
6161 + next->tls_array[i].b != prev->tls_array[i].b)) { \
6162 mcl->op = __HYPERVISOR_update_descriptor; \
6163 mcl->args[0] = virt_to_machine( \
6164 - &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
6165 - mcl->args[1] = next->tls_array[i]; \
6166 + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
6167 + mcl->args[1] = *(u64 *)&next->tls_array[i]; \
6171 @@ -605,7 +591,7 @@ __switch_to(struct task_struct *prev_p,
6174 if (unlikely(prev->iopl != next->iopl)) {
6175 - iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
6176 + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
6177 #if CONFIG_XEN_COMPAT > 0x030002
6178 mcl->op = __HYPERVISOR_physdev_op;
6179 mcl->args[0] = PHYSDEVOP_set_iopl;
6180 @@ -669,8 +655,8 @@ __switch_to(struct task_struct *prev_p,
6182 * Switch the PDA context.
6184 - prev->userrsp = read_pda(oldrsp);
6185 - write_pda(oldrsp, next->userrsp);
6186 + prev->usersp = read_pda(oldrsp);
6187 + write_pda(oldrsp, next->usersp);
6188 write_pda(pcurrent, next_p);
6189 write_pda(kernelstack,
6190 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
6191 @@ -687,7 +673,8 @@ __switch_to(struct task_struct *prev_p,
6193 * Now maybe reload the debug registers
6195 - if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
6196 + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
6197 + task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
6198 __switch_to_xtra(prev_p, next_p);
6200 /* If the task has used fpu the last 5 timeslices, just do a full
6201 @@ -702,23 +689,18 @@ __switch_to(struct task_struct *prev_p,
6203 * sys_execve() executes a new program.
6207 long sys_execve(char __user *name, char __user * __user *argv,
6208 - char __user * __user *envp, struct pt_regs regs)
6209 + char __user * __user *envp, struct pt_regs *regs)
6214 filename = getname(name);
6215 error = PTR_ERR(filename);
6216 - if (IS_ERR(filename))
6217 + if (IS_ERR(filename))
6219 - error = do_execve(filename, argv, envp, ®s);
6221 - task_lock(current);
6222 - current->ptrace &= ~PT_DTRACE;
6223 - task_unlock(current);
6225 + error = do_execve(filename, argv, envp, regs);
6229 @@ -728,18 +710,18 @@ void set_personality_64bit(void)
6230 /* inherit personality from parent */
6232 /* Make sure to be in 64bit mode */
6233 - clear_thread_flag(TIF_IA32);
6234 + clear_thread_flag(TIF_IA32);
6236 /* TBD: overwrites user setup. Should have two bits.
6237 But 64bit processes have always behaved this way,
6238 so it's not too bad. The main problem is just that
6239 - 32bit childs are affected again. */
6240 + 32bit childs are affected again. */
6241 current->personality &= ~READ_IMPLIES_EXEC;
6244 asmlinkage long sys_fork(struct pt_regs *regs)
6246 - return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
6247 + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
6251 @@ -747,7 +729,7 @@ sys_clone(unsigned long clone_flags, uns
6252 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
6255 - newsp = regs->rsp;
6257 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
6260 @@ -763,29 +745,29 @@ sys_clone(unsigned long clone_flags, uns
6262 asmlinkage long sys_vfork(struct pt_regs *regs)
6264 - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
6265 + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
6269 unsigned long get_wchan(struct task_struct *p)
6271 unsigned long stack;
6276 if (!p || p == current || p->state==TASK_RUNNING)
6278 stack = (unsigned long)task_stack_page(p);
6279 - if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
6280 + if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
6282 - fp = *(u64 *)(p->thread.rsp);
6283 + fp = *(u64 *)(p->thread.sp);
6285 if (fp < (unsigned long)stack ||
6286 fp > (unsigned long)stack+THREAD_SIZE)
6288 - rip = *(u64 *)(fp+8);
6289 - if (!in_sched_functions(rip))
6291 + ip = *(u64 *)(fp+8);
6292 + if (!in_sched_functions(ip))
6295 } while (count++ < 16);
6297 @@ -827,19 +809,19 @@ long do_arch_prctl(struct task_struct *t
6298 /* Not strictly needed for fs, but do it for symmetry
6300 if (addr >= TASK_SIZE_OF(task))
6304 - /* handle small bases via the GDT because that's faster to
6305 + /* handle small bases via the GDT because that's faster to
6307 - if (addr <= 0xffffffff) {
6308 + if (addr <= 0xffffffff) {
6309 set_32bit_tls(task, FS_TLS, addr);
6311 - load_TLS(&task->thread, cpu);
6313 + load_TLS(&task->thread, cpu);
6314 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
6316 task->thread.fsindex = FS_TLS_SEL;
6317 task->thread.fs = 0;
6320 task->thread.fsindex = 0;
6321 task->thread.fs = addr;
6323 @@ -852,24 +834,24 @@ long do_arch_prctl(struct task_struct *t
6327 - case ARCH_GET_FS: {
6328 - unsigned long base;
6329 + case ARCH_GET_FS: {
6330 + unsigned long base;
6331 if (task->thread.fsindex == FS_TLS_SEL)
6332 base = read_32bit_tls(task, FS_TLS);
6334 rdmsrl(MSR_FS_BASE, base);
6336 base = task->thread.fs;
6337 - ret = put_user(base, (unsigned long __user *)addr);
6339 + ret = put_user(base, (unsigned long __user *)addr);
6342 - case ARCH_GET_GS: {
6343 + case ARCH_GET_GS: {
6346 if (task->thread.gsindex == GS_TLS_SEL)
6347 base = read_32bit_tls(task, GS_TLS);
6349 - asm("movl %%gs,%0" : "=r" (gsindex));
6350 + asm("movl %%gs,%0" : "=r" (gsindex));
6352 rdmsrl(MSR_KERNEL_GS_BASE, base);
6354 @@ -877,40 +859,21 @@ long do_arch_prctl(struct task_struct *t
6357 base = task->thread.gs;
6358 - ret = put_user(base, (unsigned long __user *)addr);
6359 + ret = put_user(base, (unsigned long __user *)addr);
6374 long sys_arch_prctl(int code, unsigned long addr)
6376 return do_arch_prctl(current, code, addr);
6380 - * Capture the user space registers if the task is not running (in user space)
6382 -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
6384 - struct pt_regs *pp, ptregs;
6386 - pp = task_pt_regs(tsk);
6389 - ptregs.cs &= 0xffff;
6390 - ptregs.ss &= 0xffff;
6392 - elf_core_copy_regs(regs, &ptregs);
6394 - boot_option_idle_override = 1;
6398 unsigned long arch_align_stack(unsigned long sp)
6399 @@ -919,3 +882,9 @@ unsigned long arch_align_stack(unsigned
6400 sp -= get_random_int() % 8192;
6404 +unsigned long arch_randomize_brk(struct mm_struct *mm)
6406 + unsigned long range_end = mm->brk + 0x02000000;
6407 + return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
6409 --- a/arch/x86/kernel/quirks-xen.c
6410 +++ b/arch/x86/kernel/quirks-xen.c
6412 static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
6418 /* BIOS may enable hardware IRQ balancing for
6419 * E7520/E7320/E7525(revision ID 0x9 and below)
6420 @@ -24,14 +24,17 @@ static void __devinit quirk_intel_irqbal
6421 pci_read_config_byte(dev, 0xf4, &config);
6422 pci_write_config_byte(dev, 0xf4, config|0x2);
6424 - /* read xTPR register */
6425 - raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
6427 + * read xTPR register. We may not have a pci_dev for device 8
6428 + * because it might be hidden until the above write.
6430 + pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word);
6432 if (!(word & (1 << 13))) {
6433 struct xen_platform_op op;
6435 - printk(KERN_INFO "Intel E7520/7320/7525 detected. "
6436 - "Disabling irq balancing and affinity\n");
6437 + dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
6438 + "disabling irq balancing and affinity\n");
6439 op.cmd = XENPF_platform_quirk;
6440 op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
6441 WARN_ON(HYPERVISOR_platform_op(&op));
6442 @@ -102,14 +105,16 @@ static void ich_force_enable_hpet(struct
6443 pci_read_config_dword(dev, 0xF0, &rcba);
6446 - printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n");
6447 + dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
6448 + "cannot force enable HPET\n");
6452 /* use bits 31:14, 16 kB aligned */
6453 rcba_base = ioremap_nocache(rcba, 0x4000);
6454 if (rcba_base == NULL) {
6455 - printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n");
6456 + dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
6457 + "cannot force enable HPET\n");
6461 @@ -120,8 +125,8 @@ static void ich_force_enable_hpet(struct
6462 /* HPET is enabled in HPTC. Just not reported by BIOS */
6464 force_hpet_address = 0xFED00000 | (val << 12);
6465 - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6466 - force_hpet_address);
6467 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6468 + "0x%lx\n", force_hpet_address);
6472 @@ -140,11 +145,12 @@ static void ich_force_enable_hpet(struct
6474 force_hpet_address = 0;
6476 - printk(KERN_DEBUG "Failed to force enable HPET\n");
6477 + dev_printk(KERN_DEBUG, &dev->dev,
6478 + "Failed to force enable HPET\n");
6480 force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
6481 - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6482 - force_hpet_address);
6483 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6484 + "0x%lx\n", force_hpet_address);
6488 @@ -160,6 +166,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
6489 ich_force_enable_hpet);
6490 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
6491 ich_force_enable_hpet);
6492 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
6493 + ich_force_enable_hpet);
6496 static struct pci_dev *cached_dev;
6497 @@ -204,8 +212,8 @@ static void old_ich_force_enable_hpet(st
6500 force_hpet_address = 0xFED00000 | (val << 12);
6501 - printk(KERN_DEBUG "HPET at base address 0x%lx\n",
6502 - force_hpet_address);
6503 + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
6504 + force_hpet_address);
6508 @@ -225,14 +233,14 @@ static void old_ich_force_enable_hpet(st
6509 /* HPET is enabled in HPTC. Just not reported by BIOS */
6511 force_hpet_address = 0xFED00000 | (val << 12);
6512 - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6513 - force_hpet_address);
6514 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6515 + "0x%lx\n", force_hpet_address);
6517 force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
6521 - printk(KERN_DEBUG "Failed to force enable HPET\n");
6522 + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
6526 @@ -290,8 +298,8 @@ static void vt8237_force_enable_hpet(str
6529 force_hpet_address = (val & ~0x3ff);
6530 - printk(KERN_DEBUG "HPET at base address 0x%lx\n",
6531 - force_hpet_address);
6532 + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
6533 + force_hpet_address);
6537 @@ -305,14 +313,14 @@ static void vt8237_force_enable_hpet(str
6538 pci_read_config_dword(dev, 0x68, &val);
6540 force_hpet_address = (val & ~0x3ff);
6541 - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6542 - force_hpet_address);
6543 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6544 + "0x%lx\n", force_hpet_address);
6546 force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
6550 - printk(KERN_DEBUG "Failed to force enable HPET\n");
6551 + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
6554 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
6555 @@ -340,7 +348,7 @@ static void nvidia_force_enable_hpet(str
6556 pci_read_config_dword(dev, 0x44, &val);
6557 force_hpet_address = val & 0xfffffffe;
6558 force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
6559 - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6560 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
6561 force_hpet_address);
6564 @@ -353,6 +361,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
6565 nvidia_force_enable_hpet);
6568 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260,
6569 + nvidia_force_enable_hpet);
6570 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360,
6571 nvidia_force_enable_hpet);
6572 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361,
6573 @@ -373,19 +383,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
6574 void force_hpet_resume(void)
6576 switch (force_hpet_resume_type) {
6577 - case ICH_FORCE_HPET_RESUME:
6578 - return ich_force_hpet_resume();
6580 - case OLD_ICH_FORCE_HPET_RESUME:
6581 - return old_ich_force_hpet_resume();
6583 - case VT8237_FORCE_HPET_RESUME:
6584 - return vt8237_force_hpet_resume();
6586 - case NVIDIA_FORCE_HPET_RESUME:
6587 - return nvidia_force_hpet_resume();
6590 + case ICH_FORCE_HPET_RESUME:
6591 + ich_force_hpet_resume();
6593 + case OLD_ICH_FORCE_HPET_RESUME:
6594 + old_ich_force_hpet_resume();
6596 + case VT8237_FORCE_HPET_RESUME:
6597 + vt8237_force_hpet_resume();
6599 + case NVIDIA_FORCE_HPET_RESUME:
6600 + nvidia_force_hpet_resume();
6606 --- a/arch/x86/kernel/rtc.c
6607 +++ b/arch/x86/kernel/rtc.c
6608 @@ -181,6 +181,10 @@ unsigned long read_persistent_clock(void
6610 unsigned long retval, flags;
6613 + if (!is_initial_xendomain())
6614 + return xen_read_persistent_clock();
6616 spin_lock_irqsave(&rtc_lock, flags);
6617 retval = get_wallclock();
6618 spin_unlock_irqrestore(&rtc_lock, flags);
6619 @@ -190,6 +194,10 @@ unsigned long read_persistent_clock(void
6621 int update_persistent_clock(struct timespec now)
6624 + if (xen_update_persistent_clock() < 0 || xen_independent_wallclock())
6627 return set_rtc_mmss(now.tv_sec);
6630 --- a/arch/x86/kernel/setup_32-xen.c
6631 +++ b/arch/x86/kernel/setup_32-xen.c
6633 #include <linux/crash_dump.h>
6634 #include <linux/dmi.h>
6635 #include <linux/pfn.h>
6636 +#include <linux/pci.h>
6637 +#include <linux/init_ohci1394_dma.h>
6639 #include <video/edid.h>
6641 +#include <asm/mtrr.h>
6642 #include <asm/apic.h>
6643 #include <asm/e820.h>
6644 #include <asm/mpspec.h>
6645 @@ -79,14 +82,83 @@ static struct notifier_block xen_panic_b
6646 xen_panic_event, NULL, 0 /* try to go last */
6649 -int disable_pse __cpuinitdata = 0;
6654 -extern struct resource code_resource;
6655 -extern struct resource data_resource;
6656 -extern struct resource bss_resource;
6657 +static struct resource data_resource = {
6658 + .name = "Kernel data",
6661 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6664 +static struct resource code_resource = {
6665 + .name = "Kernel code",
6668 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6671 +static struct resource bss_resource = {
6672 + .name = "Kernel bss",
6675 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6678 +static struct resource video_ram_resource = {
6679 + .name = "Video RAM area",
6682 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6685 +static struct resource standard_io_resources[] = { {
6689 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6694 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6699 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6704 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6706 + .name = "keyboard",
6709 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6711 + .name = "dma page reg",
6714 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6719 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6724 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6729 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6732 /* cpu data as detected by the assembly code in head.S */
6733 struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
6734 @@ -94,13 +166,16 @@ struct cpuinfo_x86 new_cpu_data __cpuini
6735 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
6736 EXPORT_SYMBOL(boot_cpu_data);
6738 +#ifndef CONFIG_X86_PAE
6739 unsigned long mmu_cr4_features;
6741 +unsigned long mmu_cr4_features = X86_CR4_PAE;
6744 /* for MCA, but anyone else can use it if they want */
6745 unsigned int machine_id;
6746 unsigned int machine_submodel_id;
6747 unsigned int BIOS_revision;
6748 -unsigned int mca_pentium_flag;
6750 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
6751 int bootloader_type;
6752 @@ -131,13 +206,17 @@ extern int root_mountflags;
6754 unsigned long saved_videomode;
6756 -#define RAMDISK_IMAGE_START_MASK 0x07FF
6757 +#define RAMDISK_IMAGE_START_MASK 0x07FF
6758 #define RAMDISK_PROMPT_FLAG 0x8000
6759 -#define RAMDISK_LOAD_FLAG 0x4000
6760 +#define RAMDISK_LOAD_FLAG 0x4000
6762 static char __initdata command_line[COMMAND_LINE_SIZE];
6764 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
6765 struct boot_params __initdata boot_params;
6767 +struct boot_params boot_params;
6771 * Point at the empty zero page to start with. We map the real shared_info
6772 @@ -198,8 +277,7 @@ static int __init parse_mem(char *arg)
6775 if (strcmp(arg, "nopentium") == 0) {
6776 - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
6778 + setup_clear_cpu_cap(X86_FEATURE_PSE);
6780 /* If the user specifies memory size, we
6781 * limit the BIOS-provided memory map to
6782 @@ -208,7 +286,7 @@ static int __init parse_mem(char *arg)
6783 * trim the existing memory map.
6785 unsigned long long mem_size;
6788 mem_size = memparse(arg, &arg);
6789 limit_regions(mem_size);
6790 user_defined_memmap = 1;
6791 @@ -350,7 +428,7 @@ static void __init reserve_ebda_region(v
6793 addr = get_bios_ebda();
6795 - reserve_bootmem(addr, PAGE_SIZE);
6796 + reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
6800 @@ -365,8 +443,6 @@ static unsigned long __init setup_memory
6801 min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
6802 xen_start_info->nr_pt_frames;
6806 max_low_pfn = find_max_low_pfn();
6808 #ifdef CONFIG_HIGHMEM
6809 @@ -447,7 +523,8 @@ static void __init reserve_crashkernel(v
6810 (unsigned long)(total_mem >> 20));
6811 crashk_res.start = crash_base;
6812 crashk_res.end = crash_base + crash_size - 1;
6813 - reserve_bootmem(crash_base, crash_size);
6814 + reserve_bootmem(crash_base, crash_size,
6817 printk(KERN_INFO "crashkernel reservation failed - "
6818 "you have to specify a base address\n");
6819 @@ -461,6 +538,99 @@ static inline void __init reserve_crashk
6823 +#ifdef CONFIG_BLK_DEV_INITRD
6825 +static bool do_relocate_initrd = false;
6827 +static void __init reserve_initrd(void)
6829 + unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
6830 + unsigned long ramdisk_size = xen_start_info->mod_len;
6831 + unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
6832 + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6833 + unsigned long ramdisk_here;
6837 + if (!xen_start_info->mod_start || !ramdisk_size)
6838 + return; /* No initrd provided by bootloader */
6840 + if (ramdisk_end < ramdisk_image) {
6841 + printk(KERN_ERR "initrd wraps around end of memory, "
6842 + "disabling initrd\n");
6845 + if (ramdisk_size >= end_of_lowmem/2) {
6846 + printk(KERN_ERR "initrd too large to handle, "
6847 + "disabling initrd\n");
6850 + if (ramdisk_end <= end_of_lowmem) {
6851 + /* All in lowmem, easy case */
6852 + reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
6853 + initrd_start = ramdisk_image + PAGE_OFFSET;
6854 + initrd_end = initrd_start+ramdisk_size;
6858 + /* We need to move the initrd down into lowmem */
6859 + ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
6861 + /* Note: this includes all the lowmem currently occupied by
6862 + the initrd, we rely on that fact to keep the data intact. */
6863 + reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
6864 + initrd_start = ramdisk_here + PAGE_OFFSET;
6865 + initrd_end = initrd_start + ramdisk_size;
6867 + do_relocate_initrd = true;
6870 +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
6872 +static void __init relocate_initrd(void)
6874 + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
6875 + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
6876 + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6877 + unsigned long ramdisk_here;
6878 + unsigned long slop, clen, mapaddr;
6881 + if (!do_relocate_initrd)
6884 + ramdisk_here = initrd_start - PAGE_OFFSET;
6886 + q = (char *)initrd_start;
6888 + /* Copy any lowmem portion of the initrd */
6889 + if (ramdisk_image < end_of_lowmem) {
6890 + clen = end_of_lowmem - ramdisk_image;
6891 + p = (char *)__va(ramdisk_image);
6892 + memcpy(q, p, clen);
6894 + ramdisk_image += clen;
6895 + ramdisk_size -= clen;
6898 + /* Copy the highmem portion of the initrd */
6899 + while (ramdisk_size) {
6900 + slop = ramdisk_image & ~PAGE_MASK;
6901 + clen = ramdisk_size;
6902 + if (clen > MAX_MAP_CHUNK-slop)
6903 + clen = MAX_MAP_CHUNK-slop;
6904 + mapaddr = ramdisk_image & PAGE_MASK;
6905 + p = early_ioremap(mapaddr, clen+slop);
6906 + memcpy(q, p+slop, clen);
6907 + early_iounmap(p, clen+slop);
6909 + ramdisk_image += clen;
6910 + ramdisk_size -= clen;
6914 +#endif /* CONFIG_BLK_DEV_INITRD */
6916 void __init setup_bootmem_allocator(void)
6918 unsigned long bootmap_size;
6919 @@ -478,14 +648,15 @@ void __init setup_bootmem_allocator(void
6920 * bootmem allocator with an invalid RAM area.
6922 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
6923 - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
6924 + bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
6929 * reserve physical page 0 - it's a special BIOS page on many boxes,
6930 * enabling clean reboots, SMP operation, laptop functions.
6932 - reserve_bootmem(0, PAGE_SIZE);
6933 + reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
6935 /* reserve EBDA region, it's a 4K region */
6936 reserve_ebda_region();
6937 @@ -495,7 +666,7 @@ void __init setup_bootmem_allocator(void
6938 unless you have no PS/2 mouse plugged in. */
6939 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
6940 boot_cpu_data.x86 == 6)
6941 - reserve_bootmem(0xa0000 - 4096, 4096);
6942 + reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
6946 @@ -503,7 +674,7 @@ void __init setup_bootmem_allocator(void
6947 * FIXME: Don't need the extra page at 4K, but need to fix
6948 * trampoline before removing it. (see the GDT stuff)
6950 - reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
6951 + reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
6953 #ifdef CONFIG_ACPI_SLEEP
6955 @@ -511,29 +682,12 @@ void __init setup_bootmem_allocator(void
6957 acpi_reserve_bootmem();
6959 - numa_kva_reserve();
6960 #endif /* !CONFIG_XEN */
6962 #ifdef CONFIG_BLK_DEV_INITRD
6963 - if (xen_start_info->mod_start) {
6964 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
6965 - unsigned long ramdisk_size = xen_start_info->mod_len;
6966 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
6967 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6969 - if (ramdisk_end <= end_of_lowmem) {
6970 - /*reserve_bootmem(ramdisk_image, ramdisk_size);*/
6971 - initrd_start = ramdisk_image + PAGE_OFFSET;
6972 - initrd_end = initrd_start+ramdisk_size;
6973 - initrd_below_start_ok = 1;
6975 - printk(KERN_ERR "initrd extends beyond end of memory "
6976 - "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
6977 - ramdisk_end, end_of_lowmem);
6983 + numa_kva_reserve();
6984 reserve_crashkernel();
6987 @@ -600,20 +754,14 @@ void __init setup_arch(char **cmdline_p)
6988 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
6989 pre_setup_arch_hook();
6991 + early_ioremap_init();
6993 prefill_possible_map();
6997 - * FIXME: This isn't an official loader_type right
6998 - * now but does currently work with elilo.
6999 - * If we were configured as an EFI kernel, check to make
7000 - * sure that we were loaded correctly from elilo and that
7001 - * the system table is valid. If not, then initialize normally.
7004 - if ((boot_params.hdr.type_of_loader == 0x50) &&
7005 - boot_params.efi_info.efi_systab)
7006 + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
7011 @@ -653,12 +801,9 @@ void __init setup_arch(char **cmdline_p)
7018 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
7019 - print_memory_map(memory_setup());
7022 + printk(KERN_INFO "BIOS-provided physical RAM map:\n");
7023 + print_memory_map(memory_setup());
7027 @@ -691,6 +836,17 @@ void __init setup_arch(char **cmdline_p)
7028 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
7029 *cmdline_p = command_line;
7034 + /* update e820 for memory not covered by WB MTRRs */
7038 + if (mtrr_trim_uncached_memory(max_pfn))
7042 max_low_pfn = setup_memory();
7045 @@ -715,6 +871,16 @@ void __init setup_arch(char **cmdline_p)
7046 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
7051 + * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
7054 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7055 + if (init_ohci1394_dma_early)
7056 + init_ohci1394_dma_on_all_controllers();
7059 remapped_pgdat_init();
7062 @@ -800,16 +966,20 @@ void __init setup_arch(char **cmdline_p)
7063 * NOTE: at this point the bootmem allocator is fully available.
7066 +#ifdef CONFIG_BLK_DEV_INITRD
7067 + relocate_initrd();
7070 paravirt_post_allocator_init();
7072 if (is_initial_xendomain())
7077 #ifdef CONFIG_X86_GENERICARCH
7078 generic_apic_probe();
7085 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
7086 @@ -827,7 +997,7 @@ void __init setup_arch(char **cmdline_p)
7087 acpi_boot_table_init();
7090 -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
7095 @@ -873,3 +1043,30 @@ xen_panic_event(struct notifier_block *t
7096 /* we're never actually going to get here... */
7101 + * Request address space for all standard resources
7103 + * This is called just before pcibios_init(), which is also a
7104 + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
7106 +static int __init request_standard_resources(void)
7110 + /* Nothing to do if not running in dom0. */
7111 + if (!is_initial_xendomain())
7114 + printk(KERN_INFO "Setting up standard PCI resources\n");
7115 + init_iomem_resources(&code_resource, &data_resource, &bss_resource);
7117 + request_resource(&iomem_resource, &video_ram_resource);
7119 + /* request I/O space for devices used on all i[345]86 PCs */
7120 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
7121 + request_resource(&ioport_resource, &standard_io_resources[i]);
7125 +subsys_initcall(request_standard_resources);
7126 --- a/arch/x86/kernel/setup_64-xen.c
7127 +++ b/arch/x86/kernel/setup_64-xen.c
7129 #include <linux/ptrace.h>
7130 #include <linux/slab.h>
7131 #include <linux/user.h>
7132 -#include <linux/a.out.h>
7133 #include <linux/screen_info.h>
7134 #include <linux/ioport.h>
7135 #include <linux/delay.h>
7137 #include <linux/crash_dump.h>
7138 #include <linux/root_dev.h>
7139 #include <linux/pci.h>
7140 +#include <linux/efi.h>
7141 #include <linux/acpi.h>
7142 #include <linux/kallsyms.h>
7143 #include <linux/edd.h>
7145 #include <linux/dmi.h>
7146 #include <linux/dma-mapping.h>
7147 #include <linux/ctype.h>
7148 +#include <linux/uaccess.h>
7149 +#include <linux/init_ohci1394_dma.h>
7151 #include <asm/mtrr.h>
7152 #include <asm/uaccess.h>
7153 #include <asm/system.h>
7154 +#include <asm/vsyscall.h>
7156 #include <asm/smp.h>
7157 #include <asm/msr.h>
7159 #include <video/edid.h>
7160 #include <asm/e820.h>
7161 #include <asm/dma.h>
7162 +#include <asm/gart.h>
7163 #include <asm/mpspec.h>
7164 #include <asm/mmu_context.h>
7165 #include <asm/proto.h>
7167 #include <asm/sections.h>
7168 #include <asm/dmi.h>
7169 #include <asm/cacheflush.h>
7170 +#include <asm/mce.h>
7171 +#include <asm/ds.h>
7172 +#include <asm/topology.h>
7174 #include <linux/percpu.h>
7175 #include <xen/interface/physdev.h>
7176 @@ -108,6 +115,8 @@ EXPORT_SYMBOL(xen_start_info);
7177 struct cpuinfo_x86 boot_cpu_data __read_mostly;
7178 EXPORT_SYMBOL(boot_cpu_data);
7180 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
7182 unsigned long mmu_cr4_features;
7184 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
7185 @@ -117,7 +126,7 @@ unsigned long saved_video_mode;
7187 int force_mwait __cpuinitdata;
7193 int dmi_alloc_index;
7194 @@ -163,25 +172,27 @@ struct resource standard_io_resources[]
7196 #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
7198 -struct resource data_resource = {
7199 +static struct resource data_resource = {
7200 .name = "Kernel data",
7203 .flags = IORESOURCE_RAM,
7205 -struct resource code_resource = {
7206 +static struct resource code_resource = {
7207 .name = "Kernel code",
7210 .flags = IORESOURCE_RAM,
7212 -struct resource bss_resource = {
7213 +static struct resource bss_resource = {
7214 .name = "Kernel bss",
7217 .flags = IORESOURCE_RAM,
7220 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
7222 #ifdef CONFIG_PROC_VMCORE
7223 /* elfcorehdr= specifies the location of elf core header
7224 * stored by the crashed kernel. This option will be passed
7225 @@ -205,9 +216,10 @@ contig_initmem_init(unsigned long start_
7226 unsigned long bootmap_size, bootmap;
7228 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
7229 - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
7230 + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
7233 - panic("Cannot find bootmem map of size %ld\n",bootmap_size);
7234 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
7235 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
7236 e820_register_active_regions(0, start_pfn, end_pfn);
7238 @@ -215,8 +227,8 @@ contig_initmem_init(unsigned long start_
7240 free_bootmem_with_active_regions(0, end_pfn);
7242 - reserve_bootmem(bootmap, bootmap_size);
7244 + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
7248 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
7249 @@ -249,27 +261,35 @@ static inline void copy_edd(void)
7251 static void __init reserve_crashkernel(void)
7253 - unsigned long long free_mem;
7254 + unsigned long long total_mem;
7255 unsigned long long crash_size, crash_base;
7258 - free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
7259 + total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
7261 - ret = parse_crashkernel(boot_command_line, free_mem,
7262 + ret = parse_crashkernel(boot_command_line, total_mem,
7263 &crash_size, &crash_base);
7264 if (ret == 0 && crash_size) {
7265 - if (crash_base > 0) {
7266 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
7267 - "for crashkernel (System RAM: %ldMB)\n",
7268 - (unsigned long)(crash_size >> 20),
7269 - (unsigned long)(crash_base >> 20),
7270 - (unsigned long)(free_mem >> 20));
7271 - crashk_res.start = crash_base;
7272 - crashk_res.end = crash_base + crash_size - 1;
7273 - reserve_bootmem(crash_base, crash_size);
7275 + if (crash_base <= 0) {
7276 printk(KERN_INFO "crashkernel reservation failed - "
7277 "you have to specify a base address\n");
7281 + if (reserve_bootmem(crash_base, crash_size,
7282 + BOOTMEM_EXCLUSIVE) < 0) {
7283 + printk(KERN_INFO "crashkernel reservation failed - "
7284 + "memory is in use\n");
7288 + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
7289 + "for crashkernel (System RAM: %ldMB)\n",
7290 + (unsigned long)(crash_size >> 20),
7291 + (unsigned long)(crash_base >> 20),
7292 + (unsigned long)(total_mem >> 20));
7293 + crashk_res.start = crash_base;
7294 + crashk_res.end = crash_base + crash_size - 1;
7298 @@ -280,37 +300,21 @@ static inline void __init reserve_crashk
7303 -#define EBDA_ADDR_POINTER 0x40E
7305 -unsigned __initdata ebda_addr;
7306 -unsigned __initdata ebda_size;
7308 -static void discover_ebda(void)
7309 +/* Overridden in paravirt.c if CONFIG_PARAVIRT */
7310 +void __attribute__((weak)) __init memory_setup(void)
7313 - * there is a real-mode segmented pointer pointing to the
7314 - * 4K EBDA area at 0x40E
7316 - ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
7319 - ebda_size = *(unsigned short *)__va(ebda_addr);
7321 - /* Round EBDA up to pages */
7322 - if (ebda_size == 0)
7325 - ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
7326 - if (ebda_size > 64*1024)
7327 - ebda_size = 64*1024;
7328 + machine_specific_memory_setup();
7331 -#define discover_ebda() ((void)0)
7335 + * setup_arch - architecture-specific boot-time initializations
7337 + * Note: On x86_64, fixmaps are ready for use even before this is called.
7339 void __init setup_arch(char **cmdline_p)
7344 extern struct e820map machine_e820;
7346 @@ -319,6 +323,11 @@ void __init setup_arch(char **cmdline_p)
7347 /* Register a call for panic conditions. */
7348 atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
7350 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
7351 + VMASST_TYPE_writable_pagetables));
7353 + early_ioremap_init();
7355 ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
7356 screen_info = boot_params.screen_info;
7358 @@ -335,11 +344,6 @@ void __init setup_arch(char **cmdline_p)
7359 screen_info.orig_video_isVGA = 0;
7363 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
7364 - VMASST_TYPE_writable_pagetables));
7368 printk(KERN_INFO "Command line: %s\n", boot_command_line);
7370 @@ -355,7 +359,15 @@ void __init setup_arch(char **cmdline_p)
7371 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
7372 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
7374 - setup_memory_region();
7376 + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
7386 if (!boot_params.hdr.root_flags)
7387 @@ -379,28 +391,51 @@ void __init setup_arch(char **cmdline_p)
7389 parse_early_param();
7391 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7392 + if (init_ohci1394_dma_early)
7393 + init_ohci1394_dma_on_all_controllers();
7396 finish_e820_parsing();
7398 + early_gart_iommu_check();
7400 e820_register_active_regions(0, 0, -1UL);
7402 * partially used pages are not usable - thus
7403 * we are rounding upwards:
7405 end_pfn = e820_end_of_ram();
7406 + /* update e820 for memory not covered by WB MTRRs */
7409 + if (mtrr_trim_uncached_memory(end_pfn)) {
7410 + e820_register_active_regions(0, 0, -1UL);
7411 + end_pfn = e820_end_of_ram();
7415 num_physpages = end_pfn;
7416 + max_mapnr = end_pfn;
7422 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
7426 if (is_initial_xendomain())
7431 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
7432 - /* setup to use the static apicid table during kernel startup */
7433 - x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
7434 + /* setup to use the early static init tables during kernel startup */
7435 + x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
7436 + x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
7438 + x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
7442 /* How many end-of-memory variables you have, grandma! */
7443 @@ -419,54 +454,25 @@ void __init setup_arch(char **cmdline_p)
7447 - numa_initmem_init(0, end_pfn);
7448 + numa_initmem_init(0, end_pfn);
7450 contig_initmem_init(0, end_pfn);
7455 - * Reserve kernel, physmap, start info, initial page tables, and
7458 - reserve_bootmem_generic(__pa_symbol(&_text),
7459 - (table_end << PAGE_SHIFT) - __pa_symbol(&_text));
7461 - /* Reserve direct mapping */
7462 - reserve_bootmem_generic(table_start << PAGE_SHIFT,
7463 - (table_end - table_start) << PAGE_SHIFT);
7465 - /* reserve kernel */
7466 - reserve_bootmem_generic(__pa_symbol(&_text),
7467 - __pa_symbol(&_end) - __pa_symbol(&_text));
7468 + early_res_to_bootmem();
7471 +#ifdef CONFIG_ACPI_SLEEP
7473 - * reserve physical page 0 - it's a special BIOS page on many boxes,
7474 - * enabling clean reboots, SMP operation, laptop functions.
7475 + * Reserve low memory region for sleep support.
7477 - reserve_bootmem_generic(0, PAGE_SIZE);
7479 - /* reserve ebda region */
7481 - reserve_bootmem_generic(ebda_addr, ebda_size);
7483 - /* reserve nodemap region */
7485 - reserve_bootmem_generic(nodemap_addr, nodemap_size);
7486 + acpi_reserve_bootmem();
7490 - /* Reserve SMP trampoline */
7491 - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
7494 + efi_reserve_bootmem();
7497 -#ifdef CONFIG_ACPI_SLEEP
7499 - * Reserve low memory region for sleep support.
7501 - acpi_reserve_bootmem();
7503 #ifdef CONFIG_BLK_DEV_INITRD
7505 if (xen_start_info->mod_start) {
7506 @@ -490,6 +496,8 @@ void __init setup_arch(char **cmdline_p)
7507 initrd_below_start_ok = 1;
7510 + /* Assumes everything on node 0 */
7511 + free_bootmem(ramdisk_image, ramdisk_size);
7512 printk(KERN_ERR "initrd extends beyond end of memory "
7513 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
7514 ramdisk_end, end_of_mem);
7515 @@ -499,10 +507,11 @@ void __init setup_arch(char **cmdline_p)
7517 reserve_crashkernel();
7520 #ifdef CONFIG_X86_LOCAL_APIC
7522 - * Find and reserve possible boot-time SMP configuration:
7524 + * Find and reserve possible boot-time SMP configuration:
7529 @@ -590,16 +599,10 @@ void __init setup_arch(char **cmdline_p)
7533 -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
7539 - * set this early, so we dont allocate cpu0
7540 - * if MADT list doesnt list BSP first
7541 - * mpparse.c/MP_processor_info() allocates logical cpu numbers.
7543 - cpu_set(0, cpu_present_map);
7546 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
7547 @@ -623,6 +626,7 @@ void __init setup_arch(char **cmdline_p)
7550 init_apic_mappings();
7551 + ioapic_init_mappings();
7554 #if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
7555 @@ -634,18 +638,17 @@ void __init setup_arch(char **cmdline_p)
7558 if (is_initial_xendomain())
7559 - e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
7560 + e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
7561 + &code_resource, &data_resource, &bss_resource);
7563 - e820_reserve_resources(e820.map, e820.nr_map);
7564 + e820_reserve_resources(e820.map, e820.nr_map,
7565 + &code_resource, &data_resource, &bss_resource);
7566 e820_mark_nosave_regions();
7571 /* request I/O space for devices used on all i[345]86 PCs */
7572 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
7573 request_resource(&ioport_resource, &standard_io_resources[i]);
7577 if (is_initial_xendomain())
7578 @@ -679,7 +682,8 @@ void __init setup_arch(char **cmdline_p)
7581 #if defined(CONFIG_VGA_CONSOLE)
7582 - conswitchp = &vga_con;
7583 + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
7584 + conswitchp = &vga_con;
7585 #elif defined(CONFIG_DUMMY_CONSOLE)
7586 conswitchp = &dummy_con;
7588 @@ -723,9 +727,10 @@ static void __cpuinit display_cacheinfo(
7590 if (n >= 0x80000005) {
7591 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
7592 - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
7593 - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
7594 - c->x86_cache_size=(ecx>>24)+(edx>>24);
7595 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
7596 + "D cache %dK (%d bytes/line)\n",
7597 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
7598 + c->x86_cache_size = (ecx>>24) + (edx>>24);
7599 /* On K8 L1 TLB is inclusive, so don't count it */
7602 @@ -739,27 +744,25 @@ static void __cpuinit display_cacheinfo(
7603 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
7604 c->x86_cache_size, ecx & 0xFF);
7607 - if (n >= 0x80000007)
7608 - cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
7609 if (n >= 0x80000008) {
7610 - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
7611 + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
7612 c->x86_virt_bits = (eax >> 8) & 0xff;
7613 c->x86_phys_bits = eax & 0xff;
7618 -static int nearby_node(int apicid)
7619 +static int __cpuinit nearby_node(int apicid)
7624 for (i = apicid - 1; i >= 0; i--) {
7625 - int node = apicid_to_node[i];
7626 + node = apicid_to_node[i];
7627 if (node != NUMA_NO_NODE && node_online(node))
7630 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
7631 - int node = apicid_to_node[i];
7632 + node = apicid_to_node[i];
7633 if (node != NUMA_NO_NODE && node_online(node))
7636 @@ -771,7 +774,7 @@ static int nearby_node(int apicid)
7637 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
7638 * Assumes number of cores is a power of two.
7640 -static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
7641 +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
7645 @@ -780,7 +783,54 @@ static void __init amd_detect_cmp(struct
7647 unsigned apicid = hard_smp_processor_id();
7649 - unsigned ecx = cpuid_ecx(0x80000008);
7650 + bits = c->x86_coreid_bits;
7652 + /* Low order bits define the core id (index of core in socket) */
7653 + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
7654 + /* Convert the APIC ID into the socket ID */
7655 + c->phys_proc_id = phys_pkg_id(bits);
7658 + node = c->phys_proc_id;
7659 + if (apicid_to_node[apicid] != NUMA_NO_NODE)
7660 + node = apicid_to_node[apicid];
7661 + if (!node_online(node)) {
7662 + /* Two possibilities here:
7663 + - The CPU is missing memory and no node was created.
7664 + In that case try picking one from a nearby CPU
7665 + - The APIC IDs differ from the HyperTransport node IDs
7666 + which the K8 northbridge parsing fills in.
7667 + Assume they are all increased by a constant offset,
7668 + but in the same order as the HT nodeids.
7669 + If that doesn't result in a usable node fall back to the
7670 + path for the previous case. */
7672 + int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
7674 + if (ht_nodeid >= 0 &&
7675 + apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
7676 + node = apicid_to_node[ht_nodeid];
7677 + /* Pick a nearby node */
7678 + if (!node_online(node))
7679 + node = nearby_node(apicid);
7681 + numa_set_node(cpu, node);
7683 + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
7688 +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
7691 + unsigned bits, ecx;
7693 + /* Multi core CPU? */
7694 + if (c->extended_cpuid_level < 0x80000008)
7697 + ecx = cpuid_ecx(0x80000008);
7699 c->x86_max_cores = (ecx & 0xff) + 1;
7701 @@ -793,37 +843,8 @@ static void __init amd_detect_cmp(struct
7705 - /* Low order bits define the core id (index of core in socket) */
7706 - c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
7707 - /* Convert the APIC ID into the socket ID */
7708 - c->phys_proc_id = phys_pkg_id(bits);
7711 - node = c->phys_proc_id;
7712 - if (apicid_to_node[apicid] != NUMA_NO_NODE)
7713 - node = apicid_to_node[apicid];
7714 - if (!node_online(node)) {
7715 - /* Two possibilities here:
7716 - - The CPU is missing memory and no node was created.
7717 - In that case try picking one from a nearby CPU
7718 - - The APIC IDs differ from the HyperTransport node IDs
7719 - which the K8 northbridge parsing fills in.
7720 - Assume they are all increased by a constant offset,
7721 - but in the same order as the HT nodeids.
7722 - If that doesn't result in a usable node fall back to the
7723 - path for the previous case. */
7724 - int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
7725 - if (ht_nodeid >= 0 &&
7726 - apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
7727 - node = apicid_to_node[ht_nodeid];
7728 - /* Pick a nearby node */
7729 - if (!node_online(node))
7730 - node = nearby_node(apicid);
7732 - numa_set_node(cpu, node);
7733 + c->x86_coreid_bits = bits;
7735 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
7740 @@ -840,8 +861,8 @@ static void __init amd_detect_cmp(struct
7741 /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
7742 static __cpuinit int amd_apic_timer_broken(void)
7745 - u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
7746 + u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
7748 switch (eax & CPUID_XFAM) {
7750 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
7751 @@ -860,6 +881,15 @@ static __cpuinit int amd_apic_timer_brok
7755 +static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
7757 + early_init_amd_mc(c);
7759 + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
7760 + if (c->x86_power & (1<<8))
7761 + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
7764 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
7767 @@ -870,7 +900,7 @@ static void __cpuinit init_amd(struct cp
7769 * Disable TLB flush filter by setting HWCR.FFDIS on K8
7770 * bit 6 of msr C001_0015
7773 * Errata 63 for SH-B3 steppings
7774 * Errata 122 for all steppings (F+ have it disabled by default)
7776 @@ -883,35 +913,32 @@ static void __cpuinit init_amd(struct cp
7778 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
7779 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
7780 - clear_bit(0*32+31, &c->x86_capability);
7782 + clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
7784 /* On C+ stepping K8 rep microcode works well for copy/memset */
7785 level = cpuid_eax(1);
7786 - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
7787 - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7788 + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
7790 + set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7791 if (c->x86 == 0x10 || c->x86 == 0x11)
7792 - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7793 + set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7795 /* Enable workaround for FXSAVE leak */
7797 - set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
7798 + set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
7800 level = get_model_name(c);
7805 /* Should distinguish Models here, but this is only
7806 a fallback anyways. */
7807 strcpy(c->x86_model_id, "Hammer");
7814 display_cacheinfo(c);
7816 - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
7817 - if (c->x86_power & (1<<8))
7818 - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7820 /* Multi core CPU? */
7821 if (c->extended_cpuid_level >= 0x80000008)
7823 @@ -923,14 +950,10 @@ static void __cpuinit init_amd(struct cp
7824 num_cache_leaves = 3;
7826 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
7827 - set_bit(X86_FEATURE_K8, &c->x86_capability);
7829 - /* RDTSC can be speculated around */
7830 - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7831 + set_cpu_cap(c, X86_FEATURE_K8);
7833 - /* Family 10 doesn't support C states in MWAIT so don't use it */
7834 - if (c->x86 == 0x10 && !force_mwait)
7835 - clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
7836 + /* MFENCE stops RDTSC speculation */
7837 + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
7840 if (amd_apic_timer_broken())
7841 @@ -938,28 +961,29 @@ static void __cpuinit init_amd(struct cp
7845 -static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
7846 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
7849 - u32 eax, ebx, ecx, edx;
7850 - int index_msb, core_bits;
7851 + u32 eax, ebx, ecx, edx;
7852 + int index_msb, core_bits;
7854 cpuid(1, &eax, &ebx, &ecx, &edx);
7857 if (!cpu_has(c, X86_FEATURE_HT))
7859 - if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
7860 + if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
7863 smp_num_siblings = (ebx & 0xff0000) >> 16;
7865 if (smp_num_siblings == 1) {
7866 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
7867 - } else if (smp_num_siblings > 1 ) {
7868 + } else if (smp_num_siblings > 1) {
7870 if (smp_num_siblings > NR_CPUS) {
7871 - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
7872 + printk(KERN_WARNING "CPU: Unsupported number of "
7873 + "siblings %d", smp_num_siblings);
7874 smp_num_siblings = 1;
7877 @@ -969,7 +993,7 @@ static void __cpuinit detect_ht(struct c
7879 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
7881 - index_msb = get_count_order(smp_num_siblings) ;
7882 + index_msb = get_count_order(smp_num_siblings);
7884 core_bits = get_count_order(c->x86_max_cores);
7886 @@ -978,8 +1002,10 @@ static void __cpuinit detect_ht(struct c
7889 if ((c->x86_max_cores * smp_num_siblings) > 1) {
7890 - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
7891 - printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id);
7892 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
7894 + printk(KERN_INFO "CPU: Processor Core ID: %d\n",
7899 @@ -1003,7 +1029,7 @@ static int __cpuinit intel_num_cpu_cores
7903 -static void srat_detect_node(void)
7904 +static void __cpuinit srat_detect_node(void)
7908 @@ -1013,7 +1039,7 @@ static void srat_detect_node(void)
7909 /* Don't do the funky fallback heuristics the AMD version employs
7911 node = apicid_to_node[apicid];
7912 - if (node == NUMA_NO_NODE)
7913 + if (node == NUMA_NO_NODE || !node_online(node))
7914 node = first_node(node_online_map);
7915 numa_set_node(cpu, node);
7917 @@ -1021,28 +1047,39 @@ static void srat_detect_node(void)
7921 +static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
7923 + if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
7924 + (c->x86 == 0x6 && c->x86_model >= 0x0e))
7925 + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7928 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
7933 init_intel_cacheinfo(c);
7934 - if (c->cpuid_level > 9 ) {
7935 + if (c->cpuid_level > 9) {
7936 unsigned eax = cpuid_eax(10);
7937 /* Check for version and the number of counters */
7938 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
7939 - set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
7940 + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
7944 unsigned int l1, l2;
7945 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
7946 if (!(l1 & (1<<11)))
7947 - set_bit(X86_FEATURE_BTS, c->x86_capability);
7948 + set_cpu_cap(c, X86_FEATURE_BTS);
7949 if (!(l1 & (1<<12)))
7950 - set_bit(X86_FEATURE_PEBS, c->x86_capability);
7951 + set_cpu_cap(c, X86_FEATURE_PEBS);
7958 n = c->extended_cpuid_level;
7959 if (n >= 0x80000008) {
7960 unsigned eax = cpuid_eax(0x80000008);
7961 @@ -1059,14 +1096,11 @@ static void __cpuinit init_intel(struct
7962 c->x86_cache_alignment = c->x86_clflush_size * 2;
7963 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
7964 (c->x86 == 0x6 && c->x86_model >= 0x0e))
7965 - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7966 + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
7968 - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7970 - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7972 - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7973 - c->x86_max_cores = intel_num_cpu_cores(c);
7974 + set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7975 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
7976 + c->x86_max_cores = intel_num_cpu_cores(c);
7980 @@ -1083,18 +1117,12 @@ static void __cpuinit get_cpu_vendor(str
7981 c->x86_vendor = X86_VENDOR_UNKNOWN;
7984 -struct cpu_model_info {
7987 - char *model_names[16];
7990 /* Do some early cpuid on the boot CPU to get some parameter that are
7991 needed before check_bugs. Everything advanced is in identify_cpu
7993 -void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
7994 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
7999 c->loops_per_jiffy = loops_per_jiffy;
8000 c->x86_cache_size = -1;
8001 @@ -1105,6 +1133,7 @@ void __cpuinit early_identify_cpu(struct
8002 c->x86_clflush_size = 64;
8003 c->x86_cache_alignment = c->x86_clflush_size;
8004 c->x86_max_cores = 1;
8005 + c->x86_coreid_bits = 0;
8006 c->extended_cpuid_level = 0;
8007 memset(&c->x86_capability, 0, sizeof c->x86_capability);
8009 @@ -1113,7 +1142,7 @@ void __cpuinit early_identify_cpu(struct
8010 (unsigned int *)&c->x86_vendor_id[0],
8011 (unsigned int *)&c->x86_vendor_id[8],
8012 (unsigned int *)&c->x86_vendor_id[4]);
8017 /* Initialize the standard set of capabilities */
8018 @@ -1131,7 +1160,7 @@ void __cpuinit early_identify_cpu(struct
8019 c->x86 += (tfms >> 20) & 0xff;
8021 c->x86_model += ((tfms >> 16) & 0xF) << 4;
8022 - if (c->x86_capability[0] & (1<<19))
8023 + if (c->x86_capability[0] & (1<<19))
8024 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
8026 /* Have CPUID level 0 only - unheard of */
8027 @@ -1141,18 +1170,6 @@ void __cpuinit early_identify_cpu(struct
8029 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
8034 - * This does the hard work of actually picking apart the CPU stuff...
8036 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
8041 - early_identify_cpu(c);
8043 /* AMD-defined flags: level 0x80000001 */
8044 xlvl = cpuid_eax(0x80000000);
8045 c->extended_cpuid_level = xlvl;
8046 @@ -1173,6 +1190,30 @@ void __cpuinit identify_cpu(struct cpuin
8047 c->x86_capability[2] = cpuid_edx(0x80860001);
8050 + c->extended_cpuid_level = cpuid_eax(0x80000000);
8051 + if (c->extended_cpuid_level >= 0x80000007)
8052 + c->x86_power = cpuid_edx(0x80000007);
8054 + switch (c->x86_vendor) {
8055 + case X86_VENDOR_AMD:
8056 + early_init_amd(c);
8058 + case X86_VENDOR_INTEL:
8059 + early_init_intel(c);
8066 + * This does the hard work of actually picking apart the CPU stuff...
8068 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
8072 + early_identify_cpu(c);
8074 init_scattered_cpuid_features(c);
8076 c->apicid = phys_pkg_id(0);
8077 @@ -1202,8 +1243,7 @@ void __cpuinit identify_cpu(struct cpuin
8081 - select_idle_routine(c);
8086 * On SMP, boot_cpu_data holds the common feature set between
8087 @@ -1213,31 +1253,55 @@ void __cpuinit identify_cpu(struct cpuin
8089 if (c != &boot_cpu_data) {
8090 /* AND the already accumulated flags with these */
8091 - for (i = 0 ; i < NCAPINTS ; i++)
8092 + for (i = 0; i < NCAPINTS; i++)
8093 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
8096 + /* Clear all flags overriden by options */
8097 + for (i = 0; i < NCAPINTS; i++)
8098 + c->x86_capability[i] &= ~cleared_cpu_caps[i];
8100 #ifdef CONFIG_X86_MCE
8103 + select_idle_routine(c);
8105 if (c != &boot_cpu_data)
8108 numa_add_cpu(smp_processor_id());
8114 +static __init int setup_noclflush(char *arg)
8116 + setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
8119 +__setup("noclflush", setup_noclflush);
8121 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
8123 if (c->x86_model_id[0])
8124 - printk("%s", c->x86_model_id);
8125 + printk(KERN_CONT "%s", c->x86_model_id);
8127 + if (c->x86_mask || c->cpuid_level >= 0)
8128 + printk(KERN_CONT " stepping %02x\n", c->x86_mask);
8130 + printk(KERN_CONT "\n");
8133 - if (c->x86_mask || c->cpuid_level >= 0)
8134 - printk(" stepping %02x\n", c->x86_mask);
8135 +static __init int setup_disablecpuid(char *arg)
8138 + if (get_option(&arg, &bit) && bit < NCAPINTS*32)
8139 + setup_clear_cpu_cap(bit);
8145 +__setup("clearcpuid=", setup_disablecpuid);
8148 * Get CPU information for use by the procfs.
8149 @@ -1246,116 +1310,41 @@ void __cpuinit print_cpu_info(struct cpu
8150 static int show_cpuinfo(struct seq_file *m, void *v)
8152 struct cpuinfo_x86 *c = v;
8156 - * These flag bits must match the definitions in <asm/cpufeature.h>.
8157 - * NULL means this bit is undefined or reserved; either way it doesn't
8158 - * have meaning as far as Linux is concerned. Note that it's important
8159 - * to realize there is a difference between this table and CPUID -- if
8160 - * applications want to get the raw CPUID data, they should access
8161 - * /dev/cpu/<cpu_nr>/cpuid instead.
8163 - static const char *const x86_cap_flags[] = {
8164 - /* Intel-defined */
8165 - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
8166 - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
8167 - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
8168 - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
8171 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8172 - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
8173 - NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
8174 - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
8175 - "3dnowext", "3dnow",
8177 - /* Transmeta-defined */
8178 - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
8179 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8180 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8181 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8183 - /* Other (Linux-defined) */
8184 - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
8185 - NULL, NULL, NULL, NULL,
8186 - "constant_tsc", "up", NULL, "arch_perfmon",
8187 - "pebs", "bts", NULL, "sync_rdtsc",
8188 - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8189 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8191 - /* Intel-defined (#2) */
8192 - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
8193 - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
8194 - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
8195 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8197 - /* VIA/Cyrix/Centaur-defined */
8198 - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
8199 - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
8200 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8201 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8203 - /* AMD-defined (#2) */
8204 - "lahf_lm", "cmp_legacy", "svm", "extapic",
8205 - "cr8_legacy", "abm", "sse4a", "misalignsse",
8206 - "3dnowprefetch", "osvw", "ibs", "sse5",
8207 - "skinit", "wdt", NULL, NULL,
8208 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8209 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8211 - /* Auxiliary (Linux-defined) */
8212 - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8213 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8214 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8215 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8217 - static const char *const x86_power_flags[] = {
8218 - "ts", /* temperature sensor */
8219 - "fid", /* frequency id control */
8220 - "vid", /* voltage id control */
8221 - "ttp", /* thermal trip */
8226 - "", /* tsc invariant mapped to constant_tsc */
8236 - seq_printf(m,"processor\t: %u\n"
8237 - "vendor_id\t: %s\n"
8238 - "cpu family\t: %d\n"
8240 - "model name\t: %s\n",
8242 - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8244 - (int)c->x86_model,
8245 - c->x86_model_id[0] ? c->x86_model_id : "unknown");
8247 + seq_printf(m, "processor\t: %u\n"
8248 + "vendor_id\t: %s\n"
8249 + "cpu family\t: %d\n"
8251 + "model name\t: %s\n",
8253 + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8255 + (int)c->x86_model,
8256 + c->x86_model_id[0] ? c->x86_model_id : "unknown");
8258 if (c->x86_mask || c->cpuid_level >= 0)
8259 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
8261 seq_printf(m, "stepping\t: unknown\n");
8263 - if (cpu_has(c,X86_FEATURE_TSC)) {
8265 + if (cpu_has(c, X86_FEATURE_TSC)) {
8266 unsigned int freq = cpufreq_quick_get((unsigned)cpu);
8270 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
8271 - freq / 1000, (freq % 1000));
8272 + freq / 1000, (freq % 1000));
8276 - if (c->x86_cache_size >= 0)
8277 + if (c->x86_cache_size >= 0)
8278 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
8282 if (smp_num_siblings * c->x86_max_cores > 1) {
8283 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
8284 @@ -1364,48 +1353,43 @@ static int show_cpuinfo(struct seq_file
8285 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
8286 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
8293 - "fpu_exception\t: yes\n"
8294 - "cpuid level\t: %d\n"
8298 + "fpu_exception\t: yes\n"
8299 + "cpuid level\t: %d\n"
8306 - for ( i = 0 ; i < 32*NCAPINTS ; i++ )
8307 - if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8308 - seq_printf(m, " %s", x86_cap_flags[i]);
8311 + for (i = 0; i < 32*NCAPINTS; i++)
8312 + if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8313 + seq_printf(m, " %s", x86_cap_flags[i]);
8315 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
8316 c->loops_per_jiffy/(500000/HZ),
8317 (c->loops_per_jiffy/(5000/HZ)) % 100);
8319 - if (c->x86_tlbsize > 0)
8320 + if (c->x86_tlbsize > 0)
8321 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
8322 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
8323 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
8325 - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8326 + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8327 c->x86_phys_bits, c->x86_virt_bits);
8329 seq_printf(m, "power management:");
8332 - for (i = 0; i < 32; i++)
8333 - if (c->x86_power & (1 << i)) {
8334 - if (i < ARRAY_SIZE(x86_power_flags) &&
8335 - x86_power_flags[i])
8336 - seq_printf(m, "%s%s",
8337 - x86_power_flags[i][0]?" ":"",
8338 - x86_power_flags[i]);
8340 - seq_printf(m, " [%d]", i);
8342 + for (i = 0; i < 32; i++) {
8343 + if (c->x86_power & (1 << i)) {
8344 + if (i < ARRAY_SIZE(x86_power_flags) &&
8345 + x86_power_flags[i])
8346 + seq_printf(m, "%s%s",
8347 + x86_power_flags[i][0]?" ":"",
8348 + x86_power_flags[i]);
8350 + seq_printf(m, " [%d]", i);
8354 seq_printf(m, "\n\n");
8355 @@ -1432,8 +1416,8 @@ static void c_stop(struct seq_file *m, v
8359 -struct seq_operations cpuinfo_op = {
8361 +const struct seq_operations cpuinfo_op = {
8365 .show = show_cpuinfo,
8366 --- a/arch/x86/kernel/setup64-xen.c
8367 +++ b/arch/x86/kernel/setup64-xen.c
8369 #include <asm/hypervisor.h>
8372 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
8373 struct boot_params __initdata boot_params;
8375 +struct boot_params boot_params;
8378 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
8380 @@ -47,6 +51,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __attr
8382 unsigned long __supported_pte_mask __read_mostly = ~0UL;
8383 EXPORT_SYMBOL(__supported_pte_mask);
8385 static int do_not_nx __cpuinitdata = 0;
8388 @@ -90,6 +95,45 @@ static int __init nonx32_setup(char *str
8389 __setup("noexec32=", nonx32_setup);
8392 + * Copy data used in early init routines from the initial arrays to the
8393 + * per cpu data areas. These arrays then become expendable and the
8394 + * *_early_ptr's are zeroed indicating that the static arrays are gone.
8396 +static void __init setup_per_cpu_maps(void)
8401 + for_each_possible_cpu(cpu) {
8403 + if (per_cpu_offset(cpu)) {
8405 + per_cpu(x86_cpu_to_apicid, cpu) =
8406 + x86_cpu_to_apicid_init[cpu];
8407 + per_cpu(x86_bios_cpu_apicid, cpu) =
8408 + x86_bios_cpu_apicid_init[cpu];
8410 + per_cpu(x86_cpu_to_node_map, cpu) =
8411 + x86_cpu_to_node_map_init[cpu];
8416 + printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
8421 + /* indicate the early static arrays will soon be gone */
8422 + x86_cpu_to_apicid_early_ptr = NULL;
8423 + x86_bios_cpu_apicid_early_ptr = NULL;
8425 + x86_cpu_to_node_map_early_ptr = NULL;
8431 * Great future plan:
8432 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
8433 * Always point %gs to its beginning
8434 @@ -109,19 +153,24 @@ void __init setup_per_cpu_areas(void)
8435 printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
8436 for_each_cpu_mask (i, cpu_possible_map) {
8438 +#ifndef CONFIG_NEED_MULTIPLE_NODES
8439 + ptr = alloc_bootmem_pages(size);
8441 + int node = early_cpu_to_node(i);
8443 - if (!NODE_DATA(cpu_to_node(i))) {
8444 - printk("cpu with no node %d, num_online_nodes %d\n",
8445 - i, num_online_nodes());
8446 + if (!node_online(node) || !NODE_DATA(node))
8447 ptr = alloc_bootmem_pages(size);
8449 - ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
8452 + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
8455 panic("Cannot allocate cpu data for CPU %d\n", i);
8456 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
8457 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
8460 + /* setup percpu data maps early */
8461 + setup_per_cpu_maps();
8465 @@ -224,7 +273,8 @@ void syscall_init(void)
8466 wrmsrl(MSR_CSTAR, ignore_sysret);
8468 /* Flags to clear on syscall */
8469 - wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
8470 + wrmsrl(MSR_SYSCALL_MASK,
8471 + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
8473 #ifdef CONFIG_IA32_EMULATION
8474 syscall32_cpu_init ();
8475 @@ -303,7 +353,7 @@ void __cpuinit cpu_init (void)
8479 - memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
8480 + memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
8483 cpu_gdt_descr[cpu].size = GDT_SIZE;
8484 @@ -334,10 +384,10 @@ void __cpuinit cpu_init (void)
8487 estacks += PAGE_SIZE << order[v];
8488 - orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
8489 + orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
8492 - t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
8493 + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
8495 * <= is required because the CPU will access up to
8496 * 8 bits beyond the end of the IO permission bitmap.
8497 --- a/arch/x86/kernel/smp_32-xen.c
8498 +++ b/arch/x86/kernel/smp_32-xen.c
8499 @@ -168,7 +168,7 @@ void __send_IPI_shortcut(unsigned int sh
8503 -void fastcall send_IPI_self(int vector)
8504 +void send_IPI_self(int vector)
8506 __send_IPI_shortcut(APIC_DEST_SELF, vector);
8508 @@ -224,13 +224,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
8509 * We need to reload %cr3 since the page tables may be going
8510 * away from under us..
8512 -void leave_mm(unsigned long cpu)
8513 +void leave_mm(int cpu)
8515 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
8517 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
8518 load_cr3(swapper_pg_dir);
8520 +EXPORT_SYMBOL_GPL(leave_mm);
8524 --- a/arch/x86/kernel/smp_64-xen.c
8525 +++ b/arch/x86/kernel/smp_64-xen.c
8530 - * Smarter SMP flushing macros.
8531 + * Smarter SMP flushing macros.
8532 * c/o Linus Torvalds.
8534 * These mean you can really definitely utterly forget about
8537 * Optimizations Manfred Spraul <manfred@colorfullife.com>
8539 - * More scalable flush, from Andi Kleen
8540 + * More scalable flush, from Andi Kleen
8542 - * To avoid global state use 8 different call vectors.
8543 - * Each CPU uses a specific vector to trigger flushes on other
8544 - * CPUs. Depending on the received vector the target CPUs look into
8545 + * To avoid global state use 8 different call vectors.
8546 + * Each CPU uses a specific vector to trigger flushes on other
8547 + * CPUs. Depending on the received vector the target CPUs look into
8548 * the right per cpu variable for the flush data.
8550 - * With more than 8 CPUs they are hashed to the 8 available
8551 - * vectors. The limited global vector space forces us to this right now.
8552 + * With more than 8 CPUs they are hashed to the 8 available
8553 + * vectors. The limited global vector space forces us to this right now.
8554 * In future when interrupts are split into per CPU domains this could be
8555 * fixed, at the cost of triggering multiple IPIs in some cases.
8557 @@ -59,7 +59,6 @@ union smp_flush_state {
8558 cpumask_t flush_cpumask;
8559 struct mm_struct *flush_mm;
8560 unsigned long flush_va;
8561 -#define FLUSH_ALL -1ULL
8562 spinlock_t tlbstate_lock;
8564 char pad[SMP_CACHE_BYTES];
8565 @@ -71,16 +70,17 @@ union smp_flush_state {
8566 static DEFINE_PER_CPU(union smp_flush_state, flush_state);
8569 - * We cannot call mmdrop() because we are in interrupt context,
8570 + * We cannot call mmdrop() because we are in interrupt context,
8571 * instead update mm->cpu_vm_mask.
8573 -static inline void leave_mm(unsigned long cpu)
8574 +void leave_mm(int cpu)
8576 if (read_pda(mmu_state) == TLBSTATE_OK)
8578 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
8579 load_cr3(swapper_pg_dir);
8581 +EXPORT_SYMBOL_GPL(leave_mm);
8585 @@ -89,25 +89,25 @@ static inline void leave_mm(unsigned lon
8586 * 1) switch_mm() either 1a) or 1b)
8587 * 1a) thread switch to a different mm
8588 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
8589 - * Stop ipi delivery for the old mm. This is not synchronized with
8590 - * the other cpus, but smp_invalidate_interrupt ignore flush ipis
8591 - * for the wrong mm, and in the worst case we perform a superfluous
8593 + * Stop ipi delivery for the old mm. This is not synchronized with
8594 + * the other cpus, but smp_invalidate_interrupt ignore flush ipis
8595 + * for the wrong mm, and in the worst case we perform a superfluous
8597 * 1a2) set cpu mmu_state to TLBSTATE_OK
8598 - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8599 + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8600 * was in lazy tlb mode.
8601 * 1a3) update cpu active_mm
8602 - * Now cpu0 accepts tlb flushes for the new mm.
8603 + * Now cpu0 accepts tlb flushes for the new mm.
8604 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
8605 - * Now the other cpus will send tlb flush ipis.
8606 + * Now the other cpus will send tlb flush ipis.
8608 * 1b) thread switch without mm change
8609 * cpu active_mm is correct, cpu0 already handles
8611 * 1b1) set cpu mmu_state to TLBSTATE_OK
8612 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
8613 - * Atomically set the bit [other cpus will start sending flush ipis],
8614 - * and test the bit.
8615 + * Atomically set the bit [other cpus will start sending flush ipis],
8616 + * and test the bit.
8617 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
8618 * 2) switch %%esp, ie current
8620 @@ -141,12 +141,12 @@ asmlinkage void smp_invalidate_interrupt
8621 * orig_rax contains the negated interrupt vector.
8622 * Use that to determine where the sender put the data.
8624 - sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
8625 + sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
8626 f = &per_cpu(flush_state, sender);
8628 if (!cpu_isset(cpu, f->flush_cpumask))
8632 * This was a BUG() but until someone can quote me the
8633 * line from the intel manual that guarantees an IPI to
8634 * multiple CPUs is retried _only_ on the erroring CPUs
8635 @@ -154,10 +154,10 @@ asmlinkage void smp_invalidate_interrupt
8641 if (f->flush_mm == read_pda(active_mm)) {
8642 if (read_pda(mmu_state) == TLBSTATE_OK) {
8643 - if (f->flush_va == FLUSH_ALL)
8644 + if (f->flush_va == TLB_FLUSH_ALL)
8647 __flush_tlb_one(f->flush_va);
8648 @@ -170,19 +170,22 @@ out:
8649 add_pda(irq_tlb_count, 1);
8652 -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
8654 +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
8658 union smp_flush_state *f;
8659 + cpumask_t cpumask = *cpumaskp;
8661 /* Caller has disabled preemption */
8662 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
8663 f = &per_cpu(flush_state, sender);
8665 - /* Could avoid this lock when
8666 - num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
8667 - probably not worth checking this for a cache-hot lock. */
8669 + * Could avoid this lock when
8670 + * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
8671 + * probably not worth checking this for a cache-hot lock.
8673 spin_lock(&f->tlbstate_lock);
8676 @@ -206,14 +209,14 @@ static void flush_tlb_others(cpumask_t c
8677 int __cpuinit init_smp_flush(void)
8681 for_each_cpu_mask(i, cpu_possible_map) {
8682 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
8687 core_initcall(init_smp_flush);
8690 void flush_tlb_current_task(void)
8692 struct mm_struct *mm = current->mm;
8693 @@ -225,10 +228,9 @@ void flush_tlb_current_task(void)
8696 if (!cpus_empty(cpu_mask))
8697 - flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
8698 + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
8701 -EXPORT_SYMBOL(flush_tlb_current_task);
8703 void flush_tlb_mm (struct mm_struct * mm)
8705 @@ -245,11 +247,10 @@ void flush_tlb_mm (struct mm_struct * mm
8706 leave_mm(smp_processor_id());
8708 if (!cpus_empty(cpu_mask))
8709 - flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
8710 + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
8714 -EXPORT_SYMBOL(flush_tlb_mm);
8716 void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
8718 @@ -263,8 +264,8 @@ void flush_tlb_page(struct vm_area_struc
8719 if (current->active_mm == mm) {
8721 __flush_tlb_one(va);
8723 - leave_mm(smp_processor_id());
8725 + leave_mm(smp_processor_id());
8728 if (!cpus_empty(cpu_mask))
8729 @@ -272,7 +273,6 @@ void flush_tlb_page(struct vm_area_struc
8733 -EXPORT_SYMBOL(flush_tlb_page);
8735 static void do_flush_tlb_all(void* info)
8737 @@ -330,11 +330,9 @@ void unlock_ipi_call_lock(void)
8738 * this function sends a 'generic call function' IPI to all other CPU
8739 * of the system defined in the mask.
8743 -__smp_call_function_mask(cpumask_t mask,
8744 - void (*func)(void *), void *info,
8746 +static int __smp_call_function_mask(cpumask_t mask,
8747 + void (*func)(void *), void *info,
8750 struct call_data_struct data;
8751 cpumask_t allbutself;
8752 @@ -422,11 +420,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
8755 int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
8756 - int nonatomic, int wait)
8757 + int nonatomic, int wait)
8759 /* prevent preemption and reschedule on another processor */
8761 - int me = get_cpu();
8762 + int ret, me = get_cpu();
8764 /* Can deadlock when called with interrupts disabled */
8765 WARN_ON(irqs_disabled());
8766 @@ -476,9 +473,9 @@ static void stop_this_cpu(void *dummy)
8768 cpu_clear(smp_processor_id(), cpu_online_map);
8769 disable_all_local_evtchn();
8776 void smp_send_stop(void)
8778 --- a/arch/x86/kernel/time_32-xen.c
8779 +++ b/arch/x86/kernel/time_32-xen.c
8781 * serialize accesses to xtime/lost_ticks).
8784 -#include <linux/errno.h>
8785 -#include <linux/sched.h>
8786 -#include <linux/kernel.h>
8787 -#include <linux/param.h>
8788 -#include <linux/string.h>
8789 -#include <linux/mm.h>
8790 +#include <linux/init.h>
8791 #include <linux/interrupt.h>
8792 #include <linux/time.h>
8793 -#include <linux/delay.h>
8794 -#include <linux/init.h>
8795 -#include <linux/smp.h>
8796 -#include <linux/module.h>
8797 -#include <linux/sysdev.h>
8798 -#include <linux/bcd.h>
8799 -#include <linux/efi.h>
8800 #include <linux/mca.h>
8801 #include <linux/sysctl.h>
8802 #include <linux/percpu.h>
8804 #include <linux/posix-timers.h>
8805 #include <linux/cpufreq.h>
8806 #include <linux/clocksource.h>
8807 +#include <linux/sysdev.h>
8809 -#include <asm/io.h>
8810 -#include <asm/smp.h>
8811 -#include <asm/irq.h>
8812 -#include <asm/msr.h>
8813 #include <asm/delay.h>
8814 -#include <asm/mpspec.h>
8815 -#include <asm/uaccess.h>
8816 -#include <asm/processor.h>
8817 -#include <asm/timer.h>
8818 #include <asm/time.h>
8819 -#include <asm/sections.h>
8821 -#include "mach_time.h"
8823 -#include <linux/timex.h>
8825 -#include <asm/hpet.h>
8827 -#include <asm/arch_hooks.h>
8829 #include <xen/evtchn.h>
8830 #include <xen/sysctl.h>
8831 @@ -89,9 +61,6 @@ volatile unsigned long __jiffies __secti
8832 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
8833 EXPORT_SYMBOL(cpu_khz);
8835 -DEFINE_SPINLOCK(rtc_lock);
8836 -EXPORT_SYMBOL(rtc_lock);
8838 /* These are peridically updated in shared_info, and then copied here. */
8839 struct shadow_time_info {
8840 u64 tsc_timestamp; /* TSC at last update of time vals. */
8841 @@ -154,6 +123,11 @@ static int __init __independent_wallcloc
8843 __setup("independent_wallclock", __independent_wallclock);
8845 +int xen_independent_wallclock(void)
8847 + return independent_wallclock;
8850 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
8851 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
8852 static int __init __permitted_clock_jitter(char *str)
8853 @@ -223,7 +197,6 @@ static inline u64 get64(volatile u64 *pt
8854 return cmpxchg64(ptr, 0, 0);
8857 -#define cmpxchg64 cmpxchg
8861 @@ -233,7 +206,6 @@ static inline u64 get64_local(volatile u
8862 return cmpxchg64_local(ptr, 0, 0);
8865 -#define cmpxchg64_local cmpxchg_local
8869 @@ -341,35 +313,6 @@ static inline int time_values_up_to_date
8870 return (dst->version == src->version);
8874 - * This is a special lock that is owned by the CPU and holds the index
8875 - * register we are working with. It is required for NMI access to the
8876 - * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
8878 -volatile unsigned long cmos_lock = 0;
8879 -EXPORT_SYMBOL(cmos_lock);
8881 -/* Routines for accessing the CMOS RAM/RTC. */
8882 -unsigned char rtc_cmos_read(unsigned char addr)
8884 - unsigned char val;
8885 - lock_cmos_prefix(addr);
8886 - outb_p(addr, RTC_PORT(0));
8887 - val = inb_p(RTC_PORT(1));
8888 - lock_cmos_suffix(addr);
8891 -EXPORT_SYMBOL(rtc_cmos_read);
8893 -void rtc_cmos_write(unsigned char val, unsigned char addr)
8895 - lock_cmos_prefix(addr);
8896 - outb_p(addr, RTC_PORT(0));
8897 - outb_p(val, RTC_PORT(1));
8898 - lock_cmos_suffix(addr);
8900 -EXPORT_SYMBOL(rtc_cmos_write);
8902 static void sync_xen_wallclock(unsigned long dummy);
8903 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
8904 static void sync_xen_wallclock(unsigned long dummy)
8905 @@ -378,7 +321,8 @@ static void sync_xen_wallclock(unsigned
8907 struct xen_platform_op op;
8909 - if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
8910 + BUG_ON(!is_initial_xendomain());
8911 + if (!ntp_synced() || independent_wallclock)
8914 write_seqlock_irq(&xtime_lock);
8915 @@ -401,23 +345,6 @@ static void sync_xen_wallclock(unsigned
8916 mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
8919 -static int set_rtc_mmss(unsigned long nowtime)
8922 - unsigned long flags;
8924 - if (independent_wallclock || !is_initial_xendomain())
8927 - /* gets recalled with irq locally disabled */
8928 - /* XXX - does irqsave resolve this? -johnstul */
8929 - spin_lock_irqsave(&rtc_lock, flags);
8930 - retval = set_wallclock(nowtime);
8931 - spin_unlock_irqrestore(&rtc_lock, flags);
8936 static unsigned long long local_clock(void)
8938 unsigned int cpu = get_cpu();
8939 @@ -500,28 +427,24 @@ unsigned long profile_pc(struct pt_regs
8941 #if defined(CONFIG_SMP) || defined(__x86_64__)
8943 - if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs)
8944 + if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs)
8946 if (!user_mode(regs)
8948 && in_lock_functions(pc)) {
8949 # ifdef CONFIG_FRAME_POINTER
8951 - return ((unsigned long *)regs->ebp)[1];
8953 - return ((unsigned long *)regs->rbp)[1];
8955 + return ((unsigned long *)regs->bp)[1];
8958 - unsigned long *sp = (unsigned long *)®s->esp;
8959 + unsigned long *sp = (unsigned long *)®s->sp;
8961 - unsigned long *sp = (unsigned long *)regs->rsp;
8962 + unsigned long *sp = (unsigned long *)regs->sp;
8965 /* Return address is either directly at stack pointer
8966 - or above a saved eflags. Eflags has bits 22-31 zero,
8967 + or above a saved flags. Eflags has bits 22-31 zero,
8968 kernel addresses don't. */
8974 @@ -750,25 +673,32 @@ static void init_missing_ticks_accountin
8975 runstate->time[RUNSTATE_offline];
8978 -/* not static: needed by APM */
8979 -unsigned long read_persistent_clock(void)
8980 +unsigned long xen_read_persistent_clock(void)
8982 - unsigned long retval;
8983 - unsigned long flags;
8985 - spin_lock_irqsave(&rtc_lock, flags);
8986 + const shared_info_t *s = HYPERVISOR_shared_info;
8987 + u32 version, sec, nsec;
8990 - retval = get_wallclock();
8992 + version = s->wc_version;
8995 + nsec = s->wc_nsec;
8997 + } while ((s->wc_version & 1) | (version ^ s->wc_version));
8999 - spin_unlock_irqrestore(&rtc_lock, flags);
9000 + delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
9001 + do_div(delta, NSEC_PER_SEC);
9007 -int update_persistent_clock(struct timespec now)
9008 +int xen_update_persistent_clock(void)
9010 + if (!is_initial_xendomain())
9012 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
9013 - return set_rtc_mmss(now.tv_sec);
9017 extern void (*late_time_init)(void);
9018 --- a/arch/x86/kernel/traps_32-xen.c
9019 +++ b/arch/x86/kernel/traps_32-xen.c
9020 @@ -79,7 +79,8 @@ char ignore_fpu_irq = 0;
9021 * F0 0F bug workaround.. We have a special link segment
9024 -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
9025 +gate_desc idt_table[256]
9026 + __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
9029 asmlinkage void divide_error(void);
9030 @@ -109,6 +110,34 @@ asmlinkage void machine_check(void);
9031 int kstack_depth_to_print = 24;
9032 static unsigned int code_bytes = 64;
9034 +void printk_address(unsigned long address, int reliable)
9036 +#ifdef CONFIG_KALLSYMS
9037 + unsigned long offset = 0, symsize;
9038 + const char *symname;
9040 + char *delim = ":";
9041 + char namebuf[128];
9042 + char reliab[4] = "";
9044 + symname = kallsyms_lookup(address, &symsize, &offset,
9045 + &modname, namebuf);
9047 + printk(" [<%08lx>]\n", address);
9051 + strcpy(reliab, "? ");
9054 + modname = delim = "";
9055 + printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
9056 + address, reliab, delim, modname, delim, symname, offset, symsize);
9058 + printk(" [<%08lx>]\n", address);
9062 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
9064 return p > (void *)tinfo &&
9065 @@ -122,48 +151,35 @@ struct stack_frame {
9068 static inline unsigned long print_context_stack(struct thread_info *tinfo,
9069 - unsigned long *stack, unsigned long ebp,
9070 + unsigned long *stack, unsigned long bp,
9071 const struct stacktrace_ops *ops, void *data)
9073 -#ifdef CONFIG_FRAME_POINTER
9074 - struct stack_frame *frame = (struct stack_frame *)ebp;
9075 - while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
9076 - struct stack_frame *next;
9077 - unsigned long addr;
9078 + struct stack_frame *frame = (struct stack_frame *)bp;
9080 - addr = frame->return_address;
9081 - ops->address(data, addr);
9083 - * break out of recursive entries (such as
9084 - * end_of_stack_stop_unwind_function). Also,
9085 - * we can never allow a frame pointer to
9088 - next = frame->next_frame;
9089 - if (next <= frame)
9094 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
9098 - if (__kernel_text_address(addr))
9099 - ops->address(data, addr);
9101 + if (__kernel_text_address(addr)) {
9102 + if ((unsigned long) stack == bp + 4) {
9103 + ops->address(data, addr, 1);
9104 + frame = frame->next_frame;
9105 + bp = (unsigned long) frame;
9107 + ops->address(data, addr, bp == 0);
9117 #define MSG(msg) ops->warning(data, msg)
9119 void dump_trace(struct task_struct *task, struct pt_regs *regs,
9120 - unsigned long *stack,
9121 + unsigned long *stack, unsigned long bp,
9122 const struct stacktrace_ops *ops, void *data)
9124 - unsigned long ebp = 0;
9129 @@ -171,17 +187,17 @@ void dump_trace(struct task_struct *task
9130 unsigned long dummy;
9132 if (task != current)
9133 - stack = (unsigned long *)task->thread.esp;
9134 + stack = (unsigned long *)task->thread.sp;
9137 #ifdef CONFIG_FRAME_POINTER
9140 if (task == current) {
9141 - /* Grab ebp right from our regs */
9142 - asm ("movl %%ebp, %0" : "=r" (ebp) : );
9143 + /* Grab bp right from our regs */
9144 + asm ("movl %%ebp, %0" : "=r" (bp) : );
9146 - /* ebp is the last reg pushed by switch_to */
9147 - ebp = *(unsigned long *) task->thread.esp;
9148 + /* bp is the last reg pushed by switch_to */
9149 + bp = *(unsigned long *) task->thread.sp;
9153 @@ -190,7 +206,7 @@ void dump_trace(struct task_struct *task
9154 struct thread_info *context;
9155 context = (struct thread_info *)
9156 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
9157 - ebp = print_context_stack(context, stack, ebp, ops, data);
9158 + bp = print_context_stack(context, stack, bp, ops, data);
9159 /* Should be after the line below, but somewhere
9160 in early boot context comes out corrupted and we
9161 can't reference it -AK */
9162 @@ -225,9 +241,11 @@ static int print_trace_stack(void *data,
9164 * Print one address/symbol entries per line.
9166 -static void print_trace_address(void *data, unsigned long addr)
9167 +static void print_trace_address(void *data, unsigned long addr, int reliable)
9169 printk("%s [<%08lx>] ", (char *)data, addr);
9172 print_symbol("%s\n", addr);
9173 touch_nmi_watchdog();
9175 @@ -241,32 +259,32 @@ static const struct stacktrace_ops print
9178 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
9179 - unsigned long * stack, char *log_lvl)
9180 + unsigned long *stack, unsigned long bp, char *log_lvl)
9182 - dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
9183 + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
9184 printk("%s =======================\n", log_lvl);
9187 void show_trace(struct task_struct *task, struct pt_regs *regs,
9188 - unsigned long * stack)
9189 + unsigned long *stack, unsigned long bp)
9191 - show_trace_log_lvl(task, regs, stack, "");
9192 + show_trace_log_lvl(task, regs, stack, bp, "");
9195 static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
9196 - unsigned long *esp, char *log_lvl)
9197 + unsigned long *sp, unsigned long bp, char *log_lvl)
9199 unsigned long *stack;
9202 - if (esp == NULL) {
9205 - esp = (unsigned long*)task->thread.esp;
9206 + sp = (unsigned long*)task->thread.sp;
9208 - esp = (unsigned long *)&esp;
9209 + sp = (unsigned long *)&sp;
9214 for(i = 0; i < kstack_depth_to_print; i++) {
9215 if (kstack_end(stack))
9217 @@ -275,13 +293,13 @@ static void show_stack_log_lvl(struct ta
9218 printk("%08lx ", *stack++);
9220 printk("\n%sCall Trace:\n", log_lvl);
9221 - show_trace_log_lvl(task, regs, esp, log_lvl);
9222 + show_trace_log_lvl(task, regs, sp, bp, log_lvl);
9225 -void show_stack(struct task_struct *task, unsigned long *esp)
9226 +void show_stack(struct task_struct *task, unsigned long *sp)
9229 - show_stack_log_lvl(task, NULL, esp, "");
9230 + show_stack_log_lvl(task, NULL, sp, 0, "");
9234 @@ -290,13 +308,19 @@ void show_stack(struct task_struct *task
9235 void dump_stack(void)
9237 unsigned long stack;
9238 + unsigned long bp = 0;
9240 +#ifdef CONFIG_FRAME_POINTER
9242 + asm("movl %%ebp, %0" : "=r" (bp):);
9245 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
9246 current->pid, current->comm, print_tainted(),
9247 init_utsname()->release,
9248 (int)strcspn(init_utsname()->version, " "),
9249 init_utsname()->version);
9250 - show_trace(current, NULL, &stack);
9251 + show_trace(current, NULL, &stack, bp);
9254 EXPORT_SYMBOL(dump_stack);
9255 @@ -315,30 +339,30 @@ void show_registers(struct pt_regs *regs
9256 * time of the fault..
9258 if (!user_mode_vm(regs)) {
9261 unsigned int code_prologue = code_bytes * 43 / 64;
9262 unsigned int code_len = code_bytes;
9265 printk("\n" KERN_EMERG "Stack: ");
9266 - show_stack_log_lvl(NULL, regs, ®s->esp, KERN_EMERG);
9267 + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG);
9269 printk(KERN_EMERG "Code: ");
9271 - eip = (u8 *)regs->eip - code_prologue;
9272 - if (eip < (u8 *)PAGE_OFFSET ||
9273 - probe_kernel_address(eip, c)) {
9274 + ip = (u8 *)regs->ip - code_prologue;
9275 + if (ip < (u8 *)PAGE_OFFSET ||
9276 + probe_kernel_address(ip, c)) {
9277 /* try starting at EIP */
9278 - eip = (u8 *)regs->eip;
9279 + ip = (u8 *)regs->ip;
9280 code_len = code_len - code_prologue + 1;
9282 - for (i = 0; i < code_len; i++, eip++) {
9283 - if (eip < (u8 *)PAGE_OFFSET ||
9284 - probe_kernel_address(eip, c)) {
9285 + for (i = 0; i < code_len; i++, ip++) {
9286 + if (ip < (u8 *)PAGE_OFFSET ||
9287 + probe_kernel_address(ip, c)) {
9288 printk(" Bad EIP value.");
9291 - if (eip == (u8 *)regs->eip)
9292 + if (ip == (u8 *)regs->ip)
9293 printk("<%02x> ", c);
9296 @@ -347,18 +371,57 @@ void show_registers(struct pt_regs *regs
9300 -int is_valid_bugaddr(unsigned long eip)
9301 +int is_valid_bugaddr(unsigned long ip)
9305 - if (eip < PAGE_OFFSET)
9306 + if (ip < PAGE_OFFSET)
9308 - if (probe_kernel_address((unsigned short *)eip, ud2))
9309 + if (probe_kernel_address((unsigned short *)ip, ud2))
9312 return ud2 == 0x0b0f;
9315 +static int die_counter;
9317 +int __kprobes __die(const char * str, struct pt_regs * regs, long err)
9320 + unsigned short ss;
9322 + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
9323 +#ifdef CONFIG_PREEMPT
9324 + printk("PREEMPT ");
9329 +#ifdef CONFIG_DEBUG_PAGEALLOC
9330 + printk("DEBUG_PAGEALLOC");
9334 + if (notify_die(DIE_OOPS, str, regs, err,
9335 + current->thread.trap_no, SIGSEGV) !=
9337 + show_registers(regs);
9338 + /* Executive summary in case the oops scrolled away */
9339 + sp = (unsigned long) (®s->sp);
9340 + savesegment(ss, ss);
9341 + if (user_mode(regs)) {
9343 + ss = regs->ss & 0xffff;
9345 + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
9346 + print_symbol("%s", regs->ip);
9347 + printk(" SS:ESP %04x:%08lx\n", ss, sp);
9355 * This is gone through when something in the kernel has done something bad and
9356 * is about to be terminated.
9357 @@ -374,7 +437,6 @@ void die(const char * str, struct pt_reg
9359 .lock_owner_depth = 0
9361 - static int die_counter;
9362 unsigned long flags;
9365 @@ -390,43 +452,13 @@ void die(const char * str, struct pt_reg
9366 raw_local_irq_save(flags);
9368 if (++die.lock_owner_depth < 3) {
9369 - unsigned long esp;
9370 - unsigned short ss;
9372 - report_bug(regs->eip, regs);
9374 - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
9376 -#ifdef CONFIG_PREEMPT
9377 - printk("PREEMPT ");
9382 -#ifdef CONFIG_DEBUG_PAGEALLOC
9383 - printk("DEBUG_PAGEALLOC");
9386 + report_bug(regs->ip, regs);
9388 - if (notify_die(DIE_OOPS, str, regs, err,
9389 - current->thread.trap_no, SIGSEGV) !=
9391 - show_registers(regs);
9392 - /* Executive summary in case the oops scrolled away */
9393 - esp = (unsigned long) (®s->esp);
9394 - savesegment(ss, ss);
9395 - if (user_mode(regs)) {
9397 - ss = regs->xss & 0xffff;
9399 - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
9400 - print_symbol("%s", regs->eip);
9401 - printk(" SS:ESP %04x:%08lx\n", ss, esp);
9404 + if (__die(str, regs, err))
9408 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
9412 die.lock_owner = -1;
9413 @@ -462,7 +494,7 @@ static void __kprobes do_trap(int trapnr
9415 struct task_struct *tsk = current;
9417 - if (regs->eflags & VM_MASK) {
9418 + if (regs->flags & VM_MASK) {
9422 @@ -508,7 +540,7 @@ static void __kprobes do_trap(int trapnr
9425 #define DO_ERROR(trapnr, signr, str, name) \
9426 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9427 +void do_##name(struct pt_regs * regs, long error_code) \
9429 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
9431 @@ -517,7 +549,7 @@ fastcall void do_##name(struct pt_regs *
9434 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
9435 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9436 +void do_##name(struct pt_regs * regs, long error_code) \
9440 @@ -533,7 +565,7 @@ fastcall void do_##name(struct pt_regs *
9443 #define DO_VM86_ERROR(trapnr, signr, str, name) \
9444 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9445 +void do_##name(struct pt_regs * regs, long error_code) \
9447 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
9449 @@ -542,7 +574,7 @@ fastcall void do_##name(struct pt_regs *
9452 #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
9453 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9454 +void do_##name(struct pt_regs * regs, long error_code) \
9457 info.si_signo = signr; \
9458 @@ -556,13 +588,13 @@ fastcall void do_##name(struct pt_regs *
9459 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
9462 -DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
9463 +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
9464 #ifndef CONFIG_KPROBES
9465 DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
9467 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
9468 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
9469 -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
9470 +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
9471 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
9472 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
9473 DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
9474 @@ -570,10 +602,10 @@ DO_ERROR(12, SIGBUS, "stack segment", s
9475 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
9476 DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
9478 -fastcall void __kprobes do_general_protection(struct pt_regs * regs,
9479 +void __kprobes do_general_protection(struct pt_regs * regs,
9482 - if (regs->eflags & VM_MASK)
9483 + if (regs->flags & VM_MASK)
9486 if (!user_mode(regs))
9487 @@ -582,11 +614,14 @@ fastcall void __kprobes do_general_prote
9488 current->thread.error_code = error_code;
9489 current->thread.trap_no = 13;
9490 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
9491 - printk_ratelimit())
9492 + printk_ratelimit()) {
9494 - "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
9495 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
9496 current->comm, task_pid_nr(current),
9497 - regs->eip, regs->esp, error_code);
9498 + regs->ip, regs->sp, error_code);
9499 + print_vma_addr(" in ", regs->ip);
9503 force_sig(SIGSEGV, current);
9505 @@ -675,8 +710,8 @@ void __kprobes die_nmi(struct pt_regs *r
9508 printk(KERN_EMERG "%s", msg);
9509 - printk(" on CPU%d, eip %08lx, registers:\n",
9510 - smp_processor_id(), regs->eip);
9511 + printk(" on CPU%d, ip %08lx, registers:\n",
9512 + smp_processor_id(), regs->ip);
9513 show_registers(regs);
9515 spin_unlock(&nmi_print_lock);
9516 @@ -733,7 +768,7 @@ static __kprobes void default_do_nmi(str
9518 static int ignore_nmis;
9520 -fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
9521 +__kprobes void do_nmi(struct pt_regs * regs, long error_code)
9525 @@ -762,7 +797,7 @@ void restart_nmi(void)
9528 #ifdef CONFIG_KPROBES
9529 -fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
9530 +void __kprobes do_int3(struct pt_regs *regs, long error_code)
9532 trace_hardirqs_fixup();
9534 @@ -798,7 +833,7 @@ fastcall void __kprobes do_int3(struct p
9535 * find every occurrence of the TF bit that could be saved away even
9538 -fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
9539 +void __kprobes do_debug(struct pt_regs * regs, long error_code)
9541 unsigned int condition;
9542 struct task_struct *tsk = current;
9543 @@ -807,24 +842,30 @@ fastcall void __kprobes do_debug(struct
9545 get_debugreg(condition, 6);
9548 + * The processor cleared BTF, so don't mark that we need it set.
9550 + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
9551 + tsk->thread.debugctlmsr = 0;
9553 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
9554 SIGTRAP) == NOTIFY_STOP)
9556 /* It's safe to allow irq's after DR6 has been saved */
9557 - if (regs->eflags & X86_EFLAGS_IF)
9558 + if (regs->flags & X86_EFLAGS_IF)
9561 /* Mask out spurious debug traps due to lazy DR7 setting */
9562 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
9563 - if (!tsk->thread.debugreg[7])
9564 + if (!tsk->thread.debugreg7)
9568 - if (regs->eflags & VM_MASK)
9569 + if (regs->flags & VM_MASK)
9572 /* Save debug status register where ptrace can see it */
9573 - tsk->thread.debugreg[6] = condition;
9574 + tsk->thread.debugreg6 = condition;
9577 * Single-stepping through TF: make sure we ignore any events in
9578 @@ -856,7 +897,7 @@ debug_vm86:
9581 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
9582 - regs->eflags &= ~TF_MASK;
9583 + regs->flags &= ~TF_MASK;
9587 @@ -865,7 +906,7 @@ clear_TF_reenable:
9588 * the correct behaviour even in the presence of the asynchronous
9591 -void math_error(void __user *eip)
9592 +void math_error(void __user *ip)
9594 struct task_struct * task;
9596 @@ -881,7 +922,7 @@ void math_error(void __user *eip)
9597 info.si_signo = SIGFPE;
9599 info.si_code = __SI_FAULT;
9600 - info.si_addr = eip;
9601 + info.si_addr = ip;
9603 * (~cwd & swd) will mask out exceptions that are not set to unmasked
9604 * status. 0x3f is the exception bits in these regs, 0x200 is the
9605 @@ -924,13 +965,13 @@ void math_error(void __user *eip)
9606 force_sig_info(SIGFPE, &info, task);
9609 -fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
9610 +void do_coprocessor_error(struct pt_regs * regs, long error_code)
9613 - math_error((void __user *)regs->eip);
9614 + math_error((void __user *)regs->ip);
9617 -static void simd_math_error(void __user *eip)
9618 +static void simd_math_error(void __user *ip)
9620 struct task_struct * task;
9622 @@ -946,7 +987,7 @@ static void simd_math_error(void __user
9623 info.si_signo = SIGFPE;
9625 info.si_code = __SI_FAULT;
9626 - info.si_addr = eip;
9627 + info.si_addr = ip;
9629 * The SIMD FPU exceptions are handled a little differently, as there
9630 * is only a single status/control register. Thus, to determine which
9631 @@ -978,19 +1019,19 @@ static void simd_math_error(void __user
9632 force_sig_info(SIGFPE, &info, task);
9635 -fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
9636 +void do_simd_coprocessor_error(struct pt_regs * regs,
9640 /* Handle SIMD FPU exceptions on PIII+ processors. */
9642 - simd_math_error((void __user *)regs->eip);
9643 + simd_math_error((void __user *)regs->ip);
9646 * Handle strange cache flush from user space exception
9647 * in all other cases. This is undocumented behaviour.
9649 - if (regs->eflags & VM_MASK) {
9650 + if (regs->flags & VM_MASK) {
9651 handle_vm86_fault((struct kernel_vm86_regs *)regs,
9654 @@ -1003,7 +1044,7 @@ fastcall void do_simd_coprocessor_error(
9658 -fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
9659 +void do_spurious_interrupt_bug(struct pt_regs * regs,
9663 @@ -1012,7 +1053,7 @@ fastcall void do_spurious_interrupt_bug(
9667 -fastcall unsigned long patch_espfix_desc(unsigned long uesp,
9668 +unsigned long patch_espfix_desc(unsigned long uesp,
9671 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
9672 @@ -1072,7 +1113,7 @@ asmlinkage void math_emulate(long arg)
9673 * NB. All these are "trap gates" (i.e. events_mask isn't set) except
9674 * for those that specify <dpl>|4 in the second field.
9676 -static trap_info_t __cpuinitdata trap_table[] = {
9677 +static const trap_info_t __cpuinitconst trap_table[] = {
9678 { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
9679 { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
9680 { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
9681 @@ -1105,17 +1146,12 @@ void __init trap_init(void)
9683 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
9686 + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
9687 + * Generate a build-time error if the alignment is wrong.
9689 + BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
9692 - * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
9693 - * Generates a compile-time "error: zero width for bit-field" if
9694 - * the alignment is wrong.
9696 - struct fxsrAlignAssert {
9697 - int _:!(offsetof(struct task_struct,
9698 - thread.i387.fxsave) & 15);
9701 printk(KERN_INFO "Enabling fast FPU save and restore... ");
9702 set_in_cr4(X86_CR4_OSFXSR);
9704 --- a/arch/x86/kernel/traps_64-xen.c
9705 +++ b/arch/x86/kernel/traps_64-xen.c
9706 @@ -74,38 +74,41 @@ asmlinkage void alignment_check(void);
9707 asmlinkage void machine_check(void);
9708 asmlinkage void spurious_interrupt_bug(void);
9710 +static unsigned int code_bytes = 64;
9712 static inline void conditional_sti(struct pt_regs *regs)
9714 - if (regs->eflags & X86_EFLAGS_IF)
9715 + if (regs->flags & X86_EFLAGS_IF)
9719 static inline void preempt_conditional_sti(struct pt_regs *regs)
9721 - preempt_disable();
9722 - if (regs->eflags & X86_EFLAGS_IF)
9723 + inc_preempt_count();
9724 + if (regs->flags & X86_EFLAGS_IF)
9728 static inline void preempt_conditional_cli(struct pt_regs *regs)
9730 - if (regs->eflags & X86_EFLAGS_IF)
9731 + if (regs->flags & X86_EFLAGS_IF)
9732 local_irq_disable();
9733 /* Make sure to not schedule here because we could be running
9734 on an exception stack. */
9735 - preempt_enable_no_resched();
9736 + dec_preempt_count();
9739 int kstack_depth_to_print = 12;
9741 -#ifdef CONFIG_KALLSYMS
9742 -void printk_address(unsigned long address)
9743 +void printk_address(unsigned long address, int reliable)
9745 +#ifdef CONFIG_KALLSYMS
9746 unsigned long offset = 0, symsize;
9747 const char *symname;
9750 - char namebuf[128];
9751 + char namebuf[KSYM_NAME_LEN];
9752 + char reliab[4] = "";
9754 symname = kallsyms_lookup(address, &symsize, &offset,
9756 @@ -113,17 +116,17 @@ void printk_address(unsigned long addres
9757 printk(" [<%016lx>]\n", address);
9761 + strcpy(reliab, "? ");
9764 - modname = delim = "";
9765 - printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
9766 - address, delim, modname, delim, symname, offset, symsize);
9768 + modname = delim = "";
9769 + printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
9770 + address, reliab, delim, modname, delim, symname, offset, symsize);
9772 -void printk_address(unsigned long address)
9774 printk(" [<%016lx>]\n", address);
9779 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
9780 unsigned *usedp, char **idp)
9781 @@ -210,14 +213,53 @@ static unsigned long *in_exception_stack
9782 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
9785 -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
9786 +static inline int valid_stack_ptr(struct thread_info *tinfo,
9787 + void *p, unsigned int size, void *end)
9789 - void *t = (void *)tinfo;
9790 - return p > t && p < t + THREAD_SIZE - 3;
9793 + if (p < end && p >= (end-THREAD_SIZE))
9798 + return p > t && p < t + THREAD_SIZE - size;
9801 +/* The form of the top of the frame on the stack */
9802 +struct stack_frame {
9803 + struct stack_frame *next_frame;
9804 + unsigned long return_address;
9808 +static inline unsigned long print_context_stack(struct thread_info *tinfo,
9809 + unsigned long *stack, unsigned long bp,
9810 + const struct stacktrace_ops *ops, void *data,
9811 + unsigned long *end)
9813 + struct stack_frame *frame = (struct stack_frame *)bp;
9815 + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
9816 + unsigned long addr;
9819 + if (__kernel_text_address(addr)) {
9820 + if ((unsigned long) stack == bp + 8) {
9821 + ops->address(data, addr, 1);
9822 + frame = frame->next_frame;
9823 + bp = (unsigned long) frame;
9825 + ops->address(data, addr, bp == 0);
9833 void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
9834 - unsigned long *stack,
9835 + unsigned long *stack, unsigned long bp,
9836 const struct stacktrace_ops *ops, void *data)
9838 const unsigned cpu = get_cpu();
9839 @@ -227,36 +269,28 @@ void dump_trace(struct task_struct *tsk,
9843 + tinfo = task_thread_info(tsk);
9846 unsigned long dummy;
9848 if (tsk && tsk != current)
9849 - stack = (unsigned long *)tsk->thread.rsp;
9850 + stack = (unsigned long *)tsk->thread.sp;
9854 - * Print function call entries within a stack. 'cond' is the
9855 - * "end of stackframe" condition, that the 'stack++'
9856 - * iteration will eventually trigger.
9858 -#define HANDLE_STACK(cond) \
9859 - do while (cond) { \
9860 - unsigned long addr = *stack++; \
9861 - /* Use unlocked access here because except for NMIs \
9862 - we should be already protected against module unloads */ \
9863 - if (__kernel_text_address(addr)) { \
9865 - * If the address is either in the text segment of the \
9866 - * kernel, or in the region which contains vmalloc'ed \
9867 - * memory, it *may* be the address of a calling \
9868 - * routine; if so, print it so that someone tracing \
9869 - * down the cause of the crash will be able to figure \
9870 - * out the call path that was taken. \
9872 - ops->address(data, addr); \
9875 +#ifdef CONFIG_FRAME_POINTER
9877 + if (tsk == current) {
9878 + /* Grab bp right from our regs */
9879 + asm("movq %%rbp, %0" : "=r" (bp):);
9881 + /* bp is the last reg pushed by switch_to */
9882 + bp = *(unsigned long *) tsk->thread.sp;
9890 * Print function call entries in all stacks, starting at the
9891 @@ -272,7 +306,9 @@ void dump_trace(struct task_struct *tsk,
9893 if (ops->stack(data, id) < 0)
9895 - HANDLE_STACK (stack < estack_end);
9897 + bp = print_context_stack(tinfo, stack, bp, ops,
9898 + data, estack_end);
9899 ops->stack(data, "<EOE>");
9901 * We link to the next stack via the
9902 @@ -290,7 +326,8 @@ void dump_trace(struct task_struct *tsk,
9903 if (stack >= irqstack && stack < irqstack_end) {
9904 if (ops->stack(data, "IRQ") < 0)
9906 - HANDLE_STACK (stack < irqstack_end);
9907 + bp = print_context_stack(tinfo, stack, bp,
9908 + ops, data, irqstack_end);
9910 * We link to the next stack (which would be
9911 * the process stack normally) the last
9912 @@ -308,9 +345,7 @@ void dump_trace(struct task_struct *tsk,
9914 * This handles the process stack:
9916 - tinfo = task_thread_info(tsk);
9917 - HANDLE_STACK (valid_stack_ptr(tinfo, stack));
9918 -#undef HANDLE_STACK
9919 + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
9922 EXPORT_SYMBOL(dump_trace);
9923 @@ -333,10 +368,10 @@ static int print_trace_stack(void *data,
9927 -static void print_trace_address(void *data, unsigned long addr)
9928 +static void print_trace_address(void *data, unsigned long addr, int reliable)
9930 touch_nmi_watchdog();
9931 - printk_address(addr);
9932 + printk_address(addr, reliable);
9935 static const struct stacktrace_ops print_trace_ops = {
9936 @@ -347,15 +382,17 @@ static const struct stacktrace_ops print
9940 -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
9941 +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
9944 printk("\nCall Trace:\n");
9945 - dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
9946 + dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
9951 -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
9952 +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
9955 unsigned long *stack;
9957 @@ -366,14 +403,14 @@ _show_stack(struct task_struct *tsk, str
9958 // debugging aid: "show_stack(NULL, NULL);" prints the
9959 // back trace for this cpu.
9961 - if (rsp == NULL) {
9964 - rsp = (unsigned long *)tsk->thread.rsp;
9965 + sp = (unsigned long *)tsk->thread.sp;
9967 - rsp = (unsigned long *)&rsp;
9968 + sp = (unsigned long *)&sp;
9973 for(i=0; i < kstack_depth_to_print; i++) {
9974 if (stack >= irqstack && stack <= irqstack_end) {
9975 if (stack == irqstack_end) {
9976 @@ -389,12 +426,12 @@ _show_stack(struct task_struct *tsk, str
9977 printk(" %016lx", *stack++);
9978 touch_nmi_watchdog();
9980 - show_trace(tsk, regs, rsp);
9981 + show_trace(tsk, regs, sp, bp);
9984 -void show_stack(struct task_struct *tsk, unsigned long * rsp)
9985 +void show_stack(struct task_struct *tsk, unsigned long * sp)
9987 - _show_stack(tsk, NULL, rsp);
9988 + _show_stack(tsk, NULL, sp, 0);
9992 @@ -403,13 +440,19 @@ void show_stack(struct task_struct *tsk,
9993 void dump_stack(void)
9995 unsigned long dummy;
9996 + unsigned long bp = 0;
9998 +#ifdef CONFIG_FRAME_POINTER
10000 + asm("movq %%rbp, %0" : "=r" (bp):);
10003 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
10004 current->pid, current->comm, print_tainted(),
10005 init_utsname()->release,
10006 (int)strcspn(init_utsname()->version, " "),
10007 init_utsname()->version);
10008 - show_trace(NULL, NULL, &dummy);
10009 + show_trace(NULL, NULL, &dummy, bp);
10012 EXPORT_SYMBOL(dump_stack);
10013 @@ -417,12 +460,15 @@ EXPORT_SYMBOL(dump_stack);
10014 void show_registers(struct pt_regs *regs)
10017 - int in_kernel = !user_mode(regs);
10018 - unsigned long rsp;
10019 + unsigned long sp;
10020 const int cpu = smp_processor_id();
10021 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
10023 + unsigned int code_prologue = code_bytes * 43 / 64;
10024 + unsigned int code_len = code_bytes;
10028 + ip = (u8 *) regs->ip - code_prologue;
10029 printk("CPU %d ", cpu);
10031 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
10032 @@ -432,45 +478,43 @@ void show_registers(struct pt_regs *regs
10033 * When in-kernel, we also print out the stack and code at the
10034 * time of the fault..
10037 + if (!user_mode(regs)) {
10040 - _show_stack(NULL, regs, (unsigned long*)rsp);
10041 + _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
10044 - printk("\nCode: ");
10045 - if (regs->rip < PAGE_OFFSET)
10048 - for (i=0; i<20; i++) {
10050 - if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
10052 + printk(KERN_EMERG "Code: ");
10053 + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
10054 + /* try starting at RIP */
10055 + ip = (u8 *) regs->ip;
10056 + code_len = code_len - code_prologue + 1;
10058 + for (i = 0; i < code_len; i++, ip++) {
10059 + if (ip < (u8 *)PAGE_OFFSET ||
10060 + probe_kernel_address(ip, c)) {
10061 printk(" Bad RIP value.");
10064 - printk("%02x ", c);
10065 + if (ip == (u8 *)regs->ip)
10066 + printk("<%02x> ", c);
10068 + printk("%02x ", c);
10074 -int is_valid_bugaddr(unsigned long rip)
10075 +int is_valid_bugaddr(unsigned long ip)
10077 unsigned short ud2;
10079 - if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
10080 + if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
10083 return ud2 == 0x0b0f;
10087 -void out_of_line_bug(void)
10091 -EXPORT_SYMBOL(out_of_line_bug);
10094 static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
10095 static int die_owner = -1;
10096 static unsigned int die_nest_count;
10097 @@ -498,7 +542,7 @@ unsigned __kprobes long oops_begin(void)
10101 -void __kprobes oops_end(unsigned long flags)
10102 +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
10106 @@ -507,12 +551,17 @@ void __kprobes oops_end(unsigned long fl
10107 /* Nest count reaches zero, release the lock. */
10108 __raw_spin_unlock(&die_lock);
10109 raw_local_irq_restore(flags);
10115 panic("Fatal exception");
10120 -void __kprobes __die(const char * str, struct pt_regs * regs, long err)
10121 +int __kprobes __die(const char * str, struct pt_regs * regs, long err)
10123 static int die_counter;
10124 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
10125 @@ -526,15 +575,17 @@ void __kprobes __die(const char * str, s
10126 printk("DEBUG_PAGEALLOC");
10129 - notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
10130 + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
10132 show_registers(regs);
10133 add_taint(TAINT_DIE);
10134 /* Executive summary in case the oops scrolled away */
10135 printk(KERN_ALERT "RIP ");
10136 - printk_address(regs->rip);
10137 - printk(" RSP <%016lx>\n", regs->rsp);
10138 + printk_address(regs->ip, 1);
10139 + printk(" RSP <%016lx>\n", regs->sp);
10140 if (kexec_should_crash(current))
10145 void die(const char * str, struct pt_regs * regs, long err)
10146 @@ -542,11 +593,11 @@ void die(const char * str, struct pt_reg
10147 unsigned long flags = oops_begin();
10149 if (!user_mode(regs))
10150 - report_bug(regs->rip, regs);
10151 + report_bug(regs->ip, regs);
10153 - __die(str, regs, err);
10155 - do_exit(SIGSEGV);
10156 + if (__die(str, regs, err))
10158 + oops_end(flags, regs, SIGSEGV);
10161 #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
10162 @@ -564,10 +615,10 @@ void __kprobes die_nmi(char *str, struct
10164 if (do_panic || panic_on_oops)
10165 panic("Non maskable interrupt");
10167 + oops_end(flags, NULL, SIGBUS);
10169 local_irq_enable();
10170 - do_exit(SIGSEGV);
10175 @@ -592,11 +643,14 @@ static void __kprobes do_trap(int trapnr
10176 tsk->thread.trap_no = trapnr;
10178 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
10179 - printk_ratelimit())
10180 + printk_ratelimit()) {
10182 - "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
10183 + "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
10184 tsk->comm, tsk->pid, str,
10185 - regs->rip, regs->rsp, error_code);
10186 + regs->ip, regs->sp, error_code);
10187 + print_vma_addr(" in ", regs->ip);
10192 force_sig_info(signr, info, tsk);
10193 @@ -606,19 +660,12 @@ static void __kprobes do_trap(int trapnr
10197 - /* kernel trap */
10199 - const struct exception_table_entry *fixup;
10200 - fixup = search_exception_tables(regs->rip);
10202 - regs->rip = fixup->fixup;
10204 - tsk->thread.error_code = error_code;
10205 - tsk->thread.trap_no = trapnr;
10206 - die(str, regs, error_code);
10209 + if (!fixup_exception(regs)) {
10210 + tsk->thread.error_code = error_code;
10211 + tsk->thread.trap_no = trapnr;
10212 + die(str, regs, error_code);
10217 #define DO_ERROR(trapnr, signr, str, name) \
10218 @@ -647,10 +694,10 @@ asmlinkage void do_##name(struct pt_regs
10219 do_trap(trapnr, signr, str, regs, error_code, &info); \
10222 -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
10223 +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
10224 DO_ERROR( 4, SIGSEGV, "overflow", overflow)
10225 DO_ERROR( 5, SIGSEGV, "bounds", bounds)
10226 -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
10227 +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
10228 DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
10229 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
10230 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
10231 @@ -698,32 +745,28 @@ asmlinkage void __kprobes do_general_pro
10232 tsk->thread.trap_no = 13;
10234 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
10235 - printk_ratelimit())
10236 + printk_ratelimit()) {
10238 - "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
10239 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
10240 tsk->comm, tsk->pid,
10241 - regs->rip, regs->rsp, error_code);
10242 + regs->ip, regs->sp, error_code);
10243 + print_vma_addr(" in ", regs->ip);
10247 force_sig(SIGSEGV, tsk);
10253 - const struct exception_table_entry *fixup;
10254 - fixup = search_exception_tables(regs->rip);
10256 - regs->rip = fixup->fixup;
10259 + if (fixup_exception(regs))
10262 - tsk->thread.error_code = error_code;
10263 - tsk->thread.trap_no = 13;
10264 - if (notify_die(DIE_GPF, "general protection fault", regs,
10265 - error_code, 13, SIGSEGV) == NOTIFY_STOP)
10267 - die("general protection fault", regs, error_code);
10269 + tsk->thread.error_code = error_code;
10270 + tsk->thread.trap_no = 13;
10271 + if (notify_die(DIE_GPF, "general protection fault", regs,
10272 + error_code, 13, SIGSEGV) == NOTIFY_STOP)
10274 + die("general protection fault", regs, error_code);
10277 static __kprobes void
10278 @@ -833,15 +876,15 @@ asmlinkage __kprobes struct pt_regs *syn
10280 struct pt_regs *regs = eregs;
10281 /* Did already sync */
10282 - if (eregs == (struct pt_regs *)eregs->rsp)
10283 + if (eregs == (struct pt_regs *)eregs->sp)
10285 /* Exception from user space */
10286 else if (user_mode(eregs))
10287 regs = task_pt_regs(current);
10288 /* Exception from kernel and interrupts are enabled. Move to
10289 kernel process stack. */
10290 - else if (eregs->eflags & X86_EFLAGS_IF)
10291 - regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
10292 + else if (eregs->flags & X86_EFLAGS_IF)
10293 + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
10297 @@ -859,6 +902,12 @@ asmlinkage void __kprobes do_debug(struc
10299 get_debugreg(condition, 6);
10302 + * The processor cleared BTF, so don't mark that we need it set.
10304 + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
10305 + tsk->thread.debugctlmsr = 0;
10307 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
10308 SIGTRAP) == NOTIFY_STOP)
10310 @@ -874,27 +923,14 @@ asmlinkage void __kprobes do_debug(struc
10312 tsk->thread.debugreg6 = condition;
10314 - /* Mask out spurious TF errors due to lazy TF clearing */
10317 + * Single-stepping through TF: make sure we ignore any events in
10318 + * kernel space (but re-enable TF when returning to user mode).
10320 if (condition & DR_STEP) {
10322 - * The TF error should be masked out only if the current
10323 - * process is not traced and if the TRAP flag has been set
10324 - * previously by a tracing process (condition detected by
10325 - * the PT_DTRACE flag); remember that the i386 TRAP flag
10326 - * can be modified by the process itself in user mode,
10327 - * allowing programs to debug themselves without the ptrace()
10330 if (!user_mode(regs))
10331 goto clear_TF_reenable;
10333 - * Was the TF flag set by a debugger? If so, clear it now,
10334 - * so that register information is correct.
10336 - if (tsk->ptrace & PT_DTRACE) {
10337 - regs->eflags &= ~TF_MASK;
10338 - tsk->ptrace &= ~PT_DTRACE;
10342 /* Ok, finally something we can handle */
10343 @@ -903,7 +939,7 @@ asmlinkage void __kprobes do_debug(struc
10344 info.si_signo = SIGTRAP;
10346 info.si_code = TRAP_BRKPT;
10347 - info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
10348 + info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
10349 force_sig_info(SIGTRAP, &info, tsk);
10352 @@ -913,18 +949,15 @@ clear_dr7:
10355 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
10356 - regs->eflags &= ~TF_MASK;
10357 + regs->flags &= ~X86_EFLAGS_TF;
10358 preempt_conditional_cli(regs);
10361 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
10363 - const struct exception_table_entry *fixup;
10364 - fixup = search_exception_tables(regs->rip);
10366 - regs->rip = fixup->fixup;
10367 + if (fixup_exception(regs))
10371 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
10372 /* Illegal floating point operation in the kernel */
10373 current->thread.trap_no = trapnr;
10374 @@ -939,7 +972,7 @@ static int kernel_math_error(struct pt_r
10376 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
10378 - void __user *rip = (void __user *)(regs->rip);
10379 + void __user *ip = (void __user *)(regs->ip);
10380 struct task_struct * task;
10382 unsigned short cwd, swd;
10383 @@ -959,7 +992,7 @@ asmlinkage void do_coprocessor_error(str
10384 info.si_signo = SIGFPE;
10386 info.si_code = __SI_FAULT;
10387 - info.si_addr = rip;
10388 + info.si_addr = ip;
10390 * (~cwd & swd) will mask out exceptions that are not set to unmasked
10391 * status. 0x3f is the exception bits in these regs, 0x200 is the
10392 @@ -1008,7 +1041,7 @@ asmlinkage void bad_intr(void)
10394 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
10396 - void __user *rip = (void __user *)(regs->rip);
10397 + void __user *ip = (void __user *)(regs->ip);
10398 struct task_struct * task;
10400 unsigned short mxcsr;
10401 @@ -1028,7 +1061,7 @@ asmlinkage void do_simd_coprocessor_erro
10402 info.si_signo = SIGFPE;
10404 info.si_code = __SI_FAULT;
10405 - info.si_addr = rip;
10406 + info.si_addr = ip;
10408 * The SIMD FPU exceptions are handled a little differently, as there
10409 * is only a single status/control register. Thus, to determine which
10410 @@ -1092,13 +1125,14 @@ asmlinkage void math_state_restore(void)
10411 task_thread_info(me)->status |= TS_USEDFPU;
10414 +EXPORT_SYMBOL_GPL(math_state_restore);
10418 * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
10419 * specify <dpl>|4 in the second field.
10421 -static trap_info_t __cpuinitdata trap_table[] = {
10422 +static const trap_info_t __cpuinitconst trap_table[] = {
10423 { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error },
10424 { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
10425 { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
10426 @@ -1169,3 +1203,14 @@ static int __init kstack_setup(char *s)
10429 early_param("kstack", kstack_setup);
10432 +static int __init code_bytes_setup(char *s)
10434 + code_bytes = simple_strtoul(s, NULL, 0);
10435 + if (code_bytes > 8192)
10436 + code_bytes = 8192;
10440 +__setup("code_bytes=", code_bytes_setup);
10441 --- a/arch/x86/kernel/vsyscall_64-xen.c
10442 +++ b/arch/x86/kernel/vsyscall_64-xen.c
10444 #include <asm/vgtod.h>
10446 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
10447 -#define __syscall_clobber "r11","rcx","memory"
10448 -#define __pa_vsymbol(x) \
10449 - ({unsigned long v; \
10450 - extern char __vsyscall_0; \
10451 - asm("" : "=r" (v) : "0" (x)); \
10452 - ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
10453 +#define __syscall_clobber "r11","cx","memory"
10456 * vsyscall_gtod_data contains data that is :
10457 @@ -102,7 +97,7 @@ static __always_inline void do_get_tz(st
10458 static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
10461 - asm volatile("vsysc2: syscall"
10462 + asm volatile("syscall"
10464 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
10465 : __syscall_clobber );
10466 @@ -112,7 +107,7 @@ static __always_inline int gettimeofday(
10467 static __always_inline long time_syscall(long *t)
10470 - asm volatile("vsysc1: syscall"
10471 + asm volatile("syscall"
10473 : "0" (__NR_time),"D" (t) : __syscall_clobber);
10475 @@ -190,7 +185,7 @@ time_t __vsyscall(1) vtime(time_t *t)
10477 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
10479 - unsigned int dummy, p;
10481 unsigned long j = 0;
10483 /* Fast cache - only recompute value once per jiffies and avoid
10484 @@ -205,7 +200,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
10485 p = tcache->blob[1];
10486 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
10487 /* Load per CPU data from RDTSCP */
10488 - rdtscp(dummy, dummy, p);
10489 + native_read_tscp(&p);
10491 /* Load per CPU data from GDT */
10492 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
10493 @@ -228,42 +223,11 @@ long __vsyscall(3) venosys_1(void)
10495 #ifdef CONFIG_SYSCTL
10497 -#define SYSCALL 0x050f
10498 -#define NOP2 0x9090
10501 - * NOP out syscall in vsyscall page when not needed.
10503 -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
10504 - void __user *buffer, size_t *lenp, loff_t *ppos)
10506 +vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
10507 + void __user *buffer, size_t *lenp, loff_t *ppos)
10509 - extern u16 vsysc1, vsysc2;
10510 - u16 __iomem *map1;
10511 - u16 __iomem *map2;
10512 - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
10515 - /* gcc has some trouble with __va(__pa()), so just do it this
10517 - map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
10520 - map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
10525 - if (!vsyscall_gtod_data.sysctl_enabled) {
10526 - writew(SYSCALL, map1);
10527 - writew(SYSCALL, map2);
10529 - writew(NOP2, map1);
10530 - writew(NOP2, map2);
10536 + return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
10539 static ctl_table kernel_table2[] = {
10540 @@ -279,7 +243,6 @@ static ctl_table kernel_root_table2[] =
10541 .child = kernel_table2 },
10547 /* Assume __initcall executes before all user space. Hopefully kmod
10548 @@ -301,7 +264,7 @@ static void __cpuinit vsyscall_set_cpu(i
10550 d |= (node & 0xf) << 12;
10551 d |= (node >> 4) << 48;
10552 - if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu)
10553 + if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
10554 + GDT_ENTRY_PER_CPU),
10557 @@ -322,7 +285,7 @@ cpu_vsyscall_notifier(struct notifier_bl
10558 return NOTIFY_DONE;
10561 -static void __init map_vsyscall(void)
10562 +void __init map_vsyscall(void)
10564 extern char __vsyscall_0;
10565 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
10566 @@ -338,7 +301,6 @@ static int __init vsyscall_init(void)
10567 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
10568 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
10569 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
10572 vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */
10573 if (boot_cpu_has(X86_FEATURE_RDTSCP))
10574 --- a/arch/x86/kernel/xen_entry_64.S
10578 - * Copied from arch/xen/i386/kernel/entry.S
10580 -/* Offsets into shared_info_t. */
10581 -#define evtchn_upcall_pending /* 0 */
10582 -#define evtchn_upcall_mask 1
10584 -#define sizeof_vcpu_shift 6
10587 -//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
10588 -//#define preempt_enable(reg) decl threadinfo_preempt_count(reg)
10589 -#define preempt_disable(reg)
10590 -#define preempt_enable(reg)
10591 -#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \
10592 - movq %gs:pda_cpunumber,reg ; \
10594 - shr $32-sizeof_vcpu_shift,reg ; \
10595 - addq HYPERVISOR_shared_info,reg
10596 -#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \
10597 -#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
10599 -#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
10600 -#define XEN_PUT_VCPU_INFO(reg)
10601 -#define XEN_PUT_VCPU_INFO_fixup
10604 -#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg)
10605 -#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
10606 -#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
10607 - XEN_LOCKED_BLOCK_EVENTS(reg) ; \
10608 - XEN_PUT_VCPU_INFO(reg)
10609 -#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
10610 - XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \
10611 - XEN_PUT_VCPU_INFO(reg)
10612 -#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg)
10613 --- a/arch/x86/mach-xen/setup.c
10614 +++ b/arch/x86/mach-xen/setup.c
10615 @@ -161,15 +161,12 @@ void __init machine_specific_arch_setup(
10617 /* Do an early initialization of the fixmap area */
10619 - extern pte_t swapper_pg_pmd[PTRS_PER_PTE];
10620 + extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
10621 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
10622 - pgd_t *pgd = (pgd_t *)xen_start_info->pt_base;
10623 - pud_t *pud = pud_offset(pgd + pgd_index(addr), addr);
10624 + pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
10625 pmd_t *pmd = pmd_offset(pud, addr);
10627 - swapper_pg_dir = pgd;
10628 - init_mm.pgd = pgd;
10629 - make_lowmem_page_readonly(swapper_pg_pmd, XENFEAT_writable_page_tables);
10630 - set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_pmd) | _PAGE_TABLE));
10631 + make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
10632 + set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
10635 --- a/arch/x86/mm/fault_32-xen.c
10639 - * linux/arch/i386/mm/fault.c
10641 - * Copyright (C) 1995 Linus Torvalds
10644 -#include <linux/signal.h>
10645 -#include <linux/sched.h>
10646 -#include <linux/kernel.h>
10647 -#include <linux/errno.h>
10648 -#include <linux/string.h>
10649 -#include <linux/types.h>
10650 -#include <linux/ptrace.h>
10651 -#include <linux/mman.h>
10652 -#include <linux/mm.h>
10653 -#include <linux/smp.h>
10654 -#include <linux/interrupt.h>
10655 -#include <linux/init.h>
10656 -#include <linux/tty.h>
10657 -#include <linux/vt_kern.h> /* For unblank_screen() */
10658 -#include <linux/highmem.h>
10659 -#include <linux/bootmem.h> /* for max_low_pfn */
10660 -#include <linux/vmalloc.h>
10661 -#include <linux/module.h>
10662 -#include <linux/kprobes.h>
10663 -#include <linux/uaccess.h>
10664 -#include <linux/kdebug.h>
10665 -#include <linux/kprobes.h>
10667 -#include <asm/system.h>
10668 -#include <asm/desc.h>
10669 -#include <asm/segment.h>
10671 -extern void die(const char *,struct pt_regs *,long);
10673 -#ifdef CONFIG_KPROBES
10674 -static inline int notify_page_fault(struct pt_regs *regs)
10678 - /* kprobe_running() needs smp_processor_id() */
10679 - if (!user_mode_vm(regs)) {
10680 - preempt_disable();
10681 - if (kprobe_running() && kprobe_fault_handler(regs, 14))
10683 - preempt_enable();
10689 -static inline int notify_page_fault(struct pt_regs *regs)
10696 - * Return EIP plus the CS segment base. The segment limit is also
10697 - * adjusted, clamped to the kernel/user address space (whichever is
10698 - * appropriate), and returned in *eip_limit.
10700 - * The segment is checked, because it might have been changed by another
10701 - * task between the original faulting instruction and here.
10703 - * If CS is no longer a valid code segment, or if EIP is beyond the
10704 - * limit, or if it is a kernel address when CS is not a kernel segment,
10705 - * then the returned value will be greater than *eip_limit.
10707 - * This is slow, but is very rarely executed.
10709 -static inline unsigned long get_segment_eip(struct pt_regs *regs,
10710 - unsigned long *eip_limit)
10712 - unsigned long eip = regs->eip;
10713 - unsigned seg = regs->xcs & 0xffff;
10714 - u32 seg_ar, seg_limit, base, *desc;
10716 - /* Unlikely, but must come before segment checks. */
10717 - if (unlikely(regs->eflags & VM_MASK)) {
10719 - *eip_limit = base + 0xffff;
10720 - return base + (eip & 0xffff);
10723 - /* The standard kernel/user address space limit. */
10724 - *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
10726 - /* By far the most common cases. */
10727 - if (likely(SEGMENT_IS_FLAT_CODE(seg)))
10730 - /* Check the segment exists, is within the current LDT/GDT size,
10731 - that kernel/user (ring 0..3) has the appropriate privilege,
10732 - that it's a code segment, and get the limit. */
10733 - __asm__ ("larl %3,%0; lsll %3,%1"
10734 - : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
10735 - if ((~seg_ar & 0x9800) || eip > seg_limit) {
10737 - return 1; /* So that returned eip > *eip_limit. */
10740 - /* Get the GDT/LDT descriptor base.
10741 - When you look for races in this code remember that
10742 - LDT and other horrors are only used in user space. */
10743 - if (seg & (1<<2)) {
10744 - /* Must lock the LDT while reading it. */
10745 - mutex_lock(¤t->mm->context.lock);
10746 - desc = current->mm->context.ldt;
10747 - desc = (void *)desc + (seg & ~7);
10749 - /* Must disable preemption while reading the GDT. */
10750 - desc = (u32 *)get_cpu_gdt_table(get_cpu());
10751 - desc = (void *)desc + (seg & ~7);
10754 - /* Decode the code segment base from the descriptor */
10755 - base = get_desc_base((unsigned long *)desc);
10757 - if (seg & (1<<2)) {
10758 - mutex_unlock(¤t->mm->context.lock);
10762 - /* Adjust EIP and segment limit, and clamp at the kernel limit.
10763 - It's legitimate for segments to wrap at 0xffffffff. */
10764 - seg_limit += base;
10765 - if (seg_limit < *eip_limit && seg_limit >= base)
10766 - *eip_limit = seg_limit;
10767 - return eip + base;
10771 - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
10772 - * Check that here and ignore it.
10774 -static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
10776 - unsigned long limit;
10777 - unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
10778 - int scan_more = 1;
10779 - int prefetch = 0;
10782 - for (i = 0; scan_more && i < 15; i++) {
10783 - unsigned char opcode;
10784 - unsigned char instr_hi;
10785 - unsigned char instr_lo;
10787 - if (instr > (unsigned char *)limit)
10789 - if (probe_kernel_address(instr, opcode))
10792 - instr_hi = opcode & 0xf0;
10793 - instr_lo = opcode & 0x0f;
10796 - switch (instr_hi) {
10799 - /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
10800 - scan_more = ((instr_lo & 7) == 0x6);
10804 - /* 0x64 thru 0x67 are valid prefixes in all modes. */
10805 - scan_more = (instr_lo & 0xC) == 0x4;
10808 - /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
10809 - scan_more = !instr_lo || (instr_lo>>1) == 1;
10812 - /* Prefetch instruction is 0x0F0D or 0x0F18 */
10814 - if (instr > (unsigned char *)limit)
10816 - if (probe_kernel_address(instr, opcode))
10818 - prefetch = (instr_lo == 0xF) &&
10819 - (opcode == 0x0D || opcode == 0x18);
10829 -static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
10830 - unsigned long error_code)
10832 - if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
10833 - boot_cpu_data.x86 >= 6)) {
10834 - /* Catch an obscure case of prefetch inside an NX page. */
10835 - if (nx_enabled && (error_code & 16))
10837 - return __is_prefetch(regs, addr);
10842 -static noinline void force_sig_info_fault(int si_signo, int si_code,
10843 - unsigned long address, struct task_struct *tsk)
10847 - info.si_signo = si_signo;
10848 - info.si_errno = 0;
10849 - info.si_code = si_code;
10850 - info.si_addr = (void __user *)address;
10851 - force_sig_info(si_signo, &info, tsk);
10854 -fastcall void do_invalid_op(struct pt_regs *, unsigned long);
10856 -#ifdef CONFIG_X86_PAE
10857 -static void dump_fault_path(unsigned long address)
10859 - unsigned long *p, page;
10860 - unsigned long mfn;
10862 - page = read_cr3();
10863 - p = (unsigned long *)__va(page);
10864 - p += (address >> 30) * 2;
10865 - printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
10866 - if (p[0] & _PAGE_PRESENT) {
10867 - mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
10868 - page = mfn_to_pfn(mfn) << PAGE_SHIFT;
10869 - p = (unsigned long *)__va(page);
10870 - address &= 0x3fffffff;
10871 - p += (address >> 21) * 2;
10872 - printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
10873 - page, p[1], p[0]);
10874 - mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
10875 -#ifdef CONFIG_HIGHPTE
10876 - if (mfn_to_pfn(mfn) >= highstart_pfn)
10879 - if ((p[0] & _PAGE_PRESENT) && !(p[0] & _PAGE_PSE)) {
10880 - page = mfn_to_pfn(mfn) << PAGE_SHIFT;
10881 - p = (unsigned long *) __va(page);
10882 - address &= 0x001fffff;
10883 - p += (address >> 12) * 2;
10884 - printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
10885 - page, p[1], p[0]);
10890 -static void dump_fault_path(unsigned long address)
10892 - unsigned long page;
10894 - page = read_cr3();
10895 - page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
10896 - printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
10897 - machine_to_phys(page));
10899 - * We must not directly access the pte in the highpte
10900 - * case if the page table is located in highmem.
10901 - * And lets rather not kmap-atomic the pte, just in case
10902 - * it's allocated already.
10904 - if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
10905 - && (page & _PAGE_PRESENT)
10906 - && !(page & _PAGE_PSE)) {
10907 - page = machine_to_phys(page & PAGE_MASK);
10908 - page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
10909 - & (PTRS_PER_PTE - 1)];
10910 - printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
10911 - machine_to_phys(page));
10916 -static int spurious_fault(struct pt_regs *regs,
10917 - unsigned long address,
10918 - unsigned long error_code)
10925 - /* Reserved-bit violation or user access to kernel space? */
10926 - if (error_code & 0x0c)
10929 - pgd = init_mm.pgd + pgd_index(address);
10930 - if (!pgd_present(*pgd))
10933 - pud = pud_offset(pgd, address);
10934 - if (!pud_present(*pud))
10937 - pmd = pmd_offset(pud, address);
10938 - if (!pmd_present(*pmd))
10941 - pte = pte_offset_kernel(pmd, address);
10942 - if (!pte_present(*pte))
10944 - if ((error_code & 0x02) && !pte_write(*pte))
10946 -#ifdef CONFIG_X86_PAE
10947 - if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX))
10954 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
10956 - unsigned index = pgd_index(address);
10958 - pud_t *pud, *pud_k;
10959 - pmd_t *pmd, *pmd_k;
10962 - pgd_k = init_mm.pgd + index;
10964 - if (!pgd_present(*pgd_k))
10968 - * set_pgd(pgd, *pgd_k); here would be useless on PAE
10969 - * and redundant with the set_pmd() on non-PAE. As would
10973 - pud = pud_offset(pgd, address);
10974 - pud_k = pud_offset(pgd_k, address);
10975 - if (!pud_present(*pud_k))
10978 - pmd = pmd_offset(pud, address);
10979 - pmd_k = pmd_offset(pud_k, address);
10980 - if (!pmd_present(*pmd_k))
10982 - if (!pmd_present(*pmd)) {
10983 - bool lazy = x86_read_percpu(xen_lazy_mmu);
10985 - x86_write_percpu(xen_lazy_mmu, false);
10986 -#if CONFIG_XEN_COMPAT > 0x030002
10987 - set_pmd(pmd, *pmd_k);
10990 - * When running on older Xen we must launder *pmd_k through
10991 - * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
10993 - set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
10995 - x86_write_percpu(xen_lazy_mmu, lazy);
10997 - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
11002 - * Handle a fault on the vmalloc or module mapping area
11004 - * This assumes no large pages in there.
11006 -static inline int vmalloc_fault(unsigned long address)
11008 - unsigned long pgd_paddr;
11012 - * Synchronize this task's top level page-table
11013 - * with the 'reference' page table.
11015 - * Do _not_ use "current" here. We might be inside
11016 - * an interrupt in the middle of a task switch..
11018 - pgd_paddr = read_cr3();
11019 - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
11022 - pte_k = pte_offset_kernel(pmd_k, address);
11023 - if (!pte_present(*pte_k))
11028 -int show_unhandled_signals = 1;
11031 - * This routine handles page faults. It determines the address,
11032 - * and the problem, and then passes it off to one of the appropriate
11036 - * bit 0 == 0 means no page found, 1 means protection fault
11037 - * bit 1 == 0 means read, 1 means write
11038 - * bit 2 == 0 means kernel, 1 means user-mode
11039 - * bit 3 == 1 means use of reserved bit detected
11040 - * bit 4 == 1 means fault was an instruction fetch
11042 -fastcall void __kprobes do_page_fault(struct pt_regs *regs,
11043 - unsigned long error_code)
11045 - struct task_struct *tsk;
11046 - struct mm_struct *mm;
11047 - struct vm_area_struct * vma;
11048 - unsigned long address;
11049 - int write, si_code;
11053 - * We can fault from pretty much anywhere, with unknown IRQ state.
11055 - trace_hardirqs_fixup();
11057 - /* get the address */
11058 - address = read_cr2();
11060 - /* Set the "privileged fault" bit to something sane. */
11061 - error_code &= ~4;
11062 - error_code |= (regs->xcs & 2) << 1;
11063 - if (regs->eflags & X86_EFLAGS_VM)
11068 - si_code = SEGV_MAPERR;
11071 - * We fault-in kernel-space virtual memory on-demand. The
11072 - * 'reference' page table is init_mm.pgd.
11074 - * NOTE! We MUST NOT take any locks for this case. We may
11075 - * be in an interrupt or a critical region, and should
11076 - * only copy the information from the master page table,
11079 - * This verifies that the fault happens in kernel space
11080 - * (error_code & 4) == 0, and that the fault was not a
11081 - * protection error (error_code & 9) == 0.
11083 - if (unlikely(address >= TASK_SIZE)) {
11085 - /* Faults in hypervisor area can never be patched up. */
11086 - if (address >= hypervisor_virt_start)
11087 - goto bad_area_nosemaphore;
11089 - if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
11091 - /* Can take a spurious fault if mapping changes R/O -> R/W. */
11092 - if (spurious_fault(regs, address, error_code))
11094 - if (notify_page_fault(regs))
11097 - * Don't take the mm semaphore here. If we fixup a prefetch
11098 - * fault we could otherwise deadlock.
11100 - goto bad_area_nosemaphore;
11103 - if (notify_page_fault(regs))
11106 - /* It's safe to allow irq's after cr2 has been saved and the vmalloc
11107 - fault has been handled. */
11108 - if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
11109 - local_irq_enable();
11114 - * If we're in an interrupt, have no user context or are running in an
11115 - * atomic region then we must not take the fault..
11117 - if (in_atomic() || !mm)
11118 - goto bad_area_nosemaphore;
11120 - /* When running in the kernel we expect faults to occur only to
11121 - * addresses in user space. All other faults represent errors in the
11122 - * kernel and should generate an OOPS. Unfortunately, in the case of an
11123 - * erroneous fault occurring in a code path which already holds mmap_sem
11124 - * we will deadlock attempting to validate the fault against the
11125 - * address space. Luckily the kernel only validly references user
11126 - * space from well defined areas of code, which are listed in the
11127 - * exceptions table.
11129 - * As the vast majority of faults will be valid we will only perform
11130 - * the source reference check when there is a possibility of a deadlock.
11131 - * Attempt to lock the address space, if we cannot we then validate the
11132 - * source. If this is invalid we can skip the address space check,
11133 - * thus avoiding the deadlock.
11135 - if (!down_read_trylock(&mm->mmap_sem)) {
11136 - if ((error_code & 4) == 0 &&
11137 - !search_exception_tables(regs->eip))
11138 - goto bad_area_nosemaphore;
11139 - down_read(&mm->mmap_sem);
11142 - vma = find_vma(mm, address);
11145 - if (vma->vm_start <= address)
11147 - if (!(vma->vm_flags & VM_GROWSDOWN))
11149 - if (error_code & 4) {
11151 - * Accessing the stack below %esp is always a bug.
11152 - * The large cushion allows instructions like enter
11153 - * and pusha to work. ("enter $65535,$31" pushes
11154 - * 32 pointers and then decrements %esp by 65535.)
11156 - if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
11159 - if (expand_stack(vma, address))
11162 - * Ok, we have a good vm_area for this memory access, so
11163 - * we can handle it..
11166 - si_code = SEGV_ACCERR;
11168 - switch (error_code & 3) {
11169 - default: /* 3: write, present */
11170 - /* fall through */
11171 - case 2: /* write, not present */
11172 - if (!(vma->vm_flags & VM_WRITE))
11176 - case 1: /* read, present */
11178 - case 0: /* read, not present */
11179 - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
11185 - * If for any reason at all we couldn't handle the fault,
11186 - * make sure we exit gracefully rather than endlessly redo
11189 - fault = handle_mm_fault(mm, vma, address, write);
11190 - if (unlikely(fault & VM_FAULT_ERROR)) {
11191 - if (fault & VM_FAULT_OOM)
11192 - goto out_of_memory;
11193 - else if (fault & VM_FAULT_SIGBUS)
11197 - if (fault & VM_FAULT_MAJOR)
11203 - * Did it hit the DOS screen memory VA from vm86 mode?
11205 - if (regs->eflags & VM_MASK) {
11206 - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
11208 - tsk->thread.screen_bitmap |= 1 << bit;
11210 - up_read(&mm->mmap_sem);
11214 - * Something tried to access memory that isn't in our memory map..
11215 - * Fix it, but check if it's kernel or user first..
11218 - up_read(&mm->mmap_sem);
11220 -bad_area_nosemaphore:
11221 - /* User mode accesses just cause a SIGSEGV */
11222 - if (error_code & 4) {
11224 - * It's possible to have interrupts off here.
11226 - local_irq_enable();
11229 - * Valid to do another page fault here because this one came
11230 - * from user space.
11232 - if (is_prefetch(regs, address, error_code))
11235 - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
11236 - printk_ratelimit()) {
11237 - printk("%s%s[%d]: segfault at %08lx eip %08lx "
11238 - "esp %08lx error %lx\n",
11239 - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
11240 - tsk->comm, task_pid_nr(tsk), address, regs->eip,
11241 - regs->esp, error_code);
11243 - tsk->thread.cr2 = address;
11244 - /* Kernel addresses are always protection faults */
11245 - tsk->thread.error_code = error_code | (address >= TASK_SIZE);
11246 - tsk->thread.trap_no = 14;
11247 - force_sig_info_fault(SIGSEGV, si_code, address, tsk);
11251 -#ifdef CONFIG_X86_F00F_BUG
11253 - * Pentium F0 0F C7 C8 bug workaround.
11255 - if (boot_cpu_data.f00f_bug) {
11256 - unsigned long nr;
11258 - nr = (address - idt_descr.address) >> 3;
11261 - do_invalid_op(regs, 0);
11268 - /* Are we prepared to handle this kernel fault? */
11269 - if (fixup_exception(regs))
11273 - * Valid to do another page fault here, because if this fault
11274 - * had been triggered by is_prefetch fixup_exception would have
11277 - if (is_prefetch(regs, address, error_code))
11281 - * Oops. The kernel tried to access some bad page. We'll have to
11282 - * terminate things with extreme prejudice.
11285 - bust_spinlocks(1);
11287 - if (oops_may_print()) {
11288 -#ifdef CONFIG_X86_PAE
11289 - if (error_code & 16) {
11290 - pte_t *pte = lookup_address(address);
11292 - if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
11293 - printk(KERN_CRIT "kernel tried to execute "
11294 - "NX-protected page - exploit attempt? "
11295 - "(uid: %d)\n", current->uid);
11298 - if (address < PAGE_SIZE)
11299 - printk(KERN_ALERT "BUG: unable to handle kernel NULL "
11300 - "pointer dereference");
11302 - printk(KERN_ALERT "BUG: unable to handle kernel paging"
11304 - printk(" at virtual address %08lx\n",address);
11305 - printk(KERN_ALERT "printing eip: %08lx\n", regs->eip);
11306 - dump_fault_path(address);
11308 - tsk->thread.cr2 = address;
11309 - tsk->thread.trap_no = 14;
11310 - tsk->thread.error_code = error_code;
11311 - die("Oops", regs, error_code);
11312 - bust_spinlocks(0);
11313 - do_exit(SIGKILL);
11316 - * We ran out of memory, or some other thing happened to us that made
11317 - * us unable to handle the page fault gracefully.
11320 - up_read(&mm->mmap_sem);
11321 - if (is_global_init(tsk)) {
11323 - down_read(&mm->mmap_sem);
11326 - printk("VM: killing process %s\n", tsk->comm);
11327 - if (error_code & 4)
11328 - do_group_exit(SIGKILL);
11332 - up_read(&mm->mmap_sem);
11334 - /* Kernel mode? Handle exceptions or die */
11335 - if (!(error_code & 4))
11338 - /* User space => ok to do another page fault */
11339 - if (is_prefetch(regs, address, error_code))
11342 - tsk->thread.cr2 = address;
11343 - tsk->thread.error_code = error_code;
11344 - tsk->thread.trap_no = 14;
11345 - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
11348 -void vmalloc_sync_all(void)
11351 - * Note that races in the updates of insync and start aren't
11352 - * problematic: insync can only get set bits added, and updates to
11353 - * start are only improving performance (without affecting correctness
11355 - * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
11356 - * This change works just fine with 2-level paging too.
11358 -#define sync_index(a) ((a) >> PMD_SHIFT)
11359 - static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
11360 - static unsigned long start = TASK_SIZE;
11361 - unsigned long address;
11363 - if (SHARED_KERNEL_PMD)
11366 - BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
11367 - for (address = start;
11368 - address >= TASK_SIZE && address < hypervisor_virt_start;
11369 - address += 1UL << PMD_SHIFT) {
11370 - if (!test_bit(sync_index(address), insync)) {
11371 - unsigned long flags;
11372 - struct page *page;
11374 - spin_lock_irqsave(&pgd_lock, flags);
11375 - /* XEN: failure path assumes non-empty pgd_list. */
11376 - if (unlikely(!pgd_list)) {
11377 - spin_unlock_irqrestore(&pgd_lock, flags);
11380 - for (page = pgd_list; page; page =
11381 - (struct page *)page->index)
11382 - if (!vmalloc_sync_one(page_address(page),
11384 - BUG_ON(page != pgd_list);
11387 - spin_unlock_irqrestore(&pgd_lock, flags);
11389 - set_bit(sync_index(address), insync);
11391 - if (address == start && test_bit(sync_index(address), insync))
11392 - start = address + (1UL << PMD_SHIFT);
11395 --- a/arch/x86/mm/fault_64-xen.c
11399 - * linux/arch/x86-64/mm/fault.c
11401 - * Copyright (C) 1995 Linus Torvalds
11402 - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
11405 -#include <linux/signal.h>
11406 -#include <linux/sched.h>
11407 -#include <linux/kernel.h>
11408 -#include <linux/errno.h>
11409 -#include <linux/string.h>
11410 -#include <linux/types.h>
11411 -#include <linux/ptrace.h>
11412 -#include <linux/mman.h>
11413 -#include <linux/mm.h>
11414 -#include <linux/smp.h>
11415 -#include <linux/interrupt.h>
11416 -#include <linux/init.h>
11417 -#include <linux/tty.h>
11418 -#include <linux/vt_kern.h> /* For unblank_screen() */
11419 -#include <linux/compiler.h>
11420 -#include <linux/vmalloc.h>
11421 -#include <linux/module.h>
11422 -#include <linux/kprobes.h>
11423 -#include <linux/uaccess.h>
11424 -#include <linux/kdebug.h>
11425 -#include <linux/kprobes.h>
11427 -#include <asm/system.h>
11428 -#include <asm/pgalloc.h>
11429 -#include <asm/smp.h>
11430 -#include <asm/tlbflush.h>
11431 -#include <asm/proto.h>
11432 -#include <asm-generic/sections.h>
11434 -/* Page fault error code bits */
11435 -#define PF_PROT (1<<0) /* or no page found */
11436 -#define PF_WRITE (1<<1)
11437 -#define PF_USER (1<<2)
11438 -#define PF_RSVD (1<<3)
11439 -#define PF_INSTR (1<<4)
11441 -#ifdef CONFIG_KPROBES
11442 -static inline int notify_page_fault(struct pt_regs *regs)
11446 - /* kprobe_running() needs smp_processor_id() */
11447 - if (!user_mode(regs)) {
11448 - preempt_disable();
11449 - if (kprobe_running() && kprobe_fault_handler(regs, 14))
11451 - preempt_enable();
11457 -static inline int notify_page_fault(struct pt_regs *regs)
11463 -/* Sometimes the CPU reports invalid exceptions on prefetch.
11464 - Check that here and ignore.
11465 - Opcode checker based on code by Richard Brunner */
11466 -static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
11467 - unsigned long error_code)
11469 - unsigned char *instr;
11470 - int scan_more = 1;
11471 - int prefetch = 0;
11472 - unsigned char *max_instr;
11474 - /* If it was a exec fault ignore */
11475 - if (error_code & PF_INSTR)
11478 - instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
11479 - max_instr = instr + 15;
11481 - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
11484 - while (scan_more && instr < max_instr) {
11485 - unsigned char opcode;
11486 - unsigned char instr_hi;
11487 - unsigned char instr_lo;
11489 - if (probe_kernel_address(instr, opcode))
11492 - instr_hi = opcode & 0xf0;
11493 - instr_lo = opcode & 0x0f;
11496 - switch (instr_hi) {
11499 - /* Values 0x26,0x2E,0x36,0x3E are valid x86
11500 - prefixes. In long mode, the CPU will signal
11501 - invalid opcode if some of these prefixes are
11502 - present so we will never get here anyway */
11503 - scan_more = ((instr_lo & 7) == 0x6);
11507 - /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
11508 - Need to figure out under what instruction mode the
11509 - instruction was issued ... */
11510 - /* Could check the LDT for lm, but for now it's good
11511 - enough to assume that long mode only uses well known
11512 - segments or kernel. */
11513 - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
11517 - /* 0x64 thru 0x67 are valid prefixes in all modes. */
11518 - scan_more = (instr_lo & 0xC) == 0x4;
11521 - /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
11522 - scan_more = !instr_lo || (instr_lo>>1) == 1;
11525 - /* Prefetch instruction is 0x0F0D or 0x0F18 */
11527 - if (probe_kernel_address(instr, opcode))
11529 - prefetch = (instr_lo == 0xF) &&
11530 - (opcode == 0x0D || opcode == 0x18);
11540 -static int bad_address(void *p)
11542 - unsigned long dummy;
11543 - return probe_kernel_address((unsigned long *)p, dummy);
11546 -void dump_pagetable(unsigned long address)
11553 - pgd = (pgd_t *)read_cr3();
11555 - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
11556 - pgd += pgd_index(address);
11557 - if (bad_address(pgd)) goto bad;
11558 - printk("PGD %lx ", pgd_val(*pgd));
11559 - if (!pgd_present(*pgd)) goto ret;
11561 - pud = pud_offset(pgd, address);
11562 - if (bad_address(pud)) goto bad;
11563 - printk("PUD %lx ", pud_val(*pud));
11564 - if (!pud_present(*pud)) goto ret;
11566 - pmd = pmd_offset(pud, address);
11567 - if (bad_address(pmd)) goto bad;
11568 - printk("PMD %lx ", pmd_val(*pmd));
11569 - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
11571 - pte = pte_offset_kernel(pmd, address);
11572 - if (bad_address(pte)) goto bad;
11573 - printk("PTE %lx", pte_val(*pte));
11581 -static const char errata93_warning[] =
11582 -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
11583 -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
11584 -KERN_ERR "******* Please consider a BIOS update.\n"
11585 -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
11587 -/* Workaround for K8 erratum #93 & buggy BIOS.
11588 - BIOS SMM functions are required to use a specific workaround
11589 - to avoid corruption of the 64bit RIP register on C stepping K8.
11590 - A lot of BIOS that didn't get tested properly miss this.
11591 - The OS sees this as a page fault with the upper 32bits of RIP cleared.
11592 - Try to work around it here.
11593 - Note we only handle faults in kernel here. */
11595 -static int is_errata93(struct pt_regs *regs, unsigned long address)
11597 - static int warned;
11598 - if (address != regs->rip)
11600 - if ((address >> 32) != 0)
11602 - address |= 0xffffffffUL << 32;
11603 - if ((address >= (u64)_stext && address <= (u64)_etext) ||
11604 - (address >= MODULES_VADDR && address <= MODULES_END)) {
11606 - printk(errata93_warning);
11609 - regs->rip = address;
11615 -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
11616 - unsigned long error_code)
11618 - unsigned long flags = oops_begin();
11619 - struct task_struct *tsk;
11621 - printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
11622 - current->comm, address);
11623 - dump_pagetable(address);
11625 - tsk->thread.cr2 = address;
11626 - tsk->thread.trap_no = 14;
11627 - tsk->thread.error_code = error_code;
11628 - __die("Bad pagetable", regs, error_code);
11630 - do_exit(SIGKILL);
11634 - * Handle a fault on the vmalloc area
11636 - * This assumes no large pages in there.
11638 -static int vmalloc_fault(unsigned long address)
11640 - pgd_t *pgd, *pgd_ref;
11641 - pud_t *pud, *pud_ref;
11642 - pmd_t *pmd, *pmd_ref;
11643 - pte_t *pte, *pte_ref;
11645 - /* Copy kernel mappings over when needed. This can also
11646 - happen within a race in page table update. In the later
11647 - case just flush. */
11649 - /* On Xen the line below does not always work. Needs investigating! */
11650 - /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
11651 - pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
11652 - pgd += pgd_index(address);
11653 - pgd_ref = pgd_offset_k(address);
11654 - if (pgd_none(*pgd_ref))
11656 - if (pgd_none(*pgd))
11657 - set_pgd(pgd, *pgd_ref);
11659 - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
11661 - /* Below here mismatches are bugs because these lower tables
11664 - pud = pud_offset(pgd, address);
11665 - pud_ref = pud_offset(pgd_ref, address);
11666 - if (pud_none(*pud_ref))
11668 - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
11670 - pmd = pmd_offset(pud, address);
11671 - pmd_ref = pmd_offset(pud_ref, address);
11672 - if (pmd_none(*pmd_ref))
11674 - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
11676 - pte_ref = pte_offset_kernel(pmd_ref, address);
11677 - if (!pte_present(*pte_ref))
11679 - pte = pte_offset_kernel(pmd, address);
11680 - /* Don't use pte_page here, because the mappings can point
11681 - outside mem_map, and the NUMA hash lookup cannot handle
11683 - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
11688 -int show_unhandled_signals = 1;
11691 -#define MEM_VERBOSE 1
11693 -#ifdef MEM_VERBOSE
11694 -#define MEM_LOG(_f, _a...) \
11695 - printk("fault.c:[%d]-> " _f "\n", \
11696 - __LINE__ , ## _a )
11698 -#define MEM_LOG(_f, _a...) ((void)0)
11701 -static int spurious_fault(struct pt_regs *regs,
11702 - unsigned long address,
11703 - unsigned long error_code)
11711 - /* Faults in hypervisor area are never spurious. */
11712 - if ((address >= HYPERVISOR_VIRT_START) &&
11713 - (address < HYPERVISOR_VIRT_END))
11717 - /* Reserved-bit violation or user access to kernel space? */
11718 - if (error_code & (PF_RSVD|PF_USER))
11721 - pgd = init_mm.pgd + pgd_index(address);
11722 - if (!pgd_present(*pgd))
11725 - pud = pud_offset(pgd, address);
11726 - if (!pud_present(*pud))
11729 - pmd = pmd_offset(pud, address);
11730 - if (!pmd_present(*pmd))
11733 - pte = pte_offset_kernel(pmd, address);
11734 - if (!pte_present(*pte))
11736 - if ((error_code & PF_WRITE) && !pte_write(*pte))
11738 - if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX))
11745 - * This routine handles page faults. It determines the address,
11746 - * and the problem, and then passes it off to one of the appropriate
11749 -asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
11750 - unsigned long error_code)
11752 - struct task_struct *tsk;
11753 - struct mm_struct *mm;
11754 - struct vm_area_struct * vma;
11755 - unsigned long address;
11756 - const struct exception_table_entry *fixup;
11757 - int write, fault;
11758 - unsigned long flags;
11761 - if (!user_mode(regs))
11762 - error_code &= ~PF_USER; /* means kernel */
11765 - * We can fault from pretty much anywhere, with unknown IRQ state.
11767 - trace_hardirqs_fixup();
11771 - prefetchw(&mm->mmap_sem);
11773 - /* get the address */
11774 - address = read_cr2();
11776 - info.si_code = SEGV_MAPERR;
11780 - * We fault-in kernel-space virtual memory on-demand. The
11781 - * 'reference' page table is init_mm.pgd.
11783 - * NOTE! We MUST NOT take any locks for this case. We may
11784 - * be in an interrupt or a critical region, and should
11785 - * only copy the information from the master page table,
11788 - * This verifies that the fault happens in kernel space
11789 - * (error_code & 4) == 0, and that the fault was not a
11790 - * protection error (error_code & 9) == 0.
11792 - if (unlikely(address >= TASK_SIZE64)) {
11794 - * Don't check for the module range here: its PML4
11795 - * is always initialized because it's shared with the main
11796 - * kernel text. Only vmalloc may need PML4 syncups.
11798 - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
11799 - ((address >= VMALLOC_START && address < VMALLOC_END))) {
11800 - if (vmalloc_fault(address) >= 0)
11803 - /* Can take a spurious fault if mapping changes R/O -> R/W. */
11804 - if (spurious_fault(regs, address, error_code))
11806 - if (notify_page_fault(regs))
11809 - * Don't take the mm semaphore here. If we fixup a prefetch
11810 - * fault we could otherwise deadlock.
11812 - goto bad_area_nosemaphore;
11815 - if (notify_page_fault(regs))
11818 - if (likely(regs->eflags & X86_EFLAGS_IF))
11819 - local_irq_enable();
11821 - if (unlikely(error_code & PF_RSVD))
11822 - pgtable_bad(address, regs, error_code);
11825 - * If we're in an interrupt or have no user
11826 - * context, we must not take the fault..
11828 - if (unlikely(in_atomic() || !mm))
11829 - goto bad_area_nosemaphore;
11832 - * User-mode registers count as a user access even for any
11833 - * potential system fault or CPU buglet.
11835 - if (user_mode_vm(regs))
11836 - error_code |= PF_USER;
11839 - /* When running in the kernel we expect faults to occur only to
11840 - * addresses in user space. All other faults represent errors in the
11841 - * kernel and should generate an OOPS. Unfortunately, in the case of an
11842 - * erroneous fault occurring in a code path which already holds mmap_sem
11843 - * we will deadlock attempting to validate the fault against the
11844 - * address space. Luckily the kernel only validly references user
11845 - * space from well defined areas of code, which are listed in the
11846 - * exceptions table.
11848 - * As the vast majority of faults will be valid we will only perform
11849 - * the source reference check when there is a possibility of a deadlock.
11850 - * Attempt to lock the address space, if we cannot we then validate the
11851 - * source. If this is invalid we can skip the address space check,
11852 - * thus avoiding the deadlock.
11854 - if (!down_read_trylock(&mm->mmap_sem)) {
11855 - if ((error_code & PF_USER) == 0 &&
11856 - !search_exception_tables(regs->rip))
11857 - goto bad_area_nosemaphore;
11858 - down_read(&mm->mmap_sem);
11861 - vma = find_vma(mm, address);
11864 - if (likely(vma->vm_start <= address))
11866 - if (!(vma->vm_flags & VM_GROWSDOWN))
11868 - if (error_code & 4) {
11869 - /* Allow userspace just enough access below the stack pointer
11870 - * to let the 'enter' instruction work.
11872 - if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
11875 - if (expand_stack(vma, address))
11878 - * Ok, we have a good vm_area for this memory access, so
11879 - * we can handle it..
11882 - info.si_code = SEGV_ACCERR;
11884 - switch (error_code & (PF_PROT|PF_WRITE)) {
11885 - default: /* 3: write, present */
11886 - /* fall through */
11887 - case PF_WRITE: /* write, not present */
11888 - if (!(vma->vm_flags & VM_WRITE))
11892 - case PF_PROT: /* read, present */
11894 - case 0: /* read, not present */
11895 - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
11900 - * If for any reason at all we couldn't handle the fault,
11901 - * make sure we exit gracefully rather than endlessly redo
11904 - fault = handle_mm_fault(mm, vma, address, write);
11905 - if (unlikely(fault & VM_FAULT_ERROR)) {
11906 - if (fault & VM_FAULT_OOM)
11907 - goto out_of_memory;
11908 - else if (fault & VM_FAULT_SIGBUS)
11912 - if (fault & VM_FAULT_MAJOR)
11916 - up_read(&mm->mmap_sem);
11920 - * Something tried to access memory that isn't in our memory map..
11921 - * Fix it, but check if it's kernel or user first..
11924 - up_read(&mm->mmap_sem);
11926 -bad_area_nosemaphore:
11927 - /* User mode accesses just cause a SIGSEGV */
11928 - if (error_code & PF_USER) {
11931 - * It's possible to have interrupts off here.
11933 - local_irq_enable();
11935 - if (is_prefetch(regs, address, error_code))
11938 - /* Work around K8 erratum #100 K8 in compat mode
11939 - occasionally jumps to illegal addresses >4GB. We
11940 - catch this here in the page fault handler because
11941 - these addresses are not reachable. Just detect this
11942 - case and return. Any code segment in LDT is
11943 - compatibility mode. */
11944 - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
11948 - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
11949 - printk_ratelimit()) {
11951 - "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
11952 - tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
11953 - tsk->comm, tsk->pid, address, regs->rip,
11954 - regs->rsp, error_code);
11957 - tsk->thread.cr2 = address;
11958 - /* Kernel addresses are always protection faults */
11959 - tsk->thread.error_code = error_code | (address >= TASK_SIZE);
11960 - tsk->thread.trap_no = 14;
11961 - info.si_signo = SIGSEGV;
11962 - info.si_errno = 0;
11963 - /* info.si_code has been set above */
11964 - info.si_addr = (void __user *)address;
11965 - force_sig_info(SIGSEGV, &info, tsk);
11971 - /* Are we prepared to handle this kernel fault? */
11972 - fixup = search_exception_tables(regs->rip);
11974 - regs->rip = fixup->fixup;
11979 - * Hall of shame of CPU/BIOS bugs.
11982 - if (is_prefetch(regs, address, error_code))
11985 - if (is_errata93(regs, address))
11989 - * Oops. The kernel tried to access some bad page. We'll have to
11990 - * terminate things with extreme prejudice.
11993 - flags = oops_begin();
11995 - if (address < PAGE_SIZE)
11996 - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
11998 - printk(KERN_ALERT "Unable to handle kernel paging request");
11999 - printk(" at %016lx RIP: \n" KERN_ALERT,address);
12000 - printk_address(regs->rip);
12001 - dump_pagetable(address);
12002 - tsk->thread.cr2 = address;
12003 - tsk->thread.trap_no = 14;
12004 - tsk->thread.error_code = error_code;
12005 - __die("Oops", regs, error_code);
12006 - /* Executive summary in case the body of the oops scrolled away */
12007 - printk(KERN_EMERG "CR2: %016lx\n", address);
12009 - do_exit(SIGKILL);
12012 - * We ran out of memory, or some other thing happened to us that made
12013 - * us unable to handle the page fault gracefully.
12016 - up_read(&mm->mmap_sem);
12017 - if (is_global_init(current)) {
12021 - printk("VM: killing process %s\n", tsk->comm);
12022 - if (error_code & 4)
12023 - do_group_exit(SIGKILL);
12027 - up_read(&mm->mmap_sem);
12029 - /* Kernel mode? Handle exceptions or die */
12030 - if (!(error_code & PF_USER))
12033 - tsk->thread.cr2 = address;
12034 - tsk->thread.error_code = error_code;
12035 - tsk->thread.trap_no = 14;
12036 - info.si_signo = SIGBUS;
12037 - info.si_errno = 0;
12038 - info.si_code = BUS_ADRERR;
12039 - info.si_addr = (void __user *)address;
12040 - force_sig_info(SIGBUS, &info, tsk);
12044 -DEFINE_SPINLOCK(pgd_lock);
12045 -LIST_HEAD(pgd_list);
12047 -void vmalloc_sync_all(void)
12049 - /* Note that races in the updates of insync and start aren't
12051 - insync can only get set bits added, and updates to start are only
12052 - improving performance (without affecting correctness if undone). */
12053 - static DECLARE_BITMAP(insync, PTRS_PER_PGD);
12054 - static unsigned long start = VMALLOC_START & PGDIR_MASK;
12055 - unsigned long address;
12057 - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
12058 - if (!test_bit(pgd_index(address), insync)) {
12059 - const pgd_t *pgd_ref = pgd_offset_k(address);
12060 - struct page *page;
12062 - if (pgd_none(*pgd_ref))
12064 - spin_lock(&pgd_lock);
12065 - list_for_each_entry(page, &pgd_list, lru) {
12067 - pgd = (pgd_t *)page_address(page) + pgd_index(address);
12068 - if (pgd_none(*pgd))
12069 - set_pgd(pgd, *pgd_ref);
12071 - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
12073 - spin_unlock(&pgd_lock);
12074 - set_bit(pgd_index(address), insync);
12076 - if (address == start)
12077 - start = address + PGDIR_SIZE;
12079 - /* Check that there is no need to do the same for the modules area. */
12080 - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
12081 - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
12082 - (__START_KERNEL & PGDIR_MASK)));
12085 +++ b/arch/x86/mm/fault-xen.c
12088 + * Copyright (C) 1995 Linus Torvalds
12089 + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
12092 +#include <linux/signal.h>
12093 +#include <linux/sched.h>
12094 +#include <linux/kernel.h>
12095 +#include <linux/errno.h>
12096 +#include <linux/string.h>
12097 +#include <linux/types.h>
12098 +#include <linux/ptrace.h>
12099 +#include <linux/mman.h>
12100 +#include <linux/mm.h>
12101 +#include <linux/smp.h>
12102 +#include <linux/interrupt.h>
12103 +#include <linux/init.h>
12104 +#include <linux/tty.h>
12105 +#include <linux/vt_kern.h> /* For unblank_screen() */
12106 +#include <linux/compiler.h>
12107 +#include <linux/highmem.h>
12108 +#include <linux/bootmem.h> /* for max_low_pfn */
12109 +#include <linux/vmalloc.h>
12110 +#include <linux/module.h>
12111 +#include <linux/kprobes.h>
12112 +#include <linux/uaccess.h>
12113 +#include <linux/kdebug.h>
12115 +#include <asm/system.h>
12116 +#include <asm/desc.h>
12117 +#include <asm/segment.h>
12118 +#include <asm/pgalloc.h>
12119 +#include <asm/smp.h>
12120 +#include <asm/tlbflush.h>
12121 +#include <asm/proto.h>
12122 +#include <asm-generic/sections.h>
12125 + * Page fault error code bits
12126 + * bit 0 == 0 means no page found, 1 means protection fault
12127 + * bit 1 == 0 means read, 1 means write
12128 + * bit 2 == 0 means kernel, 1 means user-mode
12129 + * bit 3 == 1 means use of reserved bit detected
12130 + * bit 4 == 1 means fault was an instruction fetch
12132 +#define PF_PROT (1<<0)
12133 +#define PF_WRITE (1<<1)
12134 +#define PF_USER (1<<2)
12135 +#define PF_RSVD (1<<3)
12136 +#define PF_INSTR (1<<4)
12138 +static inline int notify_page_fault(struct pt_regs *regs)
12140 +#ifdef CONFIG_KPROBES
12143 + /* kprobe_running() needs smp_processor_id() */
12144 +#ifdef CONFIG_X86_32
12145 + if (!user_mode_vm(regs)) {
12147 + if (!user_mode(regs)) {
12149 + preempt_disable();
12150 + if (kprobe_running() && kprobe_fault_handler(regs, 14))
12152 + preempt_enable();
12163 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
12164 + * Check that here and ignore it.
12167 + * Sometimes the CPU reports invalid exceptions on prefetch.
12168 + * Check that here and ignore it.
12170 + * Opcode checker based on code by Richard Brunner
12172 +static int is_prefetch(struct pt_regs *regs, unsigned long addr,
12173 + unsigned long error_code)
12175 + unsigned char *instr;
12176 + int scan_more = 1;
12177 + int prefetch = 0;
12178 + unsigned char *max_instr;
12181 + * If it was a exec (instruction fetch) fault on NX page, then
12182 + * do not ignore the fault:
12184 + if (error_code & PF_INSTR)
12187 + instr = (unsigned char *)convert_ip_to_linear(current, regs);
12188 + max_instr = instr + 15;
12190 + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
12193 + while (scan_more && instr < max_instr) {
12194 + unsigned char opcode;
12195 + unsigned char instr_hi;
12196 + unsigned char instr_lo;
12198 + if (probe_kernel_address(instr, opcode))
12201 + instr_hi = opcode & 0xf0;
12202 + instr_lo = opcode & 0x0f;
12205 + switch (instr_hi) {
12209 + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
12210 + * In X86_64 long mode, the CPU will signal invalid
12211 + * opcode if some of these prefixes are present so
12212 + * X86_64 will never get here anyway
12214 + scan_more = ((instr_lo & 7) == 0x6);
12216 +#ifdef CONFIG_X86_64
12219 + * In AMD64 long mode 0x40..0x4F are valid REX prefixes
12220 + * Need to figure out under what instruction mode the
12221 + * instruction was issued. Could check the LDT for lm,
12222 + * but for now it's good enough to assume that long
12223 + * mode only uses well known segments or kernel.
12225 + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
12229 + /* 0x64 thru 0x67 are valid prefixes in all modes. */
12230 + scan_more = (instr_lo & 0xC) == 0x4;
12233 + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
12234 + scan_more = !instr_lo || (instr_lo>>1) == 1;
12237 + /* Prefetch instruction is 0x0F0D or 0x0F18 */
12240 + if (probe_kernel_address(instr, opcode))
12242 + prefetch = (instr_lo == 0xF) &&
12243 + (opcode == 0x0D || opcode == 0x18);
12253 +static void force_sig_info_fault(int si_signo, int si_code,
12254 + unsigned long address, struct task_struct *tsk)
12258 + info.si_signo = si_signo;
12259 + info.si_errno = 0;
12260 + info.si_code = si_code;
12261 + info.si_addr = (void __user *)address;
12262 + force_sig_info(si_signo, &info, tsk);
12265 +#ifdef CONFIG_X86_64
12266 +static int bad_address(void *p)
12268 + unsigned long dummy;
12269 + return probe_kernel_address((unsigned long *)p, dummy);
12273 +static void dump_pagetable(unsigned long address)
12275 +#ifdef CONFIG_X86_32
12276 + __typeof__(pte_val(__pte(0))) page;
12278 + page = read_cr3();
12279 + page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
12280 +#ifdef CONFIG_X86_PAE
12281 + printk("*pdpt = %016Lx ", page);
12282 + if ((page & _PAGE_PRESENT)
12283 + && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) {
12284 + page = mfn_to_pfn(page >> PAGE_SHIFT);
12285 + page <<= PAGE_SHIFT;
12286 + page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
12287 + & (PTRS_PER_PMD - 1)];
12288 + printk(KERN_CONT "*pde = %016Lx ", page);
12289 + page &= ~_PAGE_NX;
12292 + printk("*pde = %08lx ", page);
12296 + * We must not directly access the pte in the highpte
12297 + * case if the page table is located in highmem.
12298 + * And let's rather not kmap-atomic the pte, just in case
12299 + * it's allocated already.
12301 + if ((page & _PAGE_PRESENT)
12302 + && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn
12303 + && !(page & _PAGE_PSE)) {
12304 + page = mfn_to_pfn(page >> PAGE_SHIFT);
12305 + page <<= PAGE_SHIFT;
12306 + page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
12307 + & (PTRS_PER_PTE - 1)];
12308 + printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page);
12311 + printk(KERN_CONT "\n");
12312 +#else /* CONFIG_X86_64 */
12318 + pgd = (pgd_t *)read_cr3();
12320 + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
12321 + pgd += pgd_index(address);
12322 + if (bad_address(pgd)) goto bad;
12323 + printk("PGD %lx ", pgd_val(*pgd));
12324 + if (!pgd_present(*pgd)) goto ret;
12326 + pud = pud_offset(pgd, address);
12327 + if (bad_address(pud)) goto bad;
12328 + printk(KERN_CONT "PUD %lx ", pud_val(*pud));
12329 + if (!pud_present(*pud) || pud_large(*pud))
12332 + pmd = pmd_offset(pud, address);
12333 + if (bad_address(pmd)) goto bad;
12334 + printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
12335 + if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
12337 + pte = pte_offset_kernel(pmd, address);
12338 + if (bad_address(pte)) goto bad;
12339 + printk(KERN_CONT "PTE %lx", pte_val(*pte));
12341 + printk(KERN_CONT "\n");
12348 +#ifdef CONFIG_X86_32
12349 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
12351 + unsigned index = pgd_index(address);
12353 + pud_t *pud, *pud_k;
12354 + pmd_t *pmd, *pmd_k;
12357 + pgd_k = init_mm.pgd + index;
12359 + if (!pgd_present(*pgd_k))
12363 + * set_pgd(pgd, *pgd_k); here would be useless on PAE
12364 + * and redundant with the set_pmd() on non-PAE. As would
12368 + pud = pud_offset(pgd, address);
12369 + pud_k = pud_offset(pgd_k, address);
12370 + if (!pud_present(*pud_k))
12373 + pmd = pmd_offset(pud, address);
12374 + pmd_k = pmd_offset(pud_k, address);
12375 + if (!pmd_present(*pmd_k))
12377 + if (!pmd_present(*pmd)) {
12378 + bool lazy = x86_read_percpu(xen_lazy_mmu);
12380 + x86_write_percpu(xen_lazy_mmu, false);
12381 +#if CONFIG_XEN_COMPAT > 0x030002
12382 + set_pmd(pmd, *pmd_k);
12385 + * When running on older Xen we must launder *pmd_k through
12386 + * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
12388 + set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
12390 + x86_write_percpu(xen_lazy_mmu, lazy);
12392 + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
12397 +#ifdef CONFIG_X86_64
12398 +static const char errata93_warning[] =
12399 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
12400 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
12401 +KERN_ERR "******* Please consider a BIOS update.\n"
12402 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
12405 +/* Workaround for K8 erratum #93 & buggy BIOS.
12406 + BIOS SMM functions are required to use a specific workaround
12407 + to avoid corruption of the 64bit RIP register on C stepping K8.
12408 + A lot of BIOS that didn't get tested properly miss this.
12409 + The OS sees this as a page fault with the upper 32bits of RIP cleared.
12410 + Try to work around it here.
12411 + Note we only handle faults in kernel here.
12412 + Does nothing for X86_32
12414 +static int is_errata93(struct pt_regs *regs, unsigned long address)
12416 +#ifdef CONFIG_X86_64
12417 + static int warned;
12418 + if (address != regs->ip)
12420 + if ((address >> 32) != 0)
12422 + address |= 0xffffffffUL << 32;
12423 + if ((address >= (u64)_stext && address <= (u64)_etext) ||
12424 + (address >= MODULES_VADDR && address <= MODULES_END)) {
12426 + printk(errata93_warning);
12429 + regs->ip = address;
12437 + * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
12438 + * addresses >4GB. We catch this in the page fault handler because these
12439 + * addresses are not reachable. Just detect this case and return. Any code
12440 + * segment in LDT is compatibility mode.
12442 +static int is_errata100(struct pt_regs *regs, unsigned long address)
12444 +#ifdef CONFIG_X86_64
12445 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
12452 +void do_invalid_op(struct pt_regs *, unsigned long);
12454 +static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
12456 +#ifdef CONFIG_X86_F00F_BUG
12457 + unsigned long nr;
12459 + * Pentium F0 0F C7 C8 bug workaround.
12461 + if (boot_cpu_data.f00f_bug) {
12462 + nr = (address - idt_descr.address) >> 3;
12465 + do_invalid_op(regs, 0);
12473 +static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
12474 + unsigned long address)
12476 +#ifdef CONFIG_X86_32
12477 + if (!oops_may_print())
12481 +#ifdef CONFIG_X86_PAE
12482 + if (error_code & PF_INSTR) {
12483 + unsigned int level;
12484 + pte_t *pte = lookup_address(address, &level);
12486 + if (pte && pte_present(*pte) && !pte_exec(*pte))
12487 + printk(KERN_CRIT "kernel tried to execute "
12488 + "NX-protected page - exploit attempt? "
12489 + "(uid: %d)\n", current->uid);
12493 + printk(KERN_ALERT "BUG: unable to handle kernel ");
12494 + if (address < PAGE_SIZE)
12495 + printk(KERN_CONT "NULL pointer dereference");
12497 + printk(KERN_CONT "paging request");
12498 +#ifdef CONFIG_X86_32
12499 + printk(KERN_CONT " at %08lx\n", address);
12501 + printk(KERN_CONT " at %016lx\n", address);
12503 + printk(KERN_ALERT "IP:");
12504 + printk_address(regs->ip, 1);
12505 + dump_pagetable(address);
12508 +#ifdef CONFIG_X86_64
12509 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
12510 + unsigned long error_code)
12512 + unsigned long flags = oops_begin();
12513 + struct task_struct *tsk;
12515 + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
12516 + current->comm, address);
12517 + dump_pagetable(address);
12519 + tsk->thread.cr2 = address;
12520 + tsk->thread.trap_no = 14;
12521 + tsk->thread.error_code = error_code;
12522 + if (__die("Bad pagetable", regs, error_code))
12524 + oops_end(flags, regs, SIGKILL);
12528 +static int spurious_fault_check(unsigned long error_code, pte_t *pte)
12530 + if ((error_code & PF_WRITE) && !pte_write(*pte))
12532 + if ((error_code & PF_INSTR) && !pte_exec(*pte))
12539 + * Handle a spurious fault caused by a stale TLB entry. This allows
12540 + * us to lazily refresh the TLB when increasing the permissions of a
12541 + * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
12542 + * expensive since that implies doing a full cross-processor TLB
12543 + * flush, even if no stale TLB entries exist on other processors.
12544 + * There are no security implications to leaving a stale TLB when
12545 + * increasing the permissions on a page.
12547 +static int spurious_fault(unsigned long address,
12548 + unsigned long error_code)
12555 + /* Reserved-bit violation or user access to kernel space? */
12556 + if (error_code & (PF_USER | PF_RSVD))
12559 + pgd = init_mm.pgd + pgd_index(address);
12560 + if (!pgd_present(*pgd))
12563 + pud = pud_offset(pgd, address);
12564 + if (!pud_present(*pud))
12567 + if (pud_large(*pud))
12568 + return spurious_fault_check(error_code, (pte_t *) pud);
12570 + pmd = pmd_offset(pud, address);
12571 + if (!pmd_present(*pmd))
12574 + if (pmd_large(*pmd))
12575 + return spurious_fault_check(error_code, (pte_t *) pmd);
12577 + pte = pte_offset_kernel(pmd, address);
12578 + if (!pte_present(*pte))
12581 + return spurious_fault_check(error_code, pte);
12586 + * Handle a fault on the vmalloc or module mapping area
12589 + * Handle a fault on the vmalloc area
12591 + * This assumes no large pages in there.
12593 +static int vmalloc_fault(unsigned long address)
12595 +#ifdef CONFIG_X86_32
12596 + unsigned long pgd_paddr;
12600 + * Synchronize this task's top level page-table
12601 + * with the 'reference' page table.
12603 + * Do _not_ use "current" here. We might be inside
12604 + * an interrupt in the middle of a task switch..
12606 + pgd_paddr = read_cr3();
12607 + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
12610 + pte_k = pte_offset_kernel(pmd_k, address);
12611 + if (!pte_present(*pte_k))
12615 + pgd_t *pgd, *pgd_ref;
12616 + pud_t *pud, *pud_ref;
12617 + pmd_t *pmd, *pmd_ref;
12618 + pte_t *pte, *pte_ref;
12620 + /* Make sure we are in vmalloc area */
12621 + if (!(address >= VMALLOC_START && address < VMALLOC_END))
12624 + /* Copy kernel mappings over when needed. This can also
12625 + happen within a race in page table update. In the later
12626 + case just flush. */
12628 + /* On Xen the line below does not always work. Needs investigating! */
12629 + /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
12630 + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
12631 + pgd += pgd_index(address);
12632 + pgd_ref = pgd_offset_k(address);
12633 + if (pgd_none(*pgd_ref))
12635 + if (pgd_none(*pgd))
12636 + set_pgd(pgd, *pgd_ref);
12638 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
12640 + /* Below here mismatches are bugs because these lower tables
12643 + pud = pud_offset(pgd, address);
12644 + pud_ref = pud_offset(pgd_ref, address);
12645 + if (pud_none(*pud_ref))
12647 + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
12649 + pmd = pmd_offset(pud, address);
12650 + pmd_ref = pmd_offset(pud_ref, address);
12651 + if (pmd_none(*pmd_ref))
12653 + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
12655 + pte_ref = pte_offset_kernel(pmd_ref, address);
12656 + if (!pte_present(*pte_ref))
12658 + pte = pte_offset_kernel(pmd, address);
12659 + /* Don't use pte_page here, because the mappings can point
12660 + outside mem_map, and the NUMA hash lookup cannot handle
12662 + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
12668 +int show_unhandled_signals = 1;
12671 + * This routine handles page faults. It determines the address,
12672 + * and the problem, and then passes it off to one of the appropriate
12675 +#ifdef CONFIG_X86_64
12678 +void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
12680 + struct task_struct *tsk;
12681 + struct mm_struct *mm;
12682 + struct vm_area_struct *vma;
12683 + unsigned long address;
12684 + int write, si_code;
12686 +#ifdef CONFIG_X86_64
12687 + unsigned long flags;
12691 + * We can fault from pretty much anywhere, with unknown IRQ state.
12693 + trace_hardirqs_fixup();
12695 + /* Set the "privileged fault" bit to something sane. */
12696 + if (user_mode_vm(regs))
12697 + error_code |= PF_USER;
12699 + error_code &= ~PF_USER;
12703 + prefetchw(&mm->mmap_sem);
12705 + /* get the address */
12706 + address = read_cr2();
12708 + si_code = SEGV_MAPERR;
12710 + if (notify_page_fault(regs))
12714 + * We fault-in kernel-space virtual memory on-demand. The
12715 + * 'reference' page table is init_mm.pgd.
12717 + * NOTE! We MUST NOT take any locks for this case. We may
12718 + * be in an interrupt or a critical region, and should
12719 + * only copy the information from the master page table,
12722 + * This verifies that the fault happens in kernel space
12723 + * (error_code & 4) == 0, and that the fault was not a
12724 + * protection error (error_code & 9) == 0.
12726 +#ifdef CONFIG_X86_32
12727 + if (unlikely(address >= TASK_SIZE)) {
12729 + if (unlikely(address >= TASK_SIZE64)) {
12731 + /* Faults in hypervisor area can never be patched up. */
12732 +#if defined(CONFIG_X86_XEN)
12733 + if (address >= hypervisor_virt_start)
12734 + goto bad_area_nosemaphore;
12735 +#elif defined(CONFIG_X86_64_XEN)
12736 + /* Faults in hypervisor area are never spurious. */
12737 + if (address >= HYPERVISOR_VIRT_START
12738 + && address < HYPERVISOR_VIRT_END)
12739 + goto bad_area_nosemaphore;
12741 + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
12742 + vmalloc_fault(address) >= 0)
12745 + /* Can handle a stale RO->RW TLB */
12746 + if (spurious_fault(address, error_code))
12750 + * Don't take the mm semaphore here. If we fixup a prefetch
12751 + * fault we could otherwise deadlock.
12753 + goto bad_area_nosemaphore;
12757 +#ifdef CONFIG_X86_32
12758 + /* It's safe to allow irq's after cr2 has been saved and the vmalloc
12759 + fault has been handled. */
12760 + if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
12761 + local_irq_enable();
12764 + * If we're in an interrupt, have no user context or are running in an
12765 + * atomic region then we must not take the fault.
12767 + if (in_atomic() || !mm)
12768 + goto bad_area_nosemaphore;
12769 +#else /* CONFIG_X86_64 */
12770 + if (likely(regs->flags & X86_EFLAGS_IF))
12771 + local_irq_enable();
12773 + if (unlikely(error_code & PF_RSVD))
12774 + pgtable_bad(address, regs, error_code);
12777 + * If we're in an interrupt, have no user context or are running in an
12778 + * atomic region then we must not take the fault.
12780 + if (unlikely(in_atomic() || !mm))
12781 + goto bad_area_nosemaphore;
12784 + * User-mode registers count as a user access even for any
12785 + * potential system fault or CPU buglet.
12787 + if (user_mode_vm(regs))
12788 + error_code |= PF_USER;
12791 + /* When running in the kernel we expect faults to occur only to
12792 + * addresses in user space. All other faults represent errors in the
12793 + * kernel and should generate an OOPS. Unfortunately, in the case of an
12794 + * erroneous fault occurring in a code path which already holds mmap_sem
12795 + * we will deadlock attempting to validate the fault against the
12796 + * address space. Luckily the kernel only validly references user
12797 + * space from well defined areas of code, which are listed in the
12798 + * exceptions table.
12800 + * As the vast majority of faults will be valid we will only perform
12801 + * the source reference check when there is a possibility of a deadlock.
12802 + * Attempt to lock the address space, if we cannot we then validate the
12803 + * source. If this is invalid we can skip the address space check,
12804 + * thus avoiding the deadlock.
12806 + if (!down_read_trylock(&mm->mmap_sem)) {
12807 + if ((error_code & PF_USER) == 0 &&
12808 + !search_exception_tables(regs->ip))
12809 + goto bad_area_nosemaphore;
12810 + down_read(&mm->mmap_sem);
12813 + vma = find_vma(mm, address);
12816 + if (vma->vm_start <= address)
12818 + if (!(vma->vm_flags & VM_GROWSDOWN))
12820 + if (error_code & PF_USER) {
12822 + * Accessing the stack below %sp is always a bug.
12823 + * The large cushion allows instructions like enter
12824 + * and pusha to work. ("enter $65535,$31" pushes
12825 + * 32 pointers and then decrements %sp by 65535.)
12827 + if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
12830 + if (expand_stack(vma, address))
12833 + * Ok, we have a good vm_area for this memory access, so
12834 + * we can handle it..
12837 + si_code = SEGV_ACCERR;
12839 + switch (error_code & (PF_PROT|PF_WRITE)) {
12840 + default: /* 3: write, present */
12841 + /* fall through */
12842 + case PF_WRITE: /* write, not present */
12843 + if (!(vma->vm_flags & VM_WRITE))
12847 + case PF_PROT: /* read, present */
12849 + case 0: /* read, not present */
12850 + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
12854 +#ifdef CONFIG_X86_32
12858 + * If for any reason at all we couldn't handle the fault,
12859 + * make sure we exit gracefully rather than endlessly redo
12862 + fault = handle_mm_fault(mm, vma, address, write);
12863 + if (unlikely(fault & VM_FAULT_ERROR)) {
12864 + if (fault & VM_FAULT_OOM)
12865 + goto out_of_memory;
12866 + else if (fault & VM_FAULT_SIGBUS)
12870 + if (fault & VM_FAULT_MAJOR)
12875 +#ifdef CONFIG_X86_32
12877 + * Did it hit the DOS screen memory VA from vm86 mode?
12879 + if (v8086_mode(regs)) {
12880 + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
12882 + tsk->thread.screen_bitmap |= 1 << bit;
12885 + up_read(&mm->mmap_sem);
12889 + * Something tried to access memory that isn't in our memory map..
12890 + * Fix it, but check if it's kernel or user first..
12893 + up_read(&mm->mmap_sem);
12895 +bad_area_nosemaphore:
12896 + /* User mode accesses just cause a SIGSEGV */
12897 + if (error_code & PF_USER) {
12899 + * It's possible to have interrupts off here.
12901 + local_irq_enable();
12904 + * Valid to do another page fault here because this one came
12905 + * from user space.
12907 + if (is_prefetch(regs, address, error_code))
12910 + if (is_errata100(regs, address))
12913 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
12914 + printk_ratelimit()) {
12916 +#ifdef CONFIG_X86_32
12917 + "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
12919 + "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
12921 + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
12922 + tsk->comm, task_pid_nr(tsk), address, regs->ip,
12923 + regs->sp, error_code);
12924 + print_vma_addr(" in ", regs->ip);
12928 + tsk->thread.cr2 = address;
12929 + /* Kernel addresses are always protection faults */
12930 + tsk->thread.error_code = error_code | (address >= TASK_SIZE);
12931 + tsk->thread.trap_no = 14;
12932 + force_sig_info_fault(SIGSEGV, si_code, address, tsk);
12936 + if (is_f00f_bug(regs, address))
12940 + /* Are we prepared to handle this kernel fault? */
12941 + if (fixup_exception(regs))
12946 + * Valid to do another page fault here, because if this fault
12947 + * had been triggered by is_prefetch fixup_exception would have
12951 + * Hall of shame of CPU/BIOS bugs.
12953 + if (is_prefetch(regs, address, error_code))
12956 + if (is_errata93(regs, address))
12960 + * Oops. The kernel tried to access some bad page. We'll have to
12961 + * terminate things with extreme prejudice.
12963 +#ifdef CONFIG_X86_32
12964 + bust_spinlocks(1);
12966 + flags = oops_begin();
12969 + show_fault_oops(regs, error_code, address);
12971 + tsk->thread.cr2 = address;
12972 + tsk->thread.trap_no = 14;
12973 + tsk->thread.error_code = error_code;
12975 +#ifdef CONFIG_X86_32
12976 + die("Oops", regs, error_code);
12977 + bust_spinlocks(0);
12978 + do_exit(SIGKILL);
12980 + if (__die("Oops", regs, error_code))
12982 + /* Executive summary in case the body of the oops scrolled away */
12983 + printk(KERN_EMERG "CR2: %016lx\n", address);
12984 + oops_end(flags, regs, SIGKILL);
12988 + * We ran out of memory, or some other thing happened to us that made
12989 + * us unable to handle the page fault gracefully.
12992 + up_read(&mm->mmap_sem);
12993 + if (is_global_init(tsk)) {
12995 +#ifdef CONFIG_X86_32
12996 + down_read(&mm->mmap_sem);
13003 + printk("VM: killing process %s\n", tsk->comm);
13004 + if (error_code & PF_USER)
13005 + do_group_exit(SIGKILL);
13009 + up_read(&mm->mmap_sem);
13011 + /* Kernel mode? Handle exceptions or die */
13012 + if (!(error_code & PF_USER))
13014 +#ifdef CONFIG_X86_32
13015 + /* User space => ok to do another page fault */
13016 + if (is_prefetch(regs, address, error_code))
13019 + tsk->thread.cr2 = address;
13020 + tsk->thread.error_code = error_code;
13021 + tsk->thread.trap_no = 14;
13022 + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
13025 +DEFINE_SPINLOCK(pgd_lock);
13026 +LIST_HEAD(pgd_list);
13028 +void vmalloc_sync_all(void)
13030 +#ifdef CONFIG_X86_32
13032 + * Note that races in the updates of insync and start aren't
13033 + * problematic: insync can only get set bits added, and updates to
13034 + * start are only improving performance (without affecting correctness
13036 + * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
13037 + * This change works just fine with 2-level paging too.
13039 +#define sync_index(a) ((a) >> PMD_SHIFT)
13040 + static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
13041 + static unsigned long start = TASK_SIZE;
13042 + unsigned long address;
13044 + if (SHARED_KERNEL_PMD)
13047 + BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
13048 + for (address = start;
13049 + address < hypervisor_virt_start;
13050 + address += PMD_SIZE) {
13051 + if (!test_bit(sync_index(address), insync)) {
13052 + unsigned long flags;
13053 + struct page *page;
13055 + spin_lock_irqsave(&pgd_lock, flags);
13056 + /* XEN: failure path assumes non-empty pgd_list. */
13057 + if (unlikely(list_empty(&pgd_list))) {
13058 + spin_unlock_irqrestore(&pgd_lock, flags);
13061 + list_for_each_entry(page, &pgd_list, lru) {
13062 + if (!vmalloc_sync_one(page_address(page),
13066 + spin_unlock_irqrestore(&pgd_lock, flags);
13068 + set_bit(sync_index(address), insync);
13070 + if (address == start && test_bit(sync_index(address), insync))
13071 + start = address + PMD_SIZE;
13073 +#else /* CONFIG_X86_64 */
13075 + * Note that races in the updates of insync and start aren't
13076 + * problematic: insync can only get set bits added, and updates to
13077 + * start are only improving performance (without affecting correctness
13080 + static DECLARE_BITMAP(insync, PTRS_PER_PGD);
13081 + static unsigned long start = VMALLOC_START & PGDIR_MASK;
13082 + unsigned long address;
13084 + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
13085 + if (!test_bit(pgd_index(address), insync)) {
13086 + const pgd_t *pgd_ref = pgd_offset_k(address);
13087 + unsigned long flags;
13088 + struct page *page;
13090 + if (pgd_none(*pgd_ref))
13092 + spin_lock_irqsave(&pgd_lock, flags);
13093 + list_for_each_entry(page, &pgd_list, lru) {
13095 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
13096 + if (pgd_none(*pgd))
13097 + set_pgd(pgd, *pgd_ref);
13099 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
13101 + spin_unlock_irqrestore(&pgd_lock, flags);
13102 + set_bit(pgd_index(address), insync);
13104 + if (address == start)
13105 + start = address + PGDIR_SIZE;
13107 + /* Check that there is no need to do the same for the modules area. */
13108 + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
13109 + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
13110 + (__START_KERNEL & PGDIR_MASK)));
13113 --- a/arch/x86/mm/highmem_32-xen.c
13114 +++ b/arch/x86/mm/highmem_32-xen.c
13115 @@ -18,6 +18,49 @@ void kunmap(struct page *page)
13119 +static void debug_kmap_atomic_prot(enum km_type type)
13121 +#ifdef CONFIG_DEBUG_HIGHMEM
13122 + static unsigned warn_count = 10;
13124 + if (unlikely(warn_count == 0))
13127 + if (unlikely(in_interrupt())) {
13129 + if (type != KM_IRQ0 && type != KM_IRQ1 &&
13130 + type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
13131 + type != KM_BOUNCE_READ) {
13135 + } else if (!irqs_disabled()) { /* softirq */
13136 + if (type != KM_IRQ0 && type != KM_IRQ1 &&
13137 + type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
13138 + type != KM_SKB_SUNRPC_DATA &&
13139 + type != KM_SKB_DATA_SOFTIRQ &&
13140 + type != KM_BOUNCE_READ) {
13147 + if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
13148 + type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
13149 + if (!irqs_disabled()) {
13153 + } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
13154 + if (irq_count() == 0 && !irqs_disabled()) {
13163 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
13164 * no global lock is needed and because the kmap code must perform a global TLB
13165 @@ -37,6 +80,8 @@ void *kmap_atomic_prot(struct page *page
13166 if (!PageHighMem(page))
13167 return page_address(page);
13169 + debug_kmap_atomic_prot(type);
13171 idx = type + KM_TYPE_NR*smp_processor_id();
13172 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
13173 BUG_ON(!pte_none(*(kmap_pte-idx)));
13174 --- a/arch/x86/mm/hypervisor.c
13175 +++ b/arch/x86/mm/hypervisor.c
13176 @@ -831,15 +831,11 @@ int xen_limit_pages_to_max_mfn(
13178 EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
13181 -int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
13182 +int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
13184 - __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
13185 - maddr_t mach_lp = arbitrary_virt_to_machine(lp);
13186 - return HYPERVISOR_update_descriptor(
13187 - mach_lp, (u64)entry_a | ((u64)entry_b<<32));
13188 + maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
13189 + return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
13193 #define MAX_BATCHED_FULL_PTES 32
13195 --- a/arch/x86/mm/init_32-xen.c
13196 +++ b/arch/x86/mm/init_32-xen.c
13197 @@ -27,13 +27,13 @@
13198 #include <linux/bootmem.h>
13199 #include <linux/slab.h>
13200 #include <linux/proc_fs.h>
13201 -#include <linux/efi.h>
13202 #include <linux/memory_hotplug.h>
13203 #include <linux/initrd.h>
13204 #include <linux/cpumask.h>
13205 #include <linux/dma-mapping.h>
13206 #include <linux/scatterlist.h>
13208 +#include <asm/asm.h>
13209 #include <asm/processor.h>
13210 #include <asm/system.h>
13211 #include <asm/uaccess.h>
13212 @@ -42,18 +42,22 @@
13213 #include <asm/fixmap.h>
13214 #include <asm/e820.h>
13215 #include <asm/apic.h>
13216 +#include <asm/bugs.h>
13217 #include <asm/tlb.h>
13218 #include <asm/tlbflush.h>
13219 +#include <asm/pgalloc.h>
13220 #include <asm/sections.h>
13221 #include <asm/hypervisor.h>
13222 #include <asm/swiotlb.h>
13223 +#include <asm/setup.h>
13224 +#include <asm/cacheflush.h>
13226 unsigned int __VMALLOC_RESERVE = 128 << 20;
13228 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
13229 unsigned long highstart_pfn, highend_pfn;
13231 -static int noinline do_test_wp_bit(void);
13232 +static noinline int do_test_wp_bit(void);
13235 * Creates a middle page table and puts a pointer to it in the
13236 @@ -64,17 +68,16 @@ static pmd_t * __init one_md_table_init(
13242 #ifdef CONFIG_X86_PAE
13243 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
13244 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
13246 - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
13247 + paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
13248 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
13249 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
13250 pud = pud_offset(pgd, 0);
13251 - if (pmd_table != pmd_offset(pud, 0))
13253 + BUG_ON(pmd_table != pmd_offset(pud, 0));
13256 pud = pud_offset(pgd, 0);
13257 @@ -85,7 +88,7 @@ static pmd_t * __init one_md_table_init(
13260 * Create a page table and place a pointer to it in a middle page
13261 - * directory entry.
13262 + * directory entry:
13264 static pte_t * __init one_page_table_init(pmd_t *pmd)
13266 @@ -99,9 +102,10 @@ static pte_t * __init one_page_table_ini
13267 #ifdef CONFIG_DEBUG_PAGEALLOC
13268 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
13271 + if (!page_table) {
13273 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
13276 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
13277 make_lowmem_page_readonly(page_table,
13278 @@ -114,22 +118,21 @@ static pte_t * __init one_page_table_ini
13282 - * This function initializes a certain range of kernel virtual memory
13283 + * This function initializes a certain range of kernel virtual memory
13284 * with new bootmem page tables, everywhere page tables are missing in
13289 - * NOTE: The pagetables are allocated contiguous on the physical space
13290 - * so we can cache the place of the first one and move around without
13292 + * NOTE: The pagetables are allocated contiguous on the physical space
13293 + * so we can cache the place of the first one and move around without
13294 * checking the pgd every time.
13296 -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
13297 +static void __init
13298 +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
13302 int pgd_idx, pmd_idx;
13303 unsigned long vaddr;
13308 pgd_idx = pgd_index(vaddr);
13309 @@ -139,7 +142,8 @@ static void __init page_table_range_init
13310 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
13311 pmd = one_md_table_init(pgd);
13312 pmd = pmd + pmd_index(vaddr);
13313 - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
13314 + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
13315 + pmd++, pmd_idx++) {
13316 if (vaddr < hypervisor_virt_start)
13317 one_page_table_init(pmd);
13319 @@ -157,17 +161,17 @@ static inline int is_kernel_text(unsigne
13323 - * This maps the physical memory to kernel virtual address space, a total
13324 - * of max_low_pfn pages, by creating page tables starting from address
13326 + * This maps the physical memory to kernel virtual address space, a total
13327 + * of max_low_pfn pages, by creating page tables starting from address
13330 static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
13332 + int pgd_idx, pmd_idx, pte_ofs;
13337 - int pgd_idx, pmd_idx, pte_ofs;
13339 unsigned long max_ram_pfn = xen_start_info->nr_pages;
13340 if (max_ram_pfn > max_low_pfn)
13341 @@ -195,36 +199,49 @@ static void __init kernel_physical_mappi
13342 if (pfn >= max_low_pfn)
13345 - for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
13346 - unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
13347 - if (address >= hypervisor_virt_start)
13348 + for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
13349 + pmd++, pmd_idx++) {
13350 + unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
13352 + if (addr >= hypervisor_virt_start)
13355 - /* Map with big pages if possible, otherwise create normal page tables. */
13357 + * Map with big pages if possible, otherwise
13358 + * create normal page tables:
13361 - unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
13362 - if (is_kernel_text(address) || is_kernel_text(address2))
13363 - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
13365 - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
13366 + unsigned int addr2;
13367 + pgprot_t prot = PAGE_KERNEL_LARGE;
13369 + addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
13370 + PAGE_OFFSET + PAGE_SIZE-1;
13372 + if (is_kernel_text(addr) ||
13373 + is_kernel_text(addr2))
13374 + prot = PAGE_KERNEL_LARGE_EXEC;
13376 + set_pmd(pmd, pfn_pmd(pfn, prot));
13378 pfn += PTRS_PER_PTE;
13380 - pte = one_page_table_init(pmd);
13383 + pte = one_page_table_init(pmd);
13385 + for (pte += pte_ofs;
13386 + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
13387 + pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
13388 + pgprot_t prot = PAGE_KERNEL;
13390 + /* XEN: Only map initial RAM allocation. */
13391 + if ((pfn >= max_ram_pfn) || pte_present(*pte))
13393 + if (is_kernel_text(addr))
13394 + prot = PAGE_KERNEL_EXEC;
13396 - for (pte += pte_ofs;
13397 - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
13398 - pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
13399 - /* XEN: Only map initial RAM allocation. */
13400 - if ((pfn >= max_ram_pfn) || pte_present(*pte))
13402 - if (is_kernel_text(address))
13403 - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
13405 - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
13408 + set_pte(pte, pfn_pte(pfn, prot));
13414 @@ -245,57 +262,23 @@ static inline int page_kills_ppro(unsign
13418 -int page_is_ram(unsigned long pagenr)
13421 - unsigned long addr, end;
13423 - if (efi_enabled) {
13424 - efi_memory_desc_t *md;
13427 - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
13429 - if (!is_available_memory(md))
13431 - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
13432 - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
13434 - if ((pagenr >= addr) && (pagenr < end))
13440 - for (i = 0; i < e820.nr_map; i++) {
13442 - if (e820.map[i].type != E820_RAM) /* not usable memory */
13445 - * !!!FIXME!!! Some BIOSen report areas as RAM that
13446 - * are not. Notably the 640->1Mb area. We need a sanity
13449 - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
13450 - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
13451 - if ((pagenr >= addr) && (pagenr < end))
13457 #ifdef CONFIG_HIGHMEM
13459 pgprot_t kmap_prot;
13461 -#define kmap_get_fixmap_pte(vaddr) \
13462 - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
13463 +static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
13465 + return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
13466 + vaddr), vaddr), vaddr);
13469 static void __init kmap_init(void)
13471 unsigned long kmap_vstart;
13473 - /* cache the first kmap pte */
13475 + * Cache the first kmap pte:
13477 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
13478 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
13480 @@ -304,11 +287,11 @@ static void __init kmap_init(void)
13482 static void __init permanent_kmaps_init(pgd_t *pgd_base)
13484 + unsigned long vaddr;
13489 - unsigned long vaddr;
13491 vaddr = PKMAP_BASE;
13492 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
13493 @@ -317,7 +300,7 @@ static void __init permanent_kmaps_init(
13494 pud = pud_offset(pgd, vaddr);
13495 pmd = pmd_offset(pud, vaddr);
13496 pte = pte_offset_kernel(pmd, vaddr);
13497 - pkmap_page_table = pte;
13498 + pkmap_page_table = pte;
13501 static void __meminit free_new_highpage(struct page *page, int pfn)
13502 @@ -337,7 +320,8 @@ void __init add_one_highpage_init(struct
13503 SetPageReserved(page);
13506 -static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
13507 +static int __meminit
13508 +add_one_highpage_hotplug(struct page *page, unsigned long pfn)
13510 free_new_highpage(page, pfn);
13512 @@ -345,6 +329,7 @@ static int __meminit add_one_highpage_ho
13513 max_mapnr = max(pfn, max_mapnr);
13520 @@ -352,7 +337,7 @@ static int __meminit add_one_highpage_ho
13521 * Not currently handling the NUMA case.
13522 * Assuming single node and all memory that
13523 * has been added dynamically that would be
13524 - * onlined here is in HIGHMEM
13525 + * onlined here is in HIGHMEM.
13527 void __meminit online_page(struct page *page)
13529 @@ -360,13 +345,11 @@ void __meminit online_page(struct page *
13530 add_one_highpage_hotplug(page, page_to_pfn(page));
13534 -#ifdef CONFIG_NUMA
13535 -extern void set_highmem_pages_init(int);
13537 +#ifndef CONFIG_NUMA
13538 static void __init set_highmem_pages_init(int bad_ppro)
13542 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
13544 * Holes under sparsemem might not have no mem_map[]:
13545 @@ -376,23 +359,18 @@ static void __init set_highmem_pages_ini
13547 totalram_pages += totalhigh_pages;
13549 -#endif /* CONFIG_FLATMEM */
13550 +#endif /* !CONFIG_NUMA */
13553 -#define kmap_init() do { } while (0)
13554 -#define permanent_kmaps_init(pgd_base) do { } while (0)
13555 -#define set_highmem_pages_init(bad_ppro) do { } while (0)
13556 +# define kmap_init() do { } while (0)
13557 +# define permanent_kmaps_init(pgd_base) do { } while (0)
13558 +# define set_highmem_pages_init(bad_ppro) do { } while (0)
13559 #endif /* CONFIG_HIGHMEM */
13561 -unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
13562 +pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
13563 EXPORT_SYMBOL(__PAGE_KERNEL);
13564 -unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
13566 -#ifdef CONFIG_NUMA
13567 -extern void __init remap_numa_kva(void);
13569 -#define remap_numa_kva() do {} while (0)
13571 +pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
13573 pgd_t *swapper_pg_dir;
13575 @@ -410,9 +388,8 @@ static void __init xen_pagetable_setup_d
13576 * the boot process.
13578 * If we're booting on native hardware, this will be a pagetable
13579 - * constructed in arch/i386/kernel/head.S, and not running in PAE mode
13580 - * (even if we'll end up running in PAE). The root of the pagetable
13581 - * will be swapper_pg_dir.
13582 + * constructed in arch/x86/kernel/head_32.S. The root of the
13583 + * pagetable will be swapper_pg_dir.
13585 * If we're booting paravirtualized under a hypervisor, then there are
13586 * more options: we may already be running PAE, and the pagetable may
13587 @@ -424,10 +401,10 @@ static void __init xen_pagetable_setup_d
13588 * be partially populated, and so it avoids stomping on any existing
13591 -static void __init pagetable_init (void)
13592 +static void __init pagetable_init(void)
13594 - unsigned long vaddr, end;
13595 pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
13596 + unsigned long vaddr, end;
13598 xen_pagetable_setup_start(pgd_base);
13600 @@ -449,34 +426,36 @@ static void __init pagetable_init (void)
13601 * Fixed mappings, only the page table structure has to be
13602 * created - mappings will be set by set_fixmap():
13604 + early_ioremap_clear();
13605 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
13606 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
13607 page_table_range_init(vaddr, end, pgd_base);
13608 + early_ioremap_reset();
13610 permanent_kmaps_init(pgd_base);
13612 xen_pagetable_setup_done(pgd_base);
13615 -#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
13616 +#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
13618 - * Swap suspend & friends need this for resume because things like the intel-agp
13619 + * ACPI suspend needs this for resume, because things like the intel-agp
13620 * driver might have split up a kernel 4MB mapping.
13622 -char __nosavedata swsusp_pg_dir[PAGE_SIZE]
13623 - __attribute__ ((aligned (PAGE_SIZE)));
13624 +char swsusp_pg_dir[PAGE_SIZE]
13625 + __attribute__ ((aligned(PAGE_SIZE)));
13627 static inline void save_pg_dir(void)
13629 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
13632 +#else /* !CONFIG_ACPI_SLEEP */
13633 static inline void save_pg_dir(void)
13637 +#endif /* !CONFIG_ACPI_SLEEP */
13639 -void zap_low_mappings (void)
13640 +void zap_low_mappings(void)
13644 @@ -488,22 +467,24 @@ void zap_low_mappings (void)
13645 * Note that "pgd_clear()" doesn't do it for
13646 * us, because pgd_clear() is a no-op on i386.
13648 - for (i = 0; i < USER_PTRS_PER_PGD; i++)
13649 + for (i = 0; i < USER_PTRS_PER_PGD; i++) {
13650 #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
13651 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
13653 set_pgd(swapper_pg_dir+i, __pgd(0));
13659 -int nx_enabled = 0;
13662 +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
13663 +EXPORT_SYMBOL_GPL(__supported_pte_mask);
13665 #ifdef CONFIG_X86_PAE
13667 -static int disable_nx __initdata = 0;
13668 -u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
13669 -EXPORT_SYMBOL_GPL(__supported_pte_mask);
13670 +static int disable_nx __initdata;
13674 @@ -520,11 +501,14 @@ static int __init noexec_setup(char *str
13675 __supported_pte_mask |= _PAGE_NX;
13678 - } else if (!strcmp(str,"off")) {
13680 - __supported_pte_mask &= ~_PAGE_NX;
13684 + if (!strcmp(str, "off")) {
13686 + __supported_pte_mask &= ~_PAGE_NX;
13694 @@ -536,6 +520,7 @@ static void __init set_nx(void)
13696 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
13697 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
13699 if ((v[3] & (1 << 20)) && !disable_nx) {
13700 rdmsr(MSR_EFER, l, h);
13702 @@ -545,35 +530,6 @@ static void __init set_nx(void)
13708 - * Enables/disables executability of a given kernel page and
13709 - * returns the previous setting.
13711 -int __init set_kernel_exec(unsigned long vaddr, int enable)
13719 - pte = lookup_address(vaddr);
13722 - if (!pte_exec_kernel(*pte))
13726 - pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
13728 - pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
13729 - pte_update_defer(&init_mm, vaddr, pte);
13730 - __flush_tlb_all();
13738 @@ -590,21 +546,10 @@ void __init paging_init(void)
13739 #ifdef CONFIG_X86_PAE
13742 - printk("NX (Execute Disable) protection: active\n");
13743 + printk(KERN_INFO "NX (Execute Disable) protection: active\n");
13748 -#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
13750 - * We will bail out later - printk doesn't work right now so
13751 - * the user would just see a hanging kernel.
13752 - * when running as xen domain we are already in PAE mode at
13756 - set_in_cr4(X86_CR4_PAE);
13761 @@ -631,10 +576,10 @@ void __init paging_init(void)
13762 * used to involve black magic jumps to work around some nasty CPU bugs,
13763 * but fortunately the switch to using exceptions got rid of all that.
13766 static void __init test_wp_bit(void)
13768 - printk("Checking if this processor honours the WP bit even in supervisor mode... ");
13770 + "Checking if this processor honours the WP bit even in supervisor mode...");
13772 /* Any page-aligned address will do, the test is non-destructive */
13773 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
13774 @@ -642,23 +587,22 @@ static void __init test_wp_bit(void)
13775 clear_fixmap(FIX_WP_TEST);
13777 if (!boot_cpu_data.wp_works_ok) {
13779 + printk(KERN_CONT "No.\n");
13780 #ifdef CONFIG_X86_WP_WORKS_OK
13781 - panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
13783 + "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
13787 + printk(KERN_CONT "Ok.\n");
13791 -static struct kcore_list kcore_mem, kcore_vmalloc;
13792 +static struct kcore_list kcore_mem, kcore_vmalloc;
13794 void __init mem_init(void)
13796 - extern int ppro_with_ram_bug(void);
13797 int codesize, reservedpages, datasize, initsize;
13800 + int tmp, bad_ppro;
13803 #if defined(CONFIG_SWIOTLB)
13804 @@ -668,19 +612,19 @@ void __init mem_init(void)
13805 #ifdef CONFIG_FLATMEM
13809 bad_ppro = ppro_with_ram_bug();
13811 #ifdef CONFIG_HIGHMEM
13812 /* check that fixmap and pkmap do not overlap */
13813 - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
13814 - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
13815 + if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
13817 + "fixmap and kmap areas overlap - this will crash\n");
13818 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
13819 - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
13820 + PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
13826 /* this will put all low memory onto the freelists */
13827 totalram_pages += free_all_bootmem();
13828 /* XEN: init and count low-mem pages outside initial allocation. */
13829 @@ -693,7 +637,7 @@ void __init mem_init(void)
13831 for (tmp = 0; tmp < max_low_pfn; tmp++)
13833 - * Only count reserved RAM pages
13834 + * Only count reserved RAM pages:
13836 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
13838 @@ -704,11 +648,12 @@ void __init mem_init(void)
13839 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
13840 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
13842 - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
13843 - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
13844 + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
13845 + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
13846 VMALLOC_END-VMALLOC_START);
13848 - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
13849 + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
13850 + "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
13851 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
13852 num_physpages << (PAGE_SHIFT-10),
13854 @@ -719,54 +664,53 @@ void __init mem_init(void)
13857 #if 1 /* double-sanity-check paranoia */
13858 - printk("virtual kernel memory layout:\n"
13859 - " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13860 + printk(KERN_INFO "virtual kernel memory layout:\n"
13861 + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13862 #ifdef CONFIG_HIGHMEM
13863 - " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13864 + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13866 - " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
13867 - " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
13868 - " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
13869 - " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
13870 - " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
13871 - FIXADDR_START, FIXADDR_TOP,
13872 - (FIXADDR_TOP - FIXADDR_START) >> 10,
13873 + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
13874 + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
13875 + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
13876 + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
13877 + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
13878 + FIXADDR_START, FIXADDR_TOP,
13879 + (FIXADDR_TOP - FIXADDR_START) >> 10,
13881 #ifdef CONFIG_HIGHMEM
13882 - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
13883 - (LAST_PKMAP*PAGE_SIZE) >> 10,
13884 + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
13885 + (LAST_PKMAP*PAGE_SIZE) >> 10,
13888 - VMALLOC_START, VMALLOC_END,
13889 - (VMALLOC_END - VMALLOC_START) >> 20,
13890 + VMALLOC_START, VMALLOC_END,
13891 + (VMALLOC_END - VMALLOC_START) >> 20,
13893 - (unsigned long)__va(0), (unsigned long)high_memory,
13894 - ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
13895 + (unsigned long)__va(0), (unsigned long)high_memory,
13896 + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
13898 - (unsigned long)&__init_begin, (unsigned long)&__init_end,
13899 - ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
13900 + (unsigned long)&__init_begin, (unsigned long)&__init_end,
13901 + ((unsigned long)&__init_end -
13902 + (unsigned long)&__init_begin) >> 10,
13904 - (unsigned long)&_etext, (unsigned long)&_edata,
13905 - ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
13906 + (unsigned long)&_etext, (unsigned long)&_edata,
13907 + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
13909 - (unsigned long)&_text, (unsigned long)&_etext,
13910 - ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
13911 + (unsigned long)&_text, (unsigned long)&_etext,
13912 + ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
13914 #ifdef CONFIG_HIGHMEM
13915 - BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
13916 - BUG_ON(VMALLOC_END > PKMAP_BASE);
13917 + BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
13918 + BUG_ON(VMALLOC_END > PKMAP_BASE);
13920 - BUG_ON(VMALLOC_START > VMALLOC_END);
13921 - BUG_ON((unsigned long)high_memory > VMALLOC_START);
13922 + BUG_ON(VMALLOC_START > VMALLOC_END);
13923 + BUG_ON((unsigned long)high_memory > VMALLOC_START);
13924 #endif /* double-sanity-check paranoia */
13926 -#ifdef CONFIG_X86_PAE
13927 - if (!cpu_has_pae)
13928 - panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
13930 if (boot_cpu_data.wp_works_ok < 0)
13936 * Subtle. SMP is doing it's boot stuff late (because it has to
13937 * fork idle threads) - but it also needs low mappings for the
13938 @@ -790,49 +734,35 @@ int arch_add_memory(int nid, u64 start,
13940 return __add_pages(zone, start_pfn, nr_pages);
13945 -struct kmem_cache *pmd_cache;
13947 -void __init pgtable_cache_init(void)
13949 - if (PTRS_PER_PMD > 1)
13950 - pmd_cache = kmem_cache_create("pmd",
13951 - PTRS_PER_PMD*sizeof(pmd_t),
13952 - PTRS_PER_PMD*sizeof(pmd_t),
13958 * This function cannot be __init, since exceptions don't work in that
13959 * section. Put this after the callers, so that it cannot be inlined.
13961 -static int noinline do_test_wp_bit(void)
13962 +static noinline int do_test_wp_bit(void)
13967 __asm__ __volatile__(
13969 - "1: movb %1,%0 \n"
13971 + " movb %0, %1 \n"
13972 + "1: movb %1, %0 \n"
13973 + " xorl %2, %2 \n"
13975 - ".section __ex_table,\"a\"\n"
13977 - " .long 1b,2b \n"
13979 + _ASM_EXTABLE(1b,2b)
13980 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
13990 #ifdef CONFIG_DEBUG_RODATA
13991 +const int rodata_test_data = 0xC3;
13992 +EXPORT_SYMBOL_GPL(rodata_test_data);
13994 void mark_rodata_ro(void)
13996 @@ -845,32 +775,58 @@ void mark_rodata_ro(void)
13997 if (num_possible_cpus() <= 1)
14000 - change_page_attr(virt_to_page(start),
14001 - size >> PAGE_SHIFT, PAGE_KERNEL_RX);
14002 - printk("Write protecting the kernel text: %luk\n", size >> 10);
14003 + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
14004 + printk(KERN_INFO "Write protecting the kernel text: %luk\n",
14007 +#ifdef CONFIG_CPA_DEBUG
14008 + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
14009 + start, start+size);
14010 + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
14012 + printk(KERN_INFO "Testing CPA: write protecting again\n");
14013 + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
14018 size = (unsigned long)__end_rodata - start;
14019 - change_page_attr(virt_to_page(start),
14020 - size >> PAGE_SHIFT, PAGE_KERNEL_RO);
14021 - printk("Write protecting the kernel read-only data: %luk\n",
14023 + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
14024 + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
14028 +#ifdef CONFIG_CPA_DEBUG
14029 + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
14030 + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
14033 - * change_page_attr() requires a global_flush_tlb() call after it.
14034 - * We do this after the printk so that if something went wrong in the
14035 - * change, the printk gets out at least to give a better debug hint
14036 - * of who is the culprit.
14038 - global_flush_tlb();
14039 + printk(KERN_INFO "Testing CPA: write protecting again\n");
14040 + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
14045 void free_init_pages(char *what, unsigned long begin, unsigned long end)
14047 +#ifdef CONFIG_DEBUG_PAGEALLOC
14049 + * If debugging page accesses then do not free this memory but
14050 + * mark them not present - any buggy init-section access will
14051 + * create a kernel page fault:
14053 + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
14054 + begin, PAGE_ALIGN(end));
14055 + set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
14057 unsigned long addr;
14060 + * We just marked the kernel text read only above, now that
14061 + * we are going to free part of that, we need to make that
14062 + * writeable first.
14064 + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
14066 for (addr = begin; addr < end; addr += PAGE_SIZE) {
14067 ClearPageReserved(virt_to_page(addr));
14068 init_page_count(virt_to_page(addr));
14069 @@ -879,6 +835,7 @@ void free_init_pages(char *what, unsigne
14072 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
14076 void free_initmem(void)
14077 @@ -894,4 +851,3 @@ void free_initrd_mem(unsigned long start
14078 free_init_pages("initrd memory", start, end);
14082 --- a/arch/x86/mm/init_64-xen.c
14083 +++ b/arch/x86/mm/init_64-xen.c
14084 @@ -46,14 +46,13 @@
14085 #include <asm/proto.h>
14086 #include <asm/smp.h>
14087 #include <asm/sections.h>
14088 +#include <asm/kdebug.h>
14089 +#include <asm/numa.h>
14090 +#include <asm/cacheflush.h>
14092 #include <xen/features.h>
14095 -#define Dprintk(x...)
14098 -const struct dma_mapping_ops* dma_ops;
14099 +const struct dma_mapping_ops *dma_ops;
14100 EXPORT_SYMBOL(dma_ops);
14102 #if CONFIG_XEN_COMPAT <= 0x030002
14103 @@ -80,7 +79,21 @@ extern pte_t level1_fixmap_pgt[PTRS_PER_
14104 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
14105 __START_KERNEL_map)))
14107 -static void __meminit early_make_page_readonly(void *va, unsigned int feature)
14108 +pmd_t *__init early_get_pmd(unsigned long va)
14110 + unsigned long addr;
14111 + unsigned long *page = (unsigned long *)init_level4_pgt;
14113 + addr = page[pgd_index(va)];
14114 + addr_to_page(addr, page);
14116 + addr = page[pud_index(va)];
14117 + addr_to_page(addr, page);
14119 + return (pmd_t *)&page[pmd_index(va)];
14122 +void __meminit early_make_page_readonly(void *va, unsigned int feature)
14124 unsigned long addr, _va = (unsigned long)va;
14126 @@ -107,76 +120,6 @@ static void __meminit early_make_page_re
14130 -static void __make_page_readonly(void *va)
14132 - pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
14133 - unsigned long addr = (unsigned long) va;
14135 - pgd = pgd_offset_k(addr);
14136 - pud = pud_offset(pgd, addr);
14137 - pmd = pmd_offset(pud, addr);
14138 - ptep = pte_offset_kernel(pmd, addr);
14140 - pte.pte = ptep->pte & ~_PAGE_RW;
14141 - if (HYPERVISOR_update_va_mapping(addr, pte, 0))
14142 - xen_l1_entry_update(ptep, pte); /* fallback */
14144 - if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
14145 - __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
14148 -static void __make_page_writable(void *va)
14150 - pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
14151 - unsigned long addr = (unsigned long) va;
14153 - pgd = pgd_offset_k(addr);
14154 - pud = pud_offset(pgd, addr);
14155 - pmd = pmd_offset(pud, addr);
14156 - ptep = pte_offset_kernel(pmd, addr);
14158 - pte.pte = ptep->pte | _PAGE_RW;
14159 - if (HYPERVISOR_update_va_mapping(addr, pte, 0))
14160 - xen_l1_entry_update(ptep, pte); /* fallback */
14162 - if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
14163 - __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
14166 -void make_page_readonly(void *va, unsigned int feature)
14168 - if (!xen_feature(feature))
14169 - __make_page_readonly(va);
14172 -void make_page_writable(void *va, unsigned int feature)
14174 - if (!xen_feature(feature))
14175 - __make_page_writable(va);
14178 -void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
14180 - if (xen_feature(feature))
14183 - while (nr-- != 0) {
14184 - __make_page_readonly(va);
14185 - va = (void*)((unsigned long)va + PAGE_SIZE);
14189 -void make_pages_writable(void *va, unsigned nr, unsigned int feature)
14191 - if (xen_feature(feature))
14194 - while (nr-- != 0) {
14195 - __make_page_writable(va);
14196 - va = (void*)((unsigned long)va + PAGE_SIZE);
14201 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
14202 * physical space so we can cache the place of the first one and move
14203 @@ -187,22 +130,26 @@ void show_mem(void)
14205 long i, total = 0, reserved = 0;
14206 long shared = 0, cached = 0;
14207 - pg_data_t *pgdat;
14209 + pg_data_t *pgdat;
14211 printk(KERN_INFO "Mem-info:\n");
14213 - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
14214 + printk(KERN_INFO "Free swap: %6ldkB\n",
14215 + nr_swap_pages << (PAGE_SHIFT-10));
14217 for_each_online_pgdat(pgdat) {
14218 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14219 - /* this loop can take a while with 256 GB and 4k pages
14220 - so update the NMI watchdog */
14221 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
14222 + for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14224 + * This loop can take a while with 256 GB and
14225 + * 4k pages so defer the NMI watchdog:
14227 + if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
14228 touch_nmi_watchdog();
14231 if (!pfn_valid(pgdat->node_start_pfn + i))
14234 page = pfn_to_page(pgdat->node_start_pfn + i);
14236 if (PageReserved(page))
14237 @@ -211,58 +158,67 @@ void show_mem(void)
14239 else if (page_count(page))
14240 shared += page_count(page) - 1;
14244 - printk(KERN_INFO "%lu pages of RAM\n", total);
14245 - printk(KERN_INFO "%lu reserved pages\n",reserved);
14246 - printk(KERN_INFO "%lu pages shared\n",shared);
14247 - printk(KERN_INFO "%lu pages swap cached\n",cached);
14248 + printk(KERN_INFO "%lu pages of RAM\n", total);
14249 + printk(KERN_INFO "%lu reserved pages\n", reserved);
14250 + printk(KERN_INFO "%lu pages shared\n", shared);
14251 + printk(KERN_INFO "%lu pages swap cached\n", cached);
14254 +static unsigned long __meminitdata table_start;
14255 +static unsigned long __meminitdata table_end;
14257 static __init void *spp_getpage(void)
14263 - ptr = (void *) get_zeroed_page(GFP_ATOMIC);
14264 + ptr = (void *) get_zeroed_page(GFP_ATOMIC);
14265 else if (start_pfn < table_end) {
14266 ptr = __va(start_pfn << PAGE_SHIFT);
14268 memset(ptr, 0, PAGE_SIZE);
14270 ptr = alloc_bootmem_pages(PAGE_SIZE);
14271 - if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
14272 - panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
14274 - Dprintk("spp_getpage %p\n", ptr);
14275 + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
14276 + panic("set_pte_phys: cannot allocate page data %s\n",
14277 + after_bootmem ? "after bootmem" : "");
14280 + pr_debug("spp_getpage %p\n", ptr);
14286 #define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
14287 #define pud_offset_u(address) (level3_user_pgt + pud_index(address))
14289 -static __init void set_pte_phys(unsigned long vaddr,
14290 - unsigned long phys, pgprot_t prot, int user_mode)
14291 +static __init void
14292 +set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
14297 pte_t *pte, new_pte;
14299 - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
14300 + pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
14302 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
14303 if (pgd_none(*pgd)) {
14304 - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
14306 + "PGD FIXMAP MISSING, it should be setup in head.S!\n");
14309 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
14310 if (pud_none(*pud)) {
14311 - pmd = (pmd_t *) spp_getpage();
14312 + pmd = (pmd_t *) spp_getpage();
14313 make_page_readonly(pmd, XENFEAT_writable_page_tables);
14314 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
14315 if (pmd != pmd_offset(pud, 0)) {
14316 - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
14317 + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
14318 + pmd, pmd_offset(pud, 0));
14322 @@ -272,7 +228,7 @@ static __init void set_pte_phys(unsigned
14323 make_page_readonly(pte, XENFEAT_writable_page_tables);
14324 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
14325 if (pte != pte_offset_kernel(pmd, 0)) {
14326 - printk("PAGETABLE BUG #02!\n");
14327 + printk(KERN_ERR "PAGETABLE BUG #02!\n");
14331 @@ -294,30 +250,30 @@ static __init void set_pte_phys(unsigned
14332 __flush_tlb_one(vaddr);
14335 -static __init void set_pte_phys_ma(unsigned long vaddr,
14336 - unsigned long phys, pgprot_t prot)
14337 +static __init void
14338 +set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
14343 pte_t *pte, new_pte;
14345 - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
14346 + pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
14348 pgd = pgd_offset_k(vaddr);
14349 if (pgd_none(*pgd)) {
14350 - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
14352 + "PGD FIXMAP MISSING, it should be setup in head.S!\n");
14355 pud = pud_offset(pgd, vaddr);
14356 if (pud_none(*pud)) {
14358 - pmd = (pmd_t *) spp_getpage();
14359 + pmd = (pmd_t *) spp_getpage();
14360 make_page_readonly(pmd, XENFEAT_writable_page_tables);
14361 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
14362 if (pmd != pmd_offset(pud, 0)) {
14363 - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
14365 + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
14366 + pmd, pmd_offset(pud, 0));
14369 pmd = pmd_offset(pud, vaddr);
14370 @@ -326,7 +282,7 @@ static __init void set_pte_phys_ma(unsig
14371 make_page_readonly(pte, XENFEAT_writable_page_tables);
14372 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
14373 if (pte != pte_offset_kernel(pmd, 0)) {
14374 - printk("PAGETABLE BUG #02!\n");
14375 + printk(KERN_ERR "PAGETABLE BUG #02!\n");
14379 @@ -350,14 +306,44 @@ static __init void set_pte_phys_ma(unsig
14380 __flush_tlb_one(vaddr);
14383 +#ifndef CONFIG_XEN
14385 + * The head.S code sets up the kernel high mapping:
14387 + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
14389 + * phys_addr holds the negative offset to the kernel, which is added
14390 + * to the compile time generated pmds. This results in invalid pmds up
14391 + * to the point where we hit the physaddr 0 mapping.
14393 + * We limit the mappings to the region from _text to _end. _end is
14394 + * rounded up to the 2MB boundary. This catches the invalid pmds as
14395 + * well, as they are located before _text:
14397 +void __init cleanup_highmap(void)
14399 + unsigned long vaddr = __START_KERNEL_map;
14400 + unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
14401 + pmd_t *pmd = level2_kernel_pgt;
14402 + pmd_t *last_pmd = pmd + PTRS_PER_PMD;
14404 + for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
14405 + if (!pmd_present(*pmd))
14407 + if (vaddr < (unsigned long) _text || vaddr > end)
14408 + set_pmd(pmd, __pmd(0));
14413 /* NOTE: this is meant to be run only at boot */
14415 -__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
14417 +__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
14419 unsigned long address = __fix_to_virt(idx);
14421 if (idx >= __end_of_fixed_addresses) {
14422 - printk("Invalid __set_fixmap\n");
14423 + printk(KERN_ERR "Invalid __set_fixmap\n");
14427 @@ -375,16 +361,14 @@ __set_fixmap (enum fixed_addresses idx,
14431 -unsigned long __meminitdata table_start, table_end;
14433 static __meminit void *alloc_static_page(unsigned long *phys)
14435 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
14437 if (after_bootmem) {
14438 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
14445 @@ -396,7 +380,7 @@ static __meminit void *alloc_static_page
14447 #define PTE_SIZE PAGE_SIZE
14449 -static inline int make_readonly(unsigned long paddr)
14450 +static inline int __meminit make_readonly(unsigned long paddr)
14452 extern char __vsyscall_0;
14454 @@ -430,33 +414,38 @@ static inline int make_readonly(unsigned
14455 /* Must run before zap_low_mappings */
14456 __meminit void *early_ioremap(unsigned long addr, unsigned long size)
14458 - unsigned long vaddr;
14459 pmd_t *pmd, *last_pmd;
14460 + unsigned long vaddr;
14463 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
14464 vaddr = __START_KERNEL_map;
14465 pmd = level2_kernel_pgt;
14466 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
14468 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
14469 for (i = 0; i < pmds; i++) {
14470 if (pmd_present(pmd[i]))
14472 + goto continue_outer_loop;
14474 vaddr += addr & ~PMD_MASK;
14477 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
14478 - set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
14480 + set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
14481 + __flush_tlb_all();
14483 return (void *)vaddr;
14485 +continue_outer_loop:
14488 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
14492 -/* To avoid virtual aliases later */
14494 + * To avoid virtual aliases later:
14496 __meminit void early_iounmap(void *addr, unsigned long size)
14498 unsigned long vaddr;
14499 @@ -466,9 +455,11 @@ __meminit void early_iounmap(void *addr,
14500 vaddr = (unsigned long)addr;
14501 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
14502 pmd = level2_kernel_pgt + pmd_index(vaddr);
14504 for (i = 0; i < pmds; i++)
14505 pmd_clear(pmd + i);
14508 + __flush_tlb_all();
14512 @@ -517,18 +508,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
14513 static void __meminit
14514 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
14516 - pmd_t *pmd = pmd_offset(pud,0);
14517 + pmd_t *pmd = pmd_offset(pud, 0);
14518 spin_lock(&init_mm.page_table_lock);
14519 phys_pmd_init(pmd, address, end);
14520 spin_unlock(&init_mm.page_table_lock);
14524 -static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
14526 +static void __meminit
14527 +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
14529 int i = pud_index(addr);
14531 - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
14532 + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
14533 unsigned long pmd_phys;
14534 pud_t *pud = pud_page + pud_index(addr);
14536 @@ -550,8 +542,8 @@ static void __meminit phys_pud_init(pud_
14538 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
14542 + __flush_tlb_all();
14545 void __init xen_init_pt(void)
14547 @@ -632,6 +624,7 @@ void __init xen_init_pt(void)
14548 static void __init extend_init_mapping(unsigned long tables_space)
14550 unsigned long va = __START_KERNEL_map;
14551 + unsigned long start = start_pfn;
14552 unsigned long phys, addr, *pte_page;
14554 pte_t *pte, new_pte;
14555 @@ -682,6 +675,10 @@ static void __init extend_init_mapping(u
14560 + if (start_pfn > start)
14561 + reserve_early(start << PAGE_SHIFT,
14562 + start_pfn << PAGE_SHIFT, "INITMAP");
14565 static void __init find_early_table_space(unsigned long end)
14566 @@ -706,7 +703,7 @@ static void __init find_early_table_spac
14567 (table_start << PAGE_SHIFT) + tables);
14570 -static void xen_finish_init_mapping(void)
14571 +static void __init xen_finish_init_mapping(void)
14573 unsigned long i, start, end;
14575 @@ -742,8 +739,10 @@ static void xen_finish_init_mapping(void
14576 * Prefetch pte's for the bt_ioremap() area. It gets used before the
14577 * boot-time allocator is online, so allocate-on-demand would fail.
14579 + early_ioremap_clear();
14580 for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
14581 __set_fixmap(i, 0, __pgprot(0));
14582 + early_ioremap_reset();
14584 /* Switch to the real shared_info page, and clear the dummy page. */
14585 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
14586 @@ -764,20 +763,23 @@ static void xen_finish_init_mapping(void
14587 table_end = start_pfn;
14590 -/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
14591 - This runs before bootmem is initialized and gets pages directly from the
14592 - physical memory. To access them they are temporarily mapped. */
14594 + * Setup the direct mapping of the physical memory at PAGE_OFFSET.
14595 + * This runs before bootmem is initialized and gets pages directly from
14596 + * the physical memory. To access them they are temporarily mapped.
14598 void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
14601 unsigned long next;
14603 - Dprintk("init_memory_mapping\n");
14604 + pr_debug("init_memory_mapping\n");
14608 * Find space for the kernel direct mapping tables.
14609 - * Later we should allocate these tables in the local node of the memory
14610 - * mapped. Unfortunately this is done currently before the nodes are
14613 + * Later we should allocate these tables in the local node of the
14614 + * memory mapped. Unfortunately this is done currently before the
14615 + * nodes are discovered.
14617 if (!after_bootmem)
14618 find_early_table_space(end);
14619 @@ -786,8 +788,8 @@ void __init_refok init_memory_mapping(un
14620 end = (unsigned long)__va(end);
14622 for (; start < end; start = next) {
14623 - unsigned long pud_phys;
14624 pgd_t *pgd = pgd_offset_k(start);
14625 + unsigned long pud_phys;
14629 @@ -795,8 +797,8 @@ void __init_refok init_memory_mapping(un
14631 pud = alloc_static_page(&pud_phys);
14632 next = start + PGDIR_SIZE;
14637 phys_pud_init(pud, __pa(start), __pa(next));
14638 if (!after_bootmem) {
14639 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
14640 @@ -810,12 +812,17 @@ void __init_refok init_memory_mapping(un
14645 + if (!after_bootmem)
14646 + reserve_early(table_start << PAGE_SHIFT,
14647 + table_end << PAGE_SHIFT, "PGTABLE");
14650 #ifndef CONFIG_NUMA
14651 void __init paging_init(void)
14653 unsigned long max_zone_pfns[MAX_NR_ZONES];
14655 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
14656 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
14657 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
14658 @@ -829,40 +836,6 @@ void __init paging_init(void)
14662 -/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
14663 - from the CPU leading to inconsistent cache lines. address and size
14664 - must be aligned to 2MB boundaries.
14665 - Does nothing when the mapping doesn't exist. */
14666 -void __init clear_kernel_mapping(unsigned long address, unsigned long size)
14668 - unsigned long end = address + size;
14670 - BUG_ON(address & ~LARGE_PAGE_MASK);
14671 - BUG_ON(size & ~LARGE_PAGE_MASK);
14673 - for (; address < end; address += LARGE_PAGE_SIZE) {
14674 - pgd_t *pgd = pgd_offset_k(address);
14677 - if (pgd_none(*pgd))
14679 - pud = pud_offset(pgd, address);
14680 - if (pud_none(*pud))
14682 - pmd = pmd_offset(pud, address);
14683 - if (!pmd || pmd_none(*pmd))
14685 - if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
14686 - /* Could handle this, but it should not happen currently. */
14688 - "clear_kernel_mapping: mapping has been split. will leak memory\n");
14691 - set_pmd(pmd, __pmd(0));
14693 - __flush_tlb_all();
14697 * Memory hotplug specific functions
14699 @@ -888,16 +861,12 @@ int arch_add_memory(int nid, u64 start,
14700 unsigned long nr_pages = size >> PAGE_SHIFT;
14703 - init_memory_mapping(start, (start + size -1));
14704 + init_memory_mapping(start, start + size-1);
14706 ret = __add_pages(zone, start_pfn, nr_pages);
14713 - printk("%s: Problem encountered in __add_pages!\n", __func__);
14716 EXPORT_SYMBOL_GPL(arch_add_memory);
14718 @@ -911,36 +880,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
14720 #endif /* CONFIG_MEMORY_HOTPLUG */
14722 -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
14724 - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
14725 - * just online the pages.
14727 -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
14730 - unsigned long pfn;
14731 - unsigned long total = 0, mem = 0;
14732 - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
14733 - if (pfn_valid(pfn)) {
14734 - online_page(pfn_to_page(pfn));
14741 - z->spanned_pages += total;
14742 - z->present_pages += mem;
14743 - z->zone_pgdat->node_spanned_pages += total;
14744 - z->zone_pgdat->node_present_pages += mem;
14750 -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
14752 +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
14753 + kcore_modules, kcore_vsyscall;
14755 void __init mem_init(void)
14757 @@ -949,8 +890,7 @@ void __init mem_init(void)
14761 - /* clear the zero-page */
14762 - memset(empty_zero_page, 0, PAGE_SIZE);
14763 + /* clear_bss() already clear the empty_zero_page */
14767 @@ -968,7 +908,6 @@ void __init mem_init(void)
14769 reservedpages = end_pfn - totalram_pages -
14770 absent_pages_in_range(0, end_pfn);
14774 codesize = (unsigned long) &_etext - (unsigned long) &_text;
14775 @@ -976,46 +915,64 @@ void __init mem_init(void)
14776 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
14778 /* Register memory areas for /proc/kcore */
14779 - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
14780 - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
14781 + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
14782 + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
14783 VMALLOC_END-VMALLOC_START);
14784 kclist_add(&kcore_kernel, &_stext, _end - _stext);
14785 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
14786 - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
14787 + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
14788 VSYSCALL_END - VSYSCALL_START);
14790 - printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
14791 + printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
14792 + "%ldk reserved, %ldk data, %ldk init)\n",
14793 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
14794 end_pfn << (PAGE_SHIFT-10),
14796 reservedpages << (PAGE_SHIFT-10),
14803 void free_init_pages(char *what, unsigned long begin, unsigned long end)
14805 - unsigned long addr;
14806 + unsigned long addr = begin;
14808 - if (begin >= end)
14813 + * If debugging page accesses then do not free this memory but
14814 + * mark them not present - any buggy init-section access will
14815 + * create a kernel page fault:
14817 +#ifdef CONFIG_DEBUG_PAGEALLOC
14818 + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
14819 + begin, PAGE_ALIGN(end));
14820 + set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
14822 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
14823 - for (addr = begin; addr < end; addr += PAGE_SIZE) {
14825 + for (; addr < end; addr += PAGE_SIZE) {
14826 ClearPageReserved(virt_to_page(addr));
14827 init_page_count(virt_to_page(addr));
14828 memset((void *)(addr & ~(PAGE_SIZE-1)),
14829 POISON_FREE_INITMEM, PAGE_SIZE);
14830 if (addr >= __START_KERNEL_map) {
14831 /* make_readonly() reports all kernel addresses. */
14832 - __make_page_writable(__va(__pa(addr)));
14833 - change_page_attr_addr(addr, 1, __pgprot(0));
14834 + if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
14835 + pfn_pte(__pa(addr) >> PAGE_SHIFT,
14839 + if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
14845 - if (addr > __START_KERNEL_map)
14846 - global_flush_tlb();
14850 void free_initmem(void)
14851 @@ -1026,6 +983,8 @@ void free_initmem(void)
14854 #ifdef CONFIG_DEBUG_RODATA
14855 +const int rodata_test_data = 0xC3;
14856 +EXPORT_SYMBOL_GPL(rodata_test_data);
14858 void mark_rodata_ro(void)
14860 @@ -1047,18 +1006,27 @@ void mark_rodata_ro(void)
14864 - change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
14866 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
14867 (end - start) >> 10);
14868 + set_memory_ro(start, (end - start) >> PAGE_SHIFT);
14871 - * change_page_attr_addr() requires a global_flush_tlb() call after it.
14872 - * We do this after the printk so that if something went wrong in the
14873 - * change, the printk gets out at least to give a better debug hint
14874 - * of who is the culprit.
14875 + * The rodata section (but not the kernel text!) should also be
14876 + * not-executable.
14878 - global_flush_tlb();
14879 + start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
14880 + set_memory_nx(start, (end - start) >> PAGE_SHIFT);
14884 +#ifdef CONFIG_CPA_DEBUG
14885 + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
14886 + set_memory_rw(start, (end-start) >> PAGE_SHIFT);
14888 + printk(KERN_INFO "Testing CPA: again\n");
14889 + set_memory_ro(start, (end-start) >> PAGE_SHIFT);
14894 @@ -1069,17 +1037,21 @@ void free_initrd_mem(unsigned long start
14898 -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
14900 +void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
14903 int nid = phys_to_nid(phys);
14905 unsigned long pfn = phys >> PAGE_SHIFT;
14907 if (pfn >= end_pfn) {
14908 - /* This can happen with kdump kernels when accessing firmware
14911 + * This can happen with kdump kernels when accessing
14912 + * firmware tables:
14914 if (pfn < end_pfn_map)
14917 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
14920 @@ -1087,9 +1059,9 @@ void __init reserve_bootmem_generic(unsi
14922 /* Should check here against the e820 map to avoid double free */
14924 - reserve_bootmem_node(NODE_DATA(nid), phys, len);
14926 - reserve_bootmem(phys, len);
14927 + reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
14929 + reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
14932 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
14933 @@ -1101,46 +1073,49 @@ void __init reserve_bootmem_generic(unsi
14937 -int kern_addr_valid(unsigned long addr)
14939 +int kern_addr_valid(unsigned long addr)
14941 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
14951 if (above != 0 && above != -1UL)
14956 pgd = pgd_offset_k(addr);
14957 if (pgd_none(*pgd))
14960 pud = pud_offset(pgd, addr);
14961 if (pud_none(*pud))
14965 pmd = pmd_offset(pud, addr);
14966 if (pmd_none(*pmd))
14969 if (pmd_large(*pmd))
14970 return pfn_valid(pmd_pfn(*pmd));
14972 pte = pte_offset_kernel(pmd, addr);
14973 if (pte_none(*pte))
14976 return pfn_valid(pte_pfn(*pte));
14979 -/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
14980 - covers the 64bit vsyscall page now. 32bit has a real VMA now and does
14981 - not need special handling anymore. */
14984 + * A pseudo VMA to allow ptrace access for the vsyscall page. This only
14985 + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
14986 + * not need special handling anymore:
14988 static struct vm_area_struct gate_vma = {
14989 - .vm_start = VSYSCALL_START,
14990 - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
14991 - .vm_page_prot = PAGE_READONLY_EXEC,
14992 - .vm_flags = VM_READ | VM_EXEC
14993 + .vm_start = VSYSCALL_START,
14994 + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
14995 + .vm_page_prot = PAGE_READONLY_EXEC,
14996 + .vm_flags = VM_READ | VM_EXEC
14999 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
15000 @@ -1155,14 +1130,17 @@ struct vm_area_struct *get_gate_vma(stru
15001 int in_gate_area(struct task_struct *task, unsigned long addr)
15003 struct vm_area_struct *vma = get_gate_vma(task);
15008 return (addr >= vma->vm_start) && (addr < vma->vm_end);
15011 -/* Use this when you have no reliable task/vma, typically from interrupt
15012 - * context. It is less reliable than using the task's vma and may give
15013 - * false positives.
15015 + * Use this when you have no reliable task/vma, typically from interrupt
15016 + * context. It is less reliable than using the task's vma and may give
15017 + * false positives:
15019 int in_gate_area_no_task(unsigned long addr)
15021 @@ -1182,8 +1160,8 @@ const char *arch_vma_name(struct vm_area
15023 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
15025 -int __meminit vmemmap_populate(struct page *start_page,
15026 - unsigned long size, int node)
15028 +vmemmap_populate(struct page *start_page, unsigned long size, int node)
15030 unsigned long addr = (unsigned long)start_page;
15031 unsigned long end = (unsigned long)(start_page + size);
15032 @@ -1198,6 +1176,7 @@ int __meminit vmemmap_populate(struct pa
15033 pgd = vmemmap_pgd_populate(addr, node);
15037 pud = vmemmap_pud_populate(pgd, addr, node);
15040 @@ -1205,20 +1184,22 @@ int __meminit vmemmap_populate(struct pa
15041 pmd = pmd_offset(pud, addr);
15042 if (pmd_none(*pmd)) {
15044 - void *p = vmemmap_alloc_block(PMD_SIZE, node);
15047 + p = vmemmap_alloc_block(PMD_SIZE, node);
15051 - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
15052 - mk_pte_huge(entry);
15053 - set_pmd(pmd, __pmd(pte_val(entry)));
15054 + entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
15055 + PAGE_KERNEL_LARGE);
15056 + set_pmd(pmd, __pmd_ma(__pte_val(entry)));
15058 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
15059 addr, addr + PMD_SIZE - 1, p, node);
15062 vmemmap_verify((pte_t *)pmd, node, addr, next);
15069 --- a/arch/x86/mm/ioremap_32-xen.c
15073 - * arch/i386/mm/ioremap.c
15075 - * Re-map IO memory to kernel address space so that we can access it.
15076 - * This is needed for high PCI addresses that aren't mapped in the
15077 - * 640k-1MB IO memory area on PC's
15079 - * (C) Copyright 1995 1996 Linus Torvalds
15082 -#include <linux/vmalloc.h>
15083 -#include <linux/init.h>
15084 -#include <linux/slab.h>
15085 -#include <linux/module.h>
15086 -#include <linux/io.h>
15087 -#include <linux/sched.h>
15088 -#include <asm/fixmap.h>
15089 -#include <asm/cacheflush.h>
15090 -#include <asm/tlbflush.h>
15091 -#include <asm/pgtable.h>
15092 -#include <asm/pgalloc.h>
15094 -#define ISA_START_ADDRESS 0x0
15095 -#define ISA_END_ADDRESS 0x100000
15097 -static int direct_remap_area_pte_fn(pte_t *pte,
15098 - struct page *pmd_page,
15099 - unsigned long address,
15102 - mmu_update_t **v = (mmu_update_t **)data;
15104 - BUG_ON(!pte_none(*pte));
15106 - (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15107 - PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15113 -static int __direct_remap_pfn_range(struct mm_struct *mm,
15114 - unsigned long address,
15115 - unsigned long mfn,
15116 - unsigned long size,
15121 - unsigned long i, start_address;
15122 - mmu_update_t *u, *v, *w;
15124 - u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
15128 - start_address = address;
15130 - flush_cache_all();
15132 - for (i = 0; i < size; i += PAGE_SIZE) {
15133 - if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
15134 - /* Flush a full batch after filling in the PTE ptrs. */
15135 - rc = apply_to_page_range(mm, start_address,
15136 - address - start_address,
15137 - direct_remap_area_pte_fn, &w);
15141 - if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
15144 - start_address = address;
15148 - * Fill in the machine address: PTE ptr is done later by
15149 - * apply_to_page_range().
15151 - v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
15154 - address += PAGE_SIZE;
15159 - /* Final batch. */
15160 - rc = apply_to_page_range(mm, start_address,
15161 - address - start_address,
15162 - direct_remap_area_pte_fn, &w);
15166 - if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
15175 - free_page((unsigned long)u);
15180 -int direct_remap_pfn_range(struct vm_area_struct *vma,
15181 - unsigned long address,
15182 - unsigned long mfn,
15183 - unsigned long size,
15187 - if (xen_feature(XENFEAT_auto_translated_physmap))
15188 - return remap_pfn_range(vma, address, mfn, size, prot);
15190 - if (domid == DOMID_SELF)
15193 - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
15195 - vma->vm_mm->context.has_foreign_mappings = 1;
15197 - return __direct_remap_pfn_range(
15198 - vma->vm_mm, address, mfn, size, prot, domid);
15200 -EXPORT_SYMBOL(direct_remap_pfn_range);
15202 -int direct_kernel_remap_pfn_range(unsigned long address,
15203 - unsigned long mfn,
15204 - unsigned long size,
15208 - return __direct_remap_pfn_range(
15209 - &init_mm, address, mfn, size, prot, domid);
15211 -EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
15213 -static int lookup_pte_fn(
15214 - pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15216 - uint64_t *ptep = (uint64_t *)data;
15218 - *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15219 - PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15223 -int create_lookup_pte_addr(struct mm_struct *mm,
15224 - unsigned long address,
15227 - return apply_to_page_range(mm, address, PAGE_SIZE,
15228 - lookup_pte_fn, ptep);
15231 -EXPORT_SYMBOL(create_lookup_pte_addr);
15233 -static int noop_fn(
15234 - pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15239 -int touch_pte_range(struct mm_struct *mm,
15240 - unsigned long address,
15241 - unsigned long size)
15243 - return apply_to_page_range(mm, address, size, noop_fn, NULL);
15246 -EXPORT_SYMBOL(touch_pte_range);
15249 - * Does @address reside within a non-highmem page that is local to this virtual
15250 - * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
15251 - * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
15252 - * why this works.
15254 -static inline int is_local_lowmem(unsigned long address)
15256 - extern unsigned long max_low_pfn;
15257 - return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
15261 - * Generic mapping function (not visible outside):
15265 - * Remap an arbitrary physical address space into the kernel virtual
15266 - * address space. Needed when the kernel wants to access high addresses
15269 - * NOTE! We need to allow non-page-aligned mappings too: we will obviously
15270 - * have to convert them into an offset in a page-aligned mapping, but the
15271 - * caller shouldn't need to know that small detail.
15273 -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
15275 - void __iomem * addr;
15276 - struct vm_struct * area;
15277 - unsigned long offset, last_addr;
15279 - domid_t domid = DOMID_IO;
15281 - /* Don't allow wraparound or zero size */
15282 - last_addr = phys_addr + size - 1;
15283 - if (!size || last_addr < phys_addr)
15287 - * Don't remap the low PCI/ISA area, it's always mapped..
15289 - if (is_initial_xendomain() &&
15290 - phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
15291 - return (void __iomem *) isa_bus_to_virt(phys_addr);
15294 - * Don't allow anybody to remap normal RAM that we're using..
15296 - if (is_local_lowmem(phys_addr)) {
15297 - char *t_addr, *t_end;
15298 - struct page *page;
15300 - t_addr = bus_to_virt(phys_addr);
15301 - t_end = t_addr + (size - 1);
15303 - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
15304 - if(!PageReserved(page))
15307 - domid = DOMID_SELF;
15310 - prot = __pgprot(_KERNPG_TABLE | flags);
15313 - * Mappings have to be page-aligned
15315 - offset = phys_addr & ~PAGE_MASK;
15316 - phys_addr &= PAGE_MASK;
15317 - size = PAGE_ALIGN(last_addr+1) - phys_addr;
15320 - * Ok, go for it..
15322 - area = get_vm_area(size, VM_IOREMAP | (flags << 20));
15325 - area->phys_addr = phys_addr;
15326 - addr = (void __iomem *) area->addr;
15327 - if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
15328 - phys_addr>>PAGE_SHIFT,
15329 - size, prot, domid)) {
15330 - vunmap((void __force *) addr);
15333 - return (void __iomem *) (offset + (char __iomem *)addr);
15335 -EXPORT_SYMBOL(__ioremap);
15338 - * ioremap_nocache - map bus memory into CPU space
15339 - * @offset: bus address of the memory
15340 - * @size: size of the resource to map
15342 - * ioremap_nocache performs a platform specific sequence of operations to
15343 - * make bus memory CPU accessible via the readb/readw/readl/writeb/
15344 - * writew/writel functions and the other mmio helpers. The returned
15345 - * address is not guaranteed to be usable directly as a virtual
15348 - * This version of ioremap ensures that the memory is marked uncachable
15349 - * on the CPU as well as honouring existing caching rules from things like
15350 - * the PCI bus. Note that there are other caches and buffers on many
15351 - * busses. In particular driver authors should read up on PCI writes
15353 - * It's useful if some control registers are in such an area and
15354 - * write combining or read caching is not desirable:
15356 - * Must be freed with iounmap.
15359 -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
15361 - unsigned long last_addr;
15362 - void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
15366 - /* Guaranteed to be > phys_addr, as per __ioremap() */
15367 - last_addr = phys_addr + size - 1;
15369 - if (is_local_lowmem(last_addr)) {
15370 - struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
15371 - unsigned long npages;
15373 - phys_addr &= PAGE_MASK;
15375 - /* This might overflow and become zero.. */
15376 - last_addr = PAGE_ALIGN(last_addr);
15378 - /* .. but that's ok, because modulo-2**n arithmetic will make
15379 - * the page-aligned "last - first" come out right.
15381 - npages = (last_addr - phys_addr) >> PAGE_SHIFT;
15383 - if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
15387 - global_flush_tlb();
15392 -EXPORT_SYMBOL(ioremap_nocache);
15395 - * iounmap - Free a IO remapping
15396 - * @addr: virtual address from ioremap_*
15398 - * Caller must ensure there is only one unmapping for the same pointer.
15400 -void iounmap(volatile void __iomem *addr)
15402 - struct vm_struct *p, *o;
15404 - if ((void __force *)addr <= high_memory)
15408 - * __ioremap special-cases the PCI/ISA range by not instantiating a
15409 - * vm_area and by simply returning an address into the kernel mapping
15410 - * of ISA space. So handle that here.
15412 - if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15415 - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
15417 - /* Use the vm area unlocked, assuming the caller
15418 - ensures there isn't another iounmap for the same address
15419 - in parallel. Reuse of the virtual address is prevented by
15420 - leaving it in the global lists until we're done with it.
15421 - cpa takes care of the direct mappings. */
15422 - read_lock(&vmlist_lock);
15423 - for (p = vmlist; p; p = p->next) {
15424 - if (p->addr == addr)
15427 - read_unlock(&vmlist_lock);
15430 - printk("iounmap: bad address %p\n", addr);
15435 - /* Reset the direct mapping. Can block */
15436 - if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
15437 - change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
15438 - get_vm_area_size(p) >> PAGE_SHIFT,
15440 - global_flush_tlb();
15443 - /* Finally remove it */
15444 - o = remove_vm_area((void *)addr);
15445 - BUG_ON(p != o || o == NULL);
15448 -EXPORT_SYMBOL(iounmap);
15450 -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
15452 - unsigned long offset, last_addr;
15453 - unsigned int nrpages;
15454 - enum fixed_addresses idx;
15456 - /* Don't allow wraparound or zero size */
15457 - last_addr = phys_addr + size - 1;
15458 - if (!size || last_addr < phys_addr)
15462 - * Don't remap the low PCI/ISA area, it's always mapped..
15464 - if (is_initial_xendomain() &&
15465 - phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
15466 - return isa_bus_to_virt(phys_addr);
15469 - * Mappings have to be page-aligned
15471 - offset = phys_addr & ~PAGE_MASK;
15472 - phys_addr &= PAGE_MASK;
15473 - size = PAGE_ALIGN(last_addr) - phys_addr;
15476 - * Mappings have to fit in the FIX_BTMAP area.
15478 - nrpages = size >> PAGE_SHIFT;
15479 - if (nrpages > NR_FIX_BTMAPS)
15483 - * Ok, go for it..
15485 - idx = FIX_BTMAP_BEGIN;
15486 - while (nrpages > 0) {
15487 - set_fixmap(idx, phys_addr);
15488 - phys_addr += PAGE_SIZE;
15492 - return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
15495 -void __init bt_iounmap(void *addr, unsigned long size)
15497 - unsigned long virt_addr;
15498 - unsigned long offset;
15499 - unsigned int nrpages;
15500 - enum fixed_addresses idx;
15502 - virt_addr = (unsigned long)addr;
15503 - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
15505 - if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15507 - offset = virt_addr & ~PAGE_MASK;
15508 - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
15510 - idx = FIX_BTMAP_BEGIN;
15511 - while (nrpages > 0) {
15512 - clear_fixmap(idx);
15518 +++ b/arch/x86/mm/ioremap-xen.c
15521 + * Re-map IO memory to kernel address space so that we can access it.
15522 + * This is needed for high PCI addresses that aren't mapped in the
15523 + * 640k-1MB IO memory area on PC's
15525 + * (C) Copyright 1995 1996 Linus Torvalds
15528 +#include <linux/bootmem.h>
15529 +#include <linux/init.h>
15530 +#include <linux/io.h>
15531 +#include <linux/module.h>
15532 +#include <linux/pfn.h>
15533 +#include <linux/slab.h>
15534 +#include <linux/vmalloc.h>
15536 +#include <asm/cacheflush.h>
15537 +#include <asm/e820.h>
15538 +#include <asm/fixmap.h>
15539 +#include <asm/pgtable.h>
15540 +#include <asm/tlbflush.h>
15541 +#include <asm/pgalloc.h>
15543 +enum ioremap_mode {
15544 + IOR_MODE_UNCACHED,
15548 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
15550 +unsigned long __phys_addr(unsigned long x)
15552 + if (x >= __START_KERNEL_map)
15553 + return x - __START_KERNEL_map + phys_base;
15554 + return x - PAGE_OFFSET;
15556 +EXPORT_SYMBOL(__phys_addr);
15560 +static int direct_remap_area_pte_fn(pte_t *pte,
15561 + struct page *pmd_page,
15562 + unsigned long address,
15565 + mmu_update_t **v = (mmu_update_t **)data;
15567 + BUG_ON(!pte_none(*pte));
15569 + (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15570 + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15576 +static int __direct_remap_pfn_range(struct mm_struct *mm,
15577 + unsigned long address,
15578 + unsigned long mfn,
15579 + unsigned long size,
15584 + unsigned long i, start_address;
15585 + mmu_update_t *u, *v, *w;
15587 + u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
15591 + start_address = address;
15593 + flush_cache_all();
15595 + for (i = 0; i < size; i += PAGE_SIZE) {
15596 + if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
15597 + /* Flush a full batch after filling in the PTE ptrs. */
15598 + rc = apply_to_page_range(mm, start_address,
15599 + address - start_address,
15600 + direct_remap_area_pte_fn, &w);
15604 + if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
15607 + start_address = address;
15611 + * Fill in the machine address: PTE ptr is done later by
15612 + * apply_to_page_range().
15614 + v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
15617 + address += PAGE_SIZE;
15622 + /* Final batch. */
15623 + rc = apply_to_page_range(mm, start_address,
15624 + address - start_address,
15625 + direct_remap_area_pte_fn, &w);
15629 + if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
15638 + free_page((unsigned long)u);
15643 +int direct_remap_pfn_range(struct vm_area_struct *vma,
15644 + unsigned long address,
15645 + unsigned long mfn,
15646 + unsigned long size,
15650 + if (xen_feature(XENFEAT_auto_translated_physmap))
15651 + return remap_pfn_range(vma, address, mfn, size, prot);
15653 + if (domid == DOMID_SELF)
15656 + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
15658 + vma->vm_mm->context.has_foreign_mappings = 1;
15660 + return __direct_remap_pfn_range(
15661 + vma->vm_mm, address, mfn, size, prot, domid);
15663 +EXPORT_SYMBOL(direct_remap_pfn_range);
15665 +int direct_kernel_remap_pfn_range(unsigned long address,
15666 + unsigned long mfn,
15667 + unsigned long size,
15671 + return __direct_remap_pfn_range(
15672 + &init_mm, address, mfn, size, prot, domid);
15674 +EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
15676 +static int lookup_pte_fn(
15677 + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15679 + uint64_t *ptep = (uint64_t *)data;
15681 + *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15682 + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15686 +int create_lookup_pte_addr(struct mm_struct *mm,
15687 + unsigned long address,
15690 + return apply_to_page_range(mm, address, PAGE_SIZE,
15691 + lookup_pte_fn, ptep);
15694 +EXPORT_SYMBOL(create_lookup_pte_addr);
15696 +static int noop_fn(
15697 + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15702 +int touch_pte_range(struct mm_struct *mm,
15703 + unsigned long address,
15704 + unsigned long size)
15706 + return apply_to_page_range(mm, address, size, noop_fn, NULL);
15709 +EXPORT_SYMBOL(touch_pte_range);
15711 +#ifdef CONFIG_X86_32
15712 +int page_is_ram(unsigned long pagenr)
15714 + unsigned long addr, end;
15717 +#ifndef CONFIG_XEN
15719 + * A special case is the first 4Kb of memory;
15720 + * This is a BIOS owned area, not kernel ram, but generally
15721 + * not listed as such in the E820 table.
15727 + * Second special case: Some BIOSen report the PC BIOS
15728 + * area (640->1Mb) as ram even though it is not.
15730 + if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
15731 + pagenr < (BIOS_END >> PAGE_SHIFT))
15735 + for (i = 0; i < e820.nr_map; i++) {
15737 + * Not usable memory:
15739 + if (e820.map[i].type != E820_RAM)
15741 + addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
15742 + end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
15745 + if ((pagenr >= addr) && (pagenr < end))
15753 + * Fix up the linear direct mapping of the kernel to avoid cache attribute
15756 +static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
15757 + enum ioremap_mode mode)
15759 + unsigned long nrpages = size >> PAGE_SHIFT;
15763 + case IOR_MODE_UNCACHED:
15765 + err = set_memory_uc(vaddr, nrpages);
15767 + case IOR_MODE_CACHED:
15768 + err = set_memory_wb(vaddr, nrpages);
15776 + * Remap an arbitrary physical address space into the kernel virtual
15777 + * address space. Needed when the kernel wants to access high addresses
15780 + * NOTE! We need to allow non-page-aligned mappings too: we will obviously
15781 + * have to convert them into an offset in a page-aligned mapping, but the
15782 + * caller shouldn't need to know that small detail.
15784 +static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
15785 + enum ioremap_mode mode)
15787 + unsigned long mfn, offset, last_addr, vaddr;
15788 + struct vm_struct *area;
15790 + domid_t domid = DOMID_IO;
15792 + /* Don't allow wraparound or zero size */
15793 + last_addr = phys_addr + size - 1;
15794 + if (!size || last_addr < phys_addr)
15798 + * Don't remap the low PCI/ISA area, it's always mapped..
15800 + if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
15801 + return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
15804 + * Don't allow anybody to remap normal RAM that we're using..
15806 + for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
15807 + unsigned long pfn = mfn_to_local_pfn(mfn);
15809 + if (pfn >= max_pfn)
15812 + domid = DOMID_SELF;
15814 + if (pfn >= max_pfn_mapped) /* bogus */
15817 + if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
15822 + case IOR_MODE_UNCACHED:
15825 + * FIXME: we will use UC MINUS for now, as video fb drivers
15826 + * depend on it. Upcoming ioremap_wc() will fix this behavior.
15828 + prot = PAGE_KERNEL_UC_MINUS;
15830 + case IOR_MODE_CACHED:
15831 + prot = PAGE_KERNEL;
15836 + * Mappings have to be page-aligned
15838 + offset = phys_addr & ~PAGE_MASK;
15839 + phys_addr &= PAGE_MASK;
15840 + size = PAGE_ALIGN(last_addr+1) - phys_addr;
15843 + * Ok, go for it..
15845 + area = get_vm_area(size, VM_IOREMAP | (mode << 20));
15848 + area->phys_addr = phys_addr;
15849 + vaddr = (unsigned long) area->addr;
15850 + if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
15851 + size, prot, domid)) {
15852 + free_vm_area(area);
15856 + if (ioremap_change_attr(vaddr, size, mode) < 0) {
15857 + iounmap((void __iomem *) vaddr);
15861 + return (void __iomem *) (vaddr + offset);
15865 + * ioremap_nocache - map bus memory into CPU space
15866 + * @offset: bus address of the memory
15867 + * @size: size of the resource to map
15869 + * ioremap_nocache performs a platform specific sequence of operations to
15870 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
15871 + * writew/writel functions and the other mmio helpers. The returned
15872 + * address is not guaranteed to be usable directly as a virtual
15875 + * This version of ioremap ensures that the memory is marked uncachable
15876 + * on the CPU as well as honouring existing caching rules from things like
15877 + * the PCI bus. Note that there are other caches and buffers on many
15878 + * busses. In particular driver authors should read up on PCI writes
15880 + * It's useful if some control registers are in such an area and
15881 + * write combining or read caching is not desirable:
15883 + * Must be freed with iounmap.
15885 +void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
15887 + return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
15889 +EXPORT_SYMBOL(ioremap_nocache);
15891 +void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
15893 + return __ioremap(phys_addr, size, IOR_MODE_CACHED);
15895 +EXPORT_SYMBOL(ioremap_cache);
15898 + * iounmap - Free a IO remapping
15899 + * @addr: virtual address from ioremap_*
15901 + * Caller must ensure there is only one unmapping for the same pointer.
15903 +void iounmap(volatile void __iomem *addr)
15905 + struct vm_struct *p, *o;
15907 + if ((void __force *)addr <= high_memory)
15911 + * __ioremap special-cases the PCI/ISA range by not instantiating a
15912 + * vm_area and by simply returning an address into the kernel mapping
15913 + * of ISA space. So handle that here.
15915 + if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15918 + addr = (volatile void __iomem *)
15919 + (PAGE_MASK & (unsigned long __force)addr);
15921 + /* Use the vm area unlocked, assuming the caller
15922 + ensures there isn't another iounmap for the same address
15923 + in parallel. Reuse of the virtual address is prevented by
15924 + leaving it in the global lists until we're done with it.
15925 + cpa takes care of the direct mappings. */
15926 + read_lock(&vmlist_lock);
15927 + for (p = vmlist; p; p = p->next) {
15928 + if (p->addr == addr)
15931 + read_unlock(&vmlist_lock);
15934 + printk(KERN_ERR "iounmap: bad address %p\n", addr);
15939 + if ((p->flags >> 20) != IOR_MODE_CACHED) {
15940 + unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
15941 + unsigned long mfn = p->phys_addr;
15942 + unsigned long va = (unsigned long)addr;
15944 + for (; n > 0; n--, mfn++, va += PAGE_SIZE)
15945 + if (mfn_to_local_pfn(mfn) < max_pfn)
15946 + set_memory_wb(va, 1);
15949 + /* Finally remove it */
15950 + o = remove_vm_area((void *)addr);
15951 + BUG_ON(p != o || o == NULL);
15954 +EXPORT_SYMBOL(iounmap);
15956 +int __initdata early_ioremap_debug;
15958 +static int __init early_ioremap_debug_setup(char *str)
15960 + early_ioremap_debug = 1;
15964 +early_param("early_ioremap_debug", early_ioremap_debug_setup);
15966 +static __initdata int after_paging_init;
15967 +static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
15968 + __attribute__((aligned(PAGE_SIZE)));
15970 +#ifdef CONFIG_X86_32
15971 +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
15973 + /* Don't assume we're using swapper_pg_dir at this point */
15974 + pgd_t *base = __va(read_cr3());
15975 + pgd_t *pgd = &base[pgd_index(addr)];
15976 + pud_t *pud = pud_offset(pgd, addr);
15977 + pmd_t *pmd = pmd_offset(pud, addr);
15982 +#define early_ioremap_pmd early_get_pmd
15983 +#define make_lowmem_page_readonly early_make_page_readonly
15984 +#define make_lowmem_page_writable make_page_writable
15987 +static inline pte_t * __init early_ioremap_pte(unsigned long addr)
15989 + return &bm_pte[pte_index(addr)];
15992 +void __init early_ioremap_init(void)
15996 + if (early_ioremap_debug)
15997 + printk(KERN_INFO "early_ioremap_init()\n");
15999 + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
16000 + memset(bm_pte, 0, sizeof(bm_pte));
16001 + make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
16002 + pmd_populate_kernel(&init_mm, pmd, bm_pte);
16005 + * The boot-ioremap range spans multiple pmds, for which
16006 + * we are not prepared:
16008 + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
16010 + printk(KERN_WARNING "pmd %p != %p\n",
16011 + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
16012 + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
16013 + fix_to_virt(FIX_BTMAP_BEGIN));
16014 + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
16015 + fix_to_virt(FIX_BTMAP_END));
16017 + printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
16018 + printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
16019 + FIX_BTMAP_BEGIN);
16023 +void __init early_ioremap_clear(void)
16027 + if (early_ioremap_debug)
16028 + printk(KERN_INFO "early_ioremap_clear()\n");
16030 + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
16032 + make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
16033 + /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
16034 + __flush_tlb_all();
16037 +void __init early_ioremap_reset(void)
16039 + enum fixed_addresses idx;
16040 + unsigned long addr, phys;
16043 + after_paging_init = 1;
16044 + for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
16045 + addr = fix_to_virt(idx);
16046 + pte = early_ioremap_pte(addr);
16047 + if (pte_present(*pte)) {
16048 + phys = __pte_val(*pte) & PAGE_MASK;
16049 + set_fixmap(idx, phys);
16054 +static void __init __early_set_fixmap(enum fixed_addresses idx,
16055 + unsigned long phys, pgprot_t flags)
16057 + unsigned long addr = __fix_to_virt(idx);
16060 + if (idx >= __end_of_fixed_addresses) {
16064 + pte = early_ioremap_pte(addr);
16065 + if (pgprot_val(flags))
16066 + set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
16068 + pte_clear(NULL, addr, pte);
16069 + __flush_tlb_one(addr);
16072 +static inline void __init early_set_fixmap(enum fixed_addresses idx,
16073 + unsigned long phys)
16075 + if (after_paging_init)
16076 + set_fixmap(idx, phys);
16078 + __early_set_fixmap(idx, phys, PAGE_KERNEL);
16081 +static inline void __init early_clear_fixmap(enum fixed_addresses idx)
16083 + if (after_paging_init)
16084 + clear_fixmap(idx);
16086 + __early_set_fixmap(idx, 0, __pgprot(0));
16090 +int __initdata early_ioremap_nested;
16092 +static int __init check_early_ioremap_leak(void)
16094 + if (!early_ioremap_nested)
16097 + printk(KERN_WARNING
16098 + "Debug warning: early ioremap leak of %d areas detected.\n",
16099 + early_ioremap_nested);
16100 + printk(KERN_WARNING
16101 + "please boot with early_ioremap_debug and report the dmesg.\n");
16106 +late_initcall(check_early_ioremap_leak);
16108 +void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
16110 + unsigned long offset, last_addr;
16111 + unsigned int nrpages, nesting;
16112 + enum fixed_addresses idx0, idx;
16114 + WARN_ON(system_state != SYSTEM_BOOTING);
16116 + nesting = early_ioremap_nested;
16117 + if (early_ioremap_debug) {
16118 + printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
16119 + phys_addr, size, nesting);
16123 + /* Don't allow wraparound or zero size */
16124 + last_addr = phys_addr + size - 1;
16125 + if (!size || last_addr < phys_addr) {
16130 + if (nesting >= FIX_BTMAPS_NESTING) {
16134 + early_ioremap_nested++;
16136 + * Mappings have to be page-aligned
16138 + offset = phys_addr & ~PAGE_MASK;
16139 + phys_addr &= PAGE_MASK;
16140 + size = PAGE_ALIGN(last_addr) - phys_addr;
16143 + * Mappings have to fit in the FIX_BTMAP area.
16145 + nrpages = size >> PAGE_SHIFT;
16146 + if (nrpages > NR_FIX_BTMAPS) {
16152 + * Ok, go for it..
16154 + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
16156 + while (nrpages > 0) {
16157 + early_set_fixmap(idx, phys_addr);
16158 + phys_addr += PAGE_SIZE;
16162 + if (early_ioremap_debug)
16163 + printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
16165 + return (void *) (offset + fix_to_virt(idx0));
16168 +void __init early_iounmap(void *addr, unsigned long size)
16170 + unsigned long virt_addr;
16171 + unsigned long offset;
16172 + unsigned int nrpages;
16173 + enum fixed_addresses idx;
16174 + unsigned int nesting;
16176 + nesting = --early_ioremap_nested;
16177 + WARN_ON(nesting < 0);
16179 + if (early_ioremap_debug) {
16180 + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
16185 + virt_addr = (unsigned long)addr;
16186 + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
16190 + offset = virt_addr & ~PAGE_MASK;
16191 + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
16193 + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
16194 + while (nrpages > 0) {
16195 + early_clear_fixmap(idx);
16201 +void __this_fixmap_does_not_exist(void)
16205 --- a/arch/x86/mm/pageattr_64-xen.c
16209 - * Copyright 2002 Andi Kleen, SuSE Labs.
16210 - * Thanks to Ben LaHaise for precious feedback.
16213 -#include <linux/mm.h>
16214 -#include <linux/sched.h>
16215 -#include <linux/highmem.h>
16216 -#include <linux/module.h>
16217 -#include <linux/slab.h>
16218 -#include <asm/uaccess.h>
16219 -#include <asm/processor.h>
16220 -#include <asm/tlbflush.h>
16221 -#include <asm/io.h>
16224 -#include <asm/pgalloc.h>
16225 -#include <asm/mmu_context.h>
16227 -static void _pin_lock(struct mm_struct *mm, int lock) {
16229 - spin_lock(&mm->page_table_lock);
16230 -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
16231 - /* While mm->page_table_lock protects us against insertions and
16232 - * removals of higher level page table pages, it doesn't protect
16233 - * against updates of pte-s. Such updates, however, require the
16234 - * pte pages to be in consistent state (unpinned+writable or
16235 - * pinned+readonly). The pinning and attribute changes, however
16236 - * cannot be done atomically, which is why such updates must be
16237 - * prevented from happening concurrently.
16238 - * Note that no pte lock can ever elsewhere be acquired nesting
16239 - * with an already acquired one in the same mm, or with the mm's
16240 - * page_table_lock already acquired, as that would break in the
16241 - * non-split case (where all these are actually resolving to the
16242 - * one page_table_lock). Thus acquiring all of them here is not
16243 - * going to result in dead locks, and the order of acquires
16244 - * doesn't matter.
16247 - pgd_t *pgd = mm->pgd;
16250 - for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16254 - if (pgd_none(*pgd))
16256 - pud = pud_offset(pgd, 0);
16257 - for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16261 - if (pud_none(*pud))
16263 - pmd = pmd_offset(pud, 0);
16264 - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16267 - if (pmd_none(*pmd))
16269 - ptl = pte_lockptr(0, pmd);
16273 - spin_unlock(ptl);
16280 - spin_unlock(&mm->page_table_lock);
16282 -#define pin_lock(mm) _pin_lock(mm, 1)
16283 -#define pin_unlock(mm) _pin_lock(mm, 0)
16285 -#define PIN_BATCH 8
16286 -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
16288 -static inline unsigned int pgd_walk_set_prot(void *pt, pgprot_t flags,
16289 - unsigned int cpu, unsigned int seq)
16291 - struct page *page = virt_to_page(pt);
16292 - unsigned long pfn = page_to_pfn(page);
16294 - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16295 - (unsigned long)__va(pfn << PAGE_SHIFT),
16296 - pfn_pte(pfn, flags), 0);
16297 - if (unlikely(++seq == PIN_BATCH)) {
16298 - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16299 - PIN_BATCH, NULL)))
16307 -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
16309 - pgd_t *pgd = pgd_base;
16314 - unsigned int cpu, seq;
16315 - multicall_entry_t *mcl;
16320 - * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
16321 - * be the 'current' task's pagetables (e.g., current may be 32-bit,
16322 - * but the pagetables may be for a 64-bit task).
16323 - * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
16324 - * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
16326 - for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16327 - if (pgd_none(*pgd))
16329 - pud = pud_offset(pgd, 0);
16330 - if (PTRS_PER_PUD > 1) /* not folded */
16331 - seq = pgd_walk_set_prot(pud,flags,cpu,seq);
16332 - for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16333 - if (pud_none(*pud))
16335 - pmd = pmd_offset(pud, 0);
16336 - if (PTRS_PER_PMD > 1) /* not folded */
16337 - seq = pgd_walk_set_prot(pmd,flags,cpu,seq);
16338 - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16339 - if (pmd_none(*pmd))
16341 - pte = pte_offset_kernel(pmd,0);
16342 - seq = pgd_walk_set_prot(pte,flags,cpu,seq);
16347 - mcl = per_cpu(pb_mcl, cpu);
16348 - if (unlikely(seq > PIN_BATCH - 2)) {
16349 - if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
16353 - MULTI_update_va_mapping(mcl + seq,
16354 - (unsigned long)__user_pgd(pgd_base),
16355 - pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
16357 - MULTI_update_va_mapping(mcl + seq + 1,
16358 - (unsigned long)pgd_base,
16359 - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16361 - if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
16367 -static void __pgd_pin(pgd_t *pgd)
16369 - pgd_walk(pgd, PAGE_KERNEL_RO);
16370 - xen_pgd_pin(__pa(pgd)); /* kernel */
16371 - xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
16372 - SetPagePinned(virt_to_page(pgd));
16375 -static void __pgd_unpin(pgd_t *pgd)
16377 - xen_pgd_unpin(__pa(pgd));
16378 - xen_pgd_unpin(__pa(__user_pgd(pgd)));
16379 - pgd_walk(pgd, PAGE_KERNEL);
16380 - ClearPagePinned(virt_to_page(pgd));
16383 -void pgd_test_and_unpin(pgd_t *pgd)
16385 - if (PagePinned(virt_to_page(pgd)))
16386 - __pgd_unpin(pgd);
16389 -void mm_pin(struct mm_struct *mm)
16391 - if (xen_feature(XENFEAT_writable_page_tables))
16395 - __pgd_pin(mm->pgd);
16399 -void mm_unpin(struct mm_struct *mm)
16401 - if (xen_feature(XENFEAT_writable_page_tables))
16405 - __pgd_unpin(mm->pgd);
16409 -void mm_pin_all(void)
16411 - struct page *page;
16412 - unsigned long flags;
16414 - if (xen_feature(XENFEAT_writable_page_tables))
16418 - * Allow uninterrupted access to the pgd_list. Also protects
16419 - * __pgd_pin() by disabling preemption.
16420 - * All other CPUs must be at a safe point (e.g., in stop_machine
16421 - * or offlined entirely).
16423 - spin_lock_irqsave(&pgd_lock, flags);
16424 - list_for_each_entry(page, &pgd_list, lru) {
16425 - if (!PagePinned(page))
16426 - __pgd_pin((pgd_t *)page_address(page));
16428 - spin_unlock_irqrestore(&pgd_lock, flags);
16431 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
16433 - if (!PagePinned(virt_to_page(mm->pgd)))
16437 -void arch_exit_mmap(struct mm_struct *mm)
16439 - struct task_struct *tsk = current;
16444 - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
16445 - * *much* faster this way, as no tlb flushes means bigger wrpt batches.
16447 - if (tsk->active_mm == mm) {
16448 - tsk->active_mm = &init_mm;
16449 - atomic_inc(&init_mm.mm_count);
16451 - switch_mm(mm, &init_mm, tsk);
16453 - atomic_dec(&mm->mm_count);
16454 - BUG_ON(atomic_read(&mm->mm_count) == 0);
16457 - task_unlock(tsk);
16459 - if (PagePinned(virt_to_page(mm->pgd))
16460 - && (atomic_read(&mm->mm_count) == 1)
16461 - && !mm->context.has_foreign_mappings)
16465 -static void _pte_free(struct page *page, unsigned int order)
16471 -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
16473 - struct page *pte;
16475 - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
16477 - SetPageForeign(pte, _pte_free);
16478 - init_page_count(pte);
16483 -void pte_free(struct page *pte)
16485 - unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
16487 - if (!pte_write(*virt_to_ptep(va)))
16488 - if (HYPERVISOR_update_va_mapping(
16489 - va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
16492 - ClearPageForeign(pte);
16493 - init_page_count(pte);
16495 - __free_page(pte);
16497 -#endif /* CONFIG_XEN */
16499 -pte_t *lookup_address(unsigned long address)
16501 - pgd_t *pgd = pgd_offset_k(address);
16505 - if (pgd_none(*pgd))
16507 - pud = pud_offset(pgd, address);
16508 - if (!pud_present(*pud))
16510 - pmd = pmd_offset(pud, address);
16511 - if (!pmd_present(*pmd))
16513 - if (pmd_large(*pmd))
16514 - return (pte_t *)pmd;
16515 - pte = pte_offset_kernel(pmd, address);
16516 - if (pte && !pte_present(*pte))
16521 -static struct page *split_large_page(unsigned long address, pgprot_t prot,
16522 - pgprot_t ref_prot)
16525 - unsigned long addr;
16526 - struct page *base = alloc_pages(GFP_KERNEL, 0);
16531 - * page_private is used to track the number of entries in
16532 - * the page table page have non standard attributes.
16534 - SetPagePrivate(base);
16535 - page_private(base) = 0;
16537 - address = __pa(address);
16538 - addr = address & LARGE_PAGE_MASK;
16539 - pbase = (pte_t *)page_address(base);
16540 - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
16541 - pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
16542 - addr == address ? prot : ref_prot);
16547 -void clflush_cache_range(void *adr, int size)
16550 - for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
16554 -static void flush_kernel_map(void *arg)
16556 - struct list_head *l = (struct list_head *)arg;
16559 - /* When clflush is available always use it because it is
16560 - much cheaper than WBINVD. */
16561 - /* clflush is still broken. Disable for now. */
16562 - if (1 || !cpu_has_clflush)
16563 - asm volatile("wbinvd" ::: "memory");
16564 - else list_for_each_entry(pg, l, lru) {
16565 - void *adr = page_address(pg);
16566 - clflush_cache_range(adr, PAGE_SIZE);
16568 - __flush_tlb_all();
16571 -static inline void flush_map(struct list_head *l)
16573 - on_each_cpu(flush_kernel_map, l, 1, 1);
16576 -static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
16578 -static inline void save_page(struct page *fpage)
16580 - if (!test_and_set_bit(PG_arch_1, &fpage->flags))
16581 - list_add(&fpage->lru, &deferred_pages);
16585 - * No more special protections in this 2/4MB area - revert to a
16586 - * large page again.
16588 -static void revert_page(unsigned long address, pgprot_t ref_prot)
16594 - unsigned long pfn;
16596 - pgd = pgd_offset_k(address);
16597 - BUG_ON(pgd_none(*pgd));
16598 - pud = pud_offset(pgd,address);
16599 - BUG_ON(pud_none(*pud));
16600 - pmd = pmd_offset(pud, address);
16601 - BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
16602 - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
16603 - large_pte = pfn_pte(pfn, ref_prot);
16604 - large_pte = pte_mkhuge(large_pte);
16605 - set_pte((pte_t *)pmd, large_pte);
16609 -__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
16610 - pgprot_t ref_prot)
16613 - struct page *kpte_page;
16614 - pgprot_t ref_prot2;
16616 - kpte = lookup_address(address);
16617 - if (!kpte) return 0;
16618 - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
16619 - BUG_ON(PageLRU(kpte_page));
16620 - BUG_ON(PageCompound(kpte_page));
16621 - if (pgprot_val(prot) != pgprot_val(ref_prot)) {
16622 - if (!pte_huge(*kpte)) {
16623 - set_pte(kpte, pfn_pte(pfn, prot));
16626 - * split_large_page will take the reference for this
16627 - * change_page_attr on the split page.
16629 - struct page *split;
16630 - ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
16631 - split = split_large_page(address, prot, ref_prot2);
16634 - pgprot_val(ref_prot2) &= ~_PAGE_NX;
16635 - set_pte(kpte, mk_pte(split, ref_prot2));
16636 - kpte_page = split;
16638 - page_private(kpte_page)++;
16639 - } else if (!pte_huge(*kpte)) {
16640 - set_pte(kpte, pfn_pte(pfn, ref_prot));
16641 - BUG_ON(page_private(kpte_page) == 0);
16642 - page_private(kpte_page)--;
16646 - /* on x86-64 the direct mapping set at boot is not using 4k pages */
16648 - * ..., but the XEN guest kernels (currently) do:
16649 - * If the pte was reserved, it means it was created at boot
16650 - * time (not via split_large_page) and in turn we must not
16651 - * replace it with a large page.
16653 -#ifndef CONFIG_XEN
16654 - BUG_ON(PageReserved(kpte_page));
16656 - if (PageReserved(kpte_page))
16660 - save_page(kpte_page);
16661 - if (page_private(kpte_page) == 0)
16662 - revert_page(address, ref_prot);
16667 - * Change the page attributes of an page in the linear mapping.
16669 - * This should be used when a page is mapped with a different caching policy
16670 - * than write-back somewhere - some CPUs do not like it when mappings with
16671 - * different caching policies exist. This changes the page attributes of the
16672 - * in kernel linear mapping too.
16674 - * The caller needs to ensure that there are no conflicting mappings elsewhere.
16675 - * This function only deals with the kernel linear map.
16677 - * Caller must call global_flush_tlb() after this.
16679 -int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
16681 - int err = 0, kernel_map = 0;
16684 - if (address >= __START_KERNEL_map
16685 - && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
16686 - address = (unsigned long)__va(__pa(address));
16690 - down_write(&init_mm.mmap_sem);
16691 - for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
16692 - unsigned long pfn = __pa(address) >> PAGE_SHIFT;
16694 - if (!kernel_map || pte_present(pfn_pte(0, prot))) {
16695 - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
16699 - /* Handle kernel mapping too which aliases part of the
16701 - if (__pa(address) < KERNEL_TEXT_SIZE) {
16702 - unsigned long addr2;
16704 - addr2 = __START_KERNEL_map + __pa(address);
16705 - /* Make sure the kernel mappings stay executable */
16706 - prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
16707 - err = __change_page_attr(addr2, pfn, prot2,
16708 - PAGE_KERNEL_EXEC);
16711 - up_write(&init_mm.mmap_sem);
16715 -/* Don't call this for MMIO areas that may not have a mem_map entry */
16716 -int change_page_attr(struct page *page, int numpages, pgprot_t prot)
16718 - unsigned long addr = (unsigned long)page_address(page);
16719 - return change_page_attr_addr(addr, numpages, prot);
16722 -void global_flush_tlb(void)
16724 - struct page *pg, *next;
16725 - struct list_head l;
16728 - * Write-protect the semaphore, to exclude two contexts
16729 - * doing a list_replace_init() call in parallel and to
16730 - * exclude new additions to the deferred_pages list:
16732 - down_write(&init_mm.mmap_sem);
16733 - list_replace_init(&deferred_pages, &l);
16734 - up_write(&init_mm.mmap_sem);
16738 - list_for_each_entry_safe(pg, next, &l, lru) {
16739 - list_del(&pg->lru);
16740 - clear_bit(PG_arch_1, &pg->flags);
16741 - if (page_private(pg) != 0)
16743 - ClearPagePrivate(pg);
16748 -EXPORT_SYMBOL(change_page_attr);
16749 -EXPORT_SYMBOL(global_flush_tlb);
16751 +++ b/arch/x86/mm/pageattr-xen.c
16754 + * Copyright 2002 Andi Kleen, SuSE Labs.
16755 + * Thanks to Ben LaHaise for precious feedback.
16757 +#include <linux/highmem.h>
16758 +#include <linux/bootmem.h>
16759 +#include <linux/module.h>
16760 +#include <linux/sched.h>
16761 +#include <linux/slab.h>
16762 +#include <linux/mm.h>
16763 +#include <linux/interrupt.h>
16765 +#include <asm/e820.h>
16766 +#include <asm/processor.h>
16767 +#include <asm/tlbflush.h>
16768 +#include <asm/sections.h>
16769 +#include <asm/uaccess.h>
16770 +#include <asm/pgalloc.h>
16771 +#include <asm/proto.h>
16772 +#include <asm/mmu_context.h>
16774 +#ifndef CONFIG_X86_64
16775 +#define TASK_SIZE64 TASK_SIZE
16778 +static void _pin_lock(struct mm_struct *mm, int lock) {
16780 + spin_lock(&mm->page_table_lock);
16781 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
16782 + /* While mm->page_table_lock protects us against insertions and
16783 + * removals of higher level page table pages, it doesn't protect
16784 + * against updates of pte-s. Such updates, however, require the
16785 + * pte pages to be in consistent state (unpinned+writable or
16786 + * pinned+readonly). The pinning and attribute changes, however
16787 + * cannot be done atomically, which is why such updates must be
16788 + * prevented from happening concurrently.
16789 + * Note that no pte lock can ever elsewhere be acquired nesting
16790 + * with an already acquired one in the same mm, or with the mm's
16791 + * page_table_lock already acquired, as that would break in the
16792 + * non-split case (where all these are actually resolving to the
16793 + * one page_table_lock). Thus acquiring all of them here is not
16794 + * going to result in dead locks, and the order of acquires
16795 + * doesn't matter.
16798 + pgd_t *pgd = mm->pgd;
16801 + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16805 + if (pgd_none(*pgd))
16807 + pud = pud_offset(pgd, 0);
16808 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16812 + if (pud_none(*pud))
16814 + pmd = pmd_offset(pud, 0);
16815 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16818 + if (pmd_none(*pmd))
16820 + ptl = pte_lockptr(0, pmd);
16824 + spin_unlock(ptl);
16831 + spin_unlock(&mm->page_table_lock);
16833 +#define pin_lock(mm) _pin_lock(mm, 1)
16834 +#define pin_unlock(mm) _pin_lock(mm, 0)
16836 +#define PIN_BATCH sizeof(void *)
16837 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
16839 +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
16840 + unsigned int cpu, unsigned int seq)
16842 + unsigned long pfn = page_to_pfn(page);
16844 + if (PageHighMem(page)) {
16845 + if (pgprot_val(flags) & _PAGE_RW)
16846 + ClearPagePinned(page);
16848 + SetPagePinned(page);
16850 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16851 + (unsigned long)__va(pfn << PAGE_SHIFT),
16852 + pfn_pte(pfn, flags), 0);
16853 + if (unlikely(++seq == PIN_BATCH)) {
16854 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16855 + PIN_BATCH, NULL)))
16864 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
16866 + pgd_t *pgd = pgd_base;
16870 + unsigned int cpu, seq;
16871 + multicall_entry_t *mcl;
16873 + if (xen_feature(XENFEAT_auto_translated_physmap))
16879 + * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
16880 + * may not be the 'current' task's pagetables (e.g., current may be
16881 + * 32-bit, but the pagetables may be for a 64-bit task).
16882 + * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
16883 + * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
16885 + for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16886 + if (pgd_none(*pgd))
16888 + pud = pud_offset(pgd, 0);
16889 + if (PTRS_PER_PUD > 1) /* not folded */
16890 + seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
16891 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16892 + if (pud_none(*pud))
16894 + pmd = pmd_offset(pud, 0);
16895 + if (PTRS_PER_PMD > 1) /* not folded */
16896 + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
16897 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16898 + if (pmd_none(*pmd))
16900 + seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
16905 + mcl = per_cpu(pb_mcl, cpu);
16906 +#ifdef CONFIG_X86_64
16907 + if (unlikely(seq > PIN_BATCH - 2)) {
16908 + if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
16912 + MULTI_update_va_mapping(mcl + seq,
16913 + (unsigned long)__user_pgd(pgd_base),
16914 + pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
16916 + MULTI_update_va_mapping(mcl + seq + 1,
16917 + (unsigned long)pgd_base,
16918 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16920 + if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
16923 + if (likely(seq != 0)) {
16924 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16925 + (unsigned long)pgd_base,
16926 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16928 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16931 + } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
16932 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16940 +static void __pgd_pin(pgd_t *pgd)
16942 + pgd_walk(pgd, PAGE_KERNEL_RO);
16943 + kmap_flush_unused();
16944 + xen_pgd_pin(__pa(pgd)); /* kernel */
16945 +#ifdef CONFIG_X86_64
16946 + xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
16948 + SetPagePinned(virt_to_page(pgd));
16951 +static void __pgd_unpin(pgd_t *pgd)
16953 + xen_pgd_unpin(__pa(pgd));
16954 +#ifdef CONFIG_X86_64
16955 + xen_pgd_unpin(__pa(__user_pgd(pgd)));
16957 + pgd_walk(pgd, PAGE_KERNEL);
16958 + ClearPagePinned(virt_to_page(pgd));
16961 +void pgd_test_and_unpin(pgd_t *pgd)
16963 + if (PagePinned(virt_to_page(pgd)))
16964 + __pgd_unpin(pgd);
16967 +void mm_pin(struct mm_struct *mm)
16969 + if (xen_feature(XENFEAT_writable_page_tables))
16973 + __pgd_pin(mm->pgd);
16977 +void mm_unpin(struct mm_struct *mm)
16979 + if (xen_feature(XENFEAT_writable_page_tables))
16983 + __pgd_unpin(mm->pgd);
16987 +void mm_pin_all(void)
16989 + struct page *page;
16990 + unsigned long flags;
16992 + if (xen_feature(XENFEAT_writable_page_tables))
16996 + * Allow uninterrupted access to the pgd_list. Also protects
16997 + * __pgd_pin() by disabling preemption.
16998 + * All other CPUs must be at a safe point (e.g., in stop_machine
16999 + * or offlined entirely).
17001 + spin_lock_irqsave(&pgd_lock, flags);
17002 + list_for_each_entry(page, &pgd_list, lru) {
17003 + if (!PagePinned(page))
17004 + __pgd_pin((pgd_t *)page_address(page));
17006 + spin_unlock_irqrestore(&pgd_lock, flags);
17009 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
17011 + if (!PagePinned(virt_to_page(mm->pgd)))
17015 +void arch_exit_mmap(struct mm_struct *mm)
17017 + struct task_struct *tsk = current;
17022 + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
17023 + * *much* faster this way, as no tlb flushes means bigger wrpt batches.
17025 + if (tsk->active_mm == mm) {
17026 + tsk->active_mm = &init_mm;
17027 + atomic_inc(&init_mm.mm_count);
17029 + switch_mm(mm, &init_mm, tsk);
17031 + atomic_dec(&mm->mm_count);
17032 + BUG_ON(atomic_read(&mm->mm_count) == 0);
17035 + task_unlock(tsk);
17037 + if (PagePinned(virt_to_page(mm->pgd))
17038 + && atomic_read(&mm->mm_count) == 1
17039 + && !mm->context.has_foreign_mappings)
17043 +static void _pte_free(struct page *page, unsigned int order)
17046 + __pte_free(page);
17049 +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
17051 + struct page *pte;
17053 +#ifdef CONFIG_HIGHPTE
17054 + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
17056 + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17059 + pgtable_page_ctor(pte);
17060 + SetPageForeign(pte, _pte_free);
17061 + init_page_count(pte);
17066 +void __pte_free(pgtable_t pte)
17068 + if (!PageHighMem(pte)) {
17069 + unsigned long va = (unsigned long)page_address(pte);
17070 + unsigned int level;
17071 + pte_t *ptep = lookup_address(va, &level);
17073 + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
17074 + if (!pte_write(*ptep)
17075 + && HYPERVISOR_update_va_mapping(va,
17076 + mk_pte(pte, PAGE_KERNEL),
17080 +#ifdef CONFIG_HIGHPTE
17081 + ClearPagePinned(pte);
17086 + ClearPageForeign(pte);
17087 + init_page_count(pte);
17088 + pgtable_page_dtor(pte);
17089 + __free_page(pte);
17092 +#if PAGETABLE_LEVELS >= 3
17093 +static void _pmd_free(struct page *page, unsigned int order)
17096 + __pmd_free(page);
17099 +pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
17101 + struct page *pmd;
17103 + pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17106 + SetPageForeign(pmd, _pmd_free);
17107 + init_page_count(pmd);
17108 + return page_address(pmd);
17111 +void __pmd_free(pgtable_t pmd)
17113 + unsigned long va = (unsigned long)page_address(pmd);
17114 + unsigned int level;
17115 + pte_t *ptep = lookup_address(va, &level);
17117 + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
17118 + if (!pte_write(*ptep)
17119 + && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
17122 + ClearPageForeign(pmd);
17123 + init_page_count(pmd);
17124 + __free_page(pmd);
17128 +/* blktap and gntdev need this, as otherwise they would implicitly (and
17129 + * needlessly, as they never use it) reference init_mm. */
17130 +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
17131 + unsigned long addr, pte_t *ptep, int full)
17133 + return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
17135 +EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
17138 + * The current flushing context - we pass it instead of 5 arguments:
17141 + unsigned long vaddr;
17142 + pgprot_t mask_set;
17143 + pgprot_t mask_clr;
17146 + unsigned long pfn;
17149 +#ifdef CONFIG_X86_64
17151 +static inline unsigned long highmap_start_pfn(void)
17153 + return __pa(_text) >> PAGE_SHIFT;
17156 +static inline unsigned long highmap_end_pfn(void)
17158 + return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
17163 +#ifdef CONFIG_DEBUG_PAGEALLOC
17164 +# define debug_pagealloc 1
17166 +# define debug_pagealloc 0
17170 +within(unsigned long addr, unsigned long start, unsigned long end)
17172 + return addr >= start && addr < end;
17176 + * Flushing functions
17180 + * clflush_cache_range - flush a cache range with clflush
17181 + * @addr: virtual start address
17182 + * @size: number of bytes to flush
17184 + * clflush is an unordered instruction which needs fencing with mfence
17185 + * to avoid ordering issues.
17187 +void clflush_cache_range(void *vaddr, unsigned int size)
17189 + void *vend = vaddr + size - 1;
17193 + for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
17196 + * Flush any possible final partial cacheline:
17203 +static void __cpa_flush_all(void *arg)
17205 + unsigned long cache = (unsigned long)arg;
17208 + * Flush all to work around Errata in early athlons regarding
17209 + * large page flushing.
17211 + __flush_tlb_all();
17213 + if (cache && boot_cpu_data.x86_model >= 4)
17217 +static void cpa_flush_all(unsigned long cache)
17219 + BUG_ON(irqs_disabled());
17221 + on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
17224 +static void __cpa_flush_range(void *arg)
17227 + * We could optimize that further and do individual per page
17228 + * tlb invalidates for a low number of pages. Caveat: we must
17229 + * flush the high aliases on 64bit as well.
17231 + __flush_tlb_all();
17234 +static void cpa_flush_range(unsigned long start, int numpages, int cache)
17236 + unsigned int i, level;
17237 + unsigned long addr;
17239 + BUG_ON(irqs_disabled());
17240 + WARN_ON(PAGE_ALIGN(start) != start);
17242 + on_each_cpu(__cpa_flush_range, NULL, 1, 1);
17248 + * We only need to flush on one CPU,
17249 + * clflush is a MESI-coherent instruction that
17250 + * will cause all other CPUs to flush the same
17253 + for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
17254 + pte_t *pte = lookup_address(addr, &level);
17257 + * Only flush present addresses:
17259 + if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
17260 + clflush_cache_range((void *) addr, PAGE_SIZE);
17265 + * Certain areas of memory on x86 require very specific protection flags,
17266 + * for example the BIOS area or kernel text. Callers don't always get this
17267 + * right (again, ioremap() on BIOS memory is not uncommon) so this function
17268 + * checks and fixes these known static required protection bits.
17270 +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
17271 + unsigned long pfn)
17273 + pgprot_t forbidden = __pgprot(0);
17275 +#ifndef CONFIG_XEN
17277 + * The BIOS area between 640k and 1Mb needs to be executable for
17278 + * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
17280 + if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
17281 + pgprot_val(forbidden) |= _PAGE_NX;
17285 + * The kernel text needs to be executable for obvious reasons
17286 + * Does not cover __inittext since that is gone later on. On
17287 + * 64bit we do not enforce !NX on the low mapping
17289 + if (within(address, (unsigned long)_text, (unsigned long)_etext))
17290 + pgprot_val(forbidden) |= _PAGE_NX;
17293 + * The .rodata section needs to be read-only. Using the pfn
17294 + * catches all aliases.
17296 + if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
17297 + __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
17298 + pgprot_val(forbidden) |= _PAGE_RW;
17300 + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
17306 + * Lookup the page table entry for a virtual address. Return a pointer
17307 + * to the entry and the level of the mapping.
17309 + * Note: We return pud and pmd either when the entry is marked large
17310 + * or when the present bit is not set. Otherwise we would return a
17311 + * pointer to a nonexisting mapping.
17313 +pte_t *lookup_address(unsigned long address, unsigned int *level)
17315 + pgd_t *pgd = pgd_offset_k(address);
17319 + *level = PG_LEVEL_NONE;
17321 + if (pgd_none(*pgd))
17324 + pud = pud_offset(pgd, address);
17325 + if (pud_none(*pud))
17328 + *level = PG_LEVEL_1G;
17329 + if (pud_large(*pud) || !pud_present(*pud))
17330 + return (pte_t *)pud;
17332 + pmd = pmd_offset(pud, address);
17333 + if (pmd_none(*pmd))
17336 + *level = PG_LEVEL_2M;
17337 + if (pmd_large(*pmd) || !pmd_present(*pmd))
17338 + return (pte_t *)pmd;
17340 + *level = PG_LEVEL_4K;
17342 + return pte_offset_kernel(pmd, address);
17346 + * Set the new pmd in all the pgds we know about:
17348 +static void __set_pmd_pte(pte_t *kpte, unsigned long address,
17349 + unsigned int level, pte_t pte)
17351 + /* change init_mm */
17353 + case PG_LEVEL_2M:
17354 + xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte)));
17356 +#ifdef CONFIG_X86_64
17357 + case PG_LEVEL_1G:
17358 + xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte)));
17364 +#ifdef CONFIG_X86_32
17365 + if (!SHARED_KERNEL_PMD) {
17366 + struct page *page;
17368 + list_for_each_entry(page, &pgd_list, lru) {
17373 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
17374 + pud = pud_offset(pgd, address);
17375 + pmd = pmd_offset(pud, address);
17376 + xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte)));
17383 +try_preserve_large_page(pte_t *kpte, unsigned long address,
17384 + struct cpa_data *cpa)
17386 + unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
17387 + pte_t new_pte, old_pte, *tmp;
17388 + pgprot_t old_prot, new_prot;
17389 + int i, do_split = 1;
17390 + unsigned int level;
17392 + spin_lock_irqsave(&pgd_lock, flags);
17394 + * Check for races, another CPU might have split this page
17397 + tmp = lookup_address(address, &level);
17402 + case PG_LEVEL_2M:
17403 + psize = PMD_PAGE_SIZE;
17404 + pmask = PMD_PAGE_MASK;
17406 +#ifdef CONFIG_X86_64
17407 + case PG_LEVEL_1G:
17408 + psize = PUD_PAGE_SIZE;
17409 + pmask = PUD_PAGE_MASK;
17413 + do_split = -EINVAL;
17418 + * Calculate the number of pages, which fit into this large
17419 + * page starting at address:
17421 + nextpage_addr = (address + psize) & pmask;
17422 + numpages = (nextpage_addr - address) >> PAGE_SHIFT;
17423 + if (numpages < cpa->numpages)
17424 + cpa->numpages = numpages;
17427 + * We are safe now. Check whether the new pgprot is the same:
17430 + old_prot = new_prot = pte_pgprot(old_pte);
17432 + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
17433 + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
17436 + * old_pte points to the large page base address. So we need
17437 + * to add the offset of the virtual address:
17439 + pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
17442 + new_prot = static_protections(new_prot, address, pfn);
17445 + * We need to check the full range, whether
17446 + * static_protection() requires a different pgprot for one of
17447 + * the pages in the range we try to preserve:
17449 + if (pfn < max_mapnr) {
17450 + addr = address + PAGE_SIZE;
17451 + for (i = 1; i < cpa->numpages && ++pfn < max_mapnr;
17452 + i++, addr += PAGE_SIZE) {
17453 + pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
17455 + if (pgprot_val(chk_prot) != pgprot_val(new_prot))
17461 + * If there are no changes, return. maxpages has been updated
17464 + if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
17470 + * We need to change the attributes. Check, whether we can
17471 + * change the large page in one go. We request a split, when
17472 + * the address is not aligned and the number of pages is
17473 + * smaller than the number of pages in the large page. Note
17474 + * that we limited the number of possible pages already to
17475 + * the number of pages in the large page.
17477 + if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
17479 + * The address is aligned and the number of pages
17480 + * covers the full page.
17482 + new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot));
17483 + __set_pmd_pte(kpte, address, level, new_pte);
17484 + cpa->flushtlb = 1;
17489 + spin_unlock_irqrestore(&pgd_lock, flags);
17494 +static LIST_HEAD(page_pool);
17495 +static unsigned long pool_size, pool_pages, pool_low;
17496 +static unsigned long pool_used, pool_failed;
17498 +static void cpa_fill_pool(struct page **ret)
17500 + gfp_t gfp = GFP_KERNEL;
17501 + unsigned long flags;
17505 + * Avoid recursion (on debug-pagealloc) and also signal
17506 + * our priority to get to these pagetables:
17508 + if (current->flags & PF_MEMALLOC)
17510 + current->flags |= PF_MEMALLOC;
17513 + * Allocate atomically from atomic contexts:
17515 + if (in_atomic() || irqs_disabled() || debug_pagealloc)
17516 + gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
17518 + while (pool_pages < pool_size || (ret && !*ret)) {
17519 + p = alloc_pages(gfp, 0);
17525 + * If the call site needs a page right now, provide it:
17527 + if (ret && !*ret) {
17531 + spin_lock_irqsave(&pgd_lock, flags);
17532 + list_add(&p->lru, &page_pool);
17534 + spin_unlock_irqrestore(&pgd_lock, flags);
17537 + current->flags &= ~PF_MEMALLOC;
17540 +#define SHIFT_MB (20 - PAGE_SHIFT)
17541 +#define ROUND_MB_GB ((1 << 10) - 1)
17542 +#define SHIFT_MB_GB 10
17543 +#define POOL_PAGES_PER_GB 16
17545 +void __init cpa_init(void)
17547 + struct sysinfo si;
17548 + unsigned long gb;
17552 + * Calculate the number of pool pages:
17554 + * Convert totalram (nr of pages) to MiB and round to the next
17555 + * GiB. Shift MiB to Gib and multiply the result by
17556 + * POOL_PAGES_PER_GB:
17558 + if (debug_pagealloc) {
17559 + gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
17560 + pool_size = POOL_PAGES_PER_GB * gb;
17564 + pool_low = pool_size;
17566 + cpa_fill_pool(NULL);
17567 + printk(KERN_DEBUG
17568 + "CPA: page pool initialized %lu of %lu pages preallocated\n",
17569 + pool_pages, pool_size);
17572 +static int split_large_page(pte_t *kpte, unsigned long address)
17574 + unsigned long flags, mfn, mfninc = 1;
17575 + unsigned int i, level;
17576 + pte_t *pbase, *tmp;
17577 + pgprot_t ref_prot;
17578 + struct page *base;
17581 + * Get a page from the pool. The pool list is protected by the
17582 + * pgd_lock, which we have to take anyway for the split
17585 + spin_lock_irqsave(&pgd_lock, flags);
17586 + if (list_empty(&page_pool)) {
17587 + spin_unlock_irqrestore(&pgd_lock, flags);
17589 + cpa_fill_pool(&base);
17592 + spin_lock_irqsave(&pgd_lock, flags);
17594 + base = list_first_entry(&page_pool, struct page, lru);
17595 + list_del(&base->lru);
17598 + if (pool_pages < pool_low)
17599 + pool_low = pool_pages;
17603 + * Check for races, another CPU might have split this page
17604 + * up for us already:
17606 + tmp = lookup_address(address, &level);
17610 + pbase = (pte_t *)page_address(base);
17611 +#ifdef CONFIG_X86_32
17612 + paravirt_alloc_pt(&init_mm, page_to_pfn(base));
17614 + ref_prot = pte_pgprot(pte_clrhuge(*kpte));
17616 +#ifdef CONFIG_X86_64
17617 + if (level == PG_LEVEL_1G) {
17618 + mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
17619 + pgprot_val(ref_prot) |= _PAGE_PSE;
17624 + * Get the target mfn from the original entry:
17626 + mfn = __pte_mfn(*kpte);
17627 + for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc)
17628 + set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
17631 + * Install the new, split up pagetable. Important details here:
17633 + * On Intel the NX bit of all levels must be cleared to make a
17634 + * page executable. See section 4.13.2 of Intel 64 and IA-32
17635 + * Architectures Software Developer's Manual).
17637 + * Mark the entry present. The current mapping might be
17638 + * set to not present, which we preserved above.
17640 + if (HYPERVISOR_update_va_mapping((unsigned long)pbase,
17641 + mk_pte(base, PAGE_KERNEL_RO), 0))
17643 + ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
17644 + pgprot_val(ref_prot) |= _PAGE_PRESENT;
17645 + __set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot));
17650 + * If we dropped out via the lookup_address check under
17651 + * pgd_lock then stick the page back into the pool:
17654 + list_add(&base->lru, &page_pool);
17658 + spin_unlock_irqrestore(&pgd_lock, flags);
17663 +static int __change_page_attr(struct cpa_data *cpa, int primary)
17665 + unsigned long address = cpa->vaddr;
17666 + int do_split, err;
17667 + unsigned int level;
17668 + pte_t *kpte, old_pte;
17671 + kpte = lookup_address(address, &level);
17673 + return primary ? -EINVAL : 0;
17676 + if (!__pte_val(old_pte)) {
17679 + printk(KERN_WARNING "CPA: called for zero pte. "
17680 + "vaddr = %lx cpa->vaddr = %lx\n", address,
17686 + if (level == PG_LEVEL_4K) {
17688 + pgprot_t new_prot = pte_pgprot(old_pte);
17689 + unsigned long mfn = __pte_mfn(old_pte);
17691 + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
17692 + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
17694 + new_prot = static_protections(new_prot, address,
17695 + mfn_to_local_pfn(mfn));
17698 + * We need to keep the mfn from the existing PTE,
17699 + * after all we're only going to change it's attributes
17700 + * not the memory it points to
17702 + new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot));
17703 + cpa->pfn = mfn_to_local_pfn(mfn);
17705 + * Do we really change anything ?
17707 + if (__pte_val(old_pte) != __pte_val(new_pte)) {
17708 + set_pte_atomic(kpte, new_pte);
17709 + cpa->flushtlb = 1;
17711 + cpa->numpages = 1;
17716 + * Check, whether we can keep the large page intact
17717 + * and just change the pte:
17719 + do_split = try_preserve_large_page(kpte, address, cpa);
17721 + * When the range fits into the existing large page,
17722 + * return. cp->numpages and cpa->tlbflush have been updated in
17723 + * try_large_page:
17725 + if (do_split <= 0)
17729 + * We have to split the large page:
17731 + err = split_large_page(kpte, address);
17733 + cpa->flushtlb = 1;
17740 +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
17742 +static int cpa_process_alias(struct cpa_data *cpa)
17744 + struct cpa_data alias_cpa;
17747 + if (cpa->pfn > max_pfn_mapped)
17751 + * No need to redo, when the primary call touched the direct
17752 + * mapping already:
17754 + if (!within(cpa->vaddr, PAGE_OFFSET,
17755 + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
17757 + alias_cpa = *cpa;
17758 + alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
17760 + ret = __change_page_attr_set_clr(&alias_cpa, 0);
17763 +#ifdef CONFIG_X86_64
17767 + * No need to redo, when the primary call touched the high
17768 + * mapping already:
17770 + if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
17774 + * If the physical address is inside the kernel map, we need
17775 + * to touch the high mapped kernel as well:
17777 + if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
17780 + alias_cpa = *cpa;
17781 + alias_cpa.vaddr =
17782 + (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map;
17785 + * The high mapping range is imprecise, so ignore the return value.
17787 + __change_page_attr_set_clr(&alias_cpa, 0);
17792 +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
17794 + int ret, numpages = cpa->numpages;
17796 + while (numpages) {
17798 + * Store the remaining nr of pages for the large page
17799 + * preservation check.
17801 + cpa->numpages = numpages;
17803 + ret = __change_page_attr(cpa, checkalias);
17807 + if (checkalias) {
17808 + ret = cpa_process_alias(cpa);
17814 + * Adjust the number of pages with the result of the
17815 + * CPA operation. Either a large page has been
17816 + * preserved or a single page update happened.
17818 + BUG_ON(cpa->numpages > numpages);
17819 + numpages -= cpa->numpages;
17820 + cpa->vaddr += cpa->numpages * PAGE_SIZE;
17825 +static inline int cache_attr(pgprot_t attr)
17827 + return pgprot_val(attr) &
17828 + (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
17831 +static int change_page_attr_set_clr(unsigned long addr, int numpages,
17832 + pgprot_t mask_set, pgprot_t mask_clr)
17834 + struct cpa_data cpa;
17835 + int ret, cache, checkalias;
17838 + * Check, if we are requested to change a not supported
17841 + mask_set = canon_pgprot(mask_set);
17842 + mask_clr = canon_pgprot(mask_clr);
17843 + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
17846 + /* Ensure we are PAGE_SIZE aligned */
17847 + if (addr & ~PAGE_MASK) {
17848 + addr &= PAGE_MASK;
17850 + * People should not be passing in unaligned addresses:
17855 + cpa.vaddr = addr;
17856 + cpa.numpages = numpages;
17857 + cpa.mask_set = mask_set;
17858 + cpa.mask_clr = mask_clr;
17859 + cpa.flushtlb = 0;
17861 + /* No alias checking for _NX bit modifications */
17862 + checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
17864 + ret = __change_page_attr_set_clr(&cpa, checkalias);
17867 + * Check whether we really changed something:
17869 + if (!cpa.flushtlb)
17873 + * No need to flush, when we did not set any of the caching
17876 + cache = cache_attr(mask_set);
17879 + * On success we use clflush, when the CPU supports it to
17880 + * avoid the wbindv. If the CPU does not support it and in the
17881 + * error case we fall back to cpa_flush_all (which uses
17884 + if (!ret && cpu_has_clflush)
17885 + cpa_flush_range(addr, numpages, cache);
17887 + cpa_flush_all(cache);
17890 + cpa_fill_pool(NULL);
17895 +static inline int change_page_attr_set(unsigned long addr, int numpages,
17898 + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
17901 +static inline int change_page_attr_clear(unsigned long addr, int numpages,
17904 + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
17907 +int set_memory_uc(unsigned long addr, int numpages)
17909 + return change_page_attr_set(addr, numpages,
17910 + __pgprot(_PAGE_PCD));
17912 +EXPORT_SYMBOL(set_memory_uc);
17914 +int set_memory_wb(unsigned long addr, int numpages)
17916 + return change_page_attr_clear(addr, numpages,
17917 + __pgprot(_PAGE_PCD | _PAGE_PWT));
17919 +EXPORT_SYMBOL(set_memory_wb);
17921 +int set_memory_x(unsigned long addr, int numpages)
17923 + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
17925 +EXPORT_SYMBOL(set_memory_x);
17927 +int set_memory_nx(unsigned long addr, int numpages)
17929 + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
17931 +EXPORT_SYMBOL(set_memory_nx);
17933 +int set_memory_ro(unsigned long addr, int numpages)
17935 + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
17938 +int set_memory_rw(unsigned long addr, int numpages)
17940 + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
17943 +int set_memory_np(unsigned long addr, int numpages)
17945 + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
17948 +int set_pages_uc(struct page *page, int numpages)
17950 + unsigned long addr = (unsigned long)page_address(page);
17952 + return set_memory_uc(addr, numpages);
17954 +EXPORT_SYMBOL(set_pages_uc);
17956 +int set_pages_wb(struct page *page, int numpages)
17958 + unsigned long addr = (unsigned long)page_address(page);
17960 + return set_memory_wb(addr, numpages);
17962 +EXPORT_SYMBOL(set_pages_wb);
17964 +int set_pages_x(struct page *page, int numpages)
17966 + unsigned long addr = (unsigned long)page_address(page);
17968 + return set_memory_x(addr, numpages);
17970 +EXPORT_SYMBOL(set_pages_x);
17972 +int set_pages_nx(struct page *page, int numpages)
17974 + unsigned long addr = (unsigned long)page_address(page);
17976 + return set_memory_nx(addr, numpages);
17978 +EXPORT_SYMBOL(set_pages_nx);
17980 +int set_pages_ro(struct page *page, int numpages)
17982 + unsigned long addr = (unsigned long)page_address(page);
17984 + return set_memory_ro(addr, numpages);
17987 +int set_pages_rw(struct page *page, int numpages)
17989 + unsigned long addr = (unsigned long)page_address(page);
17991 + return set_memory_rw(addr, numpages);
17994 +#ifdef CONFIG_DEBUG_PAGEALLOC
17996 +static int __set_pages_p(struct page *page, int numpages)
17998 + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
17999 + .numpages = numpages,
18000 + .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
18001 + .mask_clr = __pgprot(0)};
18003 + return __change_page_attr_set_clr(&cpa, 1);
18006 +static int __set_pages_np(struct page *page, int numpages)
18008 + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
18009 + .numpages = numpages,
18010 + .mask_set = __pgprot(0),
18011 + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
18013 + return __change_page_attr_set_clr(&cpa, 1);
18016 +void kernel_map_pages(struct page *page, int numpages, int enable)
18018 + if (PageHighMem(page))
18021 + debug_check_no_locks_freed(page_address(page),
18022 + numpages * PAGE_SIZE);
18026 + * If page allocator is not up yet then do not call c_p_a():
18028 + if (!debug_pagealloc_enabled)
18032 + * The return value is ignored as the calls cannot fail.
18033 + * Large pages are kept enabled at boot time, and are
18034 + * split up quickly with DEBUG_PAGEALLOC. If a splitup
18035 + * fails here (due to temporary memory shortage) no damage
18036 + * is done because we just keep the largepage intact up
18037 + * to the next attempt when it will likely be split up:
18040 + __set_pages_p(page, numpages);
18042 + __set_pages_np(page, numpages);
18045 + * We should perform an IPI and flush all tlbs,
18046 + * but that can deadlock->flush only current cpu:
18048 + __flush_tlb_all();
18051 + * Try to refill the page pool here. We can do this only after
18054 + cpa_fill_pool(NULL);
18057 +#ifdef CONFIG_HIBERNATION
18059 +bool kernel_page_present(struct page *page)
18061 + unsigned int level;
18064 + if (PageHighMem(page))
18067 + pte = lookup_address((unsigned long)page_address(page), &level);
18068 + return (__pte_val(*pte) & _PAGE_PRESENT);
18071 +#endif /* CONFIG_HIBERNATION */
18073 +#endif /* CONFIG_DEBUG_PAGEALLOC */
18075 +static inline int in_secondary_range(unsigned long va)
18077 +#ifdef CONFIG_X86_64
18078 + return va >= VMALLOC_START && va < VMALLOC_END;
18080 + return va >= (unsigned long)high_memory;
18084 +static void __make_page_readonly(unsigned long va)
18087 + unsigned int level;
18089 + pte = lookup_address(va, &level);
18090 + BUG_ON(!pte || level != PG_LEVEL_4K);
18091 + if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0))
18093 + if (in_secondary_range(va)) {
18094 + unsigned long pfn = pte_pfn(*pte);
18096 +#ifdef CONFIG_HIGHMEM
18097 + if (pfn >= highstart_pfn)
18098 + kmap_flush_unused(); /* flush stale writable kmaps */
18101 + __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT));
18105 +static void __make_page_writable(unsigned long va)
18108 + unsigned int level;
18110 + pte = lookup_address(va, &level);
18111 + BUG_ON(!pte || level != PG_LEVEL_4K);
18112 + if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
18114 + if (in_secondary_range(va)) {
18115 + unsigned long pfn = pte_pfn(*pte);
18117 +#ifdef CONFIG_HIGHMEM
18118 + if (pfn < highstart_pfn)
18120 + __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT));
18124 +void make_page_readonly(void *va, unsigned int feature)
18126 + if (!xen_feature(feature))
18127 + __make_page_readonly((unsigned long)va);
18130 +void make_page_writable(void *va, unsigned int feature)
18132 + if (!xen_feature(feature))
18133 + __make_page_writable((unsigned long)va);
18136 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
18138 + unsigned long addr;
18140 + if (xen_feature(feature))
18143 + for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
18144 + __make_page_readonly(addr);
18147 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
18149 + unsigned long addr;
18151 + if (xen_feature(feature))
18154 + for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
18155 + __make_page_writable(addr);
18159 + * The testcases use internal knowledge of the implementation that shouldn't
18160 + * be exposed to the rest of the kernel. Include these directly here.
18162 +#ifdef CONFIG_CPA_DEBUG
18163 +#include "pageattr-test.c"
18165 --- a/arch/x86/mm/pgtable_32-xen.c
18166 +++ b/arch/x86/mm/pgtable_32-xen.c
18168 #include <xen/features.h>
18169 #include <asm/hypervisor.h>
18171 -static void pgd_test_and_unpin(pgd_t *pgd);
18173 void show_mem(void)
18175 int total = 0, reserved = 0;
18176 @@ -167,53 +165,6 @@ pte_t *pte_alloc_one_kernel(struct mm_st
18180 -static void _pte_free(struct page *page, unsigned int order)
18186 -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
18188 - struct page *pte;
18190 -#ifdef CONFIG_HIGHPTE
18191 - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
18193 - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
18196 - SetPageForeign(pte, _pte_free);
18197 - init_page_count(pte);
18202 -void pte_free(struct page *pte)
18204 - unsigned long pfn = page_to_pfn(pte);
18206 - if (!PageHighMem(pte)) {
18207 - unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT);
18209 - if (!pte_write(*virt_to_ptep(va)))
18210 - if (HYPERVISOR_update_va_mapping(
18211 - va, pfn_pte(pfn, PAGE_KERNEL), 0))
18214 - ClearPagePinned(pte);
18216 - ClearPageForeign(pte);
18217 - init_page_count(pte);
18219 - __free_page(pte);
18222 -void pmd_ctor(struct kmem_cache *cache, void *pmd)
18224 - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18228 * List of all pgd's needed for non-PAE so it can invalidate entries
18229 * in both cached and uncached pgd's; not needed for PAE since the
18230 @@ -224,224 +175,191 @@ void pmd_ctor(struct kmem_cache *cache,
18231 * vmalloc faults work because attached pagetables are never freed.
18234 -DEFINE_SPINLOCK(pgd_lock);
18235 -struct page *pgd_list;
18237 static inline void pgd_list_add(pgd_t *pgd)
18239 struct page *page = virt_to_page(pgd);
18240 - page->index = (unsigned long)pgd_list;
18242 - set_page_private(pgd_list, (unsigned long)&page->index);
18244 - set_page_private(page, (unsigned long)&pgd_list);
18246 + list_add(&page->lru, &pgd_list);
18249 static inline void pgd_list_del(pgd_t *pgd)
18251 - struct page *next, **pprev, *page = virt_to_page(pgd);
18252 - next = (struct page *)page->index;
18253 - pprev = (struct page **)page_private(page);
18256 - set_page_private(next, (unsigned long)pprev);
18258 + struct page *page = virt_to_page(pgd);
18260 + list_del(&page->lru);
18263 +#define UNSHARED_PTRS_PER_PGD \
18264 + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
18266 -#if (PTRS_PER_PMD == 1)
18267 -/* Non-PAE pgd constructor */
18268 -static void pgd_ctor(void *pgd)
18269 +static void pgd_ctor(void *p)
18272 unsigned long flags;
18274 - /* !PAE, no pagetable sharing */
18275 + pgd_test_and_unpin(pgd);
18277 + /* Clear usermode parts of PGD */
18278 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18280 spin_lock_irqsave(&pgd_lock, flags);
18282 - /* must happen under lock */
18283 - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18284 - swapper_pg_dir + USER_PTRS_PER_PGD,
18285 - KERNEL_PGD_PTRS);
18287 - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
18288 - __pa(swapper_pg_dir) >> PAGE_SHIFT,
18289 - USER_PTRS_PER_PGD,
18290 - KERNEL_PGD_PTRS);
18291 - pgd_list_add(pgd);
18292 - spin_unlock_irqrestore(&pgd_lock, flags);
18294 -#else /* PTRS_PER_PMD > 1 */
18295 -/* PAE pgd constructor */
18296 -static void pgd_ctor(void *pgd)
18298 - /* PAE, kernel PMD may be shared */
18300 - if (SHARED_KERNEL_PMD) {
18301 - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18302 + /* If the pgd points to a shared pagetable level (either the
18303 + ptes in non-PAE, or shared PMD in PAE), then just copy the
18304 + references from swapper_pg_dir. */
18305 + if (PAGETABLE_LEVELS == 2 ||
18306 + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
18307 + clone_pgd_range(pgd + USER_PTRS_PER_PGD,
18308 swapper_pg_dir + USER_PTRS_PER_PGD,
18311 - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18312 + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
18313 + __pa(swapper_pg_dir) >> PAGE_SHIFT,
18314 + USER_PTRS_PER_PGD,
18315 + KERNEL_PGD_PTRS);
18318 + /* list required to sync kernel mapping updates */
18319 + if (PAGETABLE_LEVELS == 2)
18320 + pgd_list_add(pgd);
18322 + spin_unlock_irqrestore(&pgd_lock, flags);
18324 -#endif /* PTRS_PER_PMD */
18326 static void pgd_dtor(void *pgd)
18328 unsigned long flags; /* can be called from interrupt context */
18330 - if (SHARED_KERNEL_PMD)
18333 - paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
18334 - spin_lock_irqsave(&pgd_lock, flags);
18335 - pgd_list_del(pgd);
18336 - spin_unlock_irqrestore(&pgd_lock, flags);
18337 + if (!SHARED_KERNEL_PMD) {
18338 + spin_lock_irqsave(&pgd_lock, flags);
18339 + pgd_list_del(pgd);
18340 + spin_unlock_irqrestore(&pgd_lock, flags);
18343 pgd_test_and_unpin(pgd);
18346 -#define UNSHARED_PTRS_PER_PGD \
18347 - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
18349 -/* If we allocate a pmd for part of the kernel address space, then
18350 - make sure its initialized with the appropriate kernel mappings.
18351 - Otherwise use a cached zeroed pmd. */
18352 -static pmd_t *pmd_cache_alloc(int idx)
18353 +#ifdef CONFIG_X86_PAE
18355 + * Mop up any pmd pages which may still be attached to the pgd.
18356 + * Normally they will be freed by munmap/exit_mmap, but any pmd we
18357 + * preallocate which never got a corresponding vma will need to be
18358 + * freed manually.
18360 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
18365 - if (idx >= USER_PTRS_PER_PGD) {
18366 - pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
18367 + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
18368 + pgd_t pgd = pgdp[i];
18370 -#ifndef CONFIG_XEN
18373 - (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
18374 - sizeof(pmd_t) * PTRS_PER_PMD);
18377 - pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
18378 + if (__pgd_val(pgd) != 0) {
18379 + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
18383 + pgdp[i] = xen_make_pgd(0);
18385 -static void pmd_cache_free(pmd_t *pmd, int idx)
18387 - if (idx >= USER_PTRS_PER_PGD) {
18388 - make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
18389 - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18390 - free_page((unsigned long)pmd);
18392 - kmem_cache_free(pmd_cache, pmd);
18393 + paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
18394 + pmd_free(mm, pmd);
18399 -pgd_t *pgd_alloc(struct mm_struct *mm)
18401 + * In PAE mode, we need to do a cr3 reload (=tlb flush) when
18402 + * updating the top-level pagetable entries to guarantee the
18403 + * processor notices the update. Since this is expensive, and
18404 + * all 4 top-level entries are used almost immediately in a
18405 + * new process's life, we just pre-populate them here.
18407 + * Also, if we're in a paravirt environment where the kernel pmd is
18408 + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
18409 + * and initialize the kernel pmds here.
18411 +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
18414 + pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
18415 + unsigned long addr, flags;
18417 - pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
18418 - pmd_t **pmds = NULL;
18419 - unsigned long flags;
18421 - pgd_test_and_unpin(pgd);
18423 - if (PTRS_PER_PMD == 1 || !pgd)
18427 - if (!SHARED_KERNEL_PMD) {
18429 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
18430 - * allocation). We therefore store virtual addresses of pmds as they
18431 - * do not change across save/restore, and poke the machine addresses
18432 - * into the pgdir under the pgd_lock.
18434 - pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
18436 - quicklist_free(0, pgd_dtor, pgd);
18442 - /* Allocate pmds, remember virtual addresses. */
18443 - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
18444 - pmd_t *pmd = pmd_cache_alloc(i);
18448 + * We can race save/restore (if we sleep during a GFP_KERNEL memory
18449 + * allocation). We therefore store virtual addresses of pmds as they
18450 + * do not change across save/restore, and poke the machine addresses
18451 + * into the pgdir under the pgd_lock.
18453 + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
18454 + pmds[i] = pmd_alloc_one(mm, addr);
18458 - paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
18462 - set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
18466 - if (SHARED_KERNEL_PMD)
18469 spin_lock_irqsave(&pgd_lock, flags);
18471 /* Protect against save/restore: move below 4GB under pgd_lock. */
18472 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
18473 - int rc = xen_create_contiguous_region(
18474 - (unsigned long)pgd, 0, 32);
18476 - spin_unlock_irqrestore(&pgd_lock, flags);
18479 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
18480 + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
18481 + spin_unlock_irqrestore(&pgd_lock, flags);
18484 + pmd_free(mm, pmds[i]);
18488 /* Copy kernel pmd contents and write-protect the new pmds. */
18489 - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18491 - (void *)pgd_page_vaddr(swapper_pg_dir[i]),
18492 - sizeof(pmd_t) * PTRS_PER_PMD);
18493 - make_lowmem_page_readonly(
18494 - pmds[i], XENFEAT_writable_page_tables);
18496 + pud = pud_offset(pgd, 0);
18497 + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
18498 + i++, pud++, addr += PUD_SIZE) {
18499 + if (i >= USER_PTRS_PER_PGD) {
18501 + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
18502 + sizeof(pmd_t) * PTRS_PER_PMD);
18503 + make_lowmem_page_readonly(
18504 + pmds[i], XENFEAT_writable_page_tables);
18507 - /* It is safe to poke machine addresses of pmds under the pmd_lock. */
18508 - for (i = 0; i < PTRS_PER_PGD; i++)
18509 - set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
18510 + /* It is safe to poke machine addresses of pmds under the pgd_lock. */
18511 + pud_populate(mm, pud, pmds[i]);
18514 - /* Ensure this pgd gets picked up and pinned on save/restore. */
18515 + /* List required to sync kernel mapping updates and
18516 + * to pin/unpin on save/restore. */
18519 spin_unlock_irqrestore(&pgd_lock, flags);
18525 +#else /* !CONFIG_X86_PAE */
18526 +/* No need to prepopulate any pagetable entries in non-PAE modes. */
18527 +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
18533 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
18536 +#endif /* CONFIG_X86_PAE */
18540 - for (i--; i >= 0; i--) {
18541 - pgd_t pgdent = pgd[i];
18542 - void* pmd = (void *)__va(pgd_val(pgdent)-1);
18543 - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18544 - pmd_cache_free(pmd, i);
18547 - for (i--; i >= 0; i--) {
18548 - paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
18549 - pmd_cache_free(pmds[i], i);
18552 +pgd_t *pgd_alloc(struct mm_struct *mm)
18554 + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
18556 + /* so that alloc_pd can use it */
18561 + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
18562 + free_page((unsigned long)pgd);
18565 - quicklist_free(0, pgd_dtor, pgd);
18571 -void pgd_free(pgd_t *pgd)
18572 +void pgd_free(struct mm_struct *mm, pgd_t *pgd)
18577 * After this the pgd should not be pinned for the duration of this
18578 * function's execution. We should never sleep and thus never race:
18579 @@ -450,39 +368,43 @@ void pgd_free(pgd_t *pgd)
18580 * 2. The machine addresses in PGD entries will not become invalid
18581 * due to a concurrent save/restore.
18583 - pgd_test_and_unpin(pgd);
18586 - /* in the PAE case user pgd entries are overwritten before usage */
18587 - if (PTRS_PER_PMD > 1) {
18588 - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
18589 - pgd_t pgdent = pgd[i];
18590 - void* pmd = (void *)__va(pgd_val(pgdent)-1);
18591 - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18592 - pmd_cache_free(pmd, i);
18594 + if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
18595 + xen_destroy_contiguous_region((unsigned long)pgd, 0);
18597 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
18598 - xen_destroy_contiguous_region((unsigned long)pgd, 0);
18600 + pgd_mop_up_pmds(mm, pgd);
18601 + free_page((unsigned long)pgd);
18604 - /* in the non-PAE case, free_pgtables() clears user pgd entries */
18605 - quicklist_free(0, pgd_dtor, pgd);
18606 +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
18608 + pgtable_page_dtor(pte);
18609 + paravirt_release_pt(page_to_pfn(pte));
18610 + tlb_remove_page(tlb, pte);
18613 -void check_pgt_cache(void)
18614 +#ifdef CONFIG_X86_PAE
18616 +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
18618 - quicklist_trim(0, pgd_dtor, 25, 16);
18619 + paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18620 + tlb_remove_page(tlb, virt_to_page(pmd));
18625 void make_lowmem_page_readonly(void *va, unsigned int feature)
18628 + unsigned int level;
18631 if (xen_feature(feature))
18634 - pte = virt_to_ptep(va);
18635 + pte = lookup_address((unsigned long)va, &level);
18636 + BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
18637 rc = HYPERVISOR_update_va_mapping(
18638 (unsigned long)va, pte_wrprotect(*pte), 0);
18640 @@ -491,313 +413,15 @@ void make_lowmem_page_readonly(void *va,
18641 void make_lowmem_page_writable(void *va, unsigned int feature)
18644 + unsigned int level;
18647 if (xen_feature(feature))
18650 - pte = virt_to_ptep(va);
18651 + pte = lookup_address((unsigned long)va, &level);
18652 + BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
18653 rc = HYPERVISOR_update_va_mapping(
18654 (unsigned long)va, pte_mkwrite(*pte), 0);
18658 -void make_page_readonly(void *va, unsigned int feature)
18663 - if (xen_feature(feature))
18666 - pte = virt_to_ptep(va);
18667 - rc = HYPERVISOR_update_va_mapping(
18668 - (unsigned long)va, pte_wrprotect(*pte), 0);
18669 - if (rc) /* fallback? */
18670 - xen_l1_entry_update(pte, pte_wrprotect(*pte));
18671 - if ((unsigned long)va >= (unsigned long)high_memory) {
18672 - unsigned long pfn = pte_pfn(*pte);
18673 -#ifdef CONFIG_HIGHMEM
18674 - if (pfn >= highstart_pfn)
18675 - kmap_flush_unused(); /* flush stale writable kmaps */
18678 - make_lowmem_page_readonly(
18679 - phys_to_virt(pfn << PAGE_SHIFT), feature);
18683 -void make_page_writable(void *va, unsigned int feature)
18688 - if (xen_feature(feature))
18691 - pte = virt_to_ptep(va);
18692 - rc = HYPERVISOR_update_va_mapping(
18693 - (unsigned long)va, pte_mkwrite(*pte), 0);
18694 - if (rc) /* fallback? */
18695 - xen_l1_entry_update(pte, pte_mkwrite(*pte));
18696 - if ((unsigned long)va >= (unsigned long)high_memory) {
18697 - unsigned long pfn = pte_pfn(*pte);
18698 -#ifdef CONFIG_HIGHMEM
18699 - if (pfn < highstart_pfn)
18701 - make_lowmem_page_writable(
18702 - phys_to_virt(pfn << PAGE_SHIFT), feature);
18706 -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
18708 - if (xen_feature(feature))
18711 - while (nr-- != 0) {
18712 - make_page_readonly(va, feature);
18713 - va = (void *)((unsigned long)va + PAGE_SIZE);
18717 -void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
18719 - if (xen_feature(feature))
18722 - while (nr-- != 0) {
18723 - make_page_writable(va, feature);
18724 - va = (void *)((unsigned long)va + PAGE_SIZE);
18728 -static void _pin_lock(struct mm_struct *mm, int lock) {
18730 - spin_lock(&mm->page_table_lock);
18731 -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
18732 - /* While mm->page_table_lock protects us against insertions and
18733 - * removals of higher level page table pages, it doesn't protect
18734 - * against updates of pte-s. Such updates, however, require the
18735 - * pte pages to be in consistent state (unpinned+writable or
18736 - * pinned+readonly). The pinning and attribute changes, however
18737 - * cannot be done atomically, which is why such updates must be
18738 - * prevented from happening concurrently.
18739 - * Note that no pte lock can ever elsewhere be acquired nesting
18740 - * with an already acquired one in the same mm, or with the mm's
18741 - * page_table_lock already acquired, as that would break in the
18742 - * non-split case (where all these are actually resolving to the
18743 - * one page_table_lock). Thus acquiring all of them here is not
18744 - * going to result in dead locks, and the order of acquires
18745 - * doesn't matter.
18748 - pgd_t *pgd = mm->pgd;
18751 - for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18755 - if (pgd_none(*pgd))
18757 - pud = pud_offset(pgd, 0);
18758 - for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18762 - if (pud_none(*pud))
18764 - pmd = pmd_offset(pud, 0);
18765 - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18768 - if (pmd_none(*pmd))
18770 - ptl = pte_lockptr(0, pmd);
18774 - spin_unlock(ptl);
18781 - spin_unlock(&mm->page_table_lock);
18783 -#define pin_lock(mm) _pin_lock(mm, 1)
18784 -#define pin_unlock(mm) _pin_lock(mm, 0)
18786 -#define PIN_BATCH 4
18787 -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
18789 -static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
18790 - unsigned int cpu, unsigned seq)
18792 - unsigned long pfn = page_to_pfn(page);
18794 - if (PageHighMem(page)) {
18795 - if (pgprot_val(flags) & _PAGE_RW)
18796 - ClearPagePinned(page);
18798 - SetPagePinned(page);
18800 - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
18801 - (unsigned long)__va(pfn << PAGE_SHIFT),
18802 - pfn_pte(pfn, flags), 0);
18803 - if (unlikely(++seq == PIN_BATCH)) {
18804 - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
18805 - PIN_BATCH, NULL)))
18814 -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
18816 - pgd_t *pgd = pgd_base;
18820 - unsigned int cpu, seq;
18822 - if (xen_feature(XENFEAT_auto_translated_physmap))
18827 - for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18828 - if (pgd_none(*pgd))
18830 - pud = pud_offset(pgd, 0);
18831 - if (PTRS_PER_PUD > 1) /* not folded */
18832 - seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
18833 - for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18834 - if (pud_none(*pud))
18836 - pmd = pmd_offset(pud, 0);
18837 - if (PTRS_PER_PMD > 1) /* not folded */
18838 - seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
18839 - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18840 - if (pmd_none(*pmd))
18842 - seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
18847 - if (likely(seq != 0)) {
18848 - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
18849 - (unsigned long)pgd_base,
18850 - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18852 - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
18855 - } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
18856 - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18863 -static void __pgd_pin(pgd_t *pgd)
18865 - pgd_walk(pgd, PAGE_KERNEL_RO);
18866 - kmap_flush_unused();
18867 - xen_pgd_pin(__pa(pgd));
18868 - SetPagePinned(virt_to_page(pgd));
18871 -static void __pgd_unpin(pgd_t *pgd)
18873 - xen_pgd_unpin(__pa(pgd));
18874 - pgd_walk(pgd, PAGE_KERNEL);
18875 - ClearPagePinned(virt_to_page(pgd));
18878 -static void pgd_test_and_unpin(pgd_t *pgd)
18880 - if (PagePinned(virt_to_page(pgd)))
18881 - __pgd_unpin(pgd);
18884 -void mm_pin(struct mm_struct *mm)
18886 - if (xen_feature(XENFEAT_writable_page_tables))
18889 - __pgd_pin(mm->pgd);
18893 -void mm_unpin(struct mm_struct *mm)
18895 - if (xen_feature(XENFEAT_writable_page_tables))
18898 - __pgd_unpin(mm->pgd);
18902 -void mm_pin_all(void)
18904 - struct page *page;
18905 - unsigned long flags;
18907 - if (xen_feature(XENFEAT_writable_page_tables))
18911 - * Allow uninterrupted access to the pgd_list. Also protects
18912 - * __pgd_pin() by disabling preemption.
18913 - * All other CPUs must be at a safe point (e.g., in stop_machine
18914 - * or offlined entirely).
18916 - spin_lock_irqsave(&pgd_lock, flags);
18917 - for (page = pgd_list; page; page = (struct page *)page->index) {
18918 - if (!PagePinned(page))
18919 - __pgd_pin((pgd_t *)page_address(page));
18921 - spin_unlock_irqrestore(&pgd_lock, flags);
18924 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
18926 - if (!PagePinned(virt_to_page(mm->pgd)))
18930 -void arch_exit_mmap(struct mm_struct *mm)
18932 - struct task_struct *tsk = current;
18937 - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
18938 - * *much* faster this way, as no tlb flushes means bigger wrpt batches.
18940 - if (tsk->active_mm == mm) {
18941 - tsk->active_mm = &init_mm;
18942 - atomic_inc(&init_mm.mm_count);
18944 - switch_mm(mm, &init_mm, tsk);
18946 - atomic_dec(&mm->mm_count);
18947 - BUG_ON(atomic_read(&mm->mm_count) == 0);
18950 - task_unlock(tsk);
18952 - if (PagePinned(virt_to_page(mm->pgd)) &&
18953 - (atomic_read(&mm->mm_count) == 1) &&
18954 - !mm->context.has_foreign_mappings)
18957 --- a/arch/x86/pci/irq-xen.c
18958 +++ b/arch/x86/pci/irq-xen.c
18959 @@ -204,6 +204,7 @@ static int pirq_ali_get(struct pci_dev *
18961 static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
18963 + WARN_ON_ONCE(pirq >= 16);
18964 return irqmap[read_config_nybble(router, 0x48, pirq-1)];
18967 @@ -211,7 +212,8 @@ static int pirq_ali_set(struct pci_dev *
18969 static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
18970 unsigned int val = irqmap[irq];
18973 + WARN_ON_ONCE(pirq >= 16);
18975 write_config_nybble(router, 0x48, pirq-1, val);
18977 @@ -261,12 +263,16 @@ static int pirq_via_set(struct pci_dev *
18978 static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18980 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18982 + WARN_ON_ONCE(pirq >= 5);
18983 return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
18986 static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18988 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18990 + WARN_ON_ONCE(pirq >= 5);
18991 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
18994 @@ -279,12 +285,16 @@ static int pirq_via586_set(struct pci_de
18995 static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18997 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18999 + WARN_ON_ONCE(pirq >= 4);
19000 return read_config_nybble(router,0x43, pirqmap[pirq-1]);
19003 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19005 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
19007 + WARN_ON_ONCE(pirq >= 4);
19008 write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
19011 @@ -423,6 +433,7 @@ static int pirq_sis_set(struct pci_dev *
19013 static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19015 + WARN_ON_ONCE(pirq >= 9);
19017 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
19019 @@ -432,6 +443,7 @@ static int pirq_vlsi_get(struct pci_dev
19021 static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19023 + WARN_ON_ONCE(pirq >= 9);
19025 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
19027 @@ -453,14 +465,14 @@ static int pirq_vlsi_set(struct pci_dev
19029 static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19031 - outb_p(pirq, 0xc00);
19032 + outb(pirq, 0xc00);
19033 return inb(0xc01) & 0xf;
19036 static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19038 - outb_p(pirq, 0xc00);
19039 - outb_p(irq, 0xc01);
19040 + outb(pirq, 0xc00);
19041 + outb(irq, 0xc01);
19045 @@ -575,6 +587,10 @@ static __init int intel_router_probe(str
19046 case PCI_DEVICE_ID_INTEL_ICH9_4:
19047 case PCI_DEVICE_ID_INTEL_ICH9_5:
19048 case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
19049 + case PCI_DEVICE_ID_INTEL_ICH10_0:
19050 + case PCI_DEVICE_ID_INTEL_ICH10_1:
19051 + case PCI_DEVICE_ID_INTEL_ICH10_2:
19052 + case PCI_DEVICE_ID_INTEL_ICH10_3:
19053 r->name = "PIIX/ICH";
19054 r->get = pirq_piix_get;
19055 r->set = pirq_piix_set;
19056 --- a/arch/x86/vdso/Makefile
19057 +++ b/arch/x86/vdso/Makefile
19058 @@ -66,6 +66,7 @@ vdso32.so-$(VDSO32-y) += int80
19059 vdso32.so-$(CONFIG_COMPAT) += syscall
19060 vdso32.so-$(VDSO32-y) += sysenter
19061 xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
19062 +xen-vdso32-$(CONFIG_X86_32) += syscall
19063 vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
19065 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
19066 --- a/arch/x86/vdso/vdso32.S
19067 +++ b/arch/x86/vdso/vdso32.S
19068 @@ -19,4 +19,16 @@ vdso32_sysenter_start:
19069 .incbin "arch/x86/vdso/vdso32-sysenter.so"
19070 vdso32_sysenter_end:
19072 +#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
19073 + .globl vdso32_int80_start, vdso32_int80_end
19074 +vdso32_int80_start:
19075 + .incbin "arch/x86/vdso/vdso32-int80.so"
19077 +#elif defined(CONFIG_X86_XEN)
19078 + .globl vdso32_syscall_start, vdso32_syscall_end
19079 +vdso32_syscall_start:
19080 + .incbin "arch/x86/vdso/vdso32-syscall.so"
19081 +vdso32_syscall_end:
19085 --- a/arch/x86/vdso/vdso32-setup.c
19086 +++ b/arch/x86/vdso/vdso32-setup.c
19088 #include <asm/vdso.h>
19089 #include <asm/proto.h>
19092 -#include <xen/interface/callback.h>
19098 @@ -229,7 +225,6 @@ static inline void map_compat_vdso(int m
19100 void enable_sep_cpu(void)
19102 -#ifndef CONFIG_XEN
19103 int cpu = get_cpu();
19104 struct tss_struct *tss = &per_cpu(init_tss, cpu);
19106 @@ -244,35 +239,6 @@ void enable_sep_cpu(void)
19107 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
19108 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
19111 - extern asmlinkage void ia32pv_sysenter_target(void);
19112 - static struct callback_register sysenter = {
19113 - .type = CALLBACKTYPE_sysenter,
19114 - .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
19117 - if (!boot_cpu_has(X86_FEATURE_SEP))
19122 - if (xen_feature(XENFEAT_supervisor_mode_kernel))
19123 - sysenter.address.eip = (unsigned long)ia32_sysenter_target;
19125 - switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
19128 -#if CONFIG_XEN_COMPAT < 0x030200
19130 - sysenter.type = CALLBACKTYPE_sysenter_deprecated;
19131 - if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
19135 - clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
19141 static struct vm_area_struct gate_vma;
19143 +++ b/arch/x86/vdso/vdso32-setup-xen.c
19146 + * (C) Copyright 2002 Linus Torvalds
19147 + * Portions based on the vdso-randomization code from exec-shield:
19148 + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
19150 + * This file contains the needed initializations to support sysenter.
19153 +#include <linux/init.h>
19154 +#include <linux/smp.h>
19155 +#include <linux/thread_info.h>
19156 +#include <linux/sched.h>
19157 +#include <linux/gfp.h>
19158 +#include <linux/string.h>
19159 +#include <linux/elf.h>
19160 +#include <linux/mm.h>
19161 +#include <linux/err.h>
19162 +#include <linux/module.h>
19164 +#include <asm/cpufeature.h>
19165 +#include <asm/msr.h>
19166 +#include <asm/pgtable.h>
19167 +#include <asm/unistd.h>
19168 +#include <asm/elf.h>
19169 +#include <asm/tlbflush.h>
19170 +#include <asm/vdso.h>
19171 +#include <asm/proto.h>
19173 +#include <xen/interface/callback.h>
19176 + VDSO_DISABLED = 0,
19177 + VDSO_ENABLED = 1,
19181 +#ifdef CONFIG_COMPAT_VDSO
19182 +#define VDSO_DEFAULT VDSO_COMPAT
19184 +#define VDSO_DEFAULT VDSO_ENABLED
19187 +#ifdef CONFIG_X86_64
19188 +#define vdso_enabled sysctl_vsyscall32
19189 +#define arch_setup_additional_pages syscall32_setup_pages
19193 + * This is the difference between the prelinked addresses in the vDSO images
19194 + * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
19195 + * in the user address space.
19197 +#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
19200 + * Should the kernel map a VDSO page into processes and pass its
19201 + * address down to glibc upon exec()?
19203 +unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
19205 +static int __init vdso_setup(char *s)
19207 + vdso_enabled = simple_strtoul(s, NULL, 0);
19213 + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
19214 + * behavior on both 64-bit and 32-bit kernels.
19215 + * On 32-bit kernels, vdso=[012] means the same thing.
19217 +__setup("vdso32=", vdso_setup);
19219 +#ifdef CONFIG_X86_32
19220 +__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
19222 +EXPORT_SYMBOL_GPL(vdso_enabled);
19225 +static __init void reloc_symtab(Elf32_Ehdr *ehdr,
19226 + unsigned offset, unsigned size)
19228 + Elf32_Sym *sym = (void *)ehdr + offset;
19229 + unsigned nsym = size / sizeof(*sym);
19232 + for(i = 0; i < nsym; i++, sym++) {
19233 + if (sym->st_shndx == SHN_UNDEF ||
19234 + sym->st_shndx == SHN_ABS)
19235 + continue; /* skip */
19237 + if (sym->st_shndx > SHN_LORESERVE) {
19238 + printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
19243 + switch(ELF_ST_TYPE(sym->st_info)) {
19246 + case STT_SECTION:
19248 + sym->st_value += VDSO_ADDR_ADJUST;
19253 +static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
19255 + Elf32_Dyn *dyn = (void *)ehdr + offset;
19257 + for(; dyn->d_tag != DT_NULL; dyn++)
19258 + switch(dyn->d_tag) {
19272 + case DT_ADDRRNGLO ... DT_ADDRRNGHI:
19273 + /* definitely pointers needing relocation */
19274 + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
19277 + case DT_ENCODING ... OLD_DT_LOOS-1:
19278 + case DT_LOOS ... DT_HIOS-1:
19279 + /* Tags above DT_ENCODING are pointers if
19281 + if (dyn->d_tag >= DT_ENCODING &&
19282 + (dyn->d_tag & 1) == 0)
19283 + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
19286 + case DT_VERDEFNUM:
19287 + case DT_VERNEEDNUM:
19289 + case DT_RELACOUNT:
19290 + case DT_RELCOUNT:
19291 + case DT_VALRNGLO ... DT_VALRNGHI:
19292 + /* definitely not pointers */
19295 + case OLD_DT_LOOS ... DT_LOOS-1:
19296 + case DT_HIOS ... DT_VALRNGLO-1:
19298 + if (dyn->d_tag > DT_ENCODING)
19299 + printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
19305 +static __init void relocate_vdso(Elf32_Ehdr *ehdr)
19307 + Elf32_Phdr *phdr;
19308 + Elf32_Shdr *shdr;
19311 + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
19312 + !elf_check_arch_ia32(ehdr) ||
19313 + ehdr->e_type != ET_DYN);
19315 + ehdr->e_entry += VDSO_ADDR_ADJUST;
19317 + /* rebase phdrs */
19318 + phdr = (void *)ehdr + ehdr->e_phoff;
19319 + for (i = 0; i < ehdr->e_phnum; i++) {
19320 + phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
19322 + /* relocate dynamic stuff */
19323 + if (phdr[i].p_type == PT_DYNAMIC)
19324 + reloc_dyn(ehdr, phdr[i].p_offset);
19327 + /* rebase sections */
19328 + shdr = (void *)ehdr + ehdr->e_shoff;
19329 + for(i = 0; i < ehdr->e_shnum; i++) {
19330 + if (!(shdr[i].sh_flags & SHF_ALLOC))
19333 + shdr[i].sh_addr += VDSO_ADDR_ADJUST;
19335 + if (shdr[i].sh_type == SHT_SYMTAB ||
19336 + shdr[i].sh_type == SHT_DYNSYM)
19337 + reloc_symtab(ehdr, shdr[i].sh_offset,
19338 + shdr[i].sh_size);
19343 + * These symbols are defined by vdso32.S to mark the bounds
19344 + * of the ELF DSO images included therein.
19346 +extern const char vdso32_default_start, vdso32_default_end;
19347 +extern const char vdso32_sysenter_start, vdso32_sysenter_end;
19348 +static struct page *vdso32_pages[1];
19350 +#ifdef CONFIG_X86_64
19352 +#if CONFIG_XEN_COMPAT < 0x030200
19353 +static int use_int80 = 1;
19355 +static int use_sysenter __read_mostly = -1;
19357 +#define vdso32_sysenter() (use_sysenter > 0)
19359 +/* May not be __init: called during resume */
19360 +void syscall32_cpu_init(void)
19362 + static const struct callback_register cstar = {
19363 + .type = CALLBACKTYPE_syscall32,
19364 + .address = (unsigned long)ia32_cstar_target
19366 + static const struct callback_register sysenter = {
19367 + .type = CALLBACKTYPE_sysenter,
19368 + .address = (unsigned long)ia32_sysenter_target
19371 + if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
19372 + (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
19373 +#if CONFIG_XEN_COMPAT < 0x030200
19380 + if (use_sysenter < 0)
19381 + use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
19384 +#define compat_uses_vma 1
19386 +static inline void map_compat_vdso(int map)
19390 +#else /* CONFIG_X86_32 */
19392 +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
19394 +extern asmlinkage void ia32pv_cstar_target(void);
19395 +static /*const*/ struct callback_register __cpuinitdata cstar = {
19396 + .type = CALLBACKTYPE_syscall32,
19397 + .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target },
19400 +void __cpuinit enable_sep_cpu(void)
19402 + extern asmlinkage void ia32pv_sysenter_target(void);
19403 + static struct callback_register __cpuinitdata sysenter = {
19404 + .type = CALLBACKTYPE_sysenter,
19405 + .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
19408 + if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
19409 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
19414 + if (!boot_cpu_has(X86_FEATURE_SEP))
19417 + if (xen_feature(XENFEAT_supervisor_mode_kernel))
19418 + sysenter.address.eip = (unsigned long)ia32_sysenter_target;
19420 + switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
19423 +#if CONFIG_XEN_COMPAT < 0x030200
19425 + sysenter.type = CALLBACKTYPE_sysenter_deprecated;
19426 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
19430 + setup_clear_cpu_cap(X86_FEATURE_SEP);
19435 +static struct vm_area_struct gate_vma;
19437 +static int __init gate_vma_init(void)
19439 + gate_vma.vm_mm = NULL;
19440 + gate_vma.vm_start = FIXADDR_USER_START;
19441 + gate_vma.vm_end = FIXADDR_USER_END;
19442 + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
19443 + gate_vma.vm_page_prot = __P101;
19445 + * Make sure the vDSO gets into every core dump.
19446 + * Dumping its contents makes post-mortem fully interpretable later
19447 + * without matching up the same kernel and hardware config to see
19448 + * what PC values meant.
19450 + gate_vma.vm_flags |= VM_ALWAYSDUMP;
19454 +#define compat_uses_vma 0
19456 +static void map_compat_vdso(int map)
19458 + static int vdso_mapped;
19460 + if (map == vdso_mapped)
19463 + vdso_mapped = map;
19465 + __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
19466 + map ? PAGE_READONLY_EXEC : PAGE_NONE);
19468 + /* flush stray tlbs */
19472 +#endif /* CONFIG_X86_64 */
19474 +int __init sysenter_setup(void)
19476 + void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
19477 + const void *vsyscall;
19478 + size_t vsyscall_len;
19480 + vdso32_pages[0] = virt_to_page(syscall_page);
19482 +#ifdef CONFIG_X86_32
19485 + printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
19488 +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
19490 + extern const char vdso32_int80_start, vdso32_int80_end;
19492 + vsyscall = &vdso32_int80_start;
19493 + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
19495 +#elif defined(CONFIG_X86_32)
19496 + if (boot_cpu_has(X86_FEATURE_SYSCALL)
19497 + && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
19498 + || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
19499 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
19500 + barrier(); /* until clear_bit()'s constraints are correct ... */
19501 + if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
19502 + extern const char vdso32_syscall_start, vdso32_syscall_end;
19504 + vsyscall = &vdso32_syscall_start;
19505 + vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
19508 + if (!vdso32_sysenter()) {
19509 + vsyscall = &vdso32_default_start;
19510 + vsyscall_len = &vdso32_default_end - &vdso32_default_start;
19512 + vsyscall = &vdso32_sysenter_start;
19513 + vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
19516 + memcpy(syscall_page, vsyscall, vsyscall_len);
19517 + relocate_vdso(syscall_page);
19522 +/* Setup a VMA at program startup for the vsyscall page */
19523 +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
19525 + struct mm_struct *mm = current->mm;
19526 + unsigned long addr;
19530 + down_write(&mm->mmap_sem);
19532 + /* Test compat mode once here, in case someone
19533 + changes it via sysctl */
19534 + compat = (vdso_enabled == VDSO_COMPAT);
19536 + map_compat_vdso(compat);
19539 + addr = VDSO_HIGH_BASE;
19541 + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
19542 + if (IS_ERR_VALUE(addr)) {
19548 + if (compat_uses_vma || !compat) {
19550 + * MAYWRITE to allow gdb to COW and set breakpoints
19552 + * Make sure the vDSO gets into every core dump.
19553 + * Dumping its contents makes post-mortem fully
19554 + * interpretable later without matching up the same
19555 + * kernel and hardware config to see what PC values
19558 + ret = install_special_mapping(mm, addr, PAGE_SIZE,
19560 + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
19568 + current->mm->context.vdso = (void *)addr;
19569 + current_thread_info()->sysenter_return =
19570 + VDSO32_SYMBOL(addr, SYSENTER_RETURN);
19573 + up_write(&mm->mmap_sem);
19578 +#ifdef CONFIG_X86_64
19581 + * This must be done early in case we have an initrd containing 32-bit
19582 + * binaries (e.g., hotplug). This could be pushed upstream.
19584 +core_initcall(sysenter_setup);
19586 +#ifdef CONFIG_SYSCTL
19587 +/* Register vsyscall32 into the ABI table */
19588 +#include <linux/sysctl.h>
19590 +static ctl_table abi_table2[] = {
19592 + .procname = "vsyscall32",
19593 + .data = &sysctl_vsyscall32,
19594 + .maxlen = sizeof(int),
19596 + .proc_handler = proc_dointvec
19601 +static ctl_table abi_root_table2[] = {
19603 + .ctl_name = CTL_ABI,
19604 + .procname = "abi",
19606 + .child = abi_table2
19611 +static __init int ia32_binfmt_init(void)
19613 + register_sysctl_table(abi_root_table2);
19616 +__initcall(ia32_binfmt_init);
19619 +#else /* CONFIG_X86_32 */
19621 +const char *arch_vma_name(struct vm_area_struct *vma)
19623 + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
19628 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
19630 + struct mm_struct *mm = tsk->mm;
19632 + /* Check to see if this task was created in compat vdso mode */
19633 + if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
19634 + return &gate_vma;
19638 +int in_gate_area(struct task_struct *task, unsigned long addr)
19640 + const struct vm_area_struct *vma = get_gate_vma(task);
19642 + return vma && addr >= vma->vm_start && addr < vma->vm_end;
19645 +int in_gate_area_no_task(unsigned long addr)
19650 +#endif /* CONFIG_X86_64 */
19651 --- a/arch/x86/vdso/vdso32/syscall.S
19652 +++ b/arch/x86/vdso/vdso32/syscall.S
19653 @@ -19,8 +19,10 @@ __kernel_vsyscall:
19657 +#ifndef CONFIG_XEN
19658 movl $__USER32_DS, %ecx
19664 --- a/drivers/pci/msi-xen.c
19665 +++ b/drivers/pci/msi-xen.c
19666 @@ -43,6 +43,53 @@ struct msi_pirq_entry {
19672 +int __attribute__ ((weak))
19673 +arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
19678 +#ifndef CONFIG_XEN
19679 +int __attribute__ ((weak))
19680 +arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
19685 +int __attribute__ ((weak))
19686 +arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
19688 + struct msi_desc *entry;
19691 + list_for_each_entry(entry, &dev->msi_list, list) {
19692 + ret = arch_setup_msi_irq(dev, entry);
19700 +void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
19705 +void __attribute__ ((weak))
19706 +arch_teardown_msi_irqs(struct pci_dev *dev)
19708 + struct msi_desc *entry;
19710 + list_for_each_entry(entry, &dev->msi_list, list) {
19711 + if (entry->irq != 0)
19712 + arch_teardown_msi_irq(entry->irq);
19717 static void msi_set_enable(struct pci_dev *dev, int enable)
19720 @@ -270,7 +317,6 @@ static void pci_intx_for_msi(struct pci_
19721 pci_intx(dev, enable);
19725 static void __pci_restore_msi_state(struct pci_dev *dev)
19728 @@ -328,7 +374,7 @@ void pci_restore_msi_state(struct pci_de
19729 __pci_restore_msi_state(dev);
19730 __pci_restore_msix_state(dev);
19732 -#endif /* CONFIG_PM */
19733 +EXPORT_SYMBOL_GPL(pci_restore_msi_state);
19736 * msi_capability_init - configure device's MSI capability structure
19737 @@ -760,51 +806,3 @@ void pci_msi_init_pci_dev(struct pci_dev
19738 INIT_LIST_HEAD(&dev->msi_list);
19745 -int __attribute__ ((weak))
19746 -arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
19751 -#ifndef CONFIG_XEN
19752 -int __attribute__ ((weak))
19753 -arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
19758 -int __attribute__ ((weak))
19759 -arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
19761 - struct msi_desc *entry;
19764 - list_for_each_entry(entry, &dev->msi_list, list) {
19765 - ret = arch_setup_msi_irq(dev, entry);
19773 -void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
19778 -void __attribute__ ((weak))
19779 -arch_teardown_msi_irqs(struct pci_dev *dev)
19781 - struct msi_desc *entry;
19783 - list_for_each_entry(entry, &dev->msi_list, list) {
19784 - if (entry->irq != 0)
19785 - arch_teardown_msi_irq(entry->irq);
19789 --- a/drivers/pci/pci.c
19790 +++ b/drivers/pci/pci.c
19791 @@ -353,7 +353,12 @@ pci_find_parent_resource(const struct pc
19792 * Restore the BAR values for a given device, so as to make it
19793 * accessible by its driver.
19795 +#ifndef CONFIG_XEN
19798 +EXPORT_SYMBOL_GPL(pci_restore_bars);
19801 pci_restore_bars(struct pci_dev *dev)
19804 --- a/drivers/xen/balloon/sysfs.c
19805 +++ b/drivers/xen/balloon/sysfs.c
19806 @@ -108,7 +108,7 @@ static struct attribute_group balloon_in
19809 static struct sysdev_class balloon_sysdev_class = {
19810 - set_kset_name(BALLOON_CLASS_NAME),
19811 + .name = BALLOON_CLASS_NAME,
19814 static struct sys_device balloon_sysdev;
19815 --- a/drivers/xen/blkback/blkback.c
19816 +++ b/drivers/xen/blkback/blkback.c
19817 @@ -148,7 +148,7 @@ static void unplug_queue(blkif_t *blkif)
19819 if (blkif->plug->unplug_fn)
19820 blkif->plug->unplug_fn(blkif->plug);
19821 - blk_put_queue(blkif->plug);
19822 + kobject_put(&blkif->plug->kobj);
19823 blkif->plug = NULL;
19826 @@ -159,7 +159,8 @@ static void plug_queue(blkif_t *blkif, s
19827 if (q == blkif->plug)
19829 unplug_queue(blkif);
19830 - blk_get_queue(q);
19831 + WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags));
19832 + kobject_get(&q->kobj);
19836 --- a/drivers/xen/blkfront/blkfront.c
19837 +++ b/drivers/xen/blkfront/blkfront.c
19838 @@ -716,7 +716,6 @@ static irqreturn_t blkif_int(int irq, vo
19840 unsigned long flags;
19841 struct blkfront_info *info = (struct blkfront_info *)dev_id;
19844 spin_lock_irqsave(&blkif_io_lock, flags);
19846 @@ -741,13 +740,13 @@ static irqreturn_t blkif_int(int irq, vo
19848 ADD_ID_TO_FREELIST(info, id);
19850 - uptodate = (bret->status == BLKIF_RSP_OKAY);
19851 + ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
19852 switch (bret->operation) {
19853 case BLKIF_OP_WRITE_BARRIER:
19854 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
19855 printk("blkfront: %s: write barrier op failed\n",
19856 info->gd->disk_name);
19857 - uptodate = -EOPNOTSUPP;
19858 + ret = -EOPNOTSUPP;
19859 info->feature_barrier = 0;
19860 xlvbd_barrier(info);
19862 @@ -758,10 +757,8 @@ static irqreturn_t blkif_int(int irq, vo
19863 DPRINTK("Bad return from blkdev data "
19864 "request: %x\n", bret->status);
19866 - ret = end_that_request_first(req, uptodate,
19867 - req->hard_nr_sectors);
19868 + ret = __blk_end_request(req, ret, blk_rq_bytes(req));
19870 - end_that_request_last(req, uptodate);
19874 --- a/drivers/xen/blktap/blktap.c
19875 +++ b/drivers/xen/blktap/blktap.c
19876 @@ -327,8 +327,8 @@ static pte_t blktap_clear_pte(struct vm_
19877 * if vm_file is NULL (meaning mmap failed and we have nothing to do)
19879 if (uvaddr < uvstart || vma->vm_file == NULL)
19880 - return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
19881 - ptep, is_fullmm);
19882 + return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
19885 info = vma->vm_file->private_data;
19886 map = vma->vm_private_data;
19887 @@ -375,8 +375,8 @@ static pte_t blktap_clear_pte(struct vm_
19888 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
19890 /* USING SHADOW PAGE TABLES. */
19891 - copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
19893 + copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
19898 --- a/drivers/xen/core/evtchn.c
19899 +++ b/drivers/xen/core/evtchn.c
19900 @@ -193,7 +193,7 @@ static inline unsigned int cpu_from_evtc
19902 /* Upcall to generic IRQ layer. */
19904 -extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
19905 +extern unsigned int do_IRQ(struct pt_regs *regs);
19906 void __init xen_init_IRQ(void);
19907 void __init init_IRQ(void)
19909 @@ -202,13 +202,11 @@ void __init init_IRQ(void)
19911 #if defined (__i386__)
19912 static inline void exit_idle(void) {}
19913 -#define IRQ_REG orig_eax
19914 #elif defined (__x86_64__)
19915 #include <asm/idle.h>
19916 -#define IRQ_REG orig_rax
19918 #define do_IRQ(irq, regs) do { \
19919 - (regs)->IRQ_REG = ~(irq); \
19920 + (regs)->orig_ax = ~(irq); \
19924 @@ -669,13 +667,12 @@ static void set_affinity_irq(unsigned in
19925 int resend_irq_on_evtchn(unsigned int irq)
19927 int masked, evtchn = evtchn_from_irq(irq);
19928 - shared_info_t *s = HYPERVISOR_shared_info;
19930 if (!VALID_EVTCHN(evtchn))
19933 masked = test_and_set_evtchn_mask(evtchn);
19934 - synch_set_bit(evtchn, s->evtchn_pending);
19935 + set_evtchn(evtchn);
19937 unmask_evtchn(evtchn);
19939 @@ -968,6 +965,43 @@ void disable_all_local_evtchn(void)
19940 synch_set_bit(i, &s->evtchn_mask[0]);
19943 +/* Clear an irq's pending state, in preparation for polling on it. */
19944 +void xen_clear_irq_pending(int irq)
19946 + int evtchn = evtchn_from_irq(irq);
19948 + if (VALID_EVTCHN(evtchn))
19949 + clear_evtchn(evtchn);
19952 +/* Set an irq's pending state, to avoid blocking on it. */
19953 +void xen_set_irq_pending(int irq)
19955 + int evtchn = evtchn_from_irq(irq);
19957 + if (VALID_EVTCHN(evtchn))
19958 + set_evtchn(evtchn);
19961 +/* Test an irq's pending state. */
19962 +int xen_test_irq_pending(int irq)
19964 + int evtchn = evtchn_from_irq(irq);
19966 + return VALID_EVTCHN(evtchn) && test_evtchn(evtchn);
19969 +/* Poll waiting for an irq to become pending. In the usual case, the
19970 + irq will be disabled so it won't deliver an interrupt. */
19971 +void xen_poll_irq(int irq)
19973 + evtchn_port_t evtchn = evtchn_from_irq(irq);
19975 + if (VALID_EVTCHN(evtchn)
19976 + && HYPERVISOR_poll_no_timeout(&evtchn, 1))
19980 static void restore_cpu_virqs(unsigned int cpu)
19982 struct evtchn_bind_virq bind_virq;
19983 --- a/drivers/xen/core/hypervisor_sysfs.c
19984 +++ b/drivers/xen/core/hypervisor_sysfs.c
19985 @@ -50,7 +50,7 @@ static int __init hypervisor_subsys_init
19986 if (!is_running_on_xen())
19989 - hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type;
19990 + hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
19994 --- a/drivers/xen/core/Makefile
19995 +++ b/drivers/xen/core/Makefile
19996 @@ -10,5 +10,6 @@ obj-$(CONFIG_SYS_HYPERVISOR) += hypervis
19997 obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
19998 obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
19999 obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o
20000 +obj-$(CONFIG_X86_SMP) += spinlock.o
20001 obj-$(CONFIG_KEXEC) += machine_kexec.o
20002 obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
20003 --- a/drivers/xen/core/smpboot.c
20004 +++ b/drivers/xen/core/smpboot.c
20005 @@ -139,6 +139,10 @@ static int __cpuinit xen_smp_intr_init(u
20007 per_cpu(callfunc_irq, cpu) = rc;
20009 + rc = xen_spinlock_init(cpu);
20013 if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
20016 @@ -149,6 +153,7 @@ static int __cpuinit xen_smp_intr_init(u
20017 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
20018 if (per_cpu(callfunc_irq, cpu) >= 0)
20019 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
20020 + xen_spinlock_cleanup(cpu);
20024 @@ -160,6 +165,7 @@ static void xen_smp_intr_exit(unsigned i
20026 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
20027 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
20028 + xen_spinlock_cleanup(cpu);
20032 @@ -212,36 +218,25 @@ static void __cpuinit cpu_initialize_con
20033 smp_trap_init(ctxt.trap_ctxt);
20036 - ctxt.gdt_ents = GDT_SIZE / 8;
20039 ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
20040 + ctxt.gdt_ents = GDT_SIZE / 8;
20042 ctxt.user_regs.cs = __KERNEL_CS;
20043 - ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
20044 + ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
20046 ctxt.kernel_ss = __KERNEL_DS;
20047 - ctxt.kernel_sp = idle->thread.esp0;
20048 + ctxt.kernel_sp = idle->thread.sp0;
20050 - ctxt.event_callback_cs = __KERNEL_CS;
20051 ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
20052 - ctxt.failsafe_callback_cs = __KERNEL_CS;
20053 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
20055 + ctxt.event_callback_cs = __KERNEL_CS;
20056 + ctxt.failsafe_callback_cs = __KERNEL_CS;
20058 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
20060 ctxt.user_regs.fs = __KERNEL_PERCPU;
20061 #else /* __x86_64__ */
20062 - ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address);
20064 - ctxt.user_regs.cs = __KERNEL_CS;
20065 - ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
20067 - ctxt.kernel_ss = __KERNEL_DS;
20068 - ctxt.kernel_sp = idle->thread.rsp0;
20070 - ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
20071 - ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
20072 ctxt.syscall_callback_eip = (unsigned long)system_call;
20074 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
20076 +++ b/drivers/xen/core/spinlock.c
20079 + * Xen spinlock functions
20081 + * See arch/x86/xen/smp.c for copyright and credits for derived
20082 + * portions of this file.
20085 +#include <linux/init.h>
20086 +#include <linux/irq.h>
20087 +#include <linux/kernel.h>
20088 +#include <linux/kernel_stat.h>
20089 +#include <linux/module.h>
20090 +#include <xen/evtchn.h>
20092 +extern irqreturn_t smp_reschedule_interrupt(int, void *);
20094 +static DEFINE_PER_CPU(int, spinlock_irq) = -1;
20095 +static char spinlock_name[NR_CPUS][15];
20098 + raw_spinlock_t *lock;
20099 + unsigned int ticket;
20100 + struct spinning *prev;
20102 +static DEFINE_PER_CPU(struct spinning *, spinning);
20104 + * Protect removal of objects: Addition can be done lockless, and even
20105 + * removal itself doesn't need protection - what needs to be prevented is
20106 + * removed objects going out of scope (as they're allocated on the stack.
20108 +static DEFINE_PER_CPU(raw_rwlock_t, spinning_rm_lock) = __RAW_RW_LOCK_UNLOCKED;
20110 +int __cpuinit xen_spinlock_init(unsigned int cpu)
20114 + sprintf(spinlock_name[cpu], "spinlock%u", cpu);
20115 + rc = bind_ipi_to_irqhandler(SPIN_UNLOCK_VECTOR,
20117 + smp_reschedule_interrupt,
20118 + IRQF_DISABLED|IRQF_NOBALANCING,
20119 + spinlock_name[cpu],
20124 + disable_irq(rc); /* make sure it's never delivered */
20125 + per_cpu(spinlock_irq, cpu) = rc;
20130 +void __cpuinit xen_spinlock_cleanup(unsigned int cpu)
20132 + if (per_cpu(spinlock_irq, cpu) >= 0)
20133 + unbind_from_irqhandler(per_cpu(spinlock_irq, cpu), NULL);
20134 + per_cpu(spinlock_irq, cpu) = -1;
20137 +int xen_spin_wait(raw_spinlock_t *lock, unsigned int token)
20139 + int rc = 0, irq = __get_cpu_var(spinlock_irq);
20140 + raw_rwlock_t *rm_lock;
20141 + unsigned long flags;
20142 + struct spinning spinning;
20144 + /* If kicker interrupt not initialized yet, just spin. */
20145 + if (unlikely(irq < 0))
20148 + token >>= TICKET_SHIFT;
20150 + /* announce we're spinning */
20151 + spinning.ticket = token;
20152 + spinning.lock = lock;
20153 + spinning.prev = __get_cpu_var(spinning);
20155 + __get_cpu_var(spinning) = &spinning;
20157 + /* clear pending */
20158 + xen_clear_irq_pending(irq);
20161 + /* Check again to make sure it didn't become free while
20162 + * we weren't looking. */
20163 + if ((lock->slock & ((1U << TICKET_SHIFT) - 1)) == token) {
20164 + /* If we interrupted another spinlock while it was
20165 + * blocking, make sure it doesn't block (again)
20166 + * without rechecking the lock. */
20167 + if (spinning.prev)
20168 + xen_set_irq_pending(irq);
20173 + /* block until irq becomes pending */
20174 + xen_poll_irq(irq);
20175 + } while (!xen_test_irq_pending(irq));
20177 + /* Leave the irq pending so that any interrupted blocker will
20179 + kstat_this_cpu.irqs[irq] += !rc;
20181 + /* announce we're done */
20182 + __get_cpu_var(spinning) = spinning.prev;
20183 + rm_lock = &__get_cpu_var(spinning_rm_lock);
20184 + raw_local_irq_save(flags);
20185 + __raw_write_lock(rm_lock);
20186 + __raw_write_unlock(rm_lock);
20187 + raw_local_irq_restore(flags);
20191 +EXPORT_SYMBOL(xen_spin_wait);
20193 +unsigned int xen_spin_adjust(raw_spinlock_t *lock, unsigned int token)
20195 + return token;//todo
20197 +EXPORT_SYMBOL(xen_spin_adjust);
20199 +int xen_spin_wait_flags(raw_spinlock_t *lock, unsigned int *token,
20200 + unsigned int flags)
20202 + return xen_spin_wait(lock, *token);//todo
20204 +EXPORT_SYMBOL(xen_spin_wait_flags);
20206 +void xen_spin_kick(raw_spinlock_t *lock, unsigned int token)
20208 + unsigned int cpu;
20210 + token &= (1U << TICKET_SHIFT) - 1;
20211 + for_each_online_cpu(cpu) {
20212 + raw_rwlock_t *rm_lock;
20213 + unsigned long flags;
20214 + struct spinning *spinning;
20216 + if (cpu == raw_smp_processor_id())
20219 + rm_lock = &per_cpu(spinning_rm_lock, cpu);
20220 + raw_local_irq_save(flags);
20221 + __raw_read_lock(rm_lock);
20223 + spinning = per_cpu(spinning, cpu);
20226 + && (spinning->lock != lock || spinning->ticket != token))
20229 + __raw_read_unlock(rm_lock);
20230 + raw_local_irq_restore(flags);
20232 + if (unlikely(spinning)) {
20233 + notify_remote_via_irq(per_cpu(spinlock_irq, cpu));
20238 +EXPORT_SYMBOL(xen_spin_kick);
20239 --- a/drivers/xen/core/xen_sysfs.c
20240 +++ b/drivers/xen/core/xen_sysfs.c
20241 @@ -29,12 +29,12 @@ HYPERVISOR_ATTR_RO(type);
20243 static int __init xen_sysfs_type_init(void)
20245 - return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr);
20246 + return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
20249 static void xen_sysfs_type_destroy(void)
20251 - sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr);
20252 + sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
20255 /* xen version attributes */
20256 @@ -90,13 +90,12 @@ static struct attribute_group version_gr
20258 static int __init xen_sysfs_version_init(void)
20260 - return sysfs_create_group(&hypervisor_subsys.kobj,
20262 + return sysfs_create_group(hypervisor_kobj, &version_group);
20265 static void xen_sysfs_version_destroy(void)
20267 - sysfs_remove_group(&hypervisor_subsys.kobj, &version_group);
20268 + sysfs_remove_group(hypervisor_kobj, &version_group);
20272 @@ -126,12 +125,12 @@ HYPERVISOR_ATTR_RO(uuid);
20274 static int __init xen_sysfs_uuid_init(void)
20276 - return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
20277 + return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
20280 static void xen_sysfs_uuid_destroy(void)
20282 - sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
20283 + sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
20286 /* xen compilation attributes */
20287 @@ -204,14 +203,12 @@ static struct attribute_group xen_compil
20289 int __init static xen_compilation_init(void)
20291 - return sysfs_create_group(&hypervisor_subsys.kobj,
20292 - &xen_compilation_group);
20293 + return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
20296 static void xen_compilation_destroy(void)
20298 - sysfs_remove_group(&hypervisor_subsys.kobj,
20299 - &xen_compilation_group);
20300 + sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
20303 /* xen properties info */
20304 @@ -325,14 +322,12 @@ static struct attribute_group xen_proper
20306 static int __init xen_properties_init(void)
20308 - return sysfs_create_group(&hypervisor_subsys.kobj,
20309 - &xen_properties_group);
20310 + return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
20313 static void xen_properties_destroy(void)
20315 - sysfs_remove_group(&hypervisor_subsys.kobj,
20316 - &xen_properties_group);
20317 + sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
20320 #ifdef CONFIG_KEXEC
20321 @@ -350,13 +345,12 @@ HYPERVISOR_ATTR_RO(vmcoreinfo);
20323 static int __init xen_sysfs_vmcoreinfo_init(void)
20325 - return sysfs_create_file(&hypervisor_subsys.kobj,
20326 - &vmcoreinfo_attr.attr);
20327 + return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
20330 static void xen_sysfs_vmcoreinfo_destroy(void)
20332 - sysfs_remove_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr);
20333 + sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
20337 --- a/drivers/xen/gntdev/gntdev.c
20338 +++ b/drivers/xen/gntdev/gntdev.c
20339 @@ -782,7 +782,7 @@ static pte_t gntdev_clear_pte(struct vm_
20342 /* USING SHADOW PAGE TABLES. */
20343 - copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
20344 + copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
20347 /* Finally, we unmap the grant from kernel space. */
20348 @@ -810,7 +810,7 @@ static pte_t gntdev_clear_pte(struct vm_
20349 >> PAGE_SHIFT, INVALID_P2M_ENTRY);
20352 - copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
20353 + copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
20357 --- a/drivers/xen/scsifront/scsifront.c
20358 +++ b/drivers/xen/scsifront/scsifront.c
20359 @@ -260,19 +260,19 @@ static int map_data_for_request(struct v
20363 - if (sc->use_sg) {
20364 + if (scsi_bufflen(sc)) {
20365 /* quoted scsi_lib.c/scsi_req_map_sg . */
20366 - struct scatterlist *sg, *sgl = (struct scatterlist *)sc->request_buffer;
20367 - unsigned int data_len = sc->request_bufflen;
20368 + struct scatterlist *sg, *sgl = scsi_sglist(sc);
20369 + unsigned int data_len = scsi_bufflen(sc);
20371 - nr_pages = (sc->request_bufflen + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
20372 + nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
20373 if (nr_pages > VSCSIIF_SG_TABLESIZE) {
20374 printk(KERN_ERR "scsifront: Unable to map request_buffer for command!\n");
20375 ref_cnt = (-E2BIG);
20379 - for_each_sg (sgl, sg, sc->use_sg, i) {
20380 + for_each_sg (sgl, sg, scsi_sg_count(sc), i) {
20381 page = sg_page(sg);
20384 @@ -306,45 +306,6 @@ static int map_data_for_request(struct v
20388 - } else if (sc->request_bufflen) {
20389 - unsigned long end = ((unsigned long)sc->request_buffer
20390 - + sc->request_bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
20391 - unsigned long start = (unsigned long)sc->request_buffer >> PAGE_SHIFT;
20393 - page = virt_to_page(sc->request_buffer);
20394 - nr_pages = end - start;
20395 - len = sc->request_bufflen;
20397 - if (nr_pages > VSCSIIF_SG_TABLESIZE) {
20398 - ref_cnt = (-E2BIG);
20402 - buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
20404 - off = offset_in_page((unsigned long)sc->request_buffer);
20405 - for (i = 0; i < nr_pages; i++) {
20406 - bytes = PAGE_SIZE - off;
20411 - ref = gnttab_claim_grant_reference(&gref_head);
20412 - BUG_ON(ref == -ENOSPC);
20414 - gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
20415 - buffer_pfn, write);
20417 - info->shadow[id].gref[i] = ref;
20418 - ring_req->seg[i].gref = ref;
20419 - ring_req->seg[i].offset = (uint16_t)off;
20420 - ring_req->seg[i].length = (uint16_t)bytes;
20430 --- a/drivers/xen/xenoprof/xenoprofile.c
20431 +++ b/drivers/xen/xenoprof/xenoprofile.c
20432 @@ -79,7 +79,7 @@ static int xenoprof_resume(struct sys_de
20435 static struct sysdev_class oprofile_sysclass = {
20436 - set_kset_name("oprofile"),
20437 + .name = "oprofile",
20438 .resume = xenoprof_resume,
20439 .suspend = xenoprof_suspend
20441 --- a/include/asm-x86/mach-xen/asm/agp.h
20442 +++ b/include/asm-x86/mach-xen/asm/agp.h
20443 @@ -13,18 +13,13 @@
20444 * page. This avoids data corruption on some CPUs.
20448 - * Caller's responsibility to call global_flush_tlb() for performance
20451 #define map_page_into_agp(page) ( \
20452 xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
20453 - ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE))
20454 + ?: set_pages_uc(page, 1))
20455 #define unmap_page_from_agp(page) ( \
20456 xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
20457 /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
20458 - change_page_attr(page, 1, PAGE_KERNEL))
20459 -#define flush_agp_mappings() global_flush_tlb()
20460 + set_pages_wb(page, 1))
20463 * Could use CLFLUSH here if the cpu supports it. But then it would
20464 --- a/include/asm-x86/mach-xen/asm/desc_32.h
20467 -#ifndef __ARCH_DESC_H
20468 -#define __ARCH_DESC_H
20470 -#include <asm/ldt.h>
20471 -#include <asm/segment.h>
20473 -#ifndef __ASSEMBLY__
20475 -#include <linux/preempt.h>
20476 -#include <linux/smp.h>
20478 -#include <asm/mmu.h>
20480 -struct Xgt_desc_struct {
20481 - unsigned short size;
20482 - unsigned long address __attribute__((packed));
20483 - unsigned short pad;
20484 -} __attribute__ ((packed));
20488 - struct desc_struct gdt[GDT_ENTRIES];
20489 -} __attribute__((aligned(PAGE_SIZE)));
20490 -DECLARE_PER_CPU(struct gdt_page, gdt_page);
20492 -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
20494 - return per_cpu(gdt_page, cpu).gdt;
20497 -extern struct Xgt_desc_struct idt_descr;
20498 -extern struct desc_struct idt_table[];
20499 -extern void set_intr_gate(unsigned int irq, void * addr);
20501 -static inline void pack_descriptor(__u32 *a, __u32 *b,
20502 - unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
20504 - *a = ((base & 0xffff) << 16) | (limit & 0xffff);
20505 - *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
20506 - (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
20509 -static inline void pack_gate(__u32 *a, __u32 *b,
20510 - unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
20512 - *a = (seg << 16) | (base & 0xffff);
20513 - *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
20516 -#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */
20517 -#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */
20518 -#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */
20519 -#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */
20520 -#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */
20521 -#define DESCTYPE_DPL3 0x60 /* DPL-3 */
20522 -#define DESCTYPE_S 0x10 /* !system */
20524 -#ifndef CONFIG_XEN
20525 -#define load_TR_desc() native_load_tr_desc()
20526 -#define load_gdt(dtr) native_load_gdt(dtr)
20527 -#define load_idt(dtr) native_load_idt(dtr)
20528 -#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
20529 -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
20531 -#define store_gdt(dtr) native_store_gdt(dtr)
20532 -#define store_idt(dtr) native_store_idt(dtr)
20533 -#define store_tr(tr) (tr = native_store_tr())
20534 -#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
20536 -#define load_TLS(t, cpu) native_load_tls(t, cpu)
20537 -#define set_ldt native_set_ldt
20539 -#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20540 -#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20541 -#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20543 -static inline void write_dt_entry(struct desc_struct *dt,
20544 - int entry, u32 entry_low, u32 entry_high)
20546 - dt[entry].a = entry_low;
20547 - dt[entry].b = entry_high;
20550 -static inline void native_set_ldt(const void *addr, unsigned int entries)
20552 - if (likely(entries == 0))
20553 - __asm__ __volatile__("lldt %w0"::"q" (0));
20555 - unsigned cpu = smp_processor_id();
20558 - pack_descriptor(&a, &b, (unsigned long)addr,
20559 - entries * sizeof(struct desc_struct) - 1,
20560 - DESCTYPE_LDT, 0);
20561 - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
20562 - __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
20567 -static inline void native_load_tr_desc(void)
20569 - asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
20572 -static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
20574 - asm volatile("lgdt %0"::"m" (*dtr));
20577 -static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
20579 - asm volatile("lidt %0"::"m" (*dtr));
20582 -static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
20584 - asm ("sgdt %0":"=m" (*dtr));
20587 -static inline void native_store_idt(struct Xgt_desc_struct *dtr)
20589 - asm ("sidt %0":"=m" (*dtr));
20592 -static inline unsigned long native_store_tr(void)
20594 - unsigned long tr;
20595 - asm ("str %0":"=r" (tr));
20599 -static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
20602 - struct desc_struct *gdt = get_cpu_gdt_table(cpu);
20604 - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20605 - gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
20608 -#define load_TLS(t, cpu) xen_load_tls(t, cpu)
20609 -#define set_ldt xen_set_ldt
20611 -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
20612 -extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
20614 -static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
20617 - struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
20619 - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20620 - if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
20621 - *(u64 *)&t->tls_array[i]))
20626 -#ifndef CONFIG_X86_NO_IDT
20627 -static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
20630 - pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
20631 - write_idt_entry(idt_table, gate, a, b);
20635 -#ifndef CONFIG_X86_NO_TSS
20636 -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
20639 - pack_descriptor(&a, &b, (unsigned long)addr,
20640 - offsetof(struct tss_struct, __cacheline_filler) - 1,
20641 - DESCTYPE_TSS, 0);
20642 - write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
20647 -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
20649 -#define LDT_entry_a(info) \
20650 - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
20652 -#define LDT_entry_b(info) \
20653 - (((info)->base_addr & 0xff000000) | \
20654 - (((info)->base_addr & 0x00ff0000) >> 16) | \
20655 - ((info)->limit & 0xf0000) | \
20656 - (((info)->read_exec_only ^ 1) << 9) | \
20657 - ((info)->contents << 10) | \
20658 - (((info)->seg_not_present ^ 1) << 15) | \
20659 - ((info)->seg_32bit << 22) | \
20660 - ((info)->limit_in_pages << 23) | \
20661 - ((info)->useable << 20) | \
20664 -#define LDT_empty(info) (\
20665 - (info)->base_addr == 0 && \
20666 - (info)->limit == 0 && \
20667 - (info)->contents == 0 && \
20668 - (info)->read_exec_only == 1 && \
20669 - (info)->seg_32bit == 0 && \
20670 - (info)->limit_in_pages == 0 && \
20671 - (info)->seg_not_present == 1 && \
20672 - (info)->useable == 0 )
20674 -static inline void clear_LDT(void)
20676 - set_ldt(NULL, 0);
20680 - * load one particular LDT into the current CPU
20682 -static inline void load_LDT_nolock(mm_context_t *pc)
20684 - set_ldt(pc->ldt, pc->size);
20687 -static inline void load_LDT(mm_context_t *pc)
20689 - preempt_disable();
20690 - load_LDT_nolock(pc);
20691 - preempt_enable();
20694 -static inline unsigned long get_desc_base(unsigned long *desc)
20696 - unsigned long base;
20697 - base = ((desc[0] >> 16) & 0x0000ffff) |
20698 - ((desc[1] << 16) & 0x00ff0000) |
20699 - (desc[1] & 0xff000000);
20703 -#else /* __ASSEMBLY__ */
20706 - * GET_DESC_BASE reads the descriptor base of the specified segment.
20709 - * idx - descriptor index
20710 - * gdt - GDT pointer
20711 - * base - 32bit register to which the base will be written
20712 - * lo_w - lo word of the "base" register
20713 - * lo_b - lo byte of the "base" register
20714 - * hi_b - hi byte of the low word of the "base" register
20717 - * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
20718 - * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
20720 -#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
20721 - movb idx*8+4(gdt), lo_b; \
20722 - movb idx*8+7(gdt), hi_b; \
20723 - shll $16, base; \
20724 - movw idx*8+2(gdt), lo_w;
20726 -#endif /* !__ASSEMBLY__ */
20729 --- a/include/asm-x86/mach-xen/asm/desc_64.h
20732 -/* Written 2000 by Andi Kleen */
20733 -#ifndef __ARCH_DESC_H
20734 -#define __ARCH_DESC_H
20736 -#include <linux/threads.h>
20737 -#include <asm/ldt.h>
20739 -#ifndef __ASSEMBLY__
20741 -#include <linux/string.h>
20742 -#include <linux/smp.h>
20743 -#include <asm/desc_defs.h>
20745 -#include <asm/segment.h>
20746 -#include <asm/mmu.h>
20748 -extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
20750 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
20752 -#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
20753 -#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
20755 -static inline void clear_LDT(void)
20757 - int cpu = get_cpu();
20760 - * NB. We load the default_ldt for lcall7/27 handling on demand, as
20761 - * it slows down context switching. Noone uses it anyway.
20763 - cpu = cpu; /* XXX avoid compiler warning */
20764 - xen_set_ldt(NULL, 0);
20768 -#ifndef CONFIG_X86_NO_TSS
20769 -static inline unsigned long __store_tr(void)
20771 - unsigned long tr;
20773 - asm volatile ("str %w0":"=r" (tr));
20777 -#define store_tr(tr) (tr) = __store_tr()
20781 - * This is the ldt that every process will get unless we need
20782 - * something other than this.
20784 -extern struct desc_struct default_ldt[];
20785 -#ifndef CONFIG_X86_NO_IDT
20786 -extern struct gate_struct idt_table[];
20788 -extern struct desc_ptr cpu_gdt_descr[];
20790 -/* the cpu gdt accessor */
20791 -#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
20793 -#ifndef CONFIG_XEN
20794 -static inline void load_gdt(const struct desc_ptr *ptr)
20796 - asm volatile("lgdt %w0"::"m" (*ptr));
20799 -static inline void store_gdt(struct desc_ptr *ptr)
20801 - asm("sgdt %w0":"=m" (*ptr));
20805 -static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
20807 - struct gate_struct s;
20808 - s.offset_low = PTR_LOW(func);
20809 - s.segment = __KERNEL_CS;
20816 - s.offset_middle = PTR_MIDDLE(func);
20817 - s.offset_high = PTR_HIGH(func);
20818 - /* does not need to be atomic because it is only done once at setup time */
20819 - memcpy(adr, &s, 16);
20822 -#ifndef CONFIG_X86_NO_IDT
20823 -static inline void set_intr_gate(int nr, void *func)
20825 - BUG_ON((unsigned)nr > 0xFF);
20826 - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
20829 -static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
20831 - BUG_ON((unsigned)nr > 0xFF);
20832 - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
20835 -static inline void set_system_gate(int nr, void *func)
20837 - BUG_ON((unsigned)nr > 0xFF);
20838 - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
20841 -static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
20843 - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
20846 -static inline void load_idt(const struct desc_ptr *ptr)
20848 - asm volatile("lidt %w0"::"m" (*ptr));
20851 -static inline void store_idt(struct desc_ptr *dtr)
20853 - asm("sidt %w0":"=m" (*dtr));
20857 -static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
20860 - struct ldttss_desc d;
20861 - memset(&d,0,sizeof(d));
20862 - d.limit0 = size & 0xFFFF;
20863 - d.base0 = PTR_LOW(tss);
20864 - d.base1 = PTR_MIDDLE(tss) & 0xFF;
20867 - d.limit1 = (size >> 16) & 0xF;
20868 - d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
20869 - d.base3 = PTR_HIGH(tss);
20870 - memcpy(ptr, &d, 16);
20873 -#ifndef CONFIG_X86_NO_TSS
20874 -static inline void set_tss_desc(unsigned cpu, void *addr)
20877 - * sizeof(unsigned long) coming from an extra "long" at the end
20878 - * of the iobitmap. See tss_struct definition in processor.h
20880 - * -1? seg base+limit should be pointing to the address of the
20881 - * last valid byte
20883 - set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
20884 - (unsigned long)addr, DESC_TSS,
20885 - IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
20889 -static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
20891 - set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
20892 - DESC_LDT, size * 8 - 1);
20895 -#define LDT_entry_a(info) \
20896 - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
20897 -/* Don't allow setting of the lm bit. It is useless anyways because
20898 - 64bit system calls require __USER_CS. */
20899 -#define LDT_entry_b(info) \
20900 - (((info)->base_addr & 0xff000000) | \
20901 - (((info)->base_addr & 0x00ff0000) >> 16) | \
20902 - ((info)->limit & 0xf0000) | \
20903 - (((info)->read_exec_only ^ 1) << 9) | \
20904 - ((info)->contents << 10) | \
20905 - (((info)->seg_not_present ^ 1) << 15) | \
20906 - ((info)->seg_32bit << 22) | \
20907 - ((info)->limit_in_pages << 23) | \
20908 - ((info)->useable << 20) | \
20909 - /* ((info)->lm << 21) | */ \
20912 -#define LDT_empty(info) (\
20913 - (info)->base_addr == 0 && \
20914 - (info)->limit == 0 && \
20915 - (info)->contents == 0 && \
20916 - (info)->read_exec_only == 1 && \
20917 - (info)->seg_32bit == 0 && \
20918 - (info)->limit_in_pages == 0 && \
20919 - (info)->seg_not_present == 1 && \
20920 - (info)->useable == 0 && \
20923 -static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
20926 - u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
20928 - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20929 - if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
20930 - t->tls_array[i]))
20935 - * load one particular LDT into the current CPU
20937 -static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
20939 - void *segments = pc->ldt;
20940 - int count = pc->size;
20942 - if (likely(!count))
20945 - xen_set_ldt(segments, count);
20948 -static inline void load_LDT(mm_context_t *pc)
20950 - int cpu = get_cpu();
20951 - load_LDT_nolock(pc, cpu);
20955 -extern struct desc_ptr idt_descr;
20957 -#endif /* !__ASSEMBLY__ */
20960 --- a/include/asm-x86/mach-xen/asm/desc.h
20961 +++ b/include/asm-x86/mach-xen/asm/desc.h
20963 +#ifndef _ASM_DESC_H_
20964 +#define _ASM_DESC_H_
20966 +#ifndef __ASSEMBLY__
20967 +#include <asm/desc_defs.h>
20968 +#include <asm/ldt.h>
20969 +#include <asm/mmu.h>
20970 +#include <linux/smp.h>
20972 +static inline void fill_ldt(struct desc_struct *desc,
20973 + const struct user_desc *info)
20975 + desc->limit0 = info->limit & 0x0ffff;
20976 + desc->base0 = info->base_addr & 0x0000ffff;
20978 + desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
20979 + desc->type = (info->read_exec_only ^ 1) << 1;
20980 + desc->type |= info->contents << 2;
20983 + desc->p = info->seg_not_present ^ 1;
20984 + desc->limit = (info->limit & 0xf0000) >> 16;
20985 + desc->avl = info->useable;
20986 + desc->d = info->seg_32bit;
20987 + desc->g = info->limit_in_pages;
20988 + desc->base2 = (info->base_addr & 0xff000000) >> 24;
20991 +#ifndef CONFIG_X86_NO_IDT
20992 +extern struct desc_ptr idt_descr;
20993 +extern gate_desc idt_table[];
20996 +#ifdef CONFIG_X86_64
20997 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
20998 +extern struct desc_ptr cpu_gdt_descr[];
20999 +/* the cpu gdt accessor */
21000 +#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
21002 +static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
21003 + unsigned dpl, unsigned ist, unsigned seg)
21005 + gate->offset_low = PTR_LOW(func);
21006 + gate->segment = __KERNEL_CS;
21012 + gate->type = type;
21013 + gate->offset_middle = PTR_MIDDLE(func);
21014 + gate->offset_high = PTR_HIGH(func);
21019 + struct desc_struct gdt[GDT_ENTRIES];
21020 +} __attribute__((aligned(PAGE_SIZE)));
21021 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
21023 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
21025 + return per_cpu(gdt_page, cpu).gdt;
21028 +static inline void pack_gate(gate_desc *gate, unsigned char type,
21029 + unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
21032 + gate->a = (seg << 16) | (base & 0xffff);
21033 + gate->b = (base & 0xffff0000) |
21034 + (((0x80 | type | (dpl << 5)) & 0xff) << 8);
21039 +static inline int desc_empty(const void *ptr)
21041 + const u32 *desc = ptr;
21042 + return !(desc[0] | desc[1]);
21045 +#ifndef CONFIG_XEN
21046 +#define load_TR_desc() native_load_tr_desc()
21047 +#define load_gdt(dtr) native_load_gdt(dtr)
21048 +#define load_idt(dtr) native_load_idt(dtr)
21049 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
21050 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
21052 +#define store_gdt(dtr) native_store_gdt(dtr)
21053 +#define store_idt(dtr) native_store_idt(dtr)
21054 +#define store_tr(tr) (tr = native_store_tr())
21055 +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
21057 +#define load_TLS(t, cpu) native_load_tls(t, cpu)
21058 +#define set_ldt native_set_ldt
21060 +#define write_ldt_entry(dt, entry, desc) \
21061 + native_write_ldt_entry(dt, entry, desc)
21062 +#define write_gdt_entry(dt, entry, desc, type) \
21063 + native_write_gdt_entry(dt, entry, desc, type)
21064 +#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
21066 +static inline void native_write_idt_entry(gate_desc *idt, int entry,
21067 + const gate_desc *gate)
21069 + memcpy(&idt[entry], gate, sizeof(*gate));
21072 +static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
21073 + const void *desc)
21075 + memcpy(&ldt[entry], desc, 8);
21078 +static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
21079 + const void *desc, int type)
21081 + unsigned int size;
21084 + size = sizeof(tss_desc);
21087 + size = sizeof(ldt_desc);
21090 + size = sizeof(struct desc_struct);
21093 + memcpy(&gdt[entry], desc, size);
21097 +static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
21098 + unsigned long limit, unsigned char type,
21099 + unsigned char flags)
21101 + desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
21102 + desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
21103 + (limit & 0x000f0000) | ((type & 0xff) << 8) |
21104 + ((flags & 0xf) << 20);
21109 +#ifndef CONFIG_XEN
21110 +static inline void set_tssldt_descriptor(void *d, unsigned long addr,
21111 + unsigned type, unsigned size)
21113 +#ifdef CONFIG_X86_64
21114 + struct ldttss_desc64 *desc = d;
21115 + memset(desc, 0, sizeof(*desc));
21116 + desc->limit0 = size & 0xFFFF;
21117 + desc->base0 = PTR_LOW(addr);
21118 + desc->base1 = PTR_MIDDLE(addr) & 0xFF;
21119 + desc->type = type;
21121 + desc->limit1 = (size >> 16) & 0xF;
21122 + desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
21123 + desc->base3 = PTR_HIGH(addr);
21126 + pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
21130 +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
21132 + struct desc_struct *d = get_cpu_gdt_table(cpu);
21136 + * sizeof(unsigned long) coming from an extra "long" at the end
21137 + * of the iobitmap. See tss_struct definition in processor.h
21139 + * -1? seg base+limit should be pointing to the address of the
21140 + * last valid byte
21142 + set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
21143 + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
21144 + write_gdt_entry(d, entry, &tss, DESC_TSS);
21147 +#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
21149 +static inline void native_set_ldt(const void *addr, unsigned int entries)
21151 + if (likely(entries == 0))
21152 + __asm__ __volatile__("lldt %w0"::"q" (0));
21154 + unsigned cpu = smp_processor_id();
21157 + set_tssldt_descriptor(&ldt, (unsigned long)addr,
21158 + DESC_LDT, entries * sizeof(ldt) - 1);
21159 + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
21161 + __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
21165 +static inline void native_load_tr_desc(void)
21167 + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
21170 +static inline void native_load_gdt(const struct desc_ptr *dtr)
21172 + asm volatile("lgdt %0"::"m" (*dtr));
21175 +static inline void native_load_idt(const struct desc_ptr *dtr)
21177 + asm volatile("lidt %0"::"m" (*dtr));
21180 +static inline void native_store_gdt(struct desc_ptr *dtr)
21182 + asm volatile("sgdt %0":"=m" (*dtr));
21185 +static inline void native_store_idt(struct desc_ptr *dtr)
21187 + asm volatile("sidt %0":"=m" (*dtr));
21190 +static inline unsigned long native_store_tr(void)
21192 + unsigned long tr;
21193 + asm volatile("str %0":"=r" (tr));
21197 +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
21200 + struct desc_struct *gdt = get_cpu_gdt_table(cpu);
21202 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
21203 + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
21206 +#define load_TLS(t, cpu) xen_load_tls(t, cpu)
21207 +#define set_ldt xen_set_ldt
21209 +extern int write_ldt_entry(struct desc_struct *ldt, int entry,
21210 + const void *desc);
21211 +extern int write_gdt_entry(struct desc_struct *gdt, int entry,
21212 + const void *desc, int type);
21214 +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
21217 + struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
21219 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
21220 + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
21221 + *(u64 *)&t->tls_array[i]))
21226 +#define _LDT_empty(info) (\
21227 + (info)->base_addr == 0 && \
21228 + (info)->limit == 0 && \
21229 + (info)->contents == 0 && \
21230 + (info)->read_exec_only == 1 && \
21231 + (info)->seg_32bit == 0 && \
21232 + (info)->limit_in_pages == 0 && \
21233 + (info)->seg_not_present == 1 && \
21234 + (info)->useable == 0)
21236 +#ifdef CONFIG_X86_64
21237 +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
21239 +#define LDT_empty(info) (_LDT_empty(info))
21242 +static inline void clear_LDT(void)
21244 + set_ldt(NULL, 0);
21248 + * load one particular LDT into the current CPU
21250 +static inline void load_LDT_nolock(mm_context_t *pc)
21252 + set_ldt(pc->ldt, pc->size);
21255 +static inline void load_LDT(mm_context_t *pc)
21257 + preempt_disable();
21258 + load_LDT_nolock(pc);
21259 + preempt_enable();
21262 +static inline unsigned long get_desc_base(const struct desc_struct *desc)
21264 + return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
21267 +static inline unsigned long get_desc_limit(const struct desc_struct *desc)
21269 + return desc->limit0 | (desc->limit << 16);
21272 +#ifndef CONFIG_X86_NO_IDT
21273 +static inline void _set_gate(int gate, unsigned type, void *addr,
21274 + unsigned dpl, unsigned ist, unsigned seg)
21277 + pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
21279 + * does not need to be atomic because it is only done once at
21282 + write_idt_entry(idt_table, gate, &s);
21286 + * This needs to use 'idt_table' rather than 'idt', and
21287 + * thus use the _nonmapped_ version of the IDT, as the
21288 + * Pentium F0 0F bugfix can have resulted in the mapped
21289 + * IDT being write-protected.
21291 +static inline void set_intr_gate(unsigned int n, void *addr)
21293 + BUG_ON((unsigned)n > 0xFF);
21294 + _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
21298 + * This routine sets up an interrupt gate at directory privilege level 3.
21300 +static inline void set_system_intr_gate(unsigned int n, void *addr)
21302 + BUG_ON((unsigned)n > 0xFF);
21303 + _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
21306 +static inline void set_trap_gate(unsigned int n, void *addr)
21308 + BUG_ON((unsigned)n > 0xFF);
21309 + _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
21312 +static inline void set_system_gate(unsigned int n, void *addr)
21314 + BUG_ON((unsigned)n > 0xFF);
21315 #ifdef CONFIG_X86_32
21316 -# include "desc_32.h"
21317 + _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
21319 + _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
21323 +static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
21325 + BUG_ON((unsigned)n > 0xFF);
21326 + _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
21329 +static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
21331 + BUG_ON((unsigned)n > 0xFF);
21332 + _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
21335 +static inline void set_system_gate_ist(int n, void *addr, unsigned ist)
21337 + BUG_ON((unsigned)n > 0xFF);
21338 + _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
21343 -# include "desc_64.h"
21345 + * GET_DESC_BASE reads the descriptor base of the specified segment.
21348 + * idx - descriptor index
21349 + * gdt - GDT pointer
21350 + * base - 32bit register to which the base will be written
21351 + * lo_w - lo word of the "base" register
21352 + * lo_b - lo byte of the "base" register
21353 + * hi_b - hi byte of the low word of the "base" register
21356 + * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
21357 + * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
21359 +#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
21360 + movb idx*8+4(gdt), lo_b; \
21361 + movb idx*8+7(gdt), hi_b; \
21362 + shll $16, base; \
21363 + movw idx*8+2(gdt), lo_w;
21366 +#endif /* __ASSEMBLY__ */
21369 --- a/include/asm-x86/mach-xen/asm/dma-mapping_32.h
21370 +++ b/include/asm-x86/mach-xen/asm/dma-mapping_32.h
21371 @@ -84,23 +84,13 @@ dma_sync_single_range_for_device(struct
21372 dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
21375 -static inline void
21377 dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
21378 - enum dma_data_direction direction)
21381 - swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
21382 - flush_write_buffers();
21384 + enum dma_data_direction direction);
21386 -static inline void
21388 dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
21389 - enum dma_data_direction direction)
21392 - swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
21393 - flush_write_buffers();
21395 + enum dma_data_direction direction);
21398 dma_mapping_error(dma_addr_t dma_addr);
21399 --- a/include/asm-x86/mach-xen/asm/fixmap_32.h
21400 +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
21401 @@ -64,7 +64,7 @@ enum fixed_addresses {
21403 #ifdef CONFIG_X86_VISWS_APIC
21404 FIX_CO_CPU, /* Cobalt timer */
21405 - FIX_CO_APIC, /* Cobalt APIC Redirection Table */
21406 + FIX_CO_APIC, /* Cobalt APIC Redirection Table */
21407 FIX_LI_PCIA, /* Lithium PCI Bridge A */
21408 FIX_LI_PCIB, /* Lithium PCI Bridge B */
21410 @@ -73,7 +73,7 @@ enum fixed_addresses {
21412 #ifdef CONFIG_X86_CYCLONE_TIMER
21413 FIX_CYCLONE_TIMER, /*cyclone timer register*/
21416 #ifdef CONFIG_HIGHMEM
21417 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
21418 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
21419 @@ -93,11 +93,23 @@ enum fixed_addresses {
21421 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
21422 __end_of_permanent_fixed_addresses,
21423 - /* temporary boot-time mappings, used before ioremap() is functional */
21424 -#define NR_FIX_BTMAPS 16
21425 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
21426 - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
21428 + * 256 temporary boot-time mappings, used by early_ioremap(),
21429 + * before ioremap() is functional.
21431 + * We round it up to the next 512 pages boundary so that we
21432 + * can have a single pgd entry and a single pte table:
21434 +#define NR_FIX_BTMAPS 64
21435 +#define FIX_BTMAPS_NESTING 4
21437 + __end_of_permanent_fixed_addresses + 512 -
21438 + (__end_of_permanent_fixed_addresses & 511),
21439 + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
21441 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
21442 + FIX_OHCI1394_BASE,
21444 __end_of_fixed_addresses
21447 --- a/include/asm-x86/mach-xen/asm/fixmap_64.h
21448 +++ b/include/asm-x86/mach-xen/asm/fixmap_64.h
21450 #include <asm/apicdef.h>
21451 #include <asm/page.h>
21452 #include <asm/vsyscall.h>
21453 +#include <asm/efi.h>
21454 #include <asm/acpi.h>
21457 @@ -46,6 +47,10 @@ enum fixed_addresses {
21458 FIX_IO_APIC_BASE_0,
21459 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
21462 + FIX_EFI_IO_MAP_LAST_PAGE,
21463 + FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
21467 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
21468 @@ -55,10 +60,22 @@ enum fixed_addresses {
21470 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
21471 __end_of_permanent_fixed_addresses,
21472 - /* temporary boot-time mappings, used before ioremap() is functional */
21473 -#define NR_FIX_BTMAPS 16
21474 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
21475 - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
21477 + * 256 temporary boot-time mappings, used by early_ioremap(),
21478 + * before ioremap() is functional.
21480 + * We round it up to the next 512 pages boundary so that we
21481 + * can have a single pgd entry and a single pte table:
21483 +#define NR_FIX_BTMAPS 64
21484 +#define FIX_BTMAPS_NESTING 4
21486 + __end_of_permanent_fixed_addresses + 512 -
21487 + (__end_of_permanent_fixed_addresses & 511),
21488 + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
21489 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
21490 + FIX_OHCI1394_BASE,
21492 __end_of_fixed_addresses
21495 --- a/include/asm-x86/mach-xen/asm/highmem.h
21496 +++ b/include/asm-x86/mach-xen/asm/highmem.h
21497 @@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table;
21498 * easily, subsequent pte tables have to be allocated in one physical
21501 -#ifdef CONFIG_X86_PAE
21502 -#define LAST_PKMAP 512
21504 -#define LAST_PKMAP 1024
21509 @@ -57,13 +52,12 @@ extern pte_t *pkmap_page_table;
21513 -#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
21514 #define LAST_PKMAP_MASK (LAST_PKMAP-1)
21515 #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
21516 #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
21518 -extern void * FASTCALL(kmap_high(struct page *page));
21519 -extern void FASTCALL(kunmap_high(struct page *page));
21520 +extern void *kmap_high(struct page *page);
21521 +extern void kunmap_high(struct page *page);
21523 void *kmap(struct page *page);
21524 void kunmap(struct page *page);
21525 --- a/include/asm-x86/mach-xen/asm/hypervisor.h
21526 +++ b/include/asm-x86/mach-xen/asm/hypervisor.h
21527 @@ -264,6 +264,25 @@ HYPERVISOR_poll(
21531 +static inline int __must_check
21532 +HYPERVISOR_poll_no_timeout(
21533 + evtchn_port_t *ports, unsigned int nr_ports)
21536 + struct sched_poll sched_poll = {
21537 + .nr_ports = nr_ports
21539 + set_xen_guest_handle(sched_poll.ports, ports);
21541 + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
21542 +#if CONFIG_XEN_COMPAT <= 0x030002
21543 + if (rc == -ENOSYS)
21544 + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
21553 --- a/include/asm-x86/mach-xen/asm/io_32.h
21554 +++ b/include/asm-x86/mach-xen/asm/io_32.h
21555 @@ -113,8 +113,6 @@ static inline void * phys_to_virt(unsign
21556 ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
21557 bvec_to_pseudophys((vec2))))
21559 -extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
21562 * ioremap - map bus memory into CPU space
21563 * @offset: bus address of the memory
21564 @@ -124,32 +122,39 @@ extern void __iomem * __ioremap(unsigned
21565 * make bus memory CPU accessible via the readb/readw/readl/writeb/
21566 * writew/writel functions and the other mmio helpers. The returned
21567 * address is not guaranteed to be usable directly as a virtual
21571 * If the area you are trying to map is a PCI BAR you should have a
21572 * look at pci_iomap().
21574 +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
21575 +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
21577 -static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
21579 + * The default ioremap() behavior is non-cached:
21581 +static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
21583 - return __ioremap(offset, size, 0);
21584 + return ioremap_nocache(offset, size);
21587 -extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
21588 extern void iounmap(volatile void __iomem *addr);
21591 - * bt_ioremap() and bt_iounmap() are for temporary early boot-time
21592 + * early_ioremap() and early_iounmap() are for temporary early boot-time
21593 * mappings, before the real ioremap() is functional.
21594 * A boot-time mapping is currently limited to at most 16 pages.
21596 -extern void *bt_ioremap(unsigned long offset, unsigned long size);
21597 -extern void bt_iounmap(void *addr, unsigned long size);
21598 +extern void early_ioremap_init(void);
21599 +extern void early_ioremap_clear(void);
21600 +extern void early_ioremap_reset(void);
21601 +extern void *early_ioremap(unsigned long offset, unsigned long size);
21602 +extern void early_iounmap(void *addr, unsigned long size);
21603 extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
21605 /* Use early IO mappings for DMI because it's initialized early */
21606 -#define dmi_ioremap bt_ioremap
21607 -#define dmi_iounmap bt_iounmap
21608 +#define dmi_ioremap early_ioremap
21609 +#define dmi_iounmap early_iounmap
21610 #define dmi_alloc alloc_bootmem
21613 @@ -263,43 +268,21 @@ static inline void flush_write_buffers(v
21615 #endif /* __KERNEL__ */
21617 -static inline void xen_io_delay(void)
21619 - asm volatile("outb %%al,$0x80" : : : "memory");
21621 +extern void xen_io_delay(void);
21622 +#define native_io_delay xen_io_delay
21624 +extern int io_delay_type;
21625 +extern void io_delay_init(void);
21627 static inline void slow_down_io(void) {
21629 + native_io_delay();
21630 #ifdef REALLY_SLOW_IO
21634 + native_io_delay();
21635 + native_io_delay();
21636 + native_io_delay();
21640 -#ifdef CONFIG_X86_NUMAQ
21641 -extern void *xquad_portio; /* Where the IO area was mapped */
21642 -#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
21643 -#define __BUILDIO(bwl,bw,type) \
21644 -static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
21645 - if (xquad_portio) \
21646 - write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
21648 - out##bwl##_local(value, port); \
21650 -static inline void out##bwl(unsigned type value, int port) { \
21651 - out##bwl##_quad(value, port, 0); \
21653 -static inline unsigned type in##bwl##_quad(int port, int quad) { \
21654 - if (xquad_portio) \
21655 - return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
21657 - return in##bwl##_local(port); \
21659 -static inline unsigned type in##bwl(int port) { \
21660 - return in##bwl##_quad(port, 0); \
21663 #define __BUILDIO(bwl,bw,type) \
21664 static inline void out##bwl(unsigned type value, int port) { \
21665 out##bwl##_local(value, port); \
21666 @@ -307,8 +290,6 @@ static inline void out##bwl(unsigned typ
21667 static inline unsigned type in##bwl(int port) { \
21668 return in##bwl##_local(port); \
21673 #define BUILDIO(bwl,bw,type) \
21674 static inline void out##bwl##_local(unsigned type value, int port) { \
21675 --- a/include/asm-x86/mach-xen/asm/io_64.h
21676 +++ b/include/asm-x86/mach-xen/asm/io_64.h
21677 @@ -36,13 +36,21 @@
21678 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
21681 -#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
21682 +extern void xen_io_delay(void);
21683 +#define native_io_delay xen_io_delay
21685 +extern int io_delay_type;
21686 +extern void io_delay_init(void);
21688 +static inline void slow_down_io(void)
21690 + native_io_delay();
21691 #ifdef REALLY_SLOW_IO
21692 -#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
21694 -#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
21695 + native_io_delay();
21696 + native_io_delay();
21697 + native_io_delay();
21702 * Talk about misusing macros..
21703 @@ -53,9 +61,15 @@ static inline void out##s(unsigned x val
21704 #define __OUT2(s,s1,s2) \
21705 __asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
21707 +#ifndef REALLY_SLOW_IO
21708 +#define REALLY_SLOW_IO
21709 +#define UNSET_REALLY_SLOW_IO
21712 #define __OUT(s,s1,x) \
21713 __OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
21714 -__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
21715 +__OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
21716 + slow_down_io(); }
21719 static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
21720 @@ -64,8 +78,13 @@ static inline RETURN_TYPE in##s(unsigned
21721 __asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
21723 #define __IN(s,s1,i...) \
21724 -__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
21725 -__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
21726 +__IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); return _v; } \
21727 +__IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
21728 + slow_down_io(); return _v; }
21730 +#ifdef UNSET_REALLY_SLOW_IO
21731 +#undef REALLY_SLOW_IO
21735 static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
21736 @@ -143,25 +162,30 @@ static inline void * phys_to_virt(unsign
21738 #include <asm-generic/iomap.h>
21740 -extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
21742 -static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
21744 - return __ioremap(offset, size, 0);
21747 -extern void *bt_ioremap(unsigned long addr, unsigned long size);
21748 -extern void bt_iounmap(void *addr, unsigned long size);
21749 -#define early_ioremap bt_ioremap
21750 -#define early_iounmap bt_iounmap
21751 +extern void early_ioremap_init(void);
21752 +extern void early_ioremap_clear(void);
21753 +extern void early_ioremap_reset(void);
21754 +extern void *early_ioremap(unsigned long addr, unsigned long size);
21755 +extern void early_iounmap(void *addr, unsigned long size);
21758 * This one maps high address device memory and turns off caching for that area.
21759 * it's useful if some control registers are in such an area and write combining
21760 * or read caching is not desirable:
21762 -extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
21763 +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
21764 +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
21767 + * The default ioremap() behavior is non-cached:
21769 +static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
21771 + return ioremap_nocache(offset, size);
21774 extern void iounmap(volatile void __iomem *addr);
21776 extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
21779 --- a/include/asm-x86/mach-xen/asm/irqflags_32.h
21783 - * include/asm-i386/irqflags.h
21785 - * IRQ flags handling
21787 - * This file gets included from lowlevel asm headers too, to provide
21788 - * wrapped versions of the local_irq_*() APIs, based on the
21789 - * raw_local_irq_*() functions from the lowlevel headers.
21791 -#ifndef _ASM_IRQFLAGS_H
21792 -#define _ASM_IRQFLAGS_H
21794 -#ifndef __ASSEMBLY__
21795 -#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
21797 -#define xen_restore_fl(f) \
21799 - vcpu_info_t *_vcpu; \
21801 - _vcpu = current_vcpu_info(); \
21802 - if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
21803 - barrier(); /* unmask then check (avoid races) */\
21804 - if (unlikely(_vcpu->evtchn_upcall_pending)) \
21805 - force_evtchn_callback(); \
21809 -#define xen_irq_disable() \
21811 - current_vcpu_info()->evtchn_upcall_mask = 1; \
21815 -#define xen_irq_enable() \
21817 - vcpu_info_t *_vcpu; \
21819 - _vcpu = current_vcpu_info(); \
21820 - _vcpu->evtchn_upcall_mask = 0; \
21821 - barrier(); /* unmask then check (avoid races) */ \
21822 - if (unlikely(_vcpu->evtchn_upcall_pending)) \
21823 - force_evtchn_callback(); \
21826 -void xen_safe_halt(void);
21828 -void xen_halt(void);
21831 - * The use of 'barrier' in the following reflects their use as local-lock
21832 - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
21833 - * critical operations are executed. All critical operations must complete
21834 - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
21835 - * includes these barriers, for example.
21838 -#define __raw_local_save_flags() xen_save_fl()
21840 -#define raw_local_irq_restore(flags) xen_restore_fl(flags)
21842 -#define raw_local_irq_disable() xen_irq_disable()
21844 -#define raw_local_irq_enable() xen_irq_enable()
21847 - * Used in the idle loop; sti takes one instruction cycle
21850 -static inline void raw_safe_halt(void)
21856 - * Used when interrupts are already enabled or to
21857 - * shutdown the processor:
21859 -static inline void halt(void)
21865 - * For spinlocks, etc:
21867 -#define __raw_local_irq_save() \
21869 - unsigned long flags = __raw_local_save_flags(); \
21871 - raw_local_irq_disable(); \
21877 -/* Offsets into shared_info_t. */
21878 -#define evtchn_upcall_pending /* 0 */
21879 -#define evtchn_upcall_mask 1
21881 -#define sizeof_vcpu_shift 6
21884 -#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
21885 - shl $sizeof_vcpu_shift,%esi ; \
21886 - addl HYPERVISOR_shared_info,%esi
21888 -#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
21891 -#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
21892 -#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
21893 -#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
21894 -#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
21895 - __DISABLE_INTERRUPTS
21896 -#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
21897 - __ENABLE_INTERRUPTS
21898 -#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
21899 -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
21900 - __TEST_PENDING ; \
21901 - jnz 14f /* process more events if necessary... */ ; \
21902 - movl PT_ESI(%esp), %esi ; \
21904 -14: __DISABLE_INTERRUPTS ; \
21905 - TRACE_IRQS_OFF ; \
21906 -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
21908 - call evtchn_do_upcall ; \
21910 - jmp ret_from_intr
21911 -#define INTERRUPT_RETURN iret
21912 -#endif /* __ASSEMBLY__ */
21914 -#ifndef __ASSEMBLY__
21915 -#define raw_local_save_flags(flags) \
21916 - do { (flags) = __raw_local_save_flags(); } while (0)
21918 -#define raw_local_irq_save(flags) \
21919 - do { (flags) = __raw_local_irq_save(); } while (0)
21921 -static inline int raw_irqs_disabled_flags(unsigned long flags)
21923 - return (flags != 0);
21926 -#define raw_irqs_disabled() \
21928 - unsigned long flags = __raw_local_save_flags(); \
21930 - raw_irqs_disabled_flags(flags); \
21934 - * makes the traced hardirq state match with the machine state
21936 - * should be a rarely used function, only in places where its
21937 - * otherwise impossible to know the irq state, like in traps.
21939 -static inline void trace_hardirqs_fixup_flags(unsigned long flags)
21941 - if (raw_irqs_disabled_flags(flags))
21942 - trace_hardirqs_off();
21944 - trace_hardirqs_on();
21947 -#define trace_hardirqs_fixup() \
21948 - trace_hardirqs_fixup_flags(__raw_local_save_flags())
21949 -#endif /* __ASSEMBLY__ */
21952 - * Do the CPU's IRQ-state tracing from assembly code. We call a
21953 - * C function, so save all the C-clobbered registers:
21955 -#ifdef CONFIG_TRACE_IRQFLAGS
21957 -# define TRACE_IRQS_ON \
21961 - call trace_hardirqs_on; \
21966 -# define TRACE_IRQS_OFF \
21970 - call trace_hardirqs_off; \
21976 -# define TRACE_IRQS_ON
21977 -# define TRACE_IRQS_OFF
21980 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
21981 -# define LOCKDEP_SYS_EXIT \
21985 - call lockdep_sys_exit; \
21990 -# define LOCKDEP_SYS_EXIT
21994 --- a/include/asm-x86/mach-xen/asm/irqflags_64.h
21998 - * include/asm-x86_64/irqflags.h
22000 - * IRQ flags handling
22002 - * This file gets included from lowlevel asm headers too, to provide
22003 - * wrapped versions of the local_irq_*() APIs, based on the
22004 - * raw_local_irq_*() functions from the lowlevel headers.
22006 -#ifndef _ASM_IRQFLAGS_H
22007 -#define _ASM_IRQFLAGS_H
22008 -#include <asm/processor-flags.h>
22010 -#ifndef __ASSEMBLY__
22012 - * Interrupt control:
22016 - * The use of 'barrier' in the following reflects their use as local-lock
22017 - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
22018 - * critical operations are executed. All critical operations must complete
22019 - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
22020 - * includes these barriers, for example.
22023 -#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
22025 -#define raw_local_save_flags(flags) \
22026 - do { (flags) = __raw_local_save_flags(); } while (0)
22028 -#define raw_local_irq_restore(x) \
22030 - vcpu_info_t *_vcpu; \
22032 - _vcpu = current_vcpu_info(); \
22033 - if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
22034 - barrier(); /* unmask then check (avoid races) */ \
22035 - if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
22036 - force_evtchn_callback(); \
22040 -#ifdef CONFIG_X86_VSMP
22043 - * Interrupt control for the VSMP architecture:
22046 -static inline void raw_local_irq_disable(void)
22048 - unsigned long flags = __raw_local_save_flags();
22050 - raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
22053 -static inline void raw_local_irq_enable(void)
22055 - unsigned long flags = __raw_local_save_flags();
22057 - raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
22060 -static inline int raw_irqs_disabled_flags(unsigned long flags)
22062 - return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
22065 -#else /* CONFIG_X86_VSMP */
22067 -#define raw_local_irq_disable() \
22069 - current_vcpu_info()->evtchn_upcall_mask = 1; \
22073 -#define raw_local_irq_enable() \
22075 - vcpu_info_t *_vcpu; \
22077 - _vcpu = current_vcpu_info(); \
22078 - _vcpu->evtchn_upcall_mask = 0; \
22079 - barrier(); /* unmask then check (avoid races) */ \
22080 - if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
22081 - force_evtchn_callback(); \
22084 -static inline int raw_irqs_disabled_flags(unsigned long flags)
22086 - return (flags != 0);
22092 - * For spinlocks, etc.:
22095 -#define __raw_local_irq_save() \
22097 - unsigned long flags = __raw_local_save_flags(); \
22099 - raw_local_irq_disable(); \
22104 -#define raw_local_irq_save(flags) \
22105 - do { (flags) = __raw_local_irq_save(); } while (0)
22107 -#define raw_irqs_disabled() \
22109 - unsigned long flags = __raw_local_save_flags(); \
22111 - raw_irqs_disabled_flags(flags); \
22115 - * makes the traced hardirq state match with the machine state
22117 - * should be a rarely used function, only in places where its
22118 - * otherwise impossible to know the irq state, like in traps.
22120 -static inline void trace_hardirqs_fixup_flags(unsigned long flags)
22122 - if (raw_irqs_disabled_flags(flags))
22123 - trace_hardirqs_off();
22125 - trace_hardirqs_on();
22128 -#define trace_hardirqs_fixup() \
22129 - trace_hardirqs_fixup_flags(__raw_local_save_flags())
22131 - * Used in the idle loop; sti takes one instruction cycle
22134 -void xen_safe_halt(void);
22135 -static inline void raw_safe_halt(void)
22141 - * Used when interrupts are already enabled or to
22142 - * shutdown the processor:
22144 -void xen_halt(void);
22145 -static inline void halt(void)
22150 -#else /* __ASSEMBLY__: */
22151 -# ifdef CONFIG_TRACE_IRQFLAGS
22152 -# define TRACE_IRQS_ON call trace_hardirqs_on_thunk
22153 -# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk
22155 -# define TRACE_IRQS_ON
22156 -# define TRACE_IRQS_OFF
22158 -# ifdef CONFIG_DEBUG_LOCK_ALLOC
22159 -# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
22160 -# define LOCKDEP_SYS_EXIT_IRQ \
22164 - LOCKDEP_SYS_EXIT; \
22169 -# define LOCKDEP_SYS_EXIT
22170 -# define LOCKDEP_SYS_EXIT_IRQ
22175 --- a/include/asm-x86/mach-xen/asm/irqflags.h
22176 +++ b/include/asm-x86/mach-xen/asm/irqflags.h
22178 -#ifdef CONFIG_X86_32
22179 -# include "irqflags_32.h"
22180 +#ifndef _X86_IRQFLAGS_H_
22181 +#define _X86_IRQFLAGS_H_
22183 +#include <asm/processor-flags.h>
22185 +#ifndef __ASSEMBLY__
22187 + * The use of 'barrier' in the following reflects their use as local-lock
22188 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
22189 + * critical operations are executed. All critical operations must complete
22190 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
22191 + * includes these barriers, for example.
22194 +#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
22196 +#define xen_restore_fl(f) \
22198 + vcpu_info_t *_vcpu; \
22200 + _vcpu = current_vcpu_info(); \
22201 + if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
22202 + barrier(); /* unmask then check (avoid races) */\
22203 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
22204 + force_evtchn_callback(); \
22208 +#define xen_irq_disable() \
22210 + current_vcpu_info()->evtchn_upcall_mask = 1; \
22214 +#define xen_irq_enable() \
22216 + vcpu_info_t *_vcpu; \
22218 + _vcpu = current_vcpu_info(); \
22219 + _vcpu->evtchn_upcall_mask = 0; \
22220 + barrier(); /* unmask then check (avoid races) */ \
22221 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
22222 + force_evtchn_callback(); \
22225 +void xen_safe_halt(void);
22227 +void xen_halt(void);
22229 +#define __raw_local_save_flags() xen_save_fl()
22231 +#define raw_local_irq_restore(flags) xen_restore_fl(flags)
22233 +#define raw_local_irq_disable() xen_irq_disable()
22235 +#define raw_local_irq_enable() xen_irq_enable()
22238 + * Used in the idle loop; sti takes one instruction cycle
22241 +static inline void raw_safe_halt(void)
22247 + * Used when interrupts are already enabled or to
22248 + * shutdown the processor:
22250 +static inline void halt(void)
22256 + * For spinlocks, etc:
22258 +#define __raw_local_irq_save() \
22260 + unsigned long flags = __raw_local_save_flags(); \
22262 + raw_local_irq_disable(); \
22267 -# include "irqflags_64.h"
22269 +/* Offsets into shared_info_t. */
22270 +#define evtchn_upcall_pending /* 0 */
22271 +#define evtchn_upcall_mask 1
22273 +#define sizeof_vcpu_shift 6
22275 +#ifdef CONFIG_X86_64
22276 +# define __REG_si %rsi
22277 +# define __CPU_num %gs:pda_cpunumber
22279 +# define __REG_si %esi
22280 +# define __CPU_num TI_cpu(%ebp)
22284 +#define GET_VCPU_INFO movl __CPU_num,%esi ; \
22285 + shl $sizeof_vcpu_shift,%esi ; \
22286 + add HYPERVISOR_shared_info,__REG_si
22288 +#define GET_VCPU_INFO mov HYPERVISOR_shared_info,__REG_si
22291 +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(__REG_si)
22292 +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(__REG_si)
22293 +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(__REG_si)
22294 +#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
22295 + __DISABLE_INTERRUPTS
22296 +#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
22297 + __ENABLE_INTERRUPTS
22299 +#ifndef CONFIG_X86_64
22300 +#define INTERRUPT_RETURN iret
22301 +#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
22302 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
22303 + __TEST_PENDING ; \
22304 + jnz 14f /* process more events if necessary... */ ; \
22305 + movl PT_ESI(%esp), %esi ; \
22307 +14: __DISABLE_INTERRUPTS ; \
22308 + TRACE_IRQS_OFF ; \
22309 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
22311 + call evtchn_do_upcall ; \
22313 + jmp ret_from_intr
22317 +#endif /* __ASSEMBLY__ */
22319 +#ifndef __ASSEMBLY__
22320 +#define raw_local_save_flags(flags) \
22321 + do { (flags) = __raw_local_save_flags(); } while (0)
22323 +#define raw_local_irq_save(flags) \
22324 + do { (flags) = __raw_local_irq_save(); } while (0)
22326 +static inline int raw_irqs_disabled_flags(unsigned long flags)
22328 + return (flags != 0);
22331 +#define raw_irqs_disabled() \
22333 + unsigned long flags = __raw_local_save_flags(); \
22335 + raw_irqs_disabled_flags(flags); \
22339 + * makes the traced hardirq state match with the machine state
22341 + * should be a rarely used function, only in places where its
22342 + * otherwise impossible to know the irq state, like in traps.
22344 +static inline void trace_hardirqs_fixup_flags(unsigned long flags)
22346 + if (raw_irqs_disabled_flags(flags))
22347 + trace_hardirqs_off();
22349 + trace_hardirqs_on();
22352 +#define trace_hardirqs_fixup() \
22353 + trace_hardirqs_fixup_flags(__raw_local_save_flags())
22357 +#ifdef CONFIG_X86_64
22359 + * Currently paravirt can't handle swapgs nicely when we
22360 + * don't have a stack we can rely on (such as a user space
22361 + * stack). So we either find a way around these or just fault
22362 + * and emulate if a guest tries to call swapgs directly.
22364 + * Either way, this is a good way to document that we don't
22365 + * have a reliable stack. x86_64 only.
22367 +#define SWAPGS_UNSAFE_STACK swapgs
22368 +#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
22369 +#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
22370 +#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
22371 +#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
22373 + ENABLE_INTERRUPTS(CLBR_NONE); \
22375 + LOCKDEP_SYS_EXIT; \
22377 + __DISABLE_INTERRUPTS; \
22381 +#define ARCH_TRACE_IRQS_ON \
22385 + call trace_hardirqs_on; \
22390 +#define ARCH_TRACE_IRQS_OFF \
22394 + call trace_hardirqs_off; \
22399 +#define ARCH_LOCKDEP_SYS_EXIT \
22403 + call lockdep_sys_exit; \
22408 +#define ARCH_LOCKDEP_SYS_EXIT_IRQ
22411 +#ifdef CONFIG_TRACE_IRQFLAGS
22412 +# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
22413 +# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
22415 +# define TRACE_IRQS_ON
22416 +# define TRACE_IRQS_OFF
22418 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
22419 +# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
22420 +# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
22422 +# define LOCKDEP_SYS_EXIT
22423 +# define LOCKDEP_SYS_EXIT_IRQ
22426 +#endif /* __ASSEMBLY__ */
22428 --- a/include/asm-x86/mach-xen/asm/maddr_32.h
22429 +++ b/include/asm-x86/mach-xen/asm/maddr_32.h
22431 #ifndef _I386_MADDR_H
22432 #define _I386_MADDR_H
22434 +#include <asm/bug.h>
22435 #include <xen/features.h>
22436 #include <xen/interface/xen.h>
22438 @@ -151,25 +152,9 @@ static inline paddr_t pte_machine_to_phy
22439 phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
22444 -#ifdef CONFIG_X86_PAE
22445 -#define __pte_ma(x) ((pte_t) { (x), (maddr_t)(x) >> 32 } )
22446 -extern unsigned long long __supported_pte_mask;
22447 -static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
22451 - pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
22452 - (pgprot_val(pgprot) >> 32);
22453 - pte.pte_high &= (__supported_pte_mask >> 32);
22454 - pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
22455 - __supported_pte_mask;
22459 -#define __pte_ma(x) ((pte_t) { (x) } )
22460 -#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
22461 +#define pte_phys_to_machine phys_to_machine
22462 +#define pte_machine_to_phys machine_to_phys
22465 #else /* !CONFIG_XEN */
22466 --- a/include/asm-x86/mach-xen/asm/maddr_64.h
22467 +++ b/include/asm-x86/mach-xen/asm/maddr_64.h
22469 #ifndef _X86_64_MADDR_H
22470 #define _X86_64_MADDR_H
22472 +#include <asm/bug.h>
22473 #include <xen/features.h>
22474 #include <xen/interface/xen.h>
22476 @@ -16,6 +17,7 @@ typedef unsigned long maddr_t;
22479 extern unsigned long *phys_to_machine_mapping;
22480 +extern unsigned long max_mapnr;
22482 #undef machine_to_phys_mapping
22483 extern unsigned long *machine_to_phys_mapping;
22484 @@ -25,7 +27,7 @@ static inline unsigned long pfn_to_mfn(u
22486 if (xen_feature(XENFEAT_auto_translated_physmap))
22488 - BUG_ON(end_pfn && pfn >= end_pfn);
22489 + BUG_ON(max_mapnr && pfn >= max_mapnr);
22490 return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
22493 @@ -33,7 +35,7 @@ static inline int phys_to_machine_mappin
22495 if (xen_feature(XENFEAT_auto_translated_physmap))
22497 - BUG_ON(end_pfn && pfn >= end_pfn);
22498 + BUG_ON(max_mapnr && pfn >= max_mapnr);
22499 return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
22502 @@ -45,7 +47,7 @@ static inline unsigned long mfn_to_pfn(u
22505 if (unlikely((mfn >> machine_to_phys_order) != 0))
22507 + return max_mapnr;
22509 /* The array access can fail (e.g., device space beyond end of RAM). */
22511 @@ -60,7 +62,7 @@ static inline unsigned long mfn_to_pfn(u
22515 - : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
22516 + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
22520 @@ -88,16 +90,16 @@ static inline unsigned long mfn_to_pfn(u
22521 static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
22523 unsigned long pfn = mfn_to_pfn(mfn);
22524 - if ((pfn < end_pfn)
22525 + if ((pfn < max_mapnr)
22526 && !xen_feature(XENFEAT_auto_translated_physmap)
22527 && (phys_to_machine_mapping[pfn] != mfn))
22528 - return end_pfn; /* force !pfn_valid() */
22529 + return max_mapnr; /* force !pfn_valid() */
22533 static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
22535 - BUG_ON(end_pfn && pfn >= end_pfn);
22536 + BUG_ON(max_mapnr && pfn >= max_mapnr);
22537 if (xen_feature(XENFEAT_auto_translated_physmap)) {
22538 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
22540 @@ -135,9 +137,6 @@ static inline paddr_t pte_machine_to_phy
22544 -#define __pte_ma(x) ((pte_t) { (x) } )
22545 -#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
22547 #else /* !CONFIG_XEN */
22549 #define pfn_to_mfn(pfn) (pfn)
22550 --- a/include/asm-x86/mach-xen/asm/mmu_context_32.h
22551 +++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h
22552 @@ -51,8 +51,6 @@ static inline void __prepare_arch_switch
22556 -void leave_mm(unsigned long cpu);
22558 static inline void switch_mm(struct mm_struct *prev,
22559 struct mm_struct *next,
22560 struct task_struct *tsk)
22561 --- a/include/asm-x86/mach-xen/asm/mmu_context_64.h
22562 +++ b/include/asm-x86/mach-xen/asm/mmu_context_64.h
22563 @@ -62,12 +62,6 @@ extern void mm_pin(struct mm_struct *mm)
22564 extern void mm_unpin(struct mm_struct *mm);
22565 void mm_pin_all(void);
22567 -static inline void load_cr3(pgd_t *pgd)
22569 - asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
22573 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
22574 struct task_struct *tsk)
22576 @@ -97,7 +91,7 @@ static inline void switch_mm(struct mm_s
22579 if (unlikely(next->context.ldt != prev->context.ldt)) {
22580 - /* load_LDT_nolock(&next->context, cpu) */
22581 + /* load_LDT_nolock(&next->context) */
22582 op->cmd = MMUEXT_SET_LDT;
22583 op->arg1.linear_addr = (unsigned long)next->context.ldt;
22584 op->arg2.nr_ents = next->context.size;
22585 @@ -110,7 +104,7 @@ static inline void switch_mm(struct mm_s
22587 write_pda(mmu_state, TLBSTATE_OK);
22588 if (read_pda(active_mm) != next)
22589 - out_of_line_bug();
22591 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
22592 /* We were in lazy tlb mode and leave_mm disabled
22593 * tlb flush IPI delivery. We must reload CR3
22594 @@ -118,7 +112,7 @@ static inline void switch_mm(struct mm_s
22596 load_cr3(next->pgd);
22597 xen_new_user_pt(__pa(__user_pgd(next->pgd)));
22598 - load_LDT_nolock(&next->context, cpu);
22599 + load_LDT_nolock(&next->context);
22603 --- a/include/asm-x86/mach-xen/asm/page_64.h
22604 +++ b/include/asm-x86/mach-xen/asm/page_64.h
22606 #ifndef _X86_64_PAGE_H
22607 #define _X86_64_PAGE_H
22609 -/* #include <linux/string.h> */
22610 -#ifndef __ASSEMBLY__
22611 -#include <linux/kernel.h>
22612 -#include <linux/types.h>
22613 -#include <asm/bug.h>
22615 -#include <linux/const.h>
22616 -#include <xen/interface/xen.h>
22619 - * Need to repeat this here in order to not include pgtable.h (which in turn
22620 - * depends on definitions made here), but to be able to use the symbolic
22621 - * below. The preprocessor will warn if the two definitions aren't identical.
22623 -#define _PAGE_PRESENT 0x001
22624 -#define _PAGE_IO 0x200
22626 -/* PAGE_SHIFT determines the page size */
22627 -#define PAGE_SHIFT 12
22628 -#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
22629 -#define PAGE_MASK (~(PAGE_SIZE-1))
22631 -/* See Documentation/x86_64/mm.txt for a description of the memory map. */
22632 -#define __PHYSICAL_MASK_SHIFT 46
22633 -#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
22634 -#define __VIRTUAL_MASK_SHIFT 48
22635 -#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
22637 -#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
22638 +#define PAGETABLE_LEVELS 4
22640 -#define THREAD_ORDER 1
22641 +#define THREAD_ORDER 1
22642 #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
22643 #define CURRENT_MASK (~(THREAD_SIZE-1))
22645 @@ -51,106 +23,10 @@
22646 #define MCE_STACK 5
22647 #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
22649 -#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
22650 -#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
22652 -#define HPAGE_SHIFT PMD_SHIFT
22653 -#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
22654 -#define HPAGE_MASK (~(HPAGE_SIZE - 1))
22655 -#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
22658 -#ifndef __ASSEMBLY__
22660 -extern unsigned long end_pfn;
22662 -#include <asm/maddr.h>
22664 -void clear_page(void *);
22665 -void copy_page(void *, void *);
22667 -#define clear_user_page(page, vaddr, pg) clear_page(page)
22668 -#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
22670 -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
22671 - alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
22672 -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
22675 - * These are used to make use of C type-checking..
22677 -typedef struct { unsigned long pte; } pte_t;
22678 -typedef struct { unsigned long pmd; } pmd_t;
22679 -typedef struct { unsigned long pud; } pud_t;
22680 -typedef struct { unsigned long pgd; } pgd_t;
22681 -#define PTE_MASK PHYSICAL_PAGE_MASK
22683 -typedef struct { unsigned long pgprot; } pgprot_t;
22685 -#define __pte_val(x) ((x).pte)
22686 -#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO)) \
22687 - == _PAGE_PRESENT ? \
22688 - pte_machine_to_phys(__pte_val(x)) : \
22691 -#define __pmd_val(x) ((x).pmd)
22692 -static inline unsigned long pmd_val(pmd_t x)
22694 - unsigned long ret = __pmd_val(x);
22695 -#if CONFIG_XEN_COMPAT <= 0x030002
22696 - if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
22698 - if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22703 -#define __pud_val(x) ((x).pud)
22704 -static inline unsigned long pud_val(pud_t x)
22706 - unsigned long ret = __pud_val(x);
22707 - if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22711 -#define __pgd_val(x) ((x).pgd)
22712 -static inline unsigned long pgd_val(pgd_t x)
22714 - unsigned long ret = __pgd_val(x);
22715 - if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22719 -#define pgprot_val(x) ((x).pgprot)
22721 -static inline pte_t __pte(unsigned long x)
22723 - if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
22724 - x = pte_phys_to_machine(x);
22725 - return ((pte_t) { (x) });
22728 -static inline pmd_t __pmd(unsigned long x)
22730 - if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22731 - return ((pmd_t) { (x) });
22734 -static inline pud_t __pud(unsigned long x)
22736 - if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22737 - return ((pud_t) { (x) });
22740 -static inline pgd_t __pgd(unsigned long x)
22742 - if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22743 - return ((pgd_t) { (x) });
22746 -#define __pgprot(x) ((pgprot_t) { (x) } )
22747 +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
22748 +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
22750 -#endif /* !__ASSEMBLY__ */
22751 +#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
22753 #define __PHYSICAL_START CONFIG_PHYSICAL_START
22754 #define __KERNEL_ALIGN 0x200000
22755 @@ -166,52 +42,58 @@ static inline pgd_t __pgd(unsigned long
22757 #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
22758 #define __START_KERNEL_map _AC(0xffffffff80000000, UL)
22759 -#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
22761 #if CONFIG_XEN_COMPAT <= 0x030002
22763 #define LOAD_OFFSET 0
22766 -/* to align the pointer to the (next) page boundary */
22767 -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
22769 -#define KERNEL_TEXT_SIZE (40*1024*1024)
22770 -#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
22771 +/* See Documentation/x86_64/mm.txt for a description of the memory map. */
22772 +#define __PHYSICAL_MASK_SHIFT 46
22773 +#define __VIRTUAL_MASK_SHIFT 48
22775 -#define PAGE_OFFSET __PAGE_OFFSET
22777 + * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
22778 + * arch/x86/kernel/head_64.S), and it is mapped here:
22780 +#define KERNEL_IMAGE_SIZE (128*1024*1024)
22781 +#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
22783 #ifndef __ASSEMBLY__
22784 +void clear_page(void *page);
22785 +void copy_page(void *to, void *from);
22787 +extern unsigned long end_pfn;
22788 +extern unsigned long end_pfn_map;
22790 static inline unsigned long __phys_addr(unsigned long x)
22792 - return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET);
22793 + return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : __PAGE_OFFSET);
22797 -#define __pa(x) __phys_addr((unsigned long)(x))
22798 -#define __pa_symbol(x) __phys_addr((unsigned long)(x))
22799 +#define __phys_reloc_hide(x) (x)
22801 -#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
22802 -#define __boot_va(x) __va(x)
22803 -#define __boot_pa(x) __pa(x)
22804 -#ifdef CONFIG_FLATMEM
22805 -#define pfn_valid(pfn) ((pfn) < end_pfn)
22808 + * These are used to make use of C type-checking..
22810 +typedef unsigned long pteval_t;
22811 +typedef unsigned long pmdval_t;
22812 +typedef unsigned long pudval_t;
22813 +typedef unsigned long pgdval_t;
22814 +typedef unsigned long pgprotval_t;
22815 +typedef unsigned long phys_addr_t;
22817 -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
22818 -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
22819 -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
22821 -#define VM_DATA_DEFAULT_FLAGS \
22822 - (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
22823 - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
22824 +typedef struct page *pgtable_t;
22826 +typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
22828 -#define __HAVE_ARCH_GATE_AREA 1
22829 #define vmemmap ((struct page *)VMEMMAP_START)
22831 -#include <asm-generic/memory_model.h>
22832 -#include <asm-generic/page.h>
22833 +#endif /* !__ASSEMBLY__ */
22835 +#ifdef CONFIG_FLATMEM
22836 +#define pfn_valid(pfn) ((pfn) < max_mapnr)
22839 -#endif /* __KERNEL__ */
22841 #endif /* _X86_64_PAGE_H */
22842 --- a/include/asm-x86/mach-xen/asm/page.h
22843 +++ b/include/asm-x86/mach-xen/asm/page.h
22845 +#ifndef _ASM_X86_PAGE_H
22846 +#define _ASM_X86_PAGE_H
22848 +#include <linux/const.h>
22850 +/* PAGE_SHIFT determines the page size */
22851 +#define PAGE_SHIFT 12
22852 +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
22853 +#define PAGE_MASK (~(PAGE_SIZE-1))
22856 -# ifdef CONFIG_X86_32
22857 -# include "page_32.h"
22859 -# include "page_64.h"
22863 + * Need to repeat this here in order to not include pgtable.h (which in turn
22864 + * depends on definitions made here), but to be able to use the symbolics
22865 + * below. The preprocessor will warn if the two definitions aren't identical.
22867 +#define _PAGE_BIT_PRESENT 0
22868 +#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
22869 +#define _PAGE_BIT_IO 9
22870 +#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
22872 +#define PHYSICAL_PAGE_MASK (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK)
22873 +#define PTE_MASK _AT(pteval_t, PHYSICAL_PAGE_MASK)
22875 +#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
22876 +#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
22878 +#define HPAGE_SHIFT PMD_SHIFT
22879 +#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
22880 +#define HPAGE_MASK (~(HPAGE_SIZE - 1))
22881 +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
22883 +/* to align the pointer to the (next) page boundary */
22884 +#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
22886 +#define __PHYSICAL_MASK _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
22887 +#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
22889 +#ifndef __ASSEMBLY__
22890 +#include <linux/types.h>
22893 +#ifdef CONFIG_X86_64
22894 +#include <asm/page_64.h>
22895 +#define max_pfn_mapped end_pfn_map
22897 +#include <asm/page_32.h>
22898 +#define max_pfn_mapped max_low_pfn
22899 +#endif /* CONFIG_X86_64 */
22901 +#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
22903 +#define VM_DATA_DEFAULT_FLAGS \
22904 + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
22905 + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
22908 +#ifndef __ASSEMBLY__
22910 +extern int page_is_ram(unsigned long pagenr);
22914 +static inline void clear_user_page(void *page, unsigned long vaddr,
22917 + clear_page(page);
22920 +static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
22921 + struct page *topage)
22923 + copy_page(to, from);
22926 +#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
22927 + alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
22928 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
22930 +typedef struct { pgprotval_t pgprot; } pgprot_t;
22932 +#define pgprot_val(x) ((x).pgprot)
22933 +#define __pgprot(x) ((pgprot_t) { (x) } )
22935 +#include <asm/maddr.h>
22937 +typedef struct { pgdval_t pgd; } pgd_t;
22939 +#define __pgd_ma(x) ((pgd_t) { (x) } )
22940 +static inline pgd_t xen_make_pgd(pgdval_t val)
22942 + if (val & _PAGE_PRESENT)
22943 + val = pte_phys_to_machine(val);
22944 + return (pgd_t) { val };
22947 +#define __pgd_val(x) ((x).pgd)
22948 +static inline pgdval_t xen_pgd_val(pgd_t pgd)
22950 + pgdval_t ret = __pgd_val(pgd);
22951 +#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
22953 + ret = machine_to_phys(ret) | _PAGE_PRESENT;
22955 + if (ret & _PAGE_PRESENT)
22956 + ret = pte_machine_to_phys(ret);
22961 +#if PAGETABLE_LEVELS >= 3
22962 +#if PAGETABLE_LEVELS == 4
22963 +typedef struct { pudval_t pud; } pud_t;
22965 +#define __pud_ma(x) ((pud_t) { (x) } )
22966 +static inline pud_t xen_make_pud(pudval_t val)
22968 + if (val & _PAGE_PRESENT)
22969 + val = pte_phys_to_machine(val);
22970 + return (pud_t) { val };
22973 +#define __pud_val(x) ((x).pud)
22974 +static inline pudval_t xen_pud_val(pud_t pud)
22976 + pudval_t ret = __pud_val(pud);
22977 + if (ret & _PAGE_PRESENT)
22978 + ret = pte_machine_to_phys(ret);
22981 +#else /* PAGETABLE_LEVELS == 3 */
22982 +#include <asm-generic/pgtable-nopud.h>
22984 +#define __pud_val(x) __pgd_val((x).pgd)
22985 +static inline pudval_t xen_pud_val(pud_t pud)
22987 + return xen_pgd_val(pud.pgd);
22989 +#endif /* PAGETABLE_LEVELS == 4 */
22991 +typedef struct { pmdval_t pmd; } pmd_t;
22993 +#define __pmd_ma(x) ((pmd_t) { (x) } )
22994 +static inline pmd_t xen_make_pmd(pmdval_t val)
22996 + if (val & _PAGE_PRESENT)
22997 + val = pte_phys_to_machine(val);
22998 + return (pmd_t) { val };
23001 +#define __pmd_val(x) ((x).pmd)
23002 +static inline pmdval_t xen_pmd_val(pmd_t pmd)
23004 + pmdval_t ret = __pmd_val(pmd);
23005 +#if CONFIG_XEN_COMPAT <= 0x030002
23007 + ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
23010 -# include "page_32.h"
23012 -# include "page_64.h"
23014 + if (ret & _PAGE_PRESENT)
23015 + ret = pte_machine_to_phys(ret);
23019 +#else /* PAGETABLE_LEVELS == 2 */
23020 +#include <asm-generic/pgtable-nopmd.h>
23022 +#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
23023 +#define __pmd_val(x) __pgd_val((x).pud.pgd)
23024 +static inline pmdval_t xen_pmd_val(pmd_t pmd)
23026 + return xen_pgd_val(pmd.pud.pgd);
23028 +#endif /* PAGETABLE_LEVELS >= 3 */
23030 +#define __pte_ma(x) ((pte_t) { .pte = (x) } )
23031 +static inline pte_t xen_make_pte(pteval_t val)
23033 + if ((val & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
23034 + val = pte_phys_to_machine(val);
23035 + return (pte_t) { .pte = val };
23038 +#define __pte_val(x) ((x).pte)
23039 +static inline pteval_t xen_pte_val(pte_t pte)
23041 + pteval_t ret = __pte_val(pte);
23042 + if ((pte.pte_low & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
23043 + ret = pte_machine_to_phys(ret);
23047 +#define pgd_val(x) xen_pgd_val(x)
23048 +#define __pgd(x) xen_make_pgd(x)
23050 +#ifndef __PAGETABLE_PUD_FOLDED
23051 +#define pud_val(x) xen_pud_val(x)
23052 +#define __pud(x) xen_make_pud(x)
23055 +#ifndef __PAGETABLE_PMD_FOLDED
23056 +#define pmd_val(x) xen_pmd_val(x)
23057 +#define __pmd(x) xen_make_pmd(x)
23060 +#define pte_val(x) xen_pte_val(x)
23061 +#define __pte(x) xen_make_pte(x)
23063 +#define __pa(x) __phys_addr((unsigned long)(x))
23064 +/* __pa_symbol should be used for C visible symbols.
23065 + This seems to be the official gcc blessed way to do such arithmetic. */
23066 +#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
23068 +#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
23070 +#define __boot_va(x) __va(x)
23071 +#define __boot_pa(x) __pa(x)
23073 +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
23074 +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
23075 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
23077 +#endif /* __ASSEMBLY__ */
23079 +#include <asm-generic/memory_model.h>
23080 +#include <asm-generic/page.h>
23082 +#define __HAVE_ARCH_GATE_AREA 1
23084 +#endif /* __KERNEL__ */
23085 +#endif /* _ASM_X86_PAGE_H */
23086 --- a/include/asm-x86/mach-xen/asm/pci_64.h
23087 +++ b/include/asm-x86/mach-xen/asm/pci_64.h
23088 @@ -26,7 +26,6 @@ extern int (*pci_config_write)(int seg,
23091 extern void pci_iommu_alloc(void);
23092 -extern int iommu_setup(char *opt);
23094 /* The PCI address space does equal the physical memory
23095 * address space. The networking and block device layers use
23096 --- a/include/asm-x86/mach-xen/asm/pci.h
23097 +++ b/include/asm-x86/mach-xen/asm/pci.h
23098 @@ -71,6 +71,7 @@ extern int pci_mmap_page_range(struct pc
23102 +extern void early_quirks(void);
23103 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
23104 enum pci_dma_burst_strategy *strat,
23105 unsigned long *strategy_parameter)
23106 @@ -78,9 +79,10 @@ static inline void pci_dma_burst_advice(
23107 *strat = PCI_DMA_BURST_INFINITY;
23108 *strategy_parameter = ~0UL;
23111 +static inline void early_quirks(void) { }
23115 #endif /* __KERNEL__ */
23117 #ifdef CONFIG_X86_32
23118 @@ -95,6 +97,19 @@ static inline void pci_dma_burst_advice(
23119 /* generic pci stuff */
23120 #include <asm-generic/pci.h>
23122 +#ifdef CONFIG_NUMA
23123 +/* Returns the node based on pci bus */
23124 +static inline int __pcibus_to_node(struct pci_bus *bus)
23126 + struct pci_sysdata *sd = bus->sysdata;
23131 +static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
23133 + return node_to_cpumask(__pcibus_to_node(bus));
23138 --- a/include/asm-x86/mach-xen/asm/pgalloc_32.h
23139 +++ b/include/asm-x86/mach-xen/asm/pgalloc_32.h
23142 #include <linux/threads.h>
23143 #include <linux/mm.h> /* for struct page */
23144 +#include <linux/pagemap.h>
23145 +#include <asm/tlb.h>
23146 +#include <asm-generic/tlb.h>
23147 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
23149 #define paravirt_alloc_pt(mm, pfn) do { } while (0)
23150 -#define paravirt_alloc_pd(pfn) do { } while (0)
23151 -#define paravirt_alloc_pd(pfn) do { } while (0)
23152 +#define paravirt_alloc_pd(mm, pfn) do { } while (0)
23153 #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
23154 #define paravirt_release_pt(pfn) do { } while (0)
23155 #define paravirt_release_pd(pfn) do { } while (0)
23157 -#define pmd_populate_kernel(mm, pmd, pte) \
23159 - paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); \
23160 - set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \
23163 -#define pmd_populate(mm, pmd, pte) \
23165 - unsigned long pfn = page_to_pfn(pte); \
23166 - paravirt_alloc_pt(mm, pfn); \
23167 - if (PagePinned(virt_to_page((mm)->pgd))) { \
23168 - if (!PageHighMem(pte)) \
23169 - BUG_ON(HYPERVISOR_update_va_mapping( \
23170 - (unsigned long)__va(pfn << PAGE_SHIFT), \
23171 - pfn_pte(pfn, PAGE_KERNEL_RO), 0)); \
23172 - else if (!test_and_set_bit(PG_pinned, &pte->flags)) \
23173 - kmap_flush_unused(); \
23175 - __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \
23177 - *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \
23179 +static inline void pmd_populate_kernel(struct mm_struct *mm,
23180 + pmd_t *pmd, pte_t *pte)
23182 + paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
23183 + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
23186 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
23188 + unsigned long pfn = page_to_pfn(pte);
23190 + paravirt_alloc_pt(mm, pfn);
23191 + if (PagePinned(virt_to_page(mm->pgd))) {
23192 + if (!PageHighMem(pte))
23193 + BUG_ON(HYPERVISOR_update_va_mapping(
23194 + (unsigned long)__va(pfn << PAGE_SHIFT),
23195 + pfn_pte(pfn, PAGE_KERNEL_RO), 0));
23196 + else if (!test_and_set_bit(PG_pinned, &pte->flags))
23197 + kmap_flush_unused();
23198 + set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
23200 + *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
23202 +#define pmd_pgtable(pmd) pmd_page(pmd)
23205 * Allocate and free page tables.
23207 +extern void pgd_test_and_unpin(pgd_t *);
23208 extern pgd_t *pgd_alloc(struct mm_struct *);
23209 -extern void pgd_free(pgd_t *pgd);
23210 +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
23212 extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
23213 -extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
23214 +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
23216 -static inline void pte_free_kernel(pte_t *pte)
23217 +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
23219 make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
23220 free_page((unsigned long)pte);
23223 -extern void pte_free(struct page *pte);
23224 +extern void __pte_free(pgtable_t);
23225 +static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
23231 -#define __pte_free_tlb(tlb,pte) \
23233 - paravirt_release_pt(page_to_pfn(pte)); \
23234 - tlb_remove_page((tlb),(pte)); \
23236 +extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
23238 #ifdef CONFIG_X86_PAE
23240 * In the PAE case we free the pmds as part of the pgd.
23242 -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); })
23243 -#define pmd_free(x) do { } while (0)
23244 -#define __pmd_free_tlb(tlb,x) do { } while (0)
23245 -#define pud_populate(mm, pmd, pte) BUG()
23247 +extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
23249 +extern void __pmd_free(pgtable_t);
23250 +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
23252 + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
23253 + __pmd_free(virt_to_page(pmd));
23256 +extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
23258 +static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
23260 + struct page *page = virt_to_page(pmd);
23261 + unsigned long pfn = page_to_pfn(page);
23263 + paravirt_alloc_pd(mm, pfn);
23265 + /* Note: almost everything apart from _PAGE_PRESENT is
23266 + reserved at the pmd (PDPT) level. */
23267 + if (PagePinned(virt_to_page(mm->pgd))) {
23268 + BUG_ON(PageHighMem(page));
23269 + BUG_ON(HYPERVISOR_update_va_mapping(
23270 + (unsigned long)__va(pfn << PAGE_SHIFT),
23271 + pfn_pte(pfn, PAGE_KERNEL_RO), 0));
23272 + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
23274 + *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
23277 + * According to Intel App note "TLBs, Paging-Structure Caches,
23278 + * and Their Invalidation", April 2007, document 317080-001,
23279 + * section 8.1: in PAE mode we explicitly have to flush the
23280 + * TLB via cr3 if the top-level pgd is changed...
23282 + if (mm == current->active_mm)
23285 +#endif /* CONFIG_X86_PAE */
23287 #endif /* _I386_PGALLOC_H */
23288 --- a/include/asm-x86/mach-xen/asm/pgalloc_64.h
23289 +++ b/include/asm-x86/mach-xen/asm/pgalloc_64.h
23291 #include <linux/mm.h>
23292 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
23294 -#include <xen/features.h>
23295 -void make_page_readonly(void *va, unsigned int feature);
23296 -void make_page_writable(void *va, unsigned int feature);
23297 -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
23298 -void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
23299 +pmd_t *early_get_pmd(unsigned long va);
23300 +void early_make_page_readonly(void *va, unsigned int feature);
23302 #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
23304 -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
23306 - set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
23309 -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
23311 - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
23312 - BUG_ON(HYPERVISOR_update_va_mapping(
23313 - (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
23314 - pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
23315 - set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
23317 - *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
23320 +#define pmd_populate_kernel(mm, pmd, pte) \
23321 + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
23323 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
23325 @@ -63,53 +46,58 @@ static inline void pgd_populate(struct m
23329 -extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
23330 -extern void pte_free(struct page *pte);
23331 +#define pmd_pgtable(pmd) pmd_page(pmd)
23333 -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
23334 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
23338 - pg = pte_alloc_one(mm, addr);
23339 - return pg ? page_address(pg) : NULL;
23340 + if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
23341 + BUG_ON(HYPERVISOR_update_va_mapping(
23342 + (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
23343 + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
23344 + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
23346 + *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
23350 -static inline void pmd_free(pmd_t *pmd)
23351 +extern void __pmd_free(pgtable_t);
23352 +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
23354 BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
23355 - pte_free(virt_to_page(pmd));
23356 + __pmd_free(virt_to_page(pmd));
23359 +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
23361 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
23365 - pg = pte_alloc_one(mm, addr);
23366 - return pg ? page_address(pg) : NULL;
23367 + return (pud_t *)pmd_alloc_one(mm, addr);
23370 -static inline void pud_free(pud_t *pud)
23371 +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
23373 BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
23374 - pte_free(virt_to_page(pud));
23375 + __pmd_free(virt_to_page(pud));
23378 static inline void pgd_list_add(pgd_t *pgd)
23380 struct page *page = virt_to_page(pgd);
23381 + unsigned long flags;
23383 - spin_lock(&pgd_lock);
23384 + spin_lock_irqsave(&pgd_lock, flags);
23385 list_add(&page->lru, &pgd_list);
23386 - spin_unlock(&pgd_lock);
23387 + spin_unlock_irqrestore(&pgd_lock, flags);
23390 static inline void pgd_list_del(pgd_t *pgd)
23392 struct page *page = virt_to_page(pgd);
23393 + unsigned long flags;
23395 - spin_lock(&pgd_lock);
23396 + spin_lock_irqsave(&pgd_lock, flags);
23397 list_del(&page->lru);
23398 - spin_unlock(&pgd_lock);
23399 + spin_unlock_irqrestore(&pgd_lock, flags);
23402 extern void pgd_test_and_unpin(pgd_t *);
23403 @@ -145,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm
23407 -static inline void pgd_free(pgd_t *pgd)
23408 +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
23410 pgd_test_and_unpin(pgd);
23412 @@ -161,17 +149,30 @@ static inline pte_t *pte_alloc_one_kerne
23416 +extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
23418 /* Should really implement gc for free page table pages. This could be
23419 done with a reference count in struct page. */
23421 -static inline void pte_free_kernel(pte_t *pte)
23422 +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
23424 BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
23425 make_page_writable(pte, XENFEAT_writable_page_tables);
23426 free_page((unsigned long)pte);
23429 -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
23430 +extern void __pte_free(pgtable_t);
23431 +static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
23436 +#define __pte_free_tlb(tlb,pte) \
23438 + pgtable_page_dtor((pte)); \
23439 + tlb_remove_page((tlb), (pte)); \
23442 #define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
23443 #define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
23445 --- a/include/asm-x86/mach-xen/asm/pgtable_32.h
23446 +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
23448 #ifndef _I386_PGTABLE_H
23449 #define _I386_PGTABLE_H
23451 -#include <asm/hypervisor.h>
23454 * The Linux memory management assumes a three-level page table setup. On
23455 * the i386, we use that, but "fold" the mid level into the top-level page
23456 @@ -25,20 +23,10 @@
23458 struct vm_area_struct;
23461 - * ZERO_PAGE is a global shared page that is always zero: used
23462 - * for zero-mapped memory areas etc..
23464 -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
23465 -extern unsigned long empty_zero_page[1024];
23466 extern pgd_t *swapper_pg_dir;
23467 -extern struct kmem_cache *pmd_cache;
23468 -extern spinlock_t pgd_lock;
23469 -extern struct page *pgd_list;
23470 -void check_pgt_cache(void);
23472 -void pmd_ctor(struct kmem_cache *, void *);
23473 -void pgtable_cache_init(void);
23474 +static inline void pgtable_cache_init(void) { }
23475 +static inline void check_pgt_cache(void) { }
23476 void paging_init(void);
23479 @@ -58,16 +46,9 @@ void paging_init(void);
23480 #define PGDIR_SIZE (1UL << PGDIR_SHIFT)
23481 #define PGDIR_MASK (~(PGDIR_SIZE-1))
23483 -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
23484 -#define FIRST_USER_ADDRESS 0
23486 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
23487 #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
23489 -#define TWOLEVEL_PGDIR_SHIFT 22
23490 -#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
23491 -#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
23493 /* Just any arbitrary offset to the start of the vmalloc VM area: the
23494 * current 8MB value just means that there will be a 8MB "hole" after the
23495 * physical memory until the kernel virtual memory starts. That means that
23496 @@ -78,121 +59,19 @@ void paging_init(void);
23497 #define VMALLOC_OFFSET (8*1024*1024)
23498 #define VMALLOC_START (((unsigned long) high_memory + \
23499 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
23500 -#ifdef CONFIG_HIGHMEM
23501 -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
23503 -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
23507 - * _PAGE_PSE set in the page directory entry just means that
23508 - * the page directory entry points directly to a 4MB-aligned block of
23511 -#define _PAGE_BIT_PRESENT 0
23512 -#define _PAGE_BIT_RW 1
23513 -#define _PAGE_BIT_USER 2
23514 -#define _PAGE_BIT_PWT 3
23515 -#define _PAGE_BIT_PCD 4
23516 -#define _PAGE_BIT_ACCESSED 5
23517 -#define _PAGE_BIT_DIRTY 6
23518 -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
23519 -#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
23520 -/*#define _PAGE_BIT_UNUSED1 9*/ /* available for programmer */
23521 -#define _PAGE_BIT_UNUSED2 10
23522 -#define _PAGE_BIT_UNUSED3 11
23523 -#define _PAGE_BIT_NX 63
23525 -#define _PAGE_PRESENT 0x001
23526 -#define _PAGE_RW 0x002
23527 -#define _PAGE_USER 0x004
23528 -#define _PAGE_PWT 0x008
23529 -#define _PAGE_PCD 0x010
23530 -#define _PAGE_ACCESSED 0x020
23531 -#define _PAGE_DIRTY 0x040
23532 -#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */
23533 -#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */
23534 -/*#define _PAGE_UNUSED1 0x200*/ /* available for programmer */
23535 -#define _PAGE_UNUSED2 0x400
23536 -#define _PAGE_UNUSED3 0x800
23538 -/* If _PAGE_PRESENT is clear, we use these: */
23539 -#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
23540 -#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE;
23541 - pte_present gives true */
23542 #ifdef CONFIG_X86_PAE
23543 -#define _PAGE_NX (1ULL<<_PAGE_BIT_NX)
23544 +#define LAST_PKMAP 512
23546 -#define _PAGE_NX 0
23547 +#define LAST_PKMAP 1024
23550 -/* Mapped page is I/O or foreign and has no associated page struct. */
23551 -#define _PAGE_IO 0x200
23552 +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
23554 -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
23555 -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
23556 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
23558 -#define PAGE_NONE \
23559 - __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
23560 -#define PAGE_SHARED \
23561 - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23563 -#define PAGE_SHARED_EXEC \
23564 - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23565 -#define PAGE_COPY_NOEXEC \
23566 - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23567 -#define PAGE_COPY_EXEC \
23568 - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23569 -#define PAGE_COPY \
23571 -#define PAGE_READONLY \
23572 - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23573 -#define PAGE_READONLY_EXEC \
23574 - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23576 -#define _PAGE_KERNEL \
23577 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
23578 -#define _PAGE_KERNEL_EXEC \
23579 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
23581 -extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
23582 -#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
23583 -#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
23584 -#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
23585 -#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
23586 -#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
23588 -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
23589 -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
23590 -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
23591 -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
23592 -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
23593 -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
23594 -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
23597 - * The i386 can't do page protection for execute, and considers that
23598 - * the same are read. Also, write permissions imply read permissions.
23599 - * This is the closest we can get..
23601 -#define __P000 PAGE_NONE
23602 -#define __P001 PAGE_READONLY
23603 -#define __P010 PAGE_COPY
23604 -#define __P011 PAGE_COPY
23605 -#define __P100 PAGE_READONLY_EXEC
23606 -#define __P101 PAGE_READONLY_EXEC
23607 -#define __P110 PAGE_COPY_EXEC
23608 -#define __P111 PAGE_COPY_EXEC
23610 -#define __S000 PAGE_NONE
23611 -#define __S001 PAGE_READONLY
23612 -#define __S010 PAGE_SHARED
23613 -#define __S011 PAGE_SHARED
23614 -#define __S100 PAGE_READONLY_EXEC
23615 -#define __S101 PAGE_READONLY_EXEC
23616 -#define __S110 PAGE_SHARED_EXEC
23617 -#define __S111 PAGE_SHARED_EXEC
23618 +#ifdef CONFIG_HIGHMEM
23619 +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
23621 +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
23625 * Define this if things work differently on an i386 and an i486:
23626 @@ -221,28 +100,6 @@ extern unsigned long pg0[];
23628 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
23631 - * The following only work if pte_present() is true.
23632 - * Undefined behaviour if not..
23634 -static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
23635 -static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
23636 -static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
23637 -static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; }
23640 - * The following only works if pte_present() is not true.
23642 -static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; }
23644 -static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
23645 -static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
23646 -static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
23647 -static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
23648 -static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
23649 -static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
23650 -static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
23652 #ifdef CONFIG_X86_PAE
23653 # include <asm/pgtable-3level.h>
23655 @@ -250,111 +107,6 @@ static inline pte_t pte_mkhuge(pte_t pte
23659 - * Rules for using pte_update - it must be called after any PTE update which
23660 - * has not been done using the set_pte / clear_pte interfaces. It is used by
23661 - * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
23662 - * updates should either be sets, clears, or set_pte_atomic for P->P
23663 - * transitions, which means this hook should only be called for user PTEs.
23664 - * This hook implies a P->P protection or access change has taken place, which
23665 - * requires a subsequent TLB flush. The notification can optionally be delayed
23666 - * until the TLB flush event by using the pte_update_defer form of the
23667 - * interface, but care must be taken to assure that the flush happens while
23668 - * still holding the same page table lock so that the shadow and primary pages
23669 - * do not become out of sync on SMP.
23671 -#define pte_update(mm, addr, ptep) do { } while (0)
23672 -#define pte_update_defer(mm, addr, ptep) do { } while (0)
23674 -/* local pte updates need not use xchg for locking */
23675 -static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
23677 - xen_set_pte(ptep, __pte(0));
23682 - * We only update the dirty/accessed state if we set
23683 - * the dirty bit by hand in the kernel, since the hardware
23684 - * will do the accessed bit for us, and we don't want to
23685 - * race with other CPU's that might be updating the dirty
23686 - * bit at the same time.
23688 -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
23689 -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
23691 - int __changed = !pte_same(*(ptep), entry); \
23692 - if (__changed && (dirty)) { \
23693 - if ( likely((vma)->vm_mm == current->mm) ) { \
23694 - BUG_ON(HYPERVISOR_update_va_mapping(address, \
23696 - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23697 - UVMF_INVLPG|UVMF_MULTI)); \
23699 - xen_l1_entry_update(ptep, entry); \
23700 - flush_tlb_page(vma, address); \
23706 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
23707 -#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
23709 - if (pte_young(*(ptep))) \
23710 - __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
23711 - &(ptep)->pte_low); \
23713 - pte_update((vma)->vm_mm, addr, ptep); \
23717 -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
23718 -#define ptep_clear_flush_young(vma, address, ptep) \
23720 - pte_t __pte = *(ptep); \
23721 - int __young = pte_young(__pte); \
23722 - __pte = pte_mkold(__pte); \
23723 - if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
23724 - (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
23725 - else if (__young) \
23726 - (ptep)->pte_low = __pte.pte_low; \
23730 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
23731 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23733 - pte_t pte = *ptep;
23734 - if (!pte_none(pte)
23735 - && (mm != &init_mm
23736 - || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
23737 - pte = xen_ptep_get_and_clear(ptep, pte);
23738 - pte_update(mm, addr, ptep);
23743 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
23744 -#define ptep_get_and_clear_full(mm, addr, ptep, full) \
23746 - pte_t __res = *(ptep); \
23747 - if (PagePinned(virt_to_page((mm)->pgd))) \
23748 - xen_l1_entry_update(ptep, __pte(0)); \
23750 - *(ptep) = __pte(0); \
23753 - ptep_get_and_clear(mm, addr, ptep))
23755 -#define __HAVE_ARCH_PTEP_SET_WRPROTECT
23756 -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23758 - pte_t pte = *ptep;
23759 - if (pte_write(pte))
23760 - set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
23764 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
23766 * dst - pointer to pgd range anwhere on a pgd page
23767 @@ -383,26 +135,6 @@ static inline void clone_pgd_range(pgd_t
23769 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
23771 -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
23774 - * Since this might change the present bit (which controls whether
23775 - * a pte_t object has undergone p2m translation), we must use
23776 - * pte_val() on the input pte and __pte() for the return value.
23778 - paddr_t pteval = pte_val(pte);
23780 - pteval &= _PAGE_CHG_MASK;
23781 - pteval |= pgprot_val(newprot);
23782 -#ifdef CONFIG_X86_PAE
23783 - pteval &= __supported_pte_mask;
23785 - return __pte(pteval);
23788 -#define pmd_large(pmd) \
23789 -((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
23792 * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
23794 @@ -424,6 +156,8 @@ static inline pte_t pte_modify(pte_t pte
23796 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
23798 +static inline int pud_large(pud_t pud) { return 0; }
23801 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
23803 @@ -449,26 +183,6 @@ static inline pte_t pte_modify(pte_t pte
23804 #define pmd_page_vaddr(pmd) \
23805 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
23808 - * Helper function that returns the kernel pagetable entry controlling
23809 - * the virtual address 'address'. NULL means no pagetable entry present.
23810 - * NOTE: the return type is pte_t but if the pmd is PSE then we return it
23813 -extern pte_t *lookup_address(unsigned long address);
23816 - * Make a given kernel text page executable/non-executable.
23817 - * Returns the previous executability setting of that page (which
23818 - * is used to restore the previous state). Used by the SMP bootup code.
23819 - * NOTE: this is an __init function for security reasons.
23821 -#ifdef CONFIG_X86_PAE
23822 - extern int set_kernel_exec(unsigned long vaddr, int enable);
23824 - static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
23827 #if defined(CONFIG_HIGHPTE)
23828 #define pte_offset_map(dir, address) \
23829 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
23830 @@ -496,59 +210,22 @@ extern pte_t *lookup_address(unsigned lo
23832 #define update_mmu_cache(vma,address,pte) do { } while (0)
23834 -#include <xen/features.h>
23835 void make_lowmem_page_readonly(void *va, unsigned int feature);
23836 void make_lowmem_page_writable(void *va, unsigned int feature);
23837 -void make_page_readonly(void *va, unsigned int feature);
23838 -void make_page_writable(void *va, unsigned int feature);
23839 -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
23840 -void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
23842 -#define virt_to_ptep(va) \
23844 - pte_t *__ptep = lookup_address((unsigned long)(va)); \
23845 - BUG_ON(!__ptep || !pte_present(*__ptep)); \
23849 -#define arbitrary_virt_to_machine(va) \
23850 - (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
23851 - | ((unsigned long)(va) & (PAGE_SIZE - 1)))
23853 #endif /* !__ASSEMBLY__ */
23856 + * kern_addr_valid() is (1) for FLATMEM and (0) for
23857 + * SPARSEMEM and DISCONTIGMEM
23859 #ifdef CONFIG_FLATMEM
23860 #define kern_addr_valid(addr) (1)
23861 -#endif /* CONFIG_FLATMEM */
23863 -int direct_remap_pfn_range(struct vm_area_struct *vma,
23864 - unsigned long address,
23865 - unsigned long mfn,
23866 - unsigned long size,
23869 -int direct_kernel_remap_pfn_range(unsigned long address,
23870 - unsigned long mfn,
23871 - unsigned long size,
23874 -int create_lookup_pte_addr(struct mm_struct *mm,
23875 - unsigned long address,
23877 -int touch_pte_range(struct mm_struct *mm,
23878 - unsigned long address,
23879 - unsigned long size);
23881 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
23882 - unsigned long addr, unsigned long end, pgprot_t newprot,
23883 - int dirty_accountable);
23885 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
23886 - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
23888 +#define kern_addr_valid(kaddr) (0)
23891 #define io_remap_pfn_range(vma,from,pfn,size,prot) \
23892 direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
23894 -#include <asm-generic/pgtable.h>
23896 #endif /* _I386_PGTABLE_H */
23897 --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
23898 +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
23899 @@ -18,16 +18,18 @@
23900 printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
23901 &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
23903 -#define pud_none(pud) 0
23904 -#define pud_bad(pud) 0
23905 -#define pud_present(pud) 1
23908 - * All present pages with !NX bit are kernel-executable:
23910 -static inline int pte_exec_kernel(pte_t pte)
23911 +static inline int pud_none(pud_t pud)
23913 + return __pud_val(pud) == 0;
23915 +static inline int pud_bad(pud_t pud)
23917 - return !(__pte_val(pte) & _PAGE_NX);
23918 + return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
23920 +static inline int pud_present(pud_t pud)
23922 + return __pud_val(pud) & _PAGE_PRESENT;
23925 /* Rules for using set_pte: the pte being assigned *must* be
23926 @@ -44,14 +46,6 @@ static inline void xen_set_pte(pte_t *pt
23927 ptep->pte_low = pte.pte_low;
23930 -static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
23931 - pte_t *ptep , pte_t pte)
23933 - if ((mm != current->mm && mm != &init_mm) ||
23934 - HYPERVISOR_update_va_mapping(addr, pte, 0))
23935 - xen_set_pte(ptep, pte);
23938 static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
23940 set_64bit((unsigned long long *)(ptep),__pte_val(pte));
23941 @@ -70,14 +64,11 @@ static inline void xen_set_pud(pud_t *pu
23942 * entry, so clear the bottom half first and enforce ordering with a compiler
23945 -static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23946 +static inline void __xen_pte_clear(pte_t *ptep)
23948 - if ((mm != current->mm && mm != &init_mm)
23949 - || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
23950 - ptep->pte_low = 0;
23952 - ptep->pte_high = 0;
23954 + ptep->pte_low = 0;
23956 + ptep->pte_high = 0;
23959 static inline void xen_pmd_clear(pmd_t *pmd)
23960 @@ -85,21 +76,25 @@ static inline void xen_pmd_clear(pmd_t *
23961 xen_l2_entry_update(pmd, __pmd(0));
23964 -#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
23965 -#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
23966 -#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte)
23967 -#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
23968 -#define set_pud(pudp, pud) xen_set_pud(pudp, pud)
23969 -#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
23970 -#define pmd_clear(pmd) xen_pmd_clear(pmd)
23971 +static inline void pud_clear(pud_t *pudp)
23975 + set_pud(pudp, __pud(0));
23978 - * Pentium-II erratum A13: in PAE mode we explicitly have to flush
23979 - * the TLB via cr3 if the top-level pgd is changed...
23980 - * We do not let the generic code free and clear pgd entries due to
23983 -static inline void pud_clear (pud_t * pud) { }
23985 + * According to Intel App note "TLBs, Paging-Structure Caches,
23986 + * and Their Invalidation", April 2007, document 317080-001,
23987 + * section 8.1: in PAE mode we explicitly have to flush the
23988 + * TLB via cr3 if the top-level pgd is changed...
23990 + * Make sure the pud entry we're updating is within the
23991 + * current pgd to avoid unnecessary TLB flushes.
23993 + pgd = read_cr3();
23994 + if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
23998 #define pud_page(pud) \
23999 ((struct page *) __va(pud_val(pud) & PAGE_MASK))
24000 @@ -128,24 +123,6 @@ static inline pte_t xen_ptep_get_and_cle
24001 #define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
24004 -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
24005 -#define ptep_clear_flush(vma, addr, ptep) \
24007 - pte_t *__ptep = (ptep); \
24008 - pte_t __res = *__ptep; \
24009 - if (!pte_none(__res) && \
24010 - ((vma)->vm_mm != current->mm || \
24011 - HYPERVISOR_update_va_mapping(addr, __pte(0), \
24012 - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24013 - UVMF_INVLPG|UVMF_MULTI))) { \
24014 - __ptep->pte_low = 0; \
24016 - __ptep->pte_high = 0; \
24017 - flush_tlb_page(vma, addr); \
24022 #define __HAVE_ARCH_PTE_SAME
24023 static inline int pte_same(pte_t a, pte_t b)
24025 @@ -168,26 +145,12 @@ static inline int pte_none(pte_t pte)
24026 mfn_to_local_pfn(__pte_mfn(_pte)) : \
24029 -extern unsigned long long __supported_pte_mask;
24031 -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
24033 - return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
24034 - pgprot_val(pgprot)) & __supported_pte_mask);
24037 -static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
24039 - return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
24040 - pgprot_val(pgprot)) & __supported_pte_mask);
24044 * Bits 0, 6 and 7 are taken in the low part of the pte,
24045 * put the 32 bits of offset into the high part.
24047 #define pte_to_pgoff(pte) ((pte).pte_high)
24048 -#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
24049 +#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
24050 #define PTE_FILE_MAX_BITS 32
24052 /* Encode and de-code a swap entry */
24053 @@ -195,8 +158,6 @@ static inline pmd_t pfn_pmd(unsigned lon
24054 #define __swp_offset(x) ((x).val >> 5)
24055 #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
24056 #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
24057 -#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val })
24059 -#define __pmd_free_tlb(tlb, x) do { } while (0)
24060 +#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
24062 #endif /* _I386_PGTABLE_3LEVEL_H */
24063 --- a/include/asm-x86/mach-xen/asm/pgtable_64.h
24064 +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
24065 @@ -13,47 +13,26 @@
24066 #include <linux/threads.h>
24067 #include <linux/sched.h>
24068 #include <asm/pda.h>
24070 -#include <asm/hypervisor.h>
24073 extern pud_t level3_user_pgt[512];
24075 extern void xen_init_pt(void);
24077 -extern pte_t *lookup_address(unsigned long address);
24079 -#define virt_to_ptep(va) \
24081 - pte_t *__ptep = lookup_address((unsigned long)(va)); \
24082 - BUG_ON(!__ptep || !pte_present(*__ptep)); \
24086 -#define arbitrary_virt_to_machine(va) \
24087 - (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
24088 - | ((unsigned long)(va) & (PAGE_SIZE - 1)))
24091 extern pud_t level3_kernel_pgt[512];
24092 extern pud_t level3_ident_pgt[512];
24093 extern pmd_t level2_kernel_pgt[512];
24094 extern pgd_t init_level4_pgt[];
24095 -extern unsigned long __supported_pte_mask;
24097 #define swapper_pg_dir init_level4_pgt
24099 extern void paging_init(void);
24100 -extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
24103 - * ZERO_PAGE is a global shared page that is always zero: used
24104 - * for zero-mapped memory areas etc..
24106 -extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
24107 -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
24109 #endif /* !__ASSEMBLY__ */
24111 +#define SHARED_KERNEL_PMD 1
24114 * PGDIR_SHIFT determines what a top-level page table entry can map
24116 @@ -96,31 +75,63 @@ extern unsigned long empty_zero_page[PAG
24117 #define pgd_none(x) (!__pgd_val(x))
24118 #define pud_none(x) (!__pud_val(x))
24120 -static inline void set_pte(pte_t *dst, pte_t val)
24123 +#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
24125 +static inline void xen_set_pte(pte_t *ptep, pte_t pte)
24130 +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
24132 + xen_set_pte(ptep, pte);
24136 +static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
24138 + return __pte_ma(xchg(&xp->pte, 0));
24141 +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
24144 +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
24147 + xen_l2_entry_update(pmdp, pmd);
24150 -#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
24151 -#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
24152 -#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
24153 +static inline void xen_pmd_clear(pmd_t *pmd)
24155 + xen_set_pmd(pmd, xen_make_pmd(0));
24158 +static inline void xen_set_pud(pud_t *pudp, pud_t pud)
24160 + xen_l3_entry_update(pudp, pud);
24163 -static inline void pud_clear (pud_t * pud)
24164 +static inline void xen_pud_clear(pud_t *pud)
24166 - set_pud(pud, __pud(0));
24167 + xen_set_pud(pud, xen_make_pud(0));
24170 #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
24172 -static inline void pgd_clear (pgd_t * pgd)
24173 +static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
24175 - set_pgd(pgd, __pgd(0));
24176 - set_pgd(__user_pgd(pgd), __pgd(0));
24177 + xen_l4_entry_update(pgdp, pgd);
24180 -#define pte_same(a, b) ((a).pte == (b).pte)
24181 +static inline void xen_pgd_clear(pgd_t * pgd)
24183 + xen_set_pgd(pgd, xen_make_pgd(0));
24184 + xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
24187 -#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
24188 +#define pte_same(a, b) ((a).pte == (b).pte)
24190 #endif /* !__ASSEMBLY__ */
24192 @@ -131,8 +142,6 @@ static inline void pgd_clear (pgd_t * pg
24193 #define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
24194 #define PGDIR_MASK (~(PGDIR_SIZE-1))
24196 -#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
24197 -#define FIRST_USER_ADDRESS 0
24199 #define MAXMEM _AC(0x3fffffffffff, UL)
24200 #define VMALLOC_START _AC(0xffffc20000000000, UL)
24201 @@ -142,105 +151,6 @@ static inline void pgd_clear (pgd_t * pg
24202 #define MODULES_END _AC(0xfffffffffff00000, UL)
24203 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
24205 -#define _PAGE_BIT_PRESENT 0
24206 -#define _PAGE_BIT_RW 1
24207 -#define _PAGE_BIT_USER 2
24208 -#define _PAGE_BIT_PWT 3
24209 -#define _PAGE_BIT_PCD 4
24210 -#define _PAGE_BIT_ACCESSED 5
24211 -#define _PAGE_BIT_DIRTY 6
24212 -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
24213 -#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
24214 -#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
24216 -#define _PAGE_PRESENT 0x001
24217 -#define _PAGE_RW 0x002
24218 -#define _PAGE_USER 0x004
24219 -#define _PAGE_PWT 0x008
24220 -#define _PAGE_PCD 0x010
24221 -#define _PAGE_ACCESSED 0x020
24222 -#define _PAGE_DIRTY 0x040
24223 -#define _PAGE_PSE 0x080 /* 2MB page */
24224 -#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
24225 -#define _PAGE_GLOBAL 0x100 /* Global TLB entry */
24227 -#define _PAGE_PROTNONE 0x080 /* If not present */
24228 -#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX)
24230 -/* Mapped page is I/O or foreign and has no associated page struct. */
24231 -#define _PAGE_IO 0x200
24233 -#ifndef __ASSEMBLY__
24234 -#if CONFIG_XEN_COMPAT <= 0x030002
24235 -extern unsigned int __kernel_page_user;
24237 -#define __kernel_page_user 0
24241 -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
24242 -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
24244 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
24246 -#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
24247 -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24248 -#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
24249 -#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24250 -#define PAGE_COPY PAGE_COPY_NOEXEC
24251 -#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24252 -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24253 -#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24254 -#define __PAGE_KERNEL \
24255 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24256 -#define __PAGE_KERNEL_EXEC \
24257 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
24258 -#define __PAGE_KERNEL_NOCACHE \
24259 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24260 -#define __PAGE_KERNEL_RO \
24261 - (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24262 -#define __PAGE_KERNEL_VSYSCALL \
24263 - (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24264 -#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
24265 - (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
24266 -#define __PAGE_KERNEL_LARGE \
24267 - (__PAGE_KERNEL | _PAGE_PSE)
24268 -#define __PAGE_KERNEL_LARGE_EXEC \
24269 - (__PAGE_KERNEL_EXEC | _PAGE_PSE)
24272 - * We don't support GLOBAL page in xenolinux64
24274 -#define MAKE_GLOBAL(x) __pgprot((x))
24276 -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
24277 -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
24278 -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
24279 -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
24280 -#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
24281 -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
24282 -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
24283 -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
24286 -#define __P000 PAGE_NONE
24287 -#define __P001 PAGE_READONLY
24288 -#define __P010 PAGE_COPY
24289 -#define __P011 PAGE_COPY
24290 -#define __P100 PAGE_READONLY_EXEC
24291 -#define __P101 PAGE_READONLY_EXEC
24292 -#define __P110 PAGE_COPY_EXEC
24293 -#define __P111 PAGE_COPY_EXEC
24295 -#define __S000 PAGE_NONE
24296 -#define __S001 PAGE_READONLY
24297 -#define __S010 PAGE_SHARED
24298 -#define __S011 PAGE_SHARED
24299 -#define __S100 PAGE_READONLY_EXEC
24300 -#define __S101 PAGE_READONLY_EXEC
24301 -#define __S110 PAGE_SHARED_EXEC
24302 -#define __S111 PAGE_SHARED_EXEC
24304 #ifndef __ASSEMBLY__
24306 static inline unsigned long pgd_bad(pgd_t pgd)
24307 @@ -258,119 +168,26 @@ static inline unsigned long pmd_bad(pmd_
24308 return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
24311 -#define set_pte_at(_mm,addr,ptep,pteval) do { \
24312 - if (((_mm) != current->mm && (_mm) != &init_mm) || \
24313 - HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
24314 - set_pte((ptep), (pteval)); \
24317 #define pte_none(x) (!(x).pte)
24318 #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
24319 -#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
24321 -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
24322 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */
24324 #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
24325 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
24326 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
24327 -#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn : \
24328 +#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
24329 (_pte).pte & _PAGE_PRESENT ? \
24330 mfn_to_local_pfn(__pte_mfn(_pte)) : \
24333 #define pte_page(x) pfn_to_page(pte_pfn(x))
24335 -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
24337 - unsigned long pte = page_nr << PAGE_SHIFT;
24338 - pte |= pgprot_val(pgprot);
24339 - pte &= __supported_pte_mask;
24340 - return __pte(pte);
24343 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24345 - pte_t pte = *ptep;
24346 - if (!pte_none(pte)) {
24347 - if ((mm != &init_mm) ||
24348 - HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
24349 - pte = __pte_ma(xchg(&ptep->pte, 0));
24354 -static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
24357 - pte_t pte = *ptep;
24358 - if (PagePinned(virt_to_page(mm->pgd)))
24359 - xen_l1_entry_update(ptep, __pte(0));
24361 - *ptep = __pte(0);
24364 - return ptep_get_and_clear(mm, addr, ptep);
24367 -#define ptep_clear_flush(vma, addr, ptep) \
24369 - pte_t *__ptep = (ptep); \
24370 - pte_t __res = *__ptep; \
24371 - if (!pte_none(__res) && \
24372 - ((vma)->vm_mm != current->mm || \
24373 - HYPERVISOR_update_va_mapping(addr, __pte(0), \
24374 - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24375 - UVMF_INVLPG|UVMF_MULTI))) { \
24376 - __ptep->pte = 0; \
24377 - flush_tlb_page(vma, addr); \
24383 - * The following only work if pte_present() is true.
24384 - * Undefined behaviour if not..
24386 -#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
24387 -static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
24388 -static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
24389 -static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
24390 -static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
24391 -static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
24393 -static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
24394 -static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
24395 -static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
24396 -static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) &= ~_PAGE_NX; return pte; }
24397 -static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
24398 -static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
24399 -static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
24400 -static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
24401 -static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
24403 -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
24405 - if (!pte_young(*ptep))
24407 - return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
24410 -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24412 - pte_t pte = *ptep;
24413 - if (pte_write(pte))
24414 - set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
24418 * Macro to mark a page protection value as "uncacheable".
24420 #define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
24422 -static inline int pmd_large(pmd_t pte) {
24423 - return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
24428 * Conversion functions: convert a page and protection to a page entry,
24429 @@ -386,6 +203,7 @@ static inline int pmd_large(pmd_t pte) {
24430 #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
24431 #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
24432 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
24433 +static inline int pgd_large(pgd_t pgd) { return 0; }
24434 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
24436 /* PUD - Level3 access */
24437 @@ -396,6 +214,12 @@ static inline int pmd_large(pmd_t pte) {
24438 #define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
24439 #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
24441 +static inline int pud_large(pud_t pte)
24443 + return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
24444 + (_PAGE_PSE|_PAGE_PRESENT);
24447 /* PMD - Level 2 access */
24448 #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
24449 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
24450 @@ -411,36 +235,18 @@ static inline int pmd_large(pmd_t pte) {
24452 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
24454 -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
24455 #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
24456 #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
24458 #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
24459 -#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
24460 +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
24461 #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
24463 /* PTE - Level 1 access. */
24465 /* page, protection -> pte */
24466 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
24467 -#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
24469 -/* Change flags of a PTE */
24470 -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
24473 - * Since this might change the present bit (which controls whether
24474 - * a pte_t object has undergone p2m translation), we must use
24475 - * pte_val() on the input pte and __pte() for the return value.
24477 - unsigned long pteval = pte_val(pte);
24479 - pteval &= _PAGE_CHG_MASK;
24480 - pteval |= pgprot_val(newprot);
24481 - pteval &= __supported_pte_mask;
24482 - return __pte(pteval);
24485 #define pte_index(address) \
24486 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
24487 #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
24488 @@ -454,101 +260,21 @@ static inline pte_t pte_modify(pte_t pte
24490 #define update_mmu_cache(vma,address,pte) do { } while (0)
24493 - * Rules for using ptep_establish: the pte MUST be a user pte, and
24494 - * must be a present->present transition.
24496 -#define __HAVE_ARCH_PTEP_ESTABLISH
24497 -#define ptep_establish(vma, address, ptep, pteval) \
24499 - if ( likely((vma)->vm_mm == current->mm) ) { \
24500 - BUG_ON(HYPERVISOR_update_va_mapping(address, \
24502 - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24503 - UVMF_INVLPG|UVMF_MULTI)); \
24505 - xen_l1_entry_update(ptep, pteval); \
24506 - flush_tlb_page(vma, address); \
24510 -/* We only update the dirty/accessed state if we set
24511 - * the dirty bit by hand in the kernel, since the hardware
24512 - * will do the accessed bit for us, and we don't want to
24513 - * race with other CPU's that might be updating the dirty
24514 - * bit at the same time. */
24515 -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
24516 -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
24518 - int __changed = !pte_same(*(ptep), entry); \
24519 - if (__changed && (dirty)) \
24520 - ptep_establish(vma, address, ptep, entry); \
24524 -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
24525 -#define ptep_clear_flush_young(vma, address, ptep) \
24527 - pte_t __pte = *(ptep); \
24528 - int __young = pte_young(__pte); \
24529 - __pte = pte_mkold(__pte); \
24530 - if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
24531 - (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
24532 - else if (__young) \
24533 - set_pte(ptep, __pte); \
24537 /* Encode and de-code a swap entry */
24538 #define __swp_type(x) (((x).val >> 1) & 0x3f)
24539 #define __swp_offset(x) ((x).val >> 8)
24540 #define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
24541 #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
24542 -#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
24544 -extern spinlock_t pgd_lock;
24545 -extern struct list_head pgd_list;
24546 +#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
24548 extern int kern_addr_valid(unsigned long addr);
24550 -#define DOMID_LOCAL (0xFFFFU)
24552 -struct vm_area_struct;
24554 -int direct_remap_pfn_range(struct vm_area_struct *vma,
24555 - unsigned long address,
24556 - unsigned long mfn,
24557 - unsigned long size,
24561 -int direct_kernel_remap_pfn_range(unsigned long address,
24562 - unsigned long mfn,
24563 - unsigned long size,
24567 -int create_lookup_pte_addr(struct mm_struct *mm,
24568 - unsigned long address,
24571 -int touch_pte_range(struct mm_struct *mm,
24572 - unsigned long address,
24573 - unsigned long size);
24575 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
24576 - unsigned long addr, unsigned long end, pgprot_t newprot,
24577 - int dirty_accountable);
24579 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
24580 - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
24582 -pte_t *lookup_address(unsigned long addr);
24583 +extern void cleanup_highmap(void);
24585 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
24586 direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
24588 #define HAVE_ARCH_UNMAPPED_AREA
24589 +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
24591 #define pgtable_cache_init() do { } while (0)
24592 #define check_pgt_cache() do { } while (0)
24593 @@ -561,13 +287,7 @@ pte_t *lookup_address(unsigned long addr
24594 #define kc_offset_to_vaddr(o) \
24595 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
24597 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
24598 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
24599 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
24600 -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
24601 -#define __HAVE_ARCH_PTEP_SET_WRPROTECT
24602 #define __HAVE_ARCH_PTE_SAME
24603 -#include <asm-generic/pgtable.h>
24604 #endif /* !__ASSEMBLY__ */
24606 #endif /* _X86_64_PGTABLE_H */
24607 --- a/include/asm-x86/mach-xen/asm/pgtable.h
24608 +++ b/include/asm-x86/mach-xen/asm/pgtable.h
24610 +#ifndef _ASM_X86_PGTABLE_H
24611 +#define _ASM_X86_PGTABLE_H
24613 +#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
24614 +#define FIRST_USER_ADDRESS 0
24616 +#define _PAGE_BIT_PRESENT 0
24617 +#define _PAGE_BIT_RW 1
24618 +#define _PAGE_BIT_USER 2
24619 +#define _PAGE_BIT_PWT 3
24620 +#define _PAGE_BIT_PCD 4
24621 +#define _PAGE_BIT_ACCESSED 5
24622 +#define _PAGE_BIT_DIRTY 6
24623 +#define _PAGE_BIT_FILE 6
24624 +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
24625 +#define _PAGE_BIT_PAT 7 /* on 4KB pages */
24626 +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
24627 +#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
24628 + * has no associated page struct. */
24629 +#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
24630 +#define _PAGE_BIT_UNUSED3 11
24631 +#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
24632 +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
24635 + * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
24636 + * sign-extended value on 32-bit with all 1's in the upper word,
24637 + * which preserves the upper pte values on 64-bit ptes:
24639 +#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
24640 +#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
24641 +#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
24642 +#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
24643 +#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
24644 +#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
24645 +#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
24646 +#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
24647 +#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
24648 +#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
24649 +#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
24650 +#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
24651 +#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
24652 +#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
24654 +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
24655 +#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
24657 +#define _PAGE_NX 0
24660 +/* If _PAGE_PRESENT is clear, we use these: */
24661 +#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */
24662 +#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
24663 + pte_present gives true */
24665 +#ifndef __ASSEMBLY__
24666 +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
24667 +extern unsigned int __kernel_page_user;
24669 +#define __kernel_page_user 0
24673 +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
24674 +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
24676 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
24678 +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
24679 +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24681 +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
24682 +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24683 +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24684 +#define PAGE_COPY PAGE_COPY_NOEXEC
24685 +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24686 +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24688 +#ifdef CONFIG_X86_32
24689 +#define _PAGE_KERNEL_EXEC \
24690 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
24691 +#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
24693 +#ifndef __ASSEMBLY__
24694 +extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
24695 +#endif /* __ASSEMBLY__ */
24697 +#define __PAGE_KERNEL_EXEC \
24698 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
24699 +#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
24702 +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
24703 +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
24704 +#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
24705 +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
24706 +#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
24707 +#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
24708 +#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
24709 +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
24710 +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
24713 + * We don't support GLOBAL page in xenolinux64
24715 +#define MAKE_GLOBAL(x) __pgprot((x))
24717 +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
24718 +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
24719 +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
24720 +#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
24721 +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
24722 +#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
24723 +#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
24724 +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
24725 +#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
24726 +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
24727 +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
24730 +#define __P000 PAGE_NONE
24731 +#define __P001 PAGE_READONLY
24732 +#define __P010 PAGE_COPY
24733 +#define __P011 PAGE_COPY
24734 +#define __P100 PAGE_READONLY_EXEC
24735 +#define __P101 PAGE_READONLY_EXEC
24736 +#define __P110 PAGE_COPY_EXEC
24737 +#define __P111 PAGE_COPY_EXEC
24739 +#define __S000 PAGE_NONE
24740 +#define __S001 PAGE_READONLY
24741 +#define __S010 PAGE_SHARED
24742 +#define __S011 PAGE_SHARED
24743 +#define __S100 PAGE_READONLY_EXEC
24744 +#define __S101 PAGE_READONLY_EXEC
24745 +#define __S110 PAGE_SHARED_EXEC
24746 +#define __S111 PAGE_SHARED_EXEC
24748 +#ifndef __ASSEMBLY__
24751 + * ZERO_PAGE is a global shared page that is always zero: used
24752 + * for zero-mapped memory areas etc..
24754 +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
24755 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
24757 +extern spinlock_t pgd_lock;
24758 +extern struct list_head pgd_list;
24761 + * The following only work if pte_present() is true.
24762 + * Undefined behaviour if not..
24764 +static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
24765 +static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
24766 +static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
24767 +static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
24768 +static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
24769 +static inline int pte_global(pte_t pte) { return 0; }
24770 +static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
24772 +static inline int pmd_large(pmd_t pte) {
24773 + return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
24774 + (_PAGE_PSE|_PAGE_PRESENT);
24777 +static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
24778 +static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
24779 +static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
24780 +static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
24781 +static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
24782 +static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
24783 +static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); }
24784 +static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
24785 +static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
24786 +static inline pte_t pte_mkglobal(pte_t pte) { return pte; }
24787 +static inline pte_t pte_clrglobal(pte_t pte) { return pte; }
24789 +extern pteval_t __supported_pte_mask;
24791 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
24793 + return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
24794 + pgprot_val(pgprot)) & __supported_pte_mask);
24797 +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
24799 + return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
24800 + pgprot_val(pgprot)) & __supported_pte_mask);
24803 +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
24805 + return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
24806 + pgprot_val(pgprot)) & __supported_pte_mask);
24809 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
24811 + pteval_t val = pte_val(pte);
24813 + val &= _PAGE_CHG_MASK;
24814 + val |= pgprot_val(newprot) & __supported_pte_mask;
24816 + return __pte(val);
24819 +#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
24821 +#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
24823 +#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
24824 +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
24826 +#define set_pte_atomic(ptep, pte) \
24827 + xen_set_pte_atomic(ptep, pte)
24829 +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
24831 +#ifndef __PAGETABLE_PUD_FOLDED
24832 +#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd)
24833 +#define pgd_clear(pgd) xen_pgd_clear(pgd)
24837 +# define set_pud(pudp, pud) xen_set_pud(pudp, pud)
24840 +#ifndef __PAGETABLE_PMD_FOLDED
24841 +#define pud_clear(pud) xen_pud_clear(pud)
24844 +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
24845 +#define pmd_clear(pmd) xen_pmd_clear(pmd)
24847 +#define pte_update(mm, addr, ptep) do { } while (0)
24848 +#define pte_update_defer(mm, addr, ptep) do { } while (0)
24850 +#endif /* __ASSEMBLY__ */
24852 #ifdef CONFIG_X86_32
24853 # include "pgtable_32.h"
24855 # include "pgtable_64.h"
24858 +#ifndef __ASSEMBLY__
24868 + * Helper function that returns the kernel pagetable entry controlling
24869 + * the virtual address 'address'. NULL means no pagetable entry present.
24870 + * NOTE: the return type is pte_t but if the pmd is PSE then we return it
24873 +extern pte_t *lookup_address(unsigned long address, unsigned int *level);
24875 +/* local pte updates need not use xchg for locking */
24876 +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
24878 + xen_set_pte(ptep, __pte(0));
24882 +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
24883 + pte_t *ptep , pte_t pte)
24885 + if ((mm != current->mm && mm != &init_mm) ||
24886 + HYPERVISOR_update_va_mapping(addr, pte, 0))
24887 + xen_set_pte(ptep, pte);
24890 +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
24893 + if ((mm != current->mm && mm != &init_mm)
24894 + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
24895 + __xen_pte_clear(ptep);
24898 +#ifndef CONFIG_PARAVIRT
24900 + * Rules for using pte_update - it must be called after any PTE update which
24901 + * has not been done using the set_pte / clear_pte interfaces. It is used by
24902 + * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
24903 + * updates should either be sets, clears, or set_pte_atomic for P->P
24904 + * transitions, which means this hook should only be called for user PTEs.
24905 + * This hook implies a P->P protection or access change has taken place, which
24906 + * requires a subsequent TLB flush. The notification can optionally be delayed
24907 + * until the TLB flush event by using the pte_update_defer form of the
24908 + * interface, but care must be taken to assure that the flush happens while
24909 + * still holding the same page table lock so that the shadow and primary pages
24910 + * do not become out of sync on SMP.
24912 +#define pte_update(mm, addr, ptep) do { } while (0)
24913 +#define pte_update_defer(mm, addr, ptep) do { } while (0)
24917 + * We only update the dirty/accessed state if we set
24918 + * the dirty bit by hand in the kernel, since the hardware
24919 + * will do the accessed bit for us, and we don't want to
24920 + * race with other CPU's that might be updating the dirty
24921 + * bit at the same time.
24923 +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
24924 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
24926 + int __changed = !pte_same(*(ptep), entry); \
24927 + if (__changed && (dirty)) { \
24928 + if ( likely((vma)->vm_mm == current->mm) ) { \
24929 + BUG_ON(HYPERVISOR_update_va_mapping(address, \
24931 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24932 + UVMF_INVLPG|UVMF_MULTI)); \
24934 + xen_l1_entry_update(ptep, entry); \
24935 + flush_tlb_page(vma, address); \
24941 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
24942 +#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
24944 + if (pte_young(*(ptep))) \
24945 + __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
24948 + pte_update((vma)->vm_mm, addr, ptep); \
24952 +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
24953 +#define ptep_clear_flush_young(vma, address, ptep) \
24955 + pte_t __pte = *(ptep); \
24956 + int __young = pte_young(__pte); \
24957 + __pte = pte_mkold(__pte); \
24958 + if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
24959 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
24960 + else if (__young) \
24961 + (ptep)->pte_low = __pte.pte_low; \
24965 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
24966 +#define ptep_clear_flush(vma, addr, ptep) \
24968 + pte_t *__ptep = (ptep); \
24969 + pte_t __res = *__ptep; \
24970 + if (!pte_none(__res) && \
24971 + ((vma)->vm_mm != current->mm || \
24972 + HYPERVISOR_update_va_mapping(addr, __pte(0), \
24973 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24974 + UVMF_INVLPG|UVMF_MULTI))) { \
24975 + __xen_pte_clear(__ptep); \
24976 + flush_tlb_page(vma, addr); \
24981 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
24982 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24984 + pte_t pte = *ptep;
24985 + if (!pte_none(pte)
24986 + && (mm != &init_mm
24987 + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
24988 + pte = xen_ptep_get_and_clear(ptep, pte);
24989 + pte_update(mm, addr, ptep);
24994 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
24995 +#define ptep_get_and_clear_full(mm, addr, ptep, full) \
24997 + pte_t *__ptep = (ptep); \
24998 + pte_t __res = *__ptep; \
24999 + if (!PagePinned(virt_to_page((mm)->pgd))) \
25000 + __xen_pte_clear(__ptep); \
25001 + else if (!pte_none(__res)) \
25002 + xen_l1_entry_update(__ptep, __pte(0)); \
25005 + ptep_get_and_clear(mm, addr, ptep))
25007 +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
25009 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
25010 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
25012 + pte_t pte = *ptep;
25013 + if (pte_write(pte))
25014 + set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
25017 +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
25018 + xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
25020 +#define arbitrary_virt_to_machine(va) \
25022 + unsigned int __lvl; \
25023 + pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \
25024 + BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
25025 + (((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT) \
25026 + | ((unsigned long)(va) & (PAGE_SIZE - 1))); \
25029 +#include <asm-generic/pgtable.h>
25031 +#include <xen/features.h>
25032 +void make_page_readonly(void *va, unsigned int feature);
25033 +void make_page_writable(void *va, unsigned int feature);
25034 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
25035 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
25037 +struct vm_area_struct;
25039 +int direct_remap_pfn_range(struct vm_area_struct *vma,
25040 + unsigned long address,
25041 + unsigned long mfn,
25042 + unsigned long size,
25045 +int direct_kernel_remap_pfn_range(unsigned long address,
25046 + unsigned long mfn,
25047 + unsigned long size,
25050 +int create_lookup_pte_addr(struct mm_struct *mm,
25051 + unsigned long address,
25053 +int touch_pte_range(struct mm_struct *mm,
25054 + unsigned long address,
25055 + unsigned long size);
25057 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
25058 + unsigned long addr, unsigned long end, pgprot_t newprot,
25059 + int dirty_accountable);
25061 +#endif /* __ASSEMBLY__ */
25063 +#endif /* _ASM_X86_PGTABLE_H */
25064 --- a/include/asm-x86/mach-xen/asm/processor_32.h
25068 - * include/asm-i386/processor.h
25070 - * Copyright (C) 1994 Linus Torvalds
25073 -#ifndef __ASM_I386_PROCESSOR_H
25074 -#define __ASM_I386_PROCESSOR_H
25076 -#include <asm/vm86.h>
25077 -#include <asm/math_emu.h>
25078 -#include <asm/segment.h>
25079 -#include <asm/page.h>
25080 -#include <asm/types.h>
25081 -#include <asm/sigcontext.h>
25082 -#include <asm/cpufeature.h>
25083 -#include <asm/msr.h>
25084 -#include <asm/system.h>
25085 -#include <linux/cache.h>
25086 -#include <linux/threads.h>
25087 -#include <asm/percpu.h>
25088 -#include <linux/cpumask.h>
25089 -#include <linux/init.h>
25090 -#include <asm/processor-flags.h>
25091 -#include <xen/interface/physdev.h>
25093 -/* flag for disabling the tsc */
25094 -#define tsc_disable 0
25096 -struct desc_struct {
25097 - unsigned long a,b;
25100 -#define desc_empty(desc) \
25101 - (!((desc)->a | (desc)->b))
25103 -#define desc_equal(desc1, desc2) \
25104 - (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
25106 - * Default implementation of macro that returns current
25107 - * instruction pointer ("program counter").
25109 -#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
25112 - * CPU type and hardware bug flags. Kept separately for each CPU.
25113 - * Members of this structure are referenced in head.S, so think twice
25114 - * before touching them. [mj]
25117 -struct cpuinfo_x86 {
25118 - __u8 x86; /* CPU family */
25119 - __u8 x86_vendor; /* CPU vendor */
25122 - char wp_works_ok; /* It doesn't on 386's */
25123 - char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
25126 - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
25127 - unsigned long x86_capability[NCAPINTS];
25128 - char x86_vendor_id[16];
25129 - char x86_model_id[64];
25130 - int x86_cache_size; /* in KB - valid for CPUS which support this
25132 - int x86_cache_alignment; /* In bytes */
25138 - unsigned long loops_per_jiffy;
25140 - cpumask_t llc_shared_map; /* cpus sharing the last level cache */
25142 - unsigned char x86_max_cores; /* cpuid returned max cores value */
25143 - unsigned char apicid;
25144 - unsigned short x86_clflush_size;
25146 - unsigned char booted_cores; /* number of cores as seen by OS */
25147 - __u8 phys_proc_id; /* Physical processor id. */
25148 - __u8 cpu_core_id; /* Core id */
25149 - __u8 cpu_index; /* index into per_cpu list */
25151 -} __attribute__((__aligned__(SMP_CACHE_BYTES)));
25153 -#define X86_VENDOR_INTEL 0
25154 -#define X86_VENDOR_CYRIX 1
25155 -#define X86_VENDOR_AMD 2
25156 -#define X86_VENDOR_UMC 3
25157 -#define X86_VENDOR_NEXGEN 4
25158 -#define X86_VENDOR_CENTAUR 5
25159 -#define X86_VENDOR_TRANSMETA 7
25160 -#define X86_VENDOR_NSC 8
25161 -#define X86_VENDOR_NUM 9
25162 -#define X86_VENDOR_UNKNOWN 0xff
25165 - * capabilities of CPUs
25168 -extern struct cpuinfo_x86 boot_cpu_data;
25169 -extern struct cpuinfo_x86 new_cpu_data;
25170 -#ifndef CONFIG_X86_NO_TSS
25171 -extern struct tss_struct doublefault_tss;
25172 -DECLARE_PER_CPU(struct tss_struct, init_tss);
25176 -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25177 -#define cpu_data(cpu) per_cpu(cpu_info, cpu)
25178 -#define current_cpu_data cpu_data(smp_processor_id())
25180 -#define cpu_data(cpu) boot_cpu_data
25181 -#define current_cpu_data boot_cpu_data
25185 - * the following now lives in the per cpu area:
25186 - * extern int cpu_llc_id[NR_CPUS];
25188 -DECLARE_PER_CPU(u8, cpu_llc_id);
25189 -extern char ignore_fpu_irq;
25191 -void __init cpu_detect(struct cpuinfo_x86 *c);
25193 -extern void identify_boot_cpu(void);
25194 -extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25195 -extern void print_cpu_info(struct cpuinfo_x86 *);
25196 -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
25197 -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
25198 -extern unsigned short num_cache_leaves;
25200 -#ifdef CONFIG_X86_HT
25201 -extern void detect_ht(struct cpuinfo_x86 *c);
25203 -static inline void detect_ht(struct cpuinfo_x86 *c) {}
25206 -static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
25207 - unsigned int *ecx, unsigned int *edx)
25209 - /* ecx is often an input as well as an output. */
25210 - __asm__(XEN_CPUID
25215 - : "0" (*eax), "2" (*ecx));
25218 -#define load_cr3(pgdir) write_cr3(__pa(pgdir))
25221 - * Save the cr4 feature set we're using (ie
25222 - * Pentium 4MB enable and PPro Global page
25223 - * enable), so that any CPU's that boot up
25224 - * after us can get the correct flags.
25226 -extern unsigned long mmu_cr4_features;
25228 -static inline void set_in_cr4 (unsigned long mask)
25231 - mmu_cr4_features |= mask;
25232 - cr4 = read_cr4();
25237 -static inline void clear_in_cr4 (unsigned long mask)
25240 - mmu_cr4_features &= ~mask;
25241 - cr4 = read_cr4();
25246 -/* Stop speculative execution */
25247 -static inline void sync_core(void)
25250 - asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
25253 -static inline void __monitor(const void *eax, unsigned long ecx,
25254 - unsigned long edx)
25256 - /* "monitor %eax,%ecx,%edx;" */
25258 - ".byte 0x0f,0x01,0xc8;"
25259 - : :"a" (eax), "c" (ecx), "d"(edx));
25262 -static inline void __mwait(unsigned long eax, unsigned long ecx)
25264 - /* "mwait %eax,%ecx;" */
25266 - ".byte 0x0f,0x01,0xc9;"
25267 - : :"a" (eax), "c" (ecx));
25270 -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25272 -/* from system description table in BIOS. Mostly for MCA use, but
25273 -others may find it useful. */
25274 -extern unsigned int machine_id;
25275 -extern unsigned int machine_submodel_id;
25276 -extern unsigned int BIOS_revision;
25277 -extern unsigned int mca_pentium_flag;
25279 -/* Boot loader type from the setup header */
25280 -extern int bootloader_type;
25283 - * User space process size: 3GB (default).
25285 -#define TASK_SIZE (PAGE_OFFSET)
25287 -/* This decides where the kernel will search for a free chunk of vm
25288 - * space during mmap's.
25290 -#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
25292 -#define HAVE_ARCH_PICK_MMAP_LAYOUT
25294 -extern void hard_disable_TSC(void);
25295 -extern void disable_TSC(void);
25296 -extern void hard_enable_TSC(void);
25299 - * Size of io_bitmap.
25301 -#define IO_BITMAP_BITS 65536
25302 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
25303 -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
25304 -#ifndef CONFIG_X86_NO_TSS
25305 -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
25307 -#define INVALID_IO_BITMAP_OFFSET 0x8000
25308 -#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
25310 -struct i387_fsave_struct {
25318 - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
25319 - long status; /* software status information */
25322 -struct i387_fxsave_struct {
25323 - unsigned short cwd;
25324 - unsigned short swd;
25325 - unsigned short twd;
25326 - unsigned short fop;
25333 - long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
25334 - long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
25335 - long padding[56];
25336 -} __attribute__ ((aligned (16)));
25338 -struct i387_soft_struct {
25346 - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
25347 - unsigned char ftop, changed, lookahead, no_update, rm, alimit;
25348 - struct info *info;
25349 - unsigned long entry_eip;
25352 -union i387_union {
25353 - struct i387_fsave_struct fsave;
25354 - struct i387_fxsave_struct fxsave;
25355 - struct i387_soft_struct soft;
25359 - unsigned long seg;
25362 -struct thread_struct;
25364 -#ifndef CONFIG_X86_NO_TSS
25365 -/* This is the TSS defined by the hardware. */
25366 -struct i386_hw_tss {
25367 - unsigned short back_link,__blh;
25368 - unsigned long esp0;
25369 - unsigned short ss0,__ss0h;
25370 - unsigned long esp1;
25371 - unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
25372 - unsigned long esp2;
25373 - unsigned short ss2,__ss2h;
25374 - unsigned long __cr3;
25375 - unsigned long eip;
25376 - unsigned long eflags;
25377 - unsigned long eax,ecx,edx,ebx;
25378 - unsigned long esp;
25379 - unsigned long ebp;
25380 - unsigned long esi;
25381 - unsigned long edi;
25382 - unsigned short es, __esh;
25383 - unsigned short cs, __csh;
25384 - unsigned short ss, __ssh;
25385 - unsigned short ds, __dsh;
25386 - unsigned short fs, __fsh;
25387 - unsigned short gs, __gsh;
25388 - unsigned short ldt, __ldth;
25389 - unsigned short trace, io_bitmap_base;
25390 -} __attribute__((packed));
25392 -struct tss_struct {
25393 - struct i386_hw_tss x86_tss;
25396 - * The extra 1 is there because the CPU will access an
25397 - * additional byte beyond the end of the IO permission
25398 - * bitmap. The extra byte must be all 1 bits, and must
25399 - * be within the limit.
25401 - unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
25403 - * Cache the current maximum and the last task that used the bitmap:
25405 - unsigned long io_bitmap_max;
25406 - struct thread_struct *io_bitmap_owner;
25408 - * pads the TSS to be cacheline-aligned (size is 0x100)
25410 - unsigned long __cacheline_filler[35];
25412 - * .. and then another 0x100 bytes for emergency kernel stack
25414 - unsigned long stack[64];
25415 -} __attribute__((packed));
25418 -#define ARCH_MIN_TASKALIGN 16
25420 -struct thread_struct {
25421 -/* cached TLS descriptors. */
25422 - struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
25423 - unsigned long esp0;
25424 - unsigned long sysenter_cs;
25425 - unsigned long eip;
25426 - unsigned long esp;
25427 - unsigned long fs;
25428 - unsigned long gs;
25429 -/* Hardware debugging registers */
25430 - unsigned long debugreg[8]; /* %%db0-7 debug registers */
25432 - unsigned long cr2, trap_no, error_code;
25433 -/* floating point info */
25434 - union i387_union i387;
25435 -/* virtual 86 mode info */
25436 - struct vm86_struct __user * vm86_info;
25437 - unsigned long screen_bitmap;
25438 - unsigned long v86flags, v86mask, saved_esp0;
25439 - unsigned int saved_fs, saved_gs;
25440 -/* IO permissions */
25441 - unsigned long *io_bitmap_ptr;
25442 - unsigned long iopl;
25443 -/* max allowed port in the bitmap, in bytes: */
25444 - unsigned long io_bitmap_max;
25447 -#define INIT_THREAD { \
25448 - .esp0 = sizeof(init_stack) + (long)&init_stack, \
25449 - .vm86_info = NULL, \
25450 - .sysenter_cs = __KERNEL_CS, \
25451 - .io_bitmap_ptr = NULL, \
25452 - .fs = __KERNEL_PERCPU, \
25456 - * Note that the .io_bitmap member must be extra-big. This is because
25457 - * the CPU will access an additional byte beyond the end of the IO
25458 - * permission bitmap. The extra byte must be all 1 bits, and must
25459 - * be within the limit.
25461 -#define INIT_TSS { \
25463 - .esp0 = sizeof(init_stack) + (long)&init_stack, \
25464 - .ss0 = __KERNEL_DS, \
25465 - .ss1 = __KERNEL_CS, \
25466 - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
25468 - .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
25471 -#define start_thread(regs, new_eip, new_esp) do { \
25472 - __asm__("movl %0,%%gs": :"r" (0)); \
25474 - set_fs(USER_DS); \
25475 - regs->xds = __USER_DS; \
25476 - regs->xes = __USER_DS; \
25477 - regs->xss = __USER_DS; \
25478 - regs->xcs = __USER_CS; \
25479 - regs->eip = new_eip; \
25480 - regs->esp = new_esp; \
25483 -/* Forward declaration, a strange C thing */
25484 -struct task_struct;
25487 -/* Free all resources held by a thread. */
25488 -extern void release_thread(struct task_struct *);
25490 -/* Prepare to copy thread state - unlazy all lazy status */
25491 -extern void prepare_to_copy(struct task_struct *tsk);
25494 - * create a kernel thread without removing it from tasklists
25496 -extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
25498 -extern unsigned long thread_saved_pc(struct task_struct *tsk);
25499 -void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
25501 -unsigned long get_wchan(struct task_struct *p);
25503 -#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
25504 -#define KSTK_TOP(info) \
25506 - unsigned long *__ptr = (unsigned long *)(info); \
25507 - (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
25511 - * The below -8 is to reserve 8 bytes on top of the ring0 stack.
25512 - * This is necessary to guarantee that the entire "struct pt_regs"
25513 - * is accessable even if the CPU haven't stored the SS/ESP registers
25514 - * on the stack (interrupt gate does not save these registers
25515 - * when switching to the same priv ring).
25516 - * Therefore beware: accessing the xss/esp fields of the
25517 - * "struct pt_regs" is possible, but they may contain the
25518 - * completely wrong values.
25520 -#define task_pt_regs(task) \
25522 - struct pt_regs *__regs__; \
25523 - __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
25527 -#define KSTK_EIP(task) (task_pt_regs(task)->eip)
25528 -#define KSTK_ESP(task) (task_pt_regs(task)->esp)
25531 -struct microcode_header {
25532 - unsigned int hdrver;
25533 - unsigned int rev;
25534 - unsigned int date;
25535 - unsigned int sig;
25536 - unsigned int cksum;
25537 - unsigned int ldrver;
25539 - unsigned int datasize;
25540 - unsigned int totalsize;
25541 - unsigned int reserved[3];
25544 -struct microcode {
25545 - struct microcode_header hdr;
25546 - unsigned int bits[0];
25549 -typedef struct microcode microcode_t;
25550 -typedef struct microcode_header microcode_header_t;
25552 -/* microcode format is extended from prescott processors */
25553 -struct extended_signature {
25554 - unsigned int sig;
25556 - unsigned int cksum;
25559 -struct extended_sigtable {
25560 - unsigned int count;
25561 - unsigned int cksum;
25562 - unsigned int reserved[3];
25563 - struct extended_signature sigs[0];
25566 -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
25567 -static inline void rep_nop(void)
25569 - __asm__ __volatile__("rep;nop": : :"memory");
25572 -#define cpu_relax() rep_nop()
25574 -#ifndef CONFIG_X86_NO_TSS
25575 -static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
25577 - tss->x86_tss.esp0 = thread->esp0;
25578 - /* This can only happen when SEP is enabled, no need to test "SEP"arately */
25579 - if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
25580 - tss->x86_tss.ss1 = thread->sysenter_cs;
25581 - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
25585 -#define xen_load_esp0(tss, thread) do { \
25586 - if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
25592 -static inline unsigned long xen_get_debugreg(int regno)
25594 - return HYPERVISOR_get_debugreg(regno);
25597 -static inline void xen_set_debugreg(int regno, unsigned long value)
25599 - WARN_ON(HYPERVISOR_set_debugreg(regno, value));
25603 - * Set IOPL bits in EFLAGS from given mask
25605 -static inline void xen_set_iopl_mask(unsigned mask)
25607 - struct physdev_set_iopl set_iopl;
25609 - /* Force the change at ring 0. */
25610 - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
25611 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
25615 -#define paravirt_enabled() 0
25616 -#define __cpuid xen_cpuid
25618 -#define load_esp0 xen_load_esp0
25621 - * These special macros can be used to get or set a debugging register
25623 -#define get_debugreg(var, register) \
25624 - (var) = xen_get_debugreg(register)
25625 -#define set_debugreg(value, register) \
25626 - xen_set_debugreg(register, value)
25628 -#define set_iopl_mask xen_set_iopl_mask
25631 - * Generic CPUID function
25632 - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
25633 - * resulting in stale register contents being returned.
25635 -static inline void cpuid(unsigned int op,
25636 - unsigned int *eax, unsigned int *ebx,
25637 - unsigned int *ecx, unsigned int *edx)
25641 - __cpuid(eax, ebx, ecx, edx);
25644 -/* Some CPUID calls want 'count' to be placed in ecx */
25645 -static inline void cpuid_count(unsigned int op, int count,
25646 - unsigned int *eax, unsigned int *ebx,
25647 - unsigned int *ecx, unsigned int *edx)
25651 - __cpuid(eax, ebx, ecx, edx);
25655 - * CPUID functions returning a single datum
25657 -static inline unsigned int cpuid_eax(unsigned int op)
25659 - unsigned int eax, ebx, ecx, edx;
25661 - cpuid(op, &eax, &ebx, &ecx, &edx);
25664 -static inline unsigned int cpuid_ebx(unsigned int op)
25666 - unsigned int eax, ebx, ecx, edx;
25668 - cpuid(op, &eax, &ebx, &ecx, &edx);
25671 -static inline unsigned int cpuid_ecx(unsigned int op)
25673 - unsigned int eax, ebx, ecx, edx;
25675 - cpuid(op, &eax, &ebx, &ecx, &edx);
25678 -static inline unsigned int cpuid_edx(unsigned int op)
25680 - unsigned int eax, ebx, ecx, edx;
25682 - cpuid(op, &eax, &ebx, &ecx, &edx);
25686 -/* generic versions from gas */
25687 -#define GENERIC_NOP1 ".byte 0x90\n"
25688 -#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
25689 -#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
25690 -#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
25691 -#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
25692 -#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
25693 -#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
25694 -#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
25696 -/* Opteron nops */
25697 -#define K8_NOP1 GENERIC_NOP1
25698 -#define K8_NOP2 ".byte 0x66,0x90\n"
25699 -#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
25700 -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
25701 -#define K8_NOP5 K8_NOP3 K8_NOP2
25702 -#define K8_NOP6 K8_NOP3 K8_NOP3
25703 -#define K8_NOP7 K8_NOP4 K8_NOP3
25704 -#define K8_NOP8 K8_NOP4 K8_NOP4
25707 -/* uses eax dependencies (arbitary choice) */
25708 -#define K7_NOP1 GENERIC_NOP1
25709 -#define K7_NOP2 ".byte 0x8b,0xc0\n"
25710 -#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
25711 -#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
25712 -#define K7_NOP5 K7_NOP4 ASM_NOP1
25713 -#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
25714 -#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
25715 -#define K7_NOP8 K7_NOP7 ASM_NOP1
25718 -/* uses eax dependencies (Intel-recommended choice) */
25719 -#define P6_NOP1 GENERIC_NOP1
25720 -#define P6_NOP2 ".byte 0x66,0x90\n"
25721 -#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
25722 -#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
25723 -#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
25724 -#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
25725 -#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
25726 -#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
25729 -#define ASM_NOP1 K8_NOP1
25730 -#define ASM_NOP2 K8_NOP2
25731 -#define ASM_NOP3 K8_NOP3
25732 -#define ASM_NOP4 K8_NOP4
25733 -#define ASM_NOP5 K8_NOP5
25734 -#define ASM_NOP6 K8_NOP6
25735 -#define ASM_NOP7 K8_NOP7
25736 -#define ASM_NOP8 K8_NOP8
25737 -#elif defined(CONFIG_MK7)
25738 -#define ASM_NOP1 K7_NOP1
25739 -#define ASM_NOP2 K7_NOP2
25740 -#define ASM_NOP3 K7_NOP3
25741 -#define ASM_NOP4 K7_NOP4
25742 -#define ASM_NOP5 K7_NOP5
25743 -#define ASM_NOP6 K7_NOP6
25744 -#define ASM_NOP7 K7_NOP7
25745 -#define ASM_NOP8 K7_NOP8
25746 -#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
25747 - defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
25748 - defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
25749 -#define ASM_NOP1 P6_NOP1
25750 -#define ASM_NOP2 P6_NOP2
25751 -#define ASM_NOP3 P6_NOP3
25752 -#define ASM_NOP4 P6_NOP4
25753 -#define ASM_NOP5 P6_NOP5
25754 -#define ASM_NOP6 P6_NOP6
25755 -#define ASM_NOP7 P6_NOP7
25756 -#define ASM_NOP8 P6_NOP8
25758 -#define ASM_NOP1 GENERIC_NOP1
25759 -#define ASM_NOP2 GENERIC_NOP2
25760 -#define ASM_NOP3 GENERIC_NOP3
25761 -#define ASM_NOP4 GENERIC_NOP4
25762 -#define ASM_NOP5 GENERIC_NOP5
25763 -#define ASM_NOP6 GENERIC_NOP6
25764 -#define ASM_NOP7 GENERIC_NOP7
25765 -#define ASM_NOP8 GENERIC_NOP8
25768 -#define ASM_NOP_MAX 8
25770 -/* Prefetch instructions for Pentium III and AMD Athlon */
25771 -/* It's not worth to care about 3dnow! prefetches for the K6
25772 - because they are microcoded there and very slow.
25773 - However we don't do prefetches for pre XP Athlons currently
25774 - That should be fixed. */
25775 -#define ARCH_HAS_PREFETCH
25776 -static inline void prefetch(const void *x)
25778 - alternative_input(ASM_NOP4,
25779 - "prefetchnta (%1)",
25784 -#define ARCH_HAS_PREFETCH
25785 -#define ARCH_HAS_PREFETCHW
25786 -#define ARCH_HAS_SPINLOCK_PREFETCH
25788 -/* 3dnow! prefetch to get an exclusive cache line. Useful for
25789 - spinlocks to avoid one state transition in the cache coherency protocol. */
25790 -static inline void prefetchw(const void *x)
25792 - alternative_input(ASM_NOP4,
25793 - "prefetchw (%1)",
25794 - X86_FEATURE_3DNOW,
25797 -#define spin_lock_prefetch(x) prefetchw(x)
25799 -extern void select_idle_routine(const struct cpuinfo_x86 *c);
25801 -#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
25803 -extern unsigned long boot_option_idle_override;
25804 -extern void enable_sep_cpu(void);
25805 -extern int sysenter_setup(void);
25807 -/* Defined in head.S */
25808 -extern struct Xgt_desc_struct early_gdt_descr;
25810 -extern void cpu_set_gdt(int);
25811 -extern void switch_to_new_gdt(void);
25812 -extern void cpu_init(void);
25813 -extern void init_gdt(int cpu);
25815 -extern int force_mwait;
25817 -#endif /* __ASM_I386_PROCESSOR_H */
25818 --- a/include/asm-x86/mach-xen/asm/processor_64.h
25822 - * include/asm-x86_64/processor.h
25824 - * Copyright (C) 1994 Linus Torvalds
25827 -#ifndef __ASM_X86_64_PROCESSOR_H
25828 -#define __ASM_X86_64_PROCESSOR_H
25830 -#include <asm/segment.h>
25831 -#include <asm/page.h>
25832 -#include <asm/types.h>
25833 -#include <asm/sigcontext.h>
25834 -#include <asm/cpufeature.h>
25835 -#include <linux/threads.h>
25836 -#include <asm/msr.h>
25837 -#include <asm/current.h>
25838 -#include <asm/system.h>
25839 -#include <asm/mmsegment.h>
25840 -#include <asm/percpu.h>
25841 -#include <linux/personality.h>
25842 -#include <linux/cpumask.h>
25843 -#include <asm/processor-flags.h>
25845 -#define TF_MASK 0x00000100
25846 -#define IF_MASK 0x00000200
25847 -#define IOPL_MASK 0x00003000
25848 -#define NT_MASK 0x00004000
25849 -#define VM_MASK 0x00020000
25850 -#define AC_MASK 0x00040000
25851 -#define VIF_MASK 0x00080000 /* virtual interrupt flag */
25852 -#define VIP_MASK 0x00100000 /* virtual interrupt pending */
25853 -#define ID_MASK 0x00200000
25855 -#define desc_empty(desc) \
25856 - (!((desc)->a | (desc)->b))
25858 -#define desc_equal(desc1, desc2) \
25859 - (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
25862 - * Default implementation of macro that returns current
25863 - * instruction pointer ("program counter").
25865 -#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
25868 - * CPU type and hardware bug flags. Kept separately for each CPU.
25871 -struct cpuinfo_x86 {
25872 - __u8 x86; /* CPU family */
25873 - __u8 x86_vendor; /* CPU vendor */
25876 - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
25877 - __u32 x86_capability[NCAPINTS];
25878 - char x86_vendor_id[16];
25879 - char x86_model_id[64];
25880 - int x86_cache_size; /* in KB */
25881 - int x86_clflush_size;
25882 - int x86_cache_alignment;
25883 - int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/
25884 - __u8 x86_virt_bits, x86_phys_bits;
25885 - __u8 x86_max_cores; /* cpuid returned max cores value */
25887 - __u32 extended_cpuid_level; /* Max extended CPUID function supported */
25888 - unsigned long loops_per_jiffy;
25890 - cpumask_t llc_shared_map; /* cpus sharing the last level cache */
25894 - __u8 booted_cores; /* number of cores as seen by OS */
25895 - __u8 phys_proc_id; /* Physical Processor id. */
25896 - __u8 cpu_core_id; /* Core id. */
25897 - __u8 cpu_index; /* index into per_cpu list */
25899 -} ____cacheline_aligned;
25901 -#define X86_VENDOR_INTEL 0
25902 -#define X86_VENDOR_CYRIX 1
25903 -#define X86_VENDOR_AMD 2
25904 -#define X86_VENDOR_UMC 3
25905 -#define X86_VENDOR_NEXGEN 4
25906 -#define X86_VENDOR_CENTAUR 5
25907 -#define X86_VENDOR_TRANSMETA 7
25908 -#define X86_VENDOR_NUM 8
25909 -#define X86_VENDOR_UNKNOWN 0xff
25912 -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25913 -#define cpu_data(cpu) per_cpu(cpu_info, cpu)
25914 -#define current_cpu_data cpu_data(smp_processor_id())
25916 -#define cpu_data(cpu) boot_cpu_data
25917 -#define current_cpu_data boot_cpu_data
25920 -extern char ignore_irq13;
25922 -extern void identify_cpu(struct cpuinfo_x86 *);
25923 -extern void print_cpu_info(struct cpuinfo_x86 *);
25924 -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
25925 -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
25926 -extern unsigned short num_cache_leaves;
25929 - * Save the cr4 feature set we're using (ie
25930 - * Pentium 4MB enable and PPro Global page
25931 - * enable), so that any CPU's that boot up
25932 - * after us can get the correct flags.
25934 -extern unsigned long mmu_cr4_features;
25936 -static inline void set_in_cr4 (unsigned long mask)
25938 - mmu_cr4_features |= mask;
25939 - __asm__("movq %%cr4,%%rax\n\t"
25940 - "orq %0,%%rax\n\t"
25941 - "movq %%rax,%%cr4\n"
25946 -static inline void clear_in_cr4 (unsigned long mask)
25948 - mmu_cr4_features &= ~mask;
25949 - __asm__("movq %%cr4,%%rax\n\t"
25950 - "andq %0,%%rax\n\t"
25951 - "movq %%rax,%%cr4\n"
25952 - : : "irg" (~mask)
25958 - * User space process size. 47bits minus one guard page.
25960 -#define TASK_SIZE64 (0x800000000000UL - 4096)
25962 -/* This decides where the kernel will search for a free chunk of vm
25963 - * space during mmap's.
25965 -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
25967 -#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
25968 -#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
25970 -#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
25973 - * Size of io_bitmap.
25975 -#define IO_BITMAP_BITS 65536
25976 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
25977 -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
25978 -#ifndef CONFIG_X86_NO_TSS
25979 -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
25981 -#define INVALID_IO_BITMAP_OFFSET 0x8000
25983 -struct i387_fxsave_struct {
25992 - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
25993 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
25995 -} __attribute__ ((aligned (16)));
25997 -union i387_union {
25998 - struct i387_fxsave_struct fxsave;
26001 -#ifndef CONFIG_X86_NO_TSS
26002 -struct tss_struct {
26012 - u16 io_bitmap_base;
26014 - * The extra 1 is there because the CPU will access an
26015 - * additional byte beyond the end of the IO permission
26016 - * bitmap. The extra byte must be all 1 bits, and must
26017 - * be within the limit. Thus we have:
26019 - * 128 bytes, the bitmap itself, for ports 0..0x3ff
26020 - * 8 bytes, for an extra "long" of ~0UL
26022 - unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
26023 -} __attribute__((packed)) ____cacheline_aligned;
26025 -DECLARE_PER_CPU(struct tss_struct,init_tss);
26029 -extern struct cpuinfo_x86 boot_cpu_data;
26030 -#ifndef CONFIG_X86_NO_TSS
26031 -/* Save the original ist values for checking stack pointers during debugging */
26033 - unsigned long ist[7];
26035 -DECLARE_PER_CPU(struct orig_ist, orig_ist);
26038 -#ifdef CONFIG_X86_VSMP
26039 -#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
26040 -#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
26042 -#define ARCH_MIN_TASKALIGN 16
26043 -#define ARCH_MIN_MMSTRUCT_ALIGN 0
26046 -struct thread_struct {
26047 - unsigned long rsp0;
26048 - unsigned long rsp;
26049 - unsigned long userrsp; /* Copy from PDA */
26050 - unsigned long fs;
26051 - unsigned long gs;
26052 - unsigned short es, ds, fsindex, gsindex;
26053 -/* Hardware debugging registers */
26054 - unsigned long debugreg0;
26055 - unsigned long debugreg1;
26056 - unsigned long debugreg2;
26057 - unsigned long debugreg3;
26058 - unsigned long debugreg6;
26059 - unsigned long debugreg7;
26061 - unsigned long cr2, trap_no, error_code;
26062 -/* floating point info */
26063 - union i387_union i387 __attribute__((aligned(16)));
26064 -/* IO permissions. the bitmap could be moved into the GDT, that would make
26065 - switch faster for a limited number of ioperm using tasks. -AK */
26067 - unsigned long *io_bitmap_ptr;
26068 - unsigned io_bitmap_max;
26069 -/* cached TLS descriptors. */
26070 - u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
26071 - unsigned int iopl;
26072 -} __attribute__((aligned(16)));
26074 -#define INIT_THREAD { \
26075 - .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
26078 -#ifndef CONFIG_X86_NO_TSS
26079 -#define INIT_TSS { \
26080 - .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
26084 -#define INIT_MMAP \
26085 -{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
26087 -#define start_thread(regs,new_rip,new_rsp) do { \
26088 - asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
26089 - load_gs_index(0); \
26090 - (regs)->rip = (new_rip); \
26091 - (regs)->rsp = (new_rsp); \
26092 - write_pda(oldrsp, (new_rsp)); \
26093 - (regs)->cs = __USER_CS; \
26094 - (regs)->ss = __USER_DS; \
26095 - (regs)->eflags = 0x200; \
26096 - set_fs(USER_DS); \
26099 -#define get_debugreg(var, register) \
26100 - var = HYPERVISOR_get_debugreg(register)
26101 -#define set_debugreg(value, register) do { \
26102 - if (HYPERVISOR_set_debugreg(register, value)) \
26106 -struct task_struct;
26109 -/* Free all resources held by a thread. */
26110 -extern void release_thread(struct task_struct *);
26112 -/* Prepare to copy thread state - unlazy all lazy status */
26113 -extern void prepare_to_copy(struct task_struct *tsk);
26116 - * create a kernel thread without removing it from tasklists
26118 -extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
26121 - * Return saved PC of a blocked thread.
26122 - * What is this good for? it will be always the scheduler or ret_from_fork.
26124 -#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
26126 -extern unsigned long get_wchan(struct task_struct *p);
26127 -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
26128 -#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
26129 -#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
26132 -struct microcode_header {
26133 - unsigned int hdrver;
26134 - unsigned int rev;
26135 - unsigned int date;
26136 - unsigned int sig;
26137 - unsigned int cksum;
26138 - unsigned int ldrver;
26140 - unsigned int datasize;
26141 - unsigned int totalsize;
26142 - unsigned int reserved[3];
26145 -struct microcode {
26146 - struct microcode_header hdr;
26147 - unsigned int bits[0];
26150 -typedef struct microcode microcode_t;
26151 -typedef struct microcode_header microcode_header_t;
26153 -/* microcode format is extended from prescott processors */
26154 -struct extended_signature {
26155 - unsigned int sig;
26157 - unsigned int cksum;
26160 -struct extended_sigtable {
26161 - unsigned int count;
26162 - unsigned int cksum;
26163 - unsigned int reserved[3];
26164 - struct extended_signature sigs[0];
26168 -#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2)
26169 -#define ASM_NOP1 P6_NOP1
26170 -#define ASM_NOP2 P6_NOP2
26171 -#define ASM_NOP3 P6_NOP3
26172 -#define ASM_NOP4 P6_NOP4
26173 -#define ASM_NOP5 P6_NOP5
26174 -#define ASM_NOP6 P6_NOP6
26175 -#define ASM_NOP7 P6_NOP7
26176 -#define ASM_NOP8 P6_NOP8
26178 -#define ASM_NOP1 K8_NOP1
26179 -#define ASM_NOP2 K8_NOP2
26180 -#define ASM_NOP3 K8_NOP3
26181 -#define ASM_NOP4 K8_NOP4
26182 -#define ASM_NOP5 K8_NOP5
26183 -#define ASM_NOP6 K8_NOP6
26184 -#define ASM_NOP7 K8_NOP7
26185 -#define ASM_NOP8 K8_NOP8
26188 -/* Opteron nops */
26189 -#define K8_NOP1 ".byte 0x90\n"
26190 -#define K8_NOP2 ".byte 0x66,0x90\n"
26191 -#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
26192 -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
26193 -#define K8_NOP5 K8_NOP3 K8_NOP2
26194 -#define K8_NOP6 K8_NOP3 K8_NOP3
26195 -#define K8_NOP7 K8_NOP4 K8_NOP3
26196 -#define K8_NOP8 K8_NOP4 K8_NOP4
26199 -/* uses eax dependencies (Intel-recommended choice) */
26200 -#define P6_NOP1 ".byte 0x90\n"
26201 -#define P6_NOP2 ".byte 0x66,0x90\n"
26202 -#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
26203 -#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
26204 -#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
26205 -#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
26206 -#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
26207 -#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
26209 -#define ASM_NOP_MAX 8
26211 -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
26212 -static inline void rep_nop(void)
26214 - __asm__ __volatile__("rep;nop": : :"memory");
26217 -/* Stop speculative execution */
26218 -static inline void sync_core(void)
26221 - asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
26224 -#define ARCH_HAS_PREFETCHW 1
26225 -static inline void prefetchw(void *x)
26227 - alternative_input("prefetcht0 (%1)",
26228 - "prefetchw (%1)",
26229 - X86_FEATURE_3DNOW,
26233 -#define ARCH_HAS_SPINLOCK_PREFETCH 1
26235 -#define spin_lock_prefetch(x) prefetchw(x)
26237 -#define cpu_relax() rep_nop()
26239 -static inline void __monitor(const void *eax, unsigned long ecx,
26240 - unsigned long edx)
26242 - /* "monitor %eax,%ecx,%edx;" */
26244 - ".byte 0x0f,0x01,0xc8;"
26245 - : :"a" (eax), "c" (ecx), "d"(edx));
26248 -static inline void __mwait(unsigned long eax, unsigned long ecx)
26250 - /* "mwait %eax,%ecx;" */
26252 - ".byte 0x0f,0x01,0xc9;"
26253 - : :"a" (eax), "c" (ecx));
26256 -static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
26258 - /* "mwait %eax,%ecx;" */
26260 - "sti; .byte 0x0f,0x01,0xc9;"
26261 - : :"a" (eax), "c" (ecx));
26264 -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
26266 -#define stack_current() \
26268 - struct thread_info *ti; \
26269 - asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
26273 -#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
26275 -extern unsigned long boot_option_idle_override;
26276 -/* Boot loader type from the setup header */
26277 -extern int bootloader_type;
26279 -#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
26281 -#endif /* __ASM_X86_64_PROCESSOR_H */
26282 --- a/include/asm-x86/mach-xen/asm/processor.h
26283 +++ b/include/asm-x86/mach-xen/asm/processor.h
26285 +#ifndef __ASM_X86_PROCESSOR_H
26286 +#define __ASM_X86_PROCESSOR_H
26288 +#include <asm/processor-flags.h>
26290 +/* migration helpers, for KVM - will be removed in 2.6.25: */
26291 +#include <asm/vm86.h>
26292 +#define Xgt_desc_struct desc_ptr
26294 +/* Forward declaration, a strange C thing */
26295 +struct task_struct;
26298 +#include <asm/vm86.h>
26299 +#include <asm/math_emu.h>
26300 +#include <asm/segment.h>
26301 +#include <asm/types.h>
26302 +#include <asm/sigcontext.h>
26303 +#include <asm/current.h>
26304 +#include <asm/cpufeature.h>
26305 +#include <asm/system.h>
26306 +#include <asm/page.h>
26307 +#include <asm/percpu.h>
26308 +#include <asm/msr.h>
26309 +#include <asm/desc_defs.h>
26310 +#include <asm/nops.h>
26311 +#include <linux/personality.h>
26312 +#include <linux/cpumask.h>
26313 +#include <linux/cache.h>
26314 +#include <linux/threads.h>
26315 +#include <linux/init.h>
26316 +#include <xen/interface/physdev.h>
26319 + * Default implementation of macro that returns current
26320 + * instruction pointer ("program counter").
26322 +static inline void *current_text_addr(void)
26325 + asm volatile("mov $1f,%0\n1:":"=r" (pc));
26329 +#ifdef CONFIG_X86_VSMP
26330 +#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
26331 +#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
26333 +#define ARCH_MIN_TASKALIGN 16
26334 +#define ARCH_MIN_MMSTRUCT_ALIGN 0
26338 + * CPU type and hardware bug flags. Kept separately for each CPU.
26339 + * Members of this structure are referenced in head.S, so think twice
26340 + * before touching them. [mj]
26343 +struct cpuinfo_x86 {
26344 + __u8 x86; /* CPU family */
26345 + __u8 x86_vendor; /* CPU vendor */
26348 +#ifdef CONFIG_X86_32
26349 + char wp_works_ok; /* It doesn't on 386's */
26350 + char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
26358 + /* number of 4K pages in DTLB/ITLB combined(in pages)*/
26360 + __u8 x86_virt_bits, x86_phys_bits;
26361 + /* cpuid returned core id bits */
26362 + __u8 x86_coreid_bits;
26363 + /* Max extended CPUID function supported */
26364 + __u32 extended_cpuid_level;
26366 + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
26367 + __u32 x86_capability[NCAPINTS];
26368 + char x86_vendor_id[16];
26369 + char x86_model_id[64];
26370 + int x86_cache_size; /* in KB - valid for CPUS which support this
26372 + int x86_cache_alignment; /* In bytes */
26374 + unsigned long loops_per_jiffy;
26376 + cpumask_t llc_shared_map; /* cpus sharing the last level cache */
26378 + u16 x86_max_cores; /* cpuid returned max cores value */
26380 + u16 x86_clflush_size;
26382 + u16 booted_cores; /* number of cores as seen by OS */
26383 + u16 phys_proc_id; /* Physical processor id. */
26384 + u16 cpu_core_id; /* Core id */
26385 + u16 cpu_index; /* index into per_cpu list */
26387 +} __attribute__((__aligned__(SMP_CACHE_BYTES)));
26389 +#define X86_VENDOR_INTEL 0
26390 +#define X86_VENDOR_CYRIX 1
26391 +#define X86_VENDOR_AMD 2
26392 +#define X86_VENDOR_UMC 3
26393 +#define X86_VENDOR_NEXGEN 4
26394 +#define X86_VENDOR_CENTAUR 5
26395 +#define X86_VENDOR_TRANSMETA 7
26396 +#define X86_VENDOR_NSC 8
26397 +#define X86_VENDOR_NUM 9
26398 +#define X86_VENDOR_UNKNOWN 0xff
26401 + * capabilities of CPUs
26403 +extern struct cpuinfo_x86 boot_cpu_data;
26404 +extern struct cpuinfo_x86 new_cpu_data;
26405 +extern __u32 cleared_cpu_caps[NCAPINTS];
26408 +DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
26409 +#define cpu_data(cpu) per_cpu(cpu_info, cpu)
26410 +#define current_cpu_data cpu_data(smp_processor_id())
26412 +#define cpu_data(cpu) boot_cpu_data
26413 +#define current_cpu_data boot_cpu_data
26416 +void cpu_detect(struct cpuinfo_x86 *c);
26418 +extern void identify_cpu(struct cpuinfo_x86 *);
26419 +extern void identify_boot_cpu(void);
26420 +extern void identify_secondary_cpu(struct cpuinfo_x86 *);
26421 +extern void print_cpu_info(struct cpuinfo_x86 *);
26422 +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
26423 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
26424 +extern unsigned short num_cache_leaves;
26426 +#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
26427 +extern void detect_ht(struct cpuinfo_x86 *c);
26429 +static inline void detect_ht(struct cpuinfo_x86 *c) {}
26432 +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
26433 + unsigned int *ecx, unsigned int *edx)
26435 + /* ecx is often an input as well as an output. */
26436 + __asm__(XEN_CPUID
26441 + : "0" (*eax), "2" (*ecx));
26444 +static inline void load_cr3(pgd_t *pgdir)
26446 + write_cr3(__pa(pgdir));
26449 +#ifndef CONFIG_X86_NO_TSS
26450 +#ifdef CONFIG_X86_32
26451 +/* This is the TSS defined by the hardware. */
26452 +struct x86_hw_tss {
26453 + unsigned short back_link, __blh;
26454 + unsigned long sp0;
26455 + unsigned short ss0, __ss0h;
26456 + unsigned long sp1;
26457 + unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */
26458 + unsigned long sp2;
26459 + unsigned short ss2, __ss2h;
26460 + unsigned long __cr3;
26461 + unsigned long ip;
26462 + unsigned long flags;
26463 + unsigned long ax, cx, dx, bx;
26464 + unsigned long sp, bp, si, di;
26465 + unsigned short es, __esh;
26466 + unsigned short cs, __csh;
26467 + unsigned short ss, __ssh;
26468 + unsigned short ds, __dsh;
26469 + unsigned short fs, __fsh;
26470 + unsigned short gs, __gsh;
26471 + unsigned short ldt, __ldth;
26472 + unsigned short trace, io_bitmap_base;
26473 +} __attribute__((packed));
26474 +extern struct tss_struct doublefault_tss;
26476 +struct x86_hw_tss {
26486 + u16 io_bitmap_base;
26487 +} __attribute__((packed)) ____cacheline_aligned;
26489 +#endif /* CONFIG_X86_NO_TSS */
26492 + * Size of io_bitmap.
26494 +#define IO_BITMAP_BITS 65536
26495 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
26496 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
26497 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
26498 +#define INVALID_IO_BITMAP_OFFSET 0x8000
26499 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
26501 +#ifndef CONFIG_X86_NO_TSS
26502 +struct tss_struct {
26503 + struct x86_hw_tss x86_tss;
26506 + * The extra 1 is there because the CPU will access an
26507 + * additional byte beyond the end of the IO permission
26508 + * bitmap. The extra byte must be all 1 bits, and must
26509 + * be within the limit.
26511 + unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
26513 + * Cache the current maximum and the last task that used the bitmap:
26515 + unsigned long io_bitmap_max;
26516 + struct thread_struct *io_bitmap_owner;
26518 + * pads the TSS to be cacheline-aligned (size is 0x100)
26520 + unsigned long __cacheline_filler[35];
26522 + * .. and then another 0x100 bytes for emergency kernel stack
26524 + unsigned long stack[64];
26525 +} __attribute__((packed));
26527 +DECLARE_PER_CPU(struct tss_struct, init_tss);
26529 +/* Save the original ist values for checking stack pointers during debugging */
26531 + unsigned long ist[7];
26533 +#endif /* CONFIG_X86_NO_TSS */
26535 +#define MXCSR_DEFAULT 0x1f80
26537 +struct i387_fsave_struct {
26545 + u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
26546 + u32 status; /* software status information */
26549 +struct i387_fxsave_struct {
26568 + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
26569 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
26571 +} __attribute__((aligned(16)));
26573 +struct i387_soft_struct {
26581 + u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
26582 + u8 ftop, changed, lookahead, no_update, rm, alimit;
26583 + struct info *info;
26587 +union i387_union {
26588 + struct i387_fsave_struct fsave;
26589 + struct i387_fxsave_struct fxsave;
26590 + struct i387_soft_struct soft;
26593 +#ifdef CONFIG_X86_32
26594 +DECLARE_PER_CPU(u8, cpu_llc_id);
26595 +#elif !defined(CONFIG_X86_NO_TSS)
26596 +DECLARE_PER_CPU(struct orig_ist, orig_ist);
26599 +extern void print_cpu_info(struct cpuinfo_x86 *);
26600 +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
26601 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
26602 +extern unsigned short num_cache_leaves;
26604 +struct thread_struct {
26605 +/* cached TLS descriptors. */
26606 + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
26607 + unsigned long sp0;
26608 + unsigned long sp;
26609 +#ifdef CONFIG_X86_32
26610 + unsigned long sysenter_cs;
26612 + unsigned long usersp; /* Copy from PDA */
26613 + unsigned short es, ds, fsindex, gsindex;
26615 + unsigned long ip;
26616 + unsigned long fs;
26617 + unsigned long gs;
26618 +/* Hardware debugging registers */
26619 + unsigned long debugreg0;
26620 + unsigned long debugreg1;
26621 + unsigned long debugreg2;
26622 + unsigned long debugreg3;
26623 + unsigned long debugreg6;
26624 + unsigned long debugreg7;
26626 + unsigned long cr2, trap_no, error_code;
26627 +/* floating point info */
26628 + union i387_union i387 __attribute__((aligned(16)));;
26629 +#ifdef CONFIG_X86_32
26630 +/* virtual 86 mode info */
26631 + struct vm86_struct __user *vm86_info;
26632 + unsigned long screen_bitmap;
26633 + unsigned long v86flags, v86mask, saved_sp0;
26634 + unsigned int saved_fs, saved_gs;
26636 +/* IO permissions */
26637 + unsigned long *io_bitmap_ptr;
26638 + unsigned long iopl;
26639 +/* max allowed port in the bitmap, in bytes: */
26640 + unsigned io_bitmap_max;
26641 +/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
26642 + unsigned long debugctlmsr;
26643 +/* Debug Store - if not 0 points to a DS Save Area configuration;
26644 + * goes into MSR_IA32_DS_AREA */
26645 + unsigned long ds_area_msr;
26648 +static inline unsigned long xen_get_debugreg(int regno)
26650 + return HYPERVISOR_get_debugreg(regno);
26653 +static inline void xen_set_debugreg(int regno, unsigned long value)
26655 + WARN_ON(HYPERVISOR_set_debugreg(regno, value));
26659 + * Set IOPL bits in EFLAGS from given mask
26661 +static inline void xen_set_iopl_mask(unsigned mask)
26663 + struct physdev_set_iopl set_iopl;
26665 + /* Force the change at ring 0. */
26666 + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
26667 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
26670 +#ifndef CONFIG_X86_NO_TSS
26671 +static inline void native_load_sp0(struct tss_struct *tss,
26672 + struct thread_struct *thread)
26674 + tss->x86_tss.sp0 = thread->sp0;
26675 +#ifdef CONFIG_X86_32
26676 + /* Only happens when SEP is enabled, no need to test "SEP"arately */
26677 + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
26678 + tss->x86_tss.ss1 = thread->sysenter_cs;
26679 + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
26684 +#define xen_load_sp0(tss, thread) do { \
26685 + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
26690 +#define __cpuid xen_cpuid
26691 +#define paravirt_enabled() 0
26694 + * These special macros can be used to get or set a debugging register
26696 +#define get_debugreg(var, register) \
26697 + (var) = xen_get_debugreg(register)
26698 +#define set_debugreg(value, register) \
26699 + xen_set_debugreg(register, value)
26701 +#define load_sp0 xen_load_sp0
26703 +#define set_iopl_mask xen_set_iopl_mask
26706 + * Save the cr4 feature set we're using (ie
26707 + * Pentium 4MB enable and PPro Global page
26708 + * enable), so that any CPU's that boot up
26709 + * after us can get the correct flags.
26711 +extern unsigned long mmu_cr4_features;
26713 +static inline void set_in_cr4(unsigned long mask)
26716 + mmu_cr4_features |= mask;
26717 + cr4 = read_cr4();
26722 +static inline void clear_in_cr4(unsigned long mask)
26725 + mmu_cr4_features &= ~mask;
26726 + cr4 = read_cr4();
26731 +struct microcode_header {
26732 + unsigned int hdrver;
26733 + unsigned int rev;
26734 + unsigned int date;
26735 + unsigned int sig;
26736 + unsigned int cksum;
26737 + unsigned int ldrver;
26739 + unsigned int datasize;
26740 + unsigned int totalsize;
26741 + unsigned int reserved[3];
26744 +struct microcode {
26745 + struct microcode_header hdr;
26746 + unsigned int bits[0];
26749 +typedef struct microcode microcode_t;
26750 +typedef struct microcode_header microcode_header_t;
26752 +/* microcode format is extended from prescott processors */
26753 +struct extended_signature {
26754 + unsigned int sig;
26756 + unsigned int cksum;
26759 +struct extended_sigtable {
26760 + unsigned int count;
26761 + unsigned int cksum;
26762 + unsigned int reserved[3];
26763 + struct extended_signature sigs[0];
26767 + unsigned long seg;
26772 + * create a kernel thread without removing it from tasklists
26774 +extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
26776 +/* Free all resources held by a thread. */
26777 +extern void release_thread(struct task_struct *);
26779 +/* Prepare to copy thread state - unlazy all lazy status */
26780 +extern void prepare_to_copy(struct task_struct *tsk);
26782 +unsigned long get_wchan(struct task_struct *p);
26785 + * Generic CPUID function
26786 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
26787 + * resulting in stale register contents being returned.
26789 +static inline void cpuid(unsigned int op,
26790 + unsigned int *eax, unsigned int *ebx,
26791 + unsigned int *ecx, unsigned int *edx)
26795 + __cpuid(eax, ebx, ecx, edx);
26798 +/* Some CPUID calls want 'count' to be placed in ecx */
26799 +static inline void cpuid_count(unsigned int op, int count,
26800 + unsigned int *eax, unsigned int *ebx,
26801 + unsigned int *ecx, unsigned int *edx)
26805 + __cpuid(eax, ebx, ecx, edx);
26809 + * CPUID functions returning a single datum
26811 +static inline unsigned int cpuid_eax(unsigned int op)
26813 + unsigned int eax, ebx, ecx, edx;
26815 + cpuid(op, &eax, &ebx, &ecx, &edx);
26818 +static inline unsigned int cpuid_ebx(unsigned int op)
26820 + unsigned int eax, ebx, ecx, edx;
26822 + cpuid(op, &eax, &ebx, &ecx, &edx);
26825 +static inline unsigned int cpuid_ecx(unsigned int op)
26827 + unsigned int eax, ebx, ecx, edx;
26829 + cpuid(op, &eax, &ebx, &ecx, &edx);
26832 +static inline unsigned int cpuid_edx(unsigned int op)
26834 + unsigned int eax, ebx, ecx, edx;
26836 + cpuid(op, &eax, &ebx, &ecx, &edx);
26840 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
26841 +static inline void rep_nop(void)
26843 + __asm__ __volatile__("rep;nop": : :"memory");
26846 +/* Stop speculative execution */
26847 +static inline void sync_core(void)
26850 + asm volatile("cpuid" : "=a" (tmp) : "0" (1)
26851 + : "ebx", "ecx", "edx", "memory");
26854 +#define cpu_relax() rep_nop()
26856 +static inline void __monitor(const void *eax, unsigned long ecx,
26857 + unsigned long edx)
26859 + /* "monitor %eax,%ecx,%edx;" */
26861 + ".byte 0x0f,0x01,0xc8;"
26862 + : :"a" (eax), "c" (ecx), "d"(edx));
26865 +static inline void __mwait(unsigned long eax, unsigned long ecx)
26867 + /* "mwait %eax,%ecx;" */
26869 + ".byte 0x0f,0x01,0xc9;"
26870 + : :"a" (eax), "c" (ecx));
26873 +static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
26875 + /* "mwait %eax,%ecx;" */
26877 + "sti; .byte 0x0f,0x01,0xc9;"
26878 + : :"a" (eax), "c" (ecx));
26881 +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
26883 +extern int force_mwait;
26885 +extern void select_idle_routine(const struct cpuinfo_x86 *c);
26887 +extern unsigned long boot_option_idle_override;
26889 +extern void enable_sep_cpu(void);
26890 +extern int sysenter_setup(void);
26892 +/* Defined in head.S */
26893 +extern struct desc_ptr early_gdt_descr;
26895 +extern void cpu_set_gdt(int);
26896 +extern void switch_to_new_gdt(void);
26897 +extern void cpu_init(void);
26898 +extern void init_gdt(int cpu);
26900 +/* from system description table in BIOS. Mostly for MCA use, but
26901 + * others may find it useful. */
26902 +extern unsigned int machine_id;
26903 +extern unsigned int machine_submodel_id;
26904 +extern unsigned int BIOS_revision;
26906 +/* Boot loader type from the setup header */
26907 +extern int bootloader_type;
26909 +extern char ignore_fpu_irq;
26910 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
26912 +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
26913 +#define ARCH_HAS_PREFETCHW
26914 +#define ARCH_HAS_SPINLOCK_PREFETCH
26916 +#ifdef CONFIG_X86_32
26917 +#define BASE_PREFETCH ASM_NOP4
26918 +#define ARCH_HAS_PREFETCH
26920 +#define BASE_PREFETCH "prefetcht0 (%1)"
26923 +/* Prefetch instructions for Pentium III and AMD Athlon */
26924 +/* It's not worth to care about 3dnow! prefetches for the K6
26925 + because they are microcoded there and very slow.
26926 + However we don't do prefetches for pre XP Athlons currently
26927 + That should be fixed. */
26928 +static inline void prefetch(const void *x)
26930 + alternative_input(BASE_PREFETCH,
26931 + "prefetchnta (%1)",
26936 +/* 3dnow! prefetch to get an exclusive cache line. Useful for
26937 + spinlocks to avoid one state transition in the cache coherency protocol. */
26938 +static inline void prefetchw(const void *x)
26940 + alternative_input(BASE_PREFETCH,
26941 + "prefetchw (%1)",
26942 + X86_FEATURE_3DNOW,
26946 +#define spin_lock_prefetch(x) prefetchw(x)
26947 #ifdef CONFIG_X86_32
26948 -# include "processor_32.h"
26950 + * User space process size: 3GB (default).
26952 +#define TASK_SIZE (PAGE_OFFSET)
26953 +#define STACK_TOP TASK_SIZE
26954 +#define STACK_TOP_MAX STACK_TOP
26956 +#define INIT_THREAD { \
26957 + .sp0 = sizeof(init_stack) + (long)&init_stack, \
26958 + .vm86_info = NULL, \
26959 + .sysenter_cs = __KERNEL_CS, \
26960 + .io_bitmap_ptr = NULL, \
26961 + .fs = __KERNEL_PERCPU, \
26965 + * Note that the .io_bitmap member must be extra-big. This is because
26966 + * the CPU will access an additional byte beyond the end of the IO
26967 + * permission bitmap. The extra byte must be all 1 bits, and must
26968 + * be within the limit.
26970 +#define INIT_TSS { \
26972 + .sp0 = sizeof(init_stack) + (long)&init_stack, \
26973 + .ss0 = __KERNEL_DS, \
26974 + .ss1 = __KERNEL_CS, \
26975 + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
26977 + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
26980 +#define start_thread(regs, new_eip, new_esp) do { \
26981 + __asm__("movl %0,%%gs": :"r" (0)); \
26983 + set_fs(USER_DS); \
26984 + regs->ds = __USER_DS; \
26985 + regs->es = __USER_DS; \
26986 + regs->ss = __USER_DS; \
26987 + regs->cs = __USER_CS; \
26988 + regs->ip = new_eip; \
26989 + regs->sp = new_esp; \
26993 +extern unsigned long thread_saved_pc(struct task_struct *tsk);
26995 +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
26996 +#define KSTK_TOP(info) \
26998 + unsigned long *__ptr = (unsigned long *)(info); \
26999 + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
27003 + * The below -8 is to reserve 8 bytes on top of the ring0 stack.
27004 + * This is necessary to guarantee that the entire "struct pt_regs"
27005 + * is accessable even if the CPU haven't stored the SS/ESP registers
27006 + * on the stack (interrupt gate does not save these registers
27007 + * when switching to the same priv ring).
27008 + * Therefore beware: accessing the ss/esp fields of the
27009 + * "struct pt_regs" is possible, but they may contain the
27010 + * completely wrong values.
27012 +#define task_pt_regs(task) \
27014 + struct pt_regs *__regs__; \
27015 + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
27019 +#define KSTK_ESP(task) (task_pt_regs(task)->sp)
27022 -# include "processor_64.h"
27024 + * User space process size. 47bits minus one guard page.
27026 +#define TASK_SIZE64 (0x800000000000UL - 4096)
27028 +/* This decides where the kernel will search for a free chunk of vm
27029 + * space during mmap's.
27031 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
27032 + 0xc0000000 : 0xFFFFe000)
27034 +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
27035 + IA32_PAGE_OFFSET : TASK_SIZE64)
27036 +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
27037 + IA32_PAGE_OFFSET : TASK_SIZE64)
27039 +#define STACK_TOP TASK_SIZE
27040 +#define STACK_TOP_MAX TASK_SIZE64
27042 +#define INIT_THREAD { \
27043 + .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
27046 +#define INIT_TSS { \
27047 + .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
27050 +#define start_thread(regs, new_rip, new_rsp) do { \
27051 + asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
27052 + load_gs_index(0); \
27053 + (regs)->ip = (new_rip); \
27054 + (regs)->sp = (new_rsp); \
27055 + write_pda(oldrsp, (new_rsp)); \
27056 + (regs)->cs = __USER_CS; \
27057 + (regs)->ss = __USER_DS; \
27058 + (regs)->flags = 0x200; \
27059 + set_fs(USER_DS); \
27063 + * Return saved PC of a blocked thread.
27064 + * What is this good for? it will be always the scheduler or ret_from_fork.
27066 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
27068 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
27069 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
27070 +#endif /* CONFIG_X86_64 */
27072 +/* This decides where the kernel will search for a free chunk of vm
27073 + * space during mmap's.
27075 +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
27077 +#define KSTK_EIP(task) (task_pt_regs(task)->ip)
27080 --- a/include/asm-x86/mach-xen/asm/segment_32.h
27083 -#ifndef _ASM_SEGMENT_H
27084 -#define _ASM_SEGMENT_H
27087 - * The layout of the per-CPU GDT under Linux:
27094 - * 4 - unused <==== new cacheline
27097 - * ------- start of TLS (Thread-Local Storage) segments:
27099 - * 6 - TLS segment #1 [ glibc's TLS segment ]
27100 - * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
27101 - * 8 - TLS segment #3
27106 - * ------- start of kernel segments:
27108 - * 12 - kernel code segment <==== new cacheline
27109 - * 13 - kernel data segment
27110 - * 14 - default user CS
27111 - * 15 - default user DS
27114 - * 18 - PNPBIOS support (16->32 gate)
27115 - * 19 - PNPBIOS support
27116 - * 20 - PNPBIOS support
27117 - * 21 - PNPBIOS support
27118 - * 22 - PNPBIOS support
27119 - * 23 - APM BIOS support
27120 - * 24 - APM BIOS support
27121 - * 25 - APM BIOS support
27123 - * 26 - ESPFIX small SS
27124 - * 27 - per-cpu [ offset to per-cpu data area ]
27128 - * 31 - TSS for double fault handler
27130 -#define GDT_ENTRY_TLS_ENTRIES 3
27131 -#define GDT_ENTRY_TLS_MIN 6
27132 -#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
27134 -#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
27136 -#define GDT_ENTRY_DEFAULT_USER_CS 14
27137 -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
27139 -#define GDT_ENTRY_DEFAULT_USER_DS 15
27140 -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
27142 -#define GDT_ENTRY_KERNEL_BASE 12
27144 -#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
27145 -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
27147 -#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
27148 -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
27150 -#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
27151 -#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
27153 -#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
27154 -#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
27156 -#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
27157 -#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
27159 -#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
27161 -#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
27163 -#define __KERNEL_PERCPU 0
27166 -#define GDT_ENTRY_DOUBLEFAULT_TSS 31
27169 - * The GDT has 32 entries
27171 -#define GDT_ENTRIES 32
27172 -#define GDT_SIZE (GDT_ENTRIES * 8)
27174 -/* Simple and small GDT entries for booting only */
27176 -#define GDT_ENTRY_BOOT_CS 2
27177 -#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
27179 -#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
27180 -#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
27182 -/* The PnP BIOS entries in the GDT */
27183 -#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
27184 -#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
27185 -#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
27186 -#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
27187 -#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
27189 -/* The PnP BIOS selectors */
27190 -#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
27191 -#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
27192 -#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
27193 -#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
27194 -#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
27197 - * The interrupt descriptor table has room for 256 idt's,
27198 - * the global descriptor table is dependent on the number
27199 - * of tasks we can have..
27201 -#define IDT_ENTRIES 256
27203 -/* Bottom two bits of selector give the ring privilege level */
27204 -#define SEGMENT_RPL_MASK 0x3
27205 -/* Bit 2 is table indicator (LDT/GDT) */
27206 -#define SEGMENT_TI_MASK 0x4
27208 -/* User mode is privilege level 3 */
27209 -#define USER_RPL 0x3
27210 -/* LDT segment has TI set, GDT has it cleared */
27211 -#define SEGMENT_LDT 0x4
27212 -#define SEGMENT_GDT 0x0
27214 -#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
27217 - * Matching rules for certain types of segments.
27220 -/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
27221 -#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
27222 - || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
27224 -/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
27225 -#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
27226 - || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
27227 - || ((x) & ~3) == (FLAT_USER_CS & ~3))
27229 -/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
27230 -#define SEGMENT_IS_PNP_CODE(x) (((x) & ~0x0b) == GDT_ENTRY_PNPBIOS_BASE * 8)
27233 --- a/include/asm-x86/mach-xen/asm/segment.h
27234 +++ b/include/asm-x86/mach-xen/asm/segment.h
27236 +#ifndef _ASM_X86_SEGMENT_H_
27237 +#define _ASM_X86_SEGMENT_H_
27239 +/* Simple and small GDT entries for booting only */
27241 +#define GDT_ENTRY_BOOT_CS 2
27242 +#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
27244 +#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
27245 +#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
27247 +#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
27248 +#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
27250 #ifdef CONFIG_X86_32
27251 -# include "segment_32.h"
27253 + * The layout of the per-CPU GDT under Linux:
27260 + * 4 - unused <==== new cacheline
27263 + * ------- start of TLS (Thread-Local Storage) segments:
27265 + * 6 - TLS segment #1 [ glibc's TLS segment ]
27266 + * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
27267 + * 8 - TLS segment #3
27272 + * ------- start of kernel segments:
27274 + * 12 - kernel code segment <==== new cacheline
27275 + * 13 - kernel data segment
27276 + * 14 - default user CS
27277 + * 15 - default user DS
27280 + * 18 - PNPBIOS support (16->32 gate)
27281 + * 19 - PNPBIOS support
27282 + * 20 - PNPBIOS support
27283 + * 21 - PNPBIOS support
27284 + * 22 - PNPBIOS support
27285 + * 23 - APM BIOS support
27286 + * 24 - APM BIOS support
27287 + * 25 - APM BIOS support
27289 + * 26 - ESPFIX small SS
27290 + * 27 - per-cpu [ offset to per-cpu data area ]
27294 + * 31 - TSS for double fault handler
27296 +#define GDT_ENTRY_TLS_MIN 6
27297 +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
27299 +#define GDT_ENTRY_DEFAULT_USER_CS 14
27300 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
27302 +#define GDT_ENTRY_DEFAULT_USER_DS 15
27303 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
27305 +#define GDT_ENTRY_KERNEL_BASE 12
27307 +#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
27308 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
27310 +#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
27311 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
27313 +#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
27314 +#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
27316 +#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
27317 +#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
27319 +#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
27320 +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
27322 +#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
27324 +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
27326 -# include "../../segment_64.h"
27327 +#define __KERNEL_PERCPU 0
27330 +#define GDT_ENTRY_DOUBLEFAULT_TSS 31
27333 + * The GDT has 32 entries
27335 +#define GDT_ENTRIES 32
27337 +/* The PnP BIOS entries in the GDT */
27338 +#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
27339 +#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
27340 +#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
27341 +#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
27342 +#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
27344 +/* The PnP BIOS selectors */
27345 +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
27346 +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
27347 +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
27348 +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
27349 +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
27351 +/* Bottom two bits of selector give the ring privilege level */
27352 +#define SEGMENT_RPL_MASK 0x3
27353 +/* Bit 2 is table indicator (LDT/GDT) */
27354 +#define SEGMENT_TI_MASK 0x4
27356 +/* User mode is privilege level 3 */
27357 +#define USER_RPL 0x3
27358 +/* LDT segment has TI set, GDT has it cleared */
27359 +#define SEGMENT_LDT 0x4
27360 +#define SEGMENT_GDT 0x0
27363 + * Matching rules for certain types of segments.
27366 +/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
27367 +#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
27368 + || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
27370 +/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
27371 +#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
27372 + || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
27373 + || ((x) & ~3) == (FLAT_USER_CS & ~3))
27375 +/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
27376 +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
27378 +#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
27381 +#include <asm/cache.h>
27383 +#define __KERNEL_CS 0x10
27384 +#define __KERNEL_DS 0x18
27386 +#define __KERNEL32_CS 0x08
27389 + * we cannot use the same code segment descriptor for user and kernel
27390 + * -- not even in the long flat mode, because of different DPL /kkeil
27391 + * The segment offset needs to contain a RPL. Grr. -AK
27392 + * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
27395 +#define __USER32_CS 0x23 /* 4*8+3 */
27396 +#define __USER_DS 0x2b /* 5*8+3 */
27397 +#define __USER_CS 0x33 /* 6*8+3 */
27398 +#define __USER32_DS __USER_DS
27400 +#define GDT_ENTRY_TSS 8 /* needs two entries */
27401 +#define GDT_ENTRY_LDT 10 /* needs two entries */
27402 +#define GDT_ENTRY_TLS_MIN 12
27403 +#define GDT_ENTRY_TLS_MAX 14
27405 +#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
27406 +#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
27408 +/* TLS indexes for 64bit - hardcoded in arch_prctl */
27412 +#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
27413 +#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
27415 +#define GDT_ENTRIES 16
27419 +/* User mode is privilege level 3 */
27420 +#define USER_RPL 0x3
27421 +/* LDT segment has TI set, GDT has it cleared */
27422 +#define SEGMENT_LDT 0x4
27423 +#define SEGMENT_GDT 0x0
27425 +/* Bottom two bits of selector give the ring privilege level */
27426 +#define SEGMENT_RPL_MASK 0x3
27427 +/* Bit 2 is table indicator (LDT/GDT) */
27428 +#define SEGMENT_TI_MASK 0x4
27430 +#define IDT_ENTRIES 256
27431 +#define GDT_SIZE (GDT_ENTRIES * 8)
27432 +#define GDT_ENTRY_TLS_ENTRIES 3
27433 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
27436 +#ifndef __ASSEMBLY__
27437 +extern const char early_idt_handlers[IDT_ENTRIES][10];
27442 --- a/include/asm-x86/mach-xen/asm/smp_32.h
27443 +++ b/include/asm-x86/mach-xen/asm/smp_32.h
27445 #ifndef __ASM_SMP_H
27446 #define __ASM_SMP_H
27448 +#ifndef __ASSEMBLY__
27449 +#include <linux/cpumask.h>
27450 +#include <linux/init.h>
27453 * We need the APIC definitions automatically as part of 'smp.h'
27455 -#ifndef __ASSEMBLY__
27456 -#include <linux/kernel.h>
27457 -#include <linux/threads.h>
27458 -#include <linux/cpumask.h>
27459 +#ifdef CONFIG_X86_LOCAL_APIC
27460 +# include <asm/mpspec.h>
27461 +# include <asm/apic.h>
27462 +# ifdef CONFIG_X86_IO_APIC
27463 +# include <asm/io_apic.h>
27467 -#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
27468 -#include <linux/bitops.h>
27469 -#include <asm/mpspec.h>
27470 -#include <asm/apic.h>
27471 -#ifdef CONFIG_X86_IO_APIC
27472 -#include <asm/io_apic.h>
27475 +#define cpu_callout_map cpu_possible_map
27476 +#define cpu_callin_map cpu_possible_map
27478 -#define BAD_APICID 0xFFu
27480 -#ifndef __ASSEMBLY__
27481 +extern int smp_num_siblings;
27482 +extern unsigned int num_processors;
27485 - * Private routines/data
27488 extern void smp_alloc_memory(void);
27489 -extern int pic_mode;
27490 -extern int smp_num_siblings;
27491 -DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27492 -DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27493 +extern void lock_ipi_call_lock(void);
27494 +extern void unlock_ipi_call_lock(void);
27496 extern void (*mtrr_hook) (void);
27497 extern void zap_low_mappings (void);
27498 -extern void lock_ipi_call_lock(void);
27499 -extern void unlock_ipi_call_lock(void);
27501 -#define MAX_APICID 256
27502 -extern u8 __initdata x86_cpu_to_apicid_init[];
27503 -extern void *x86_cpu_to_apicid_ptr;
27504 +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27505 +DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27506 +DECLARE_PER_CPU(u8, cpu_llc_id);
27507 DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
27509 -#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27511 #ifdef CONFIG_HOTPLUG_CPU
27512 extern void cpu_exit_clear(void);
27513 extern void cpu_uninit(void);
27520 +/* Globals due to paravirt */
27521 +extern void set_cpu_sibling_map(int cpu);
27525 void (*smp_prepare_boot_cpu)(void);
27526 @@ -104,11 +99,11 @@ void native_smp_prepare_cpus(unsigned in
27527 int native_cpu_up(unsigned int cpunum);
27528 void native_smp_cpus_done(unsigned int max_cpus);
27530 -#define startup_ipi_hook(phys_apicid, start_eip, start_esp) \
27534 +#ifndef CONFIG_PARAVIRT
27535 +#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
27538 +#else /* CONFIG_XEN */
27540 void xen_smp_send_stop(void);
27541 void xen_smp_send_reschedule(int cpu);
27542 @@ -120,7 +115,12 @@ int xen_smp_call_function_mask(cpumask_t
27543 #define smp_send_reschedule xen_smp_send_reschedule
27544 #define smp_call_function_mask xen_smp_call_function_mask
27547 +extern void prefill_possible_map(void);
27549 +#endif /* CONFIG_XEN */
27551 +extern int __cpu_disable(void);
27552 +extern void __cpu_die(unsigned int cpu);
27555 * This function is needed by all SMP systems. It must _always_ be valid
27556 @@ -130,64 +130,49 @@ int xen_smp_call_function_mask(cpumask_t
27557 DECLARE_PER_CPU(int, cpu_number);
27558 #define raw_smp_processor_id() (x86_read_percpu(cpu_number))
27560 -extern cpumask_t cpu_possible_map;
27561 -#define cpu_callin_map cpu_possible_map
27562 +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27564 +#define safe_smp_processor_id() smp_processor_id()
27566 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
27567 static inline int num_booting_cpus(void)
27569 - return cpus_weight(cpu_possible_map);
27570 + return cpus_weight(cpu_callout_map);
27573 -#define safe_smp_processor_id() smp_processor_id()
27574 -extern int __cpu_disable(void);
27575 -extern void __cpu_die(unsigned int cpu);
27576 -extern void prefill_possible_map(void);
27577 -extern unsigned int num_processors;
27579 -#endif /* !__ASSEMBLY__ */
27581 #else /* CONFIG_SMP */
27583 #define safe_smp_processor_id() 0
27584 #define cpu_physical_id(cpu) boot_cpu_physical_apicid
27586 -#define NO_PROC_ID 0xFF /* No processor magic marker */
27588 -#endif /* CONFIG_SMP */
27590 -#ifndef __ASSEMBLY__
27591 +#endif /* !CONFIG_SMP */
27593 #ifdef CONFIG_X86_LOCAL_APIC
27595 -#ifdef APIC_DEFINITION
27596 +static __inline int logical_smp_processor_id(void)
27598 + /* we don't want to mark this access volatile - bad code generation */
27599 + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
27602 +# ifdef APIC_DEFINITION
27603 extern int hard_smp_processor_id(void);
27605 -#include <mach_apicdef.h>
27607 +# include <mach_apicdef.h>
27608 static inline int hard_smp_processor_id(void)
27610 /* we don't want to mark this access volatile - bad code generation */
27611 - return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
27612 + return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
27614 -#endif /* APIC_DEFINITION */
27615 +# endif /* APIC_DEFINITION */
27617 #else /* CONFIG_X86_LOCAL_APIC */
27619 -#ifndef CONFIG_SMP
27620 -#define hard_smp_processor_id() 0
27622 +# ifndef CONFIG_SMP
27623 +# define hard_smp_processor_id() 0
27626 #endif /* CONFIG_X86_LOCAL_APIC */
27628 -extern u8 apicid_2_node[];
27630 -#ifdef CONFIG_X86_LOCAL_APIC
27631 -static __inline int logical_smp_processor_id(void)
27633 - /* we don't want to mark this access volatile - bad code generation */
27634 - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
27639 +#endif /* !ASSEMBLY */
27641 --- a/include/asm-x86/mach-xen/asm/smp_64.h
27642 +++ b/include/asm-x86/mach-xen/asm/smp_64.h
27643 @@ -1,139 +1,103 @@
27644 #ifndef __ASM_SMP_H
27645 #define __ASM_SMP_H
27648 - * We need the APIC definitions automatically as part of 'smp.h'
27650 -#include <linux/threads.h>
27651 #include <linux/cpumask.h>
27652 -#include <linux/bitops.h>
27653 #include <linux/init.h>
27654 -extern int disable_apic;
27656 #ifdef CONFIG_X86_LOCAL_APIC
27657 -#include <asm/mpspec.h>
27659 + * We need the APIC definitions automatically as part of 'smp.h'
27661 #include <asm/apic.h>
27662 #ifdef CONFIG_X86_IO_APIC
27663 #include <asm/io_apic.h>
27665 -#include <asm/thread_info.h>
27666 +#include <asm/mpspec.h>
27671 #include <asm/pda.h>
27672 +#include <asm/thread_info.h>
27676 -extern cpumask_t cpu_present_mask;
27677 -extern cpumask_t cpu_possible_map;
27678 -extern cpumask_t cpu_online_map;
27679 extern cpumask_t cpu_initialized;
27682 - * Private routines/data
27685 +extern int smp_num_siblings;
27686 +extern unsigned int num_processors;
27688 extern void smp_alloc_memory(void);
27689 -extern volatile unsigned long smp_invalidate_needed;
27690 extern void lock_ipi_call_lock(void);
27691 extern void unlock_ipi_call_lock(void);
27692 -extern int smp_num_siblings;
27693 -extern void smp_send_reschedule(int cpu);
27695 extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
27696 void *info, int wait);
27699 - * cpu_sibling_map and cpu_core_map now live
27700 - * in the per cpu area
27702 - * extern cpumask_t cpu_sibling_map[NR_CPUS];
27703 - * extern cpumask_t cpu_core_map[NR_CPUS];
27705 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27706 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27707 -DECLARE_PER_CPU(u8, cpu_llc_id);
27709 -#define SMP_TRAMPOLINE_BASE 0x6000
27710 +DECLARE_PER_CPU(u16, cpu_llc_id);
27711 +DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
27712 +DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
27715 - * On x86 all CPUs are mapped 1:1 to the APIC space.
27716 - * This simplifies scheduling and IPI sending and
27717 - * compresses data structures.
27720 -static inline int num_booting_cpus(void)
27721 +#ifdef CONFIG_X86_LOCAL_APIC
27722 +static inline int cpu_present_to_apicid(int mps_cpu)
27724 - return cpus_weight(cpu_possible_map);
27725 + if (cpu_present(mps_cpu))
27726 + return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
27728 + return BAD_APICID;
27732 -#define raw_smp_processor_id() read_pda(cpunumber)
27735 +#define SMP_TRAMPOLINE_BASE 0x6000
27737 extern int __cpu_disable(void);
27738 extern void __cpu_die(unsigned int cpu);
27739 extern void prefill_possible_map(void);
27740 -extern unsigned num_processors;
27741 extern unsigned __cpuinitdata disabled_cpus;
27743 -#define NO_PROC_ID 0xFF /* No processor magic marker */
27745 -#endif /* CONFIG_SMP */
27746 +#define raw_smp_processor_id() read_pda(cpunumber)
27747 +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27749 -#define safe_smp_processor_id() smp_processor_id()
27751 -#ifdef CONFIG_X86_LOCAL_APIC
27752 -static inline int hard_smp_processor_id(void)
27754 - /* we don't want to mark this access volatile - bad code generation */
27755 - return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
27758 +#define stack_smp_processor_id() \
27760 + struct thread_info *ti; \
27761 + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
27766 - * Some lowlevel functions might want to know about
27767 - * the real APIC ID <-> CPU # mapping.
27768 + * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
27769 + * scheduling and IPI sending and compresses data structures.
27771 -extern u8 __initdata x86_cpu_to_apicid_init[];
27772 -extern void *x86_cpu_to_apicid_ptr;
27773 -DECLARE_PER_CPU(u8, x86_cpu_to_apicid); /* physical ID */
27774 -extern u8 bios_cpu_apicid[];
27776 -#ifdef CONFIG_X86_LOCAL_APIC
27777 -static inline int cpu_present_to_apicid(int mps_cpu)
27778 +static inline int num_booting_cpus(void)
27780 - if (mps_cpu < NR_CPUS)
27781 - return (int)bios_cpu_apicid[mps_cpu];
27783 - return BAD_APICID;
27784 + return cpus_weight(cpu_possible_map);
27788 -#ifndef CONFIG_SMP
27789 +extern void smp_send_reschedule(int cpu);
27791 +#else /* CONFIG_SMP */
27793 +extern unsigned int boot_cpu_id;
27794 +#define cpu_physical_id(cpu) boot_cpu_id
27795 #define stack_smp_processor_id() 0
27796 -#define cpu_logical_map(x) (x)
27798 -#include <asm/thread_info.h>
27799 -#define stack_smp_processor_id() \
27801 - struct thread_info *ti; \
27802 - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
27807 +#endif /* !CONFIG_SMP */
27809 +#define safe_smp_processor_id() smp_processor_id()
27811 #ifdef CONFIG_X86_LOCAL_APIC
27812 static __inline int logical_smp_processor_id(void)
27814 /* we don't want to mark this access volatile - bad code generation */
27815 - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
27816 + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
27819 +static inline int hard_smp_processor_id(void)
27821 + /* we don't want to mark this access volatile - bad code generation */
27822 + return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
27827 -#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27829 -extern unsigned int boot_cpu_id;
27830 -#define cpu_physical_id(cpu) boot_cpu_id
27831 -#endif /* !CONFIG_SMP */
27835 +++ b/include/asm-x86/mach-xen/asm/spinlock.h
27837 +#ifndef _X86_SPINLOCK_H_
27838 +#define _X86_SPINLOCK_H_
27840 +#include <asm/atomic.h>
27841 +#include <asm/rwlock.h>
27842 +#include <asm/page.h>
27843 +#include <asm/processor.h>
27844 +#include <linux/compiler.h>
27847 + * Your basic SMP spinlocks, allowing only a single CPU anywhere
27849 + * Simple spin lock operations. There are two variants, one clears IRQ's
27850 + * on the local processor, one does not.
27852 + * These are fair FIFO ticket locks, which are currently limited to 256
27855 + * (the type definitions are in asm/spinlock_types.h)
27858 +#ifdef CONFIG_X86_32
27859 +# define LOCK_PTR_REG "a"
27860 +# define REG_PTR_MODE "k"
27862 +# define LOCK_PTR_REG "D"
27863 +# define REG_PTR_MODE "q"
27866 +#if defined(CONFIG_X86_32) && \
27867 + (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
27869 + * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
27870 + * (PPro errata 66, 92)
27872 +# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
27874 +# define UNLOCK_LOCK_PREFIX
27877 +int xen_spinlock_init(unsigned int cpu);
27878 +void xen_spinlock_cleanup(unsigned int cpu);
27879 +extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
27880 +extern int xen_spin_wait_flags(raw_spinlock_t *, unsigned int *token,
27881 + unsigned int flags);
27882 +extern unsigned int xen_spin_adjust(raw_spinlock_t *, unsigned int token);
27883 +extern void xen_spin_kick(raw_spinlock_t *, unsigned int token);
27886 + * Ticket locks are conceptually two parts, one indicating the current head of
27887 + * the queue, and the other indicating the current tail. The lock is acquired
27888 + * by atomically noting the tail and incrementing it by one (thus adding
27889 + * ourself to the queue and noting our position), then waiting until the head
27890 + * becomes equal to the the initial value of the tail.
27892 + * We use an xadd covering *both* parts of the lock, to increment the tail and
27893 + * also load the position of the head, which takes care of memory ordering
27894 + * issues and should be optimal for the uncontended case. Note the tail must be
27895 + * in the high part, because a wide xadd increment of the low part would carry
27896 + * up and contaminate the high part.
27898 + * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
27899 + * save some instructions and make the code more elegant. There really isn't
27900 + * much between them in performance though, especially as locks are out of line.
27902 +#if (NR_CPUS < 256)
27903 +#define TICKET_SHIFT 8
27904 +#define __raw_spin_lock_preamble \
27905 + asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
27906 + "cmpb %h0, %b0\n\t" \
27908 + : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
27910 + : "memory", "cc")
27911 +#define __raw_spin_lock_body \
27913 + "cmpb %h0, %b0\n\t" \
27917 + "rep ; nop\n\t" \
27918 + "movb %2, %b0\n\t" \
27919 + /* don't need lfence here, because loads are in-order */ \
27922 + : "+Q" (token), "+g" (count) \
27923 + : "m" (lock->slock) \
27924 + : "memory", "cc")
27927 +static inline int __raw_spin_trylock(raw_spinlock_t *lock)
27931 + asm("movzwl %2, %0\n\t"
27932 + "cmpb %h0, %b0\n\t"
27933 + "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
27935 + LOCK_PREFIX "cmpxchgw %w1, %2\n\t"
27938 + "movzbl %b1, %0\n\t"
27939 + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
27941 + : "memory", "cc");
27946 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
27948 + unsigned int token;
27949 + unsigned char kick;
27951 + asm(UNLOCK_LOCK_PREFIX "incb %2\n\t"
27952 + "movzwl %2, %0\n\t"
27953 + "cmpb %h0, %b0\n\t"
27955 + : "=&Q" (token), "=qm" (kick), "+m" (lock->slock)
27957 + : "memory", "cc");
27959 + xen_spin_kick(lock, token);
27962 +#define TICKET_SHIFT 16
27963 +#define __raw_spin_lock_preamble \
27965 + unsigned int tmp; \
27966 + asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
27967 + "shldl $16, %0, %3\n\t" \
27968 + "cmpw %w3, %w0\n\t" \
27970 + : "=&r" (token), "=qm" (free), "+m" (lock->slock), \
27972 + : "0" (0x00010000) \
27973 + : "memory", "cc"); \
27975 +#define __raw_spin_lock_body \
27977 + unsigned int tmp; \
27978 + asm("shldl $16, %0, %2\n" \
27980 + "cmpw %w2, %w0\n\t" \
27984 + "rep ; nop\n\t" \
27985 + "movw %3, %w0\n\t" \
27986 + /* don't need lfence here, because loads are in-order */ \
27989 + : "+r" (token), "+g" (count), "=&g" (tmp) \
27990 + : "m" (lock->slock) \
27991 + : "memory", "cc"); \
27994 +static inline int __raw_spin_trylock(raw_spinlock_t *lock)
27999 + asm("movl %2, %0\n\t"
28000 + "movl %0, %1\n\t"
28001 + "roll $16, %0\n\t"
28002 + "cmpl %0, %1\n\t"
28003 + "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
28005 + LOCK_PREFIX "cmpxchgl %1, %2\n"
28008 + "movzbl %b1, %0\n\t"
28009 + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
28011 + : "memory", "cc");
28016 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
28018 + unsigned int token, tmp;
28021 + asm(UNLOCK_LOCK_PREFIX "incw %2\n\t"
28022 + "movl %2, %0\n\t"
28023 + "shldl $16, %0, %3\n\t"
28024 + "cmpw %w3, %w0\n\t"
28026 + : "=&r" (token), "=qm" (kick), "+m" (lock->slock), "=&r" (tmp)
28028 + : "memory", "cc");
28030 + xen_spin_kick(lock, token);
28034 +static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
28036 + int tmp = *(volatile signed int *)(&(lock)->slock);
28038 + return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
28041 +static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
28043 + int tmp = *(volatile signed int *)(&(lock)->slock);
28045 + return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
28048 +static inline void __raw_spin_lock(raw_spinlock_t *lock)
28050 + unsigned int token, count;
28053 + __raw_spin_lock_preamble;
28054 + if (unlikely(!free))
28055 + token = xen_spin_adjust(lock, token);
28058 + __raw_spin_lock_body;
28059 + } while (unlikely(!count) && !xen_spin_wait(lock, token));
28062 +static inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
28063 + unsigned long flags)
28065 + unsigned int token, count;
28068 + __raw_spin_lock_preamble;
28069 + if (unlikely(!free))
28070 + token = xen_spin_adjust(lock, token);
28073 + __raw_spin_lock_body;
28074 + } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
28077 +static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
28079 + while (__raw_spin_is_locked(lock))
28084 + * Read-write spinlocks, allowing multiple readers
28085 + * but only one writer.
28087 + * NOTE! it is quite common to have readers in interrupts
28088 + * but no interrupt writers. For those circumstances we
28089 + * can "mix" irq-safe locks - any writer needs to get a
28090 + * irq-safe write-lock, but readers can get non-irqsafe
28093 + * On x86, we implement read-write locks as a 32-bit counter
28094 + * with the high bit (sign) being the "contended" bit.
28098 + * read_can_lock - would read_trylock() succeed?
28099 + * @lock: the rwlock in question.
28101 +static inline int __raw_read_can_lock(raw_rwlock_t *lock)
28103 + return (int)(lock)->lock > 0;
28107 + * write_can_lock - would write_trylock() succeed?
28108 + * @lock: the rwlock in question.
28110 +static inline int __raw_write_can_lock(raw_rwlock_t *lock)
28112 + return (lock)->lock == RW_LOCK_BIAS;
28115 +static inline void __raw_read_lock(raw_rwlock_t *rw)
28117 + asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
28119 + "call __read_lock_failed\n\t"
28121 + ::LOCK_PTR_REG (rw) : "memory");
28124 +static inline void __raw_write_lock(raw_rwlock_t *rw)
28126 + asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
28128 + "call __write_lock_failed\n\t"
28130 + ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
28133 +static inline int __raw_read_trylock(raw_rwlock_t *lock)
28135 + atomic_t *count = (atomic_t *)lock;
28137 + atomic_dec(count);
28138 + if (atomic_read(count) >= 0)
28140 + atomic_inc(count);
28144 +static inline int __raw_write_trylock(raw_rwlock_t *lock)
28146 + atomic_t *count = (atomic_t *)lock;
28148 + if (atomic_sub_and_test(RW_LOCK_BIAS, count))
28150 + atomic_add(RW_LOCK_BIAS, count);
28154 +static inline void __raw_read_unlock(raw_rwlock_t *rw)
28156 + asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
28159 +static inline void __raw_write_unlock(raw_rwlock_t *rw)
28161 + asm volatile(LOCK_PREFIX "addl %1, %0"
28162 + : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
28165 +#define _raw_spin_relax(lock) cpu_relax()
28166 +#define _raw_read_relax(lock) cpu_relax()
28167 +#define _raw_write_relax(lock) cpu_relax()
28170 --- a/include/asm-x86/mach-xen/asm/system_32.h
28173 -#ifndef __ASM_SYSTEM_H
28174 -#define __ASM_SYSTEM_H
28176 -#include <linux/kernel.h>
28177 -#include <asm/segment.h>
28178 -#include <asm/cpufeature.h>
28179 -#include <asm/cmpxchg.h>
28180 -#include <asm/synch_bitops.h>
28181 -#include <asm/hypervisor.h>
28184 -#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
28186 -struct task_struct; /* one of the stranger aspects of C forward declarations.. */
28187 -extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
28190 - * Saving eflags is important. It switches not only IOPL between tasks,
28191 - * it also protects other tasks from NT leaking through sysenter etc.
28193 -#define switch_to(prev,next,last) do { \
28194 - unsigned long esi,edi; \
28195 - asm volatile("pushfl\n\t" /* Save flags */ \
28196 - "pushl %%ebp\n\t" \
28197 - "movl %%esp,%0\n\t" /* save ESP */ \
28198 - "movl %5,%%esp\n\t" /* restore ESP */ \
28199 - "movl $1f,%1\n\t" /* save EIP */ \
28200 - "pushl %6\n\t" /* restore EIP */ \
28201 - "jmp __switch_to\n" \
28203 - "popl %%ebp\n\t" \
28205 - :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
28206 - "=a" (last),"=S" (esi),"=D" (edi) \
28207 - :"m" (next->thread.esp),"m" (next->thread.eip), \
28208 - "2" (prev), "d" (next)); \
28211 -#define _set_base(addr,base) do { unsigned long __pr; \
28212 -__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28213 - "rorl $16,%%edx\n\t" \
28214 - "movb %%dl,%2\n\t" \
28217 - :"m" (*((addr)+2)), \
28218 - "m" (*((addr)+4)), \
28219 - "m" (*((addr)+7)), \
28223 -#define _set_limit(addr,limit) do { unsigned long __lr; \
28224 -__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28225 - "rorl $16,%%edx\n\t" \
28226 - "movb %2,%%dh\n\t" \
28227 - "andb $0xf0,%%dh\n\t" \
28228 - "orb %%dh,%%dl\n\t" \
28231 - :"m" (*(addr)), \
28232 - "m" (*((addr)+6)), \
28236 -#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
28237 -#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
28240 - * Load a segment. Fall back on loading the zero
28241 - * segment if something goes wrong..
28243 -#define loadsegment(seg,value) \
28244 - asm volatile("\n" \
28246 - "mov %0,%%" #seg "\n" \
28248 - ".section .fixup,\"ax\"\n" \
28251 - "popl %%" #seg "\n\t" \
28254 - ".section __ex_table,\"a\"\n\t" \
28256 - ".long 1b,3b\n" \
28261 - * Save a segment register away
28263 -#define savesegment(seg, value) \
28264 - asm volatile("mov %%" #seg ",%0":"=rm" (value))
28266 -static inline void xen_clts(void)
28268 - HYPERVISOR_fpu_taskswitch(0);
28271 -static inline unsigned long xen_read_cr0(void)
28273 - unsigned long val;
28274 - asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
28278 -static inline void xen_write_cr0(unsigned long val)
28280 - asm volatile("movl %0,%%cr0": :"r" (val));
28283 -#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
28285 -static inline void xen_write_cr2(unsigned long val)
28287 - asm volatile("movl %0,%%cr2": :"r" (val));
28290 -static inline unsigned long xen_read_cr3(void)
28292 - unsigned long val;
28293 - asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
28294 - return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
28297 -static inline void xen_write_cr3(unsigned long val)
28299 - val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
28300 - asm volatile("movl %0,%%cr3": :"r" (val));
28303 -static inline unsigned long xen_read_cr4(void)
28305 - unsigned long val;
28306 - asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
28310 -static inline unsigned long xen_read_cr4_safe(void)
28312 - unsigned long val;
28313 - /* This could fault if %cr4 does not exist */
28314 - asm volatile("1: movl %%cr4, %0 \n"
28316 - ".section __ex_table,\"a\" \n"
28319 - : "=r" (val): "0" (0));
28323 -static inline void xen_write_cr4(unsigned long val)
28325 - asm volatile("movl %0,%%cr4": :"r" (val));
28328 -static inline void xen_wbinvd(void)
28330 - asm volatile("wbinvd": : :"memory");
28333 -static inline void clflush(volatile void *__p)
28335 - asm volatile("clflush %0" : "+m" (*(char __force *)__p));
28338 -#define read_cr0() (xen_read_cr0())
28339 -#define write_cr0(x) (xen_write_cr0(x))
28340 -#define read_cr2() (xen_read_cr2())
28341 -#define write_cr2(x) (xen_write_cr2(x))
28342 -#define read_cr3() (xen_read_cr3())
28343 -#define write_cr3(x) (xen_write_cr3(x))
28344 -#define read_cr4() (xen_read_cr4())
28345 -#define read_cr4_safe() (xen_read_cr4_safe())
28346 -#define write_cr4(x) (xen_write_cr4(x))
28347 -#define wbinvd() (xen_wbinvd())
28349 -/* Clear the 'TS' bit */
28350 -#define clts() (xen_clts())
28352 -/* Set the 'TS' bit */
28353 -#define stts() (HYPERVISOR_fpu_taskswitch(1))
28355 -#endif /* __KERNEL__ */
28357 -static inline unsigned long get_limit(unsigned long segment)
28359 - unsigned long __limit;
28360 - __asm__("lsll %1,%0"
28361 - :"=r" (__limit):"r" (segment));
28362 - return __limit+1;
28365 -#define nop() __asm__ __volatile__ ("nop")
28368 - * Force strict CPU ordering.
28369 - * And yes, this is required on UP too when we're talking
28372 - * For now, "wmb()" doesn't actually do anything, as all
28373 - * Intel CPU's follow what Intel calls a *Processor Order*,
28374 - * in which all writes are seen in the program order even
28375 - * outside the CPU.
28377 - * I expect future Intel CPU's to have a weaker ordering,
28378 - * but I'd also expect them to finally get their act together
28379 - * and add some real memory barriers if so.
28381 - * Some non intel clones support out of order store. wmb() ceases to be a
28386 -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
28387 -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
28388 -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
28391 - * read_barrier_depends - Flush all pending reads that subsequents reads
28394 - * No data-dependent reads from memory-like regions are ever reordered
28395 - * over this barrier. All reads preceding this primitive are guaranteed
28396 - * to access memory (but not necessarily other CPUs' caches) before any
28397 - * reads following this primitive that depend on the data return by
28398 - * any of the preceding reads. This primitive is much lighter weight than
28399 - * rmb() on most CPUs, and is never heavier weight than is
28402 - * These ordering constraints are respected by both the local CPU
28403 - * and the compiler.
28405 - * Ordering is not guaranteed by anything other than these primitives,
28406 - * not even by data dependencies. See the documentation for
28407 - * memory_barrier() for examples and URLs to more information.
28409 - * For example, the following code would force ordering (the initial
28410 - * value of "a" is zero, "b" is one, and "p" is "&a"):
28412 - * <programlisting>
28416 - * memory_barrier();
28418 - * read_barrier_depends();
28420 - * </programlisting>
28422 - * because the read of "*q" depends on the read of "p" and these
28423 - * two reads are separated by a read_barrier_depends(). However,
28424 - * the following code, with the same initial values for "a" and "b":
28426 - * <programlisting>
28430 - * memory_barrier();
28432 - * read_barrier_depends();
28434 - * </programlisting>
28436 - * does not enforce ordering, since there is no data dependency between
28437 - * the read of "a" and the read of "b". Therefore, on some CPUs, such
28438 - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
28439 - * in cases like this where there are no data dependencies.
28442 -#define read_barrier_depends() do { } while(0)
28445 -#define smp_mb() mb()
28446 -#ifdef CONFIG_X86_PPRO_FENCE
28447 -# define smp_rmb() rmb()
28449 -# define smp_rmb() barrier()
28451 -#ifdef CONFIG_X86_OOSTORE
28452 -# define smp_wmb() wmb()
28454 -# define smp_wmb() barrier()
28456 -#define smp_read_barrier_depends() read_barrier_depends()
28457 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28459 -#define smp_mb() barrier()
28460 -#define smp_rmb() barrier()
28461 -#define smp_wmb() barrier()
28462 -#define smp_read_barrier_depends() do { } while(0)
28463 -#define set_mb(var, value) do { var = value; barrier(); } while (0)
28466 -#include <linux/irqflags.h>
28469 - * disable hlt during certain critical i/o operations
28471 -#define HAVE_DISABLE_HLT
28472 -void disable_hlt(void);
28473 -void enable_hlt(void);
28475 -extern int es7000_plat;
28476 -void cpu_idle_wait(void);
28478 -extern unsigned long arch_align_stack(unsigned long sp);
28479 -extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28481 -void default_idle(void);
28482 -void __show_registers(struct pt_regs *, int all);
28485 --- a/include/asm-x86/mach-xen/asm/system_64.h
28486 +++ b/include/asm-x86/mach-xen/asm/system_64.h
28488 #ifndef __ASM_SYSTEM_H
28489 #define __ASM_SYSTEM_H
28491 -#include <linux/kernel.h>
28492 #include <asm/segment.h>
28493 #include <asm/cmpxchg.h>
28495 -#include <asm/synch_bitops.h>
28496 -#include <asm/hypervisor.h>
28497 -#include <xen/interface/arch-x86_64.h>
28501 -/* entries in ARCH_DLINFO: */
28502 -#ifdef CONFIG_IA32_EMULATION
28503 -# define AT_VECTOR_SIZE_ARCH 2
28505 -# define AT_VECTOR_SIZE_ARCH 1
28508 -#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
28509 -#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
28511 -/* frame pointer must be last for get_wchan */
28512 -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
28513 -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t"
28515 -#define __EXTRA_CLOBBER \
28516 - ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
28518 -/* Save restore flags to clear handle leaking NT */
28519 -#define switch_to(prev,next,last) \
28520 - asm volatile(SAVE_CONTEXT \
28521 - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
28522 - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
28523 - "call __switch_to\n\t" \
28524 - ".globl thread_return\n" \
28525 - "thread_return:\n\t" \
28526 - "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
28527 - "movq %P[thread_info](%%rsi),%%r8\n\t" \
28528 - LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
28529 - "movq %%rax,%%rdi\n\t" \
28530 - "jc ret_from_fork\n\t" \
28531 - RESTORE_CONTEXT \
28533 - : [next] "S" (next), [prev] "D" (prev), \
28534 - [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
28535 - [ti_flags] "i" (offsetof(struct thread_info, flags)),\
28536 - [tif_fork] "i" (TIF_FORK), \
28537 - [thread_info] "i" (offsetof(struct task_struct, stack)), \
28538 - [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
28539 - : "memory", "cc" __EXTRA_CLOBBER)
28541 -extern void load_gs_index(unsigned);
28544 - * Load a segment. Fall back on loading the zero
28545 - * segment if something goes wrong..
28547 -#define loadsegment(seg,value) \
28548 - asm volatile("\n" \
28550 - "movl %k0,%%" #seg "\n" \
28552 - ".section .fixup,\"ax\"\n" \
28554 - "movl %1,%%" #seg "\n\t" \
28557 - ".section __ex_table,\"a\"\n\t" \
28559 - ".quad 1b,3b\n" \
28561 - : :"r" (value), "r" (0))
28564 - * Clear and set 'TS' bit respectively
28566 -#define clts() (HYPERVISOR_fpu_taskswitch(0))
28568 -static inline unsigned long read_cr0(void)
28570 - unsigned long cr0;
28571 - asm volatile("movq %%cr0,%0" : "=r" (cr0));
28575 -static inline void write_cr0(unsigned long val)
28577 - asm volatile("movq %0,%%cr0" :: "r" (val));
28580 -#define read_cr2() current_vcpu_info()->arch.cr2
28582 -#define write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
28584 -#define read_cr3() ({ \
28585 - unsigned long __dummy; \
28586 - asm volatile("movq %%cr3,%0" : "=r" (__dummy)); \
28587 - machine_to_phys(__dummy); \
28590 -static inline void write_cr3(unsigned long val)
28592 - val = phys_to_machine(val);
28593 - asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
28596 -static inline unsigned long read_cr4(void)
28598 - unsigned long cr4;
28599 - asm volatile("movq %%cr4,%0" : "=r" (cr4));
28603 -static inline void write_cr4(unsigned long val)
28605 - asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
28608 static inline unsigned long read_cr8(void)
28610 @@ -128,52 +15,6 @@ static inline void write_cr8(unsigned lo
28614 -#define stts() (HYPERVISOR_fpu_taskswitch(1))
28616 -#define wbinvd() \
28617 - __asm__ __volatile__ ("wbinvd": : :"memory")
28619 -#endif /* __KERNEL__ */
28621 -static inline void clflush(volatile void *__p)
28623 - asm volatile("clflush %0" : "+m" (*(char __force *)__p));
28626 -#define nop() __asm__ __volatile__ ("nop")
28629 -#define smp_mb() mb()
28630 -#define smp_rmb() barrier()
28631 -#define smp_wmb() barrier()
28632 -#define smp_read_barrier_depends() do {} while(0)
28634 -#define smp_mb() barrier()
28635 -#define smp_rmb() barrier()
28636 -#define smp_wmb() barrier()
28637 -#define smp_read_barrier_depends() do {} while(0)
28642 - * Force strict CPU ordering.
28643 - * And yes, this is required on UP too when we're talking
28646 -#define mb() asm volatile("mfence":::"memory")
28647 -#define rmb() asm volatile("lfence":::"memory")
28648 -#define wmb() asm volatile("sfence" ::: "memory")
28650 -#define read_barrier_depends() do {} while(0)
28651 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28653 -#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
28655 #include <linux/irqflags.h>
28657 -void cpu_idle_wait(void);
28659 -extern unsigned long arch_align_stack(unsigned long sp);
28660 -extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28663 --- a/include/asm-x86/mach-xen/asm/system.h
28664 +++ b/include/asm-x86/mach-xen/asm/system.h
28666 +#ifndef _ASM_X86_SYSTEM_H_
28667 +#define _ASM_X86_SYSTEM_H_
28669 +#include <asm/asm.h>
28670 +#include <asm/segment.h>
28671 +#include <asm/cpufeature.h>
28672 +#include <asm/cmpxchg.h>
28673 +#include <asm/nops.h>
28674 +#include <asm/hypervisor.h>
28676 +#include <linux/kernel.h>
28677 +#include <linux/irqflags.h>
28679 +/* entries in ARCH_DLINFO: */
28680 +#ifdef CONFIG_IA32_EMULATION
28681 +# define AT_VECTOR_SIZE_ARCH 2
28683 +# define AT_VECTOR_SIZE_ARCH 1
28686 +#ifdef CONFIG_X86_32
28688 +struct task_struct; /* one of the stranger aspects of C forward declarations */
28689 +struct task_struct *__switch_to(struct task_struct *prev,
28690 + struct task_struct *next);
28693 + * Saving eflags is important. It switches not only IOPL between tasks,
28694 + * it also protects other tasks from NT leaking through sysenter etc.
28696 +#define switch_to(prev, next, last) do { \
28697 + unsigned long esi, edi; \
28698 + asm volatile("pushfl\n\t" /* Save flags */ \
28699 + "pushl %%ebp\n\t" \
28700 + "movl %%esp,%0\n\t" /* save ESP */ \
28701 + "movl %5,%%esp\n\t" /* restore ESP */ \
28702 + "movl $1f,%1\n\t" /* save EIP */ \
28703 + "pushl %6\n\t" /* restore EIP */ \
28704 + "jmp __switch_to\n" \
28706 + "popl %%ebp\n\t" \
28708 + :"=m" (prev->thread.sp), "=m" (prev->thread.ip), \
28709 + "=a" (last), "=S" (esi), "=D" (edi) \
28710 + :"m" (next->thread.sp), "m" (next->thread.ip), \
28711 + "2" (prev), "d" (next)); \
28715 + * disable hlt during certain critical i/o operations
28717 +#define HAVE_DISABLE_HLT
28719 +#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
28720 +#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
28722 +/* frame pointer must be last for get_wchan */
28723 +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
28724 +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
28726 +#define __EXTRA_CLOBBER \
28727 + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
28728 + "r12", "r13", "r14", "r15"
28730 +/* Save restore flags to clear handle leaking NT */
28731 +#define switch_to(prev, next, last) \
28732 + asm volatile(SAVE_CONTEXT \
28733 + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
28734 + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
28735 + "call __switch_to\n\t" \
28736 + ".globl thread_return\n" \
28737 + "thread_return:\n\t" \
28738 + "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
28739 + "movq %P[thread_info](%%rsi),%%r8\n\t" \
28740 + LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
28741 + "movq %%rax,%%rdi\n\t" \
28742 + "jc ret_from_fork\n\t" \
28743 + RESTORE_CONTEXT \
28745 + : [next] "S" (next), [prev] "D" (prev), \
28746 + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
28747 + [ti_flags] "i" (offsetof(struct thread_info, flags)), \
28748 + [tif_fork] "i" (TIF_FORK), \
28749 + [thread_info] "i" (offsetof(struct task_struct, stack)), \
28750 + [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
28751 + : "memory", "cc" __EXTRA_CLOBBER)
28755 +#define _set_base(addr, base) do { unsigned long __pr; \
28756 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28757 + "rorl $16,%%edx\n\t" \
28758 + "movb %%dl,%2\n\t" \
28761 + :"m" (*((addr)+2)), \
28762 + "m" (*((addr)+4)), \
28763 + "m" (*((addr)+7)), \
28767 +#define _set_limit(addr, limit) do { unsigned long __lr; \
28768 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28769 + "rorl $16,%%edx\n\t" \
28770 + "movb %2,%%dh\n\t" \
28771 + "andb $0xf0,%%dh\n\t" \
28772 + "orb %%dh,%%dl\n\t" \
28775 + :"m" (*(addr)), \
28776 + "m" (*((addr)+6)), \
28780 +#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
28781 +#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
28783 +extern void load_gs_index(unsigned);
28786 + * Load a segment. Fall back on loading the zero
28787 + * segment if something goes wrong..
28789 +#define loadsegment(seg, value) \
28790 + asm volatile("\n" \
28792 + "movl %k0,%%" #seg "\n" \
28794 + ".section .fixup,\"ax\"\n" \
28796 + "movl %k1, %%" #seg "\n\t" \
28799 + _ASM_EXTABLE(1b,3b) \
28800 + : :"r" (value), "r" (0))
28804 + * Save a segment register away
28806 +#define savesegment(seg, value) \
28807 + asm volatile("mov %%" #seg ",%0":"=rm" (value))
28809 +static inline unsigned long get_limit(unsigned long segment)
28811 + unsigned long __limit;
28812 + __asm__("lsll %1,%0"
28813 + :"=r" (__limit):"r" (segment));
28814 + return __limit+1;
28817 +static inline void xen_clts(void)
28819 + HYPERVISOR_fpu_taskswitch(0);
28822 +static inline void xen_stts(void)
28824 + HYPERVISOR_fpu_taskswitch(1);
28828 + * Volatile isn't enough to prevent the compiler from reordering the
28829 + * read/write functions for the control registers and messing everything up.
28830 + * A memory clobber would solve the problem, but would prevent reordering of
28831 + * all loads stores around it, which can hurt performance. Solution is to
28832 + * use a variable and mimic reads and writes to it to enforce serialization
28834 +static unsigned long __force_order;
28836 +static inline unsigned long xen_read_cr0(void)
28838 + unsigned long val;
28839 + asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
28843 +static inline void xen_write_cr0(unsigned long val)
28845 + asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
28848 +#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
28849 +#define xen_write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
28851 +static inline unsigned long xen_read_cr3(void)
28853 + unsigned long val;
28854 + asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
28855 +#ifdef CONFIG_X86_32
28856 + return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
28858 + return machine_to_phys(val);
28862 +static inline void xen_write_cr3(unsigned long val)
28864 +#ifdef CONFIG_X86_32
28865 + val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
28867 + val = phys_to_machine(val);
28869 + asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
28872 +static inline unsigned long xen_read_cr4(void)
28874 + unsigned long val;
28875 + asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
28879 +#define xen_read_cr4_safe() xen_read_cr4()
28881 +static inline void xen_write_cr4(unsigned long val)
28883 + asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
28886 +#ifdef CONFIG_X86_64
28887 +static inline unsigned long xen_read_cr8(void)
28892 +static inline void xen_write_cr8(unsigned long val)
28898 +static inline void xen_wbinvd(void)
28900 + asm volatile("wbinvd": : :"memory");
28902 +#define read_cr0() (xen_read_cr0())
28903 +#define write_cr0(x) (xen_write_cr0(x))
28904 +#define read_cr2() (xen_read_cr2())
28905 +#define write_cr2(x) (xen_write_cr2(x))
28906 +#define read_cr3() (xen_read_cr3())
28907 +#define write_cr3(x) (xen_write_cr3(x))
28908 +#define read_cr4() (xen_read_cr4())
28909 +#define read_cr4_safe() (xen_read_cr4_safe())
28910 +#define write_cr4(x) (xen_write_cr4(x))
28911 +#define wbinvd() (xen_wbinvd())
28912 +#ifdef CONFIG_X86_64
28913 +#define read_cr8() (xen_read_cr8())
28914 +#define write_cr8(x) (xen_write_cr8(x))
28917 +/* Clear the 'TS' bit */
28918 +#define clts() (xen_clts())
28919 +#define stts() (xen_stts())
28921 +#endif /* __KERNEL__ */
28923 +static inline void clflush(volatile void *__p)
28925 + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
28928 +#define nop() __asm__ __volatile__ ("nop")
28930 +void disable_hlt(void);
28931 +void enable_hlt(void);
28933 +extern int es7000_plat;
28934 +void cpu_idle_wait(void);
28936 +extern unsigned long arch_align_stack(unsigned long sp);
28937 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28939 +void default_idle(void);
28942 + * Force strict CPU ordering.
28943 + * And yes, this is required on UP too when we're talking
28946 #ifdef CONFIG_X86_32
28947 -# include "system_32.h"
28949 + * For now, "wmb()" doesn't actually do anything, as all
28950 + * Intel CPU's follow what Intel calls a *Processor Order*,
28951 + * in which all writes are seen in the program order even
28952 + * outside the CPU.
28954 + * I expect future Intel CPU's to have a weaker ordering,
28955 + * but I'd also expect them to finally get their act together
28956 + * and add some real memory barriers if so.
28958 + * Some non intel clones support out of order store. wmb() ceases to be a
28961 +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
28962 +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
28963 +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
28965 +#define mb() asm volatile("mfence":::"memory")
28966 +#define rmb() asm volatile("lfence":::"memory")
28967 +#define wmb() asm volatile("sfence" ::: "memory")
28971 + * read_barrier_depends - Flush all pending reads that subsequents reads
28974 + * No data-dependent reads from memory-like regions are ever reordered
28975 + * over this barrier. All reads preceding this primitive are guaranteed
28976 + * to access memory (but not necessarily other CPUs' caches) before any
28977 + * reads following this primitive that depend on the data return by
28978 + * any of the preceding reads. This primitive is much lighter weight than
28979 + * rmb() on most CPUs, and is never heavier weight than is
28982 + * These ordering constraints are respected by both the local CPU
28983 + * and the compiler.
28985 + * Ordering is not guaranteed by anything other than these primitives,
28986 + * not even by data dependencies. See the documentation for
28987 + * memory_barrier() for examples and URLs to more information.
28989 + * For example, the following code would force ordering (the initial
28990 + * value of "a" is zero, "b" is one, and "p" is "&a"):
28992 + * <programlisting>
28996 + * memory_barrier();
28998 + * read_barrier_depends();
29000 + * </programlisting>
29002 + * because the read of "*q" depends on the read of "p" and these
29003 + * two reads are separated by a read_barrier_depends(). However,
29004 + * the following code, with the same initial values for "a" and "b":
29006 + * <programlisting>
29010 + * memory_barrier();
29012 + * read_barrier_depends();
29014 + * </programlisting>
29016 + * does not enforce ordering, since there is no data dependency between
29017 + * the read of "a" and the read of "b". Therefore, on some CPUs, such
29018 + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
29019 + * in cases like this where there are no data dependencies.
29022 +#define read_barrier_depends() do { } while (0)
29025 +#define smp_mb() mb()
29026 +#ifdef CONFIG_X86_PPRO_FENCE
29027 +# define smp_rmb() rmb()
29029 -# include "system_64.h"
29030 +# define smp_rmb() barrier()
29032 +#ifdef CONFIG_X86_OOSTORE
29033 +# define smp_wmb() wmb()
29035 +# define smp_wmb() barrier()
29037 +#define smp_read_barrier_depends() read_barrier_depends()
29038 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
29040 +#define smp_mb() barrier()
29041 +#define smp_rmb() barrier()
29042 +#define smp_wmb() barrier()
29043 +#define smp_read_barrier_depends() do { } while (0)
29044 +#define set_mb(var, value) do { var = value; barrier(); } while (0)
29048 + * Stop RDTSC speculation. This is needed when you need to use RDTSC
29049 + * (or get_cycles or vread that possibly accesses the TSC) in a defined
29052 + * (Could use an alternative three way for this if there was one.)
29054 +static inline void rdtsc_barrier(void)
29056 + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
29057 + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
29061 --- a/include/asm-x86/mach-xen/asm/tlbflush_32.h
29064 -#ifndef _I386_TLBFLUSH_H
29065 -#define _I386_TLBFLUSH_H
29067 -#include <linux/mm.h>
29068 -#include <asm/processor.h>
29070 -#define __flush_tlb() xen_tlb_flush()
29071 -#define __flush_tlb_global() xen_tlb_flush()
29072 -#define __flush_tlb_all() xen_tlb_flush()
29074 -#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
29076 -#define __flush_tlb_single(addr) xen_invlpg(addr)
29078 -#define __flush_tlb_one(addr) __flush_tlb_single(addr)
29083 - * - flush_tlb() flushes the current mm struct TLBs
29084 - * - flush_tlb_all() flushes all processes TLBs
29085 - * - flush_tlb_mm(mm) flushes the specified mm context TLB's
29086 - * - flush_tlb_page(vma, vmaddr) flushes one page
29087 - * - flush_tlb_range(vma, start, end) flushes a range of pages
29088 - * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
29090 - * ..but the i386 has somewhat limited tlb flushing capabilities,
29091 - * and page-granular flushes are available only on i486 and up.
29094 -#define TLB_FLUSH_ALL 0xffffffff
29097 -#ifndef CONFIG_SMP
29099 -#include <linux/sched.h>
29101 -#define flush_tlb() __flush_tlb()
29102 -#define flush_tlb_all() __flush_tlb_all()
29103 -#define local_flush_tlb() __flush_tlb()
29105 -static inline void flush_tlb_mm(struct mm_struct *mm)
29107 - if (mm == current->active_mm)
29111 -static inline void flush_tlb_page(struct vm_area_struct *vma,
29112 - unsigned long addr)
29114 - if (vma->vm_mm == current->active_mm)
29115 - __flush_tlb_one(addr);
29118 -static inline void flush_tlb_range(struct vm_area_struct *vma,
29119 - unsigned long start, unsigned long end)
29121 - if (vma->vm_mm == current->active_mm)
29127 -#include <asm/smp.h>
29129 -#define local_flush_tlb() \
29132 -#define flush_tlb_all xen_tlb_flush_all
29133 -#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask)
29134 -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
29135 -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
29137 -#define flush_tlb() flush_tlb_current_task()
29139 -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
29141 - flush_tlb_mm(vma->vm_mm);
29144 -#define TLBSTATE_OK 1
29145 -#define TLBSTATE_LAZY 2
29149 - struct mm_struct *active_mm;
29151 - char __cacheline_padding[L1_CACHE_BYTES-8];
29153 -DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
29156 -static inline void flush_tlb_kernel_range(unsigned long start,
29157 - unsigned long end)
29162 -#endif /* _I386_TLBFLUSH_H */
29163 --- a/include/asm-x86/mach-xen/asm/tlbflush_64.h
29166 -#ifndef _X8664_TLBFLUSH_H
29167 -#define _X8664_TLBFLUSH_H
29169 -#include <linux/mm.h>
29170 -#include <linux/sched.h>
29171 -#include <asm/processor.h>
29172 -#include <asm/system.h>
29174 -#define __flush_tlb() xen_tlb_flush()
29177 - * Global pages have to be flushed a bit differently. Not a real
29178 - * performance problem because this does not happen often.
29180 -#define __flush_tlb_global() xen_tlb_flush()
29182 -#define __flush_tlb_all() __flush_tlb_global()
29184 -#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
29190 - * - flush_tlb() flushes the current mm struct TLBs
29191 - * - flush_tlb_all() flushes all processes TLBs
29192 - * - flush_tlb_mm(mm) flushes the specified mm context TLB's
29193 - * - flush_tlb_page(vma, vmaddr) flushes one page
29194 - * - flush_tlb_range(vma, start, end) flushes a range of pages
29195 - * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
29197 - * x86-64 can only flush individual pages or full VMs. For a range flush
29198 - * we always do the full VM. Might be worth trying if for a small
29199 - * range a few INVLPGs in a row are a win.
29202 -#ifndef CONFIG_SMP
29204 -#define flush_tlb() __flush_tlb()
29205 -#define flush_tlb_all() __flush_tlb_all()
29206 -#define local_flush_tlb() __flush_tlb()
29208 -static inline void flush_tlb_mm(struct mm_struct *mm)
29210 - if (mm == current->active_mm)
29214 -static inline void flush_tlb_page(struct vm_area_struct *vma,
29215 - unsigned long addr)
29217 - if (vma->vm_mm == current->active_mm)
29218 - __flush_tlb_one(addr);
29221 -static inline void flush_tlb_range(struct vm_area_struct *vma,
29222 - unsigned long start, unsigned long end)
29224 - if (vma->vm_mm == current->active_mm)
29230 -#include <asm/smp.h>
29232 -#define local_flush_tlb() \
29235 -#define flush_tlb_all xen_tlb_flush_all
29236 -#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask)
29237 -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
29238 -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
29240 -#define flush_tlb() flush_tlb_current_task()
29242 -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
29244 - flush_tlb_mm(vma->vm_mm);
29247 -#define TLBSTATE_OK 1
29248 -#define TLBSTATE_LAZY 2
29250 -/* Roughly an IPI every 20MB with 4k pages for freeing page table
29251 - ranges. Cost is about 42k of memory for each CPU. */
29252 -#define ARCH_FREE_PTE_NR 5350
29256 -static inline void flush_tlb_kernel_range(unsigned long start,
29257 - unsigned long end)
29262 -#endif /* _X8664_TLBFLUSH_H */
29263 --- a/include/asm-x86/mach-xen/asm/tlbflush.h
29264 +++ b/include/asm-x86/mach-xen/asm/tlbflush.h
29266 +#ifndef _ASM_X86_TLBFLUSH_H
29267 +#define _ASM_X86_TLBFLUSH_H
29269 +#include <linux/mm.h>
29270 +#include <linux/sched.h>
29272 +#include <asm/processor.h>
29273 +#include <asm/system.h>
29275 +#define __flush_tlb() xen_tlb_flush()
29276 +#define __flush_tlb_global() xen_tlb_flush()
29277 +#define __flush_tlb_single(addr) xen_invlpg(addr)
29278 +#define __flush_tlb_all() xen_tlb_flush()
29279 +#define __flush_tlb_one(addr) xen_invlpg(addr)
29281 #ifdef CONFIG_X86_32
29282 -# include "tlbflush_32.h"
29283 +# define TLB_FLUSH_ALL 0xffffffff
29285 -# include "tlbflush_64.h"
29286 +# define TLB_FLUSH_ALL -1ULL
29292 + * - flush_tlb() flushes the current mm struct TLBs
29293 + * - flush_tlb_all() flushes all processes TLBs
29294 + * - flush_tlb_mm(mm) flushes the specified mm context TLB's
29295 + * - flush_tlb_page(vma, vmaddr) flushes one page
29296 + * - flush_tlb_range(vma, start, end) flushes a range of pages
29297 + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
29299 + * ..but the i386 has somewhat limited tlb flushing capabilities,
29300 + * and page-granular flushes are available only on i486 and up.
29302 + * x86-64 can only flush individual pages or full VMs. For a range flush
29303 + * we always do the full VM. Might be worth trying if for a small
29304 + * range a few INVLPGs in a row are a win.
29307 +#ifndef CONFIG_SMP
29309 +#define flush_tlb() __flush_tlb()
29310 +#define flush_tlb_all() __flush_tlb_all()
29311 +#define local_flush_tlb() __flush_tlb()
29313 +static inline void flush_tlb_mm(struct mm_struct *mm)
29315 + if (mm == current->active_mm)
29319 +static inline void flush_tlb_page(struct vm_area_struct *vma,
29320 + unsigned long addr)
29322 + if (vma->vm_mm == current->active_mm)
29323 + __flush_tlb_one(addr);
29326 +static inline void flush_tlb_range(struct vm_area_struct *vma,
29327 + unsigned long start, unsigned long end)
29329 + if (vma->vm_mm == current->active_mm)
29335 +#include <asm/smp.h>
29337 +#define local_flush_tlb() __flush_tlb()
29339 +#define flush_tlb_all xen_tlb_flush_all
29340 +#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask)
29341 +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
29342 +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
29344 +#define flush_tlb() flush_tlb_current_task()
29346 +static inline void flush_tlb_range(struct vm_area_struct *vma,
29347 + unsigned long start, unsigned long end)
29349 + flush_tlb_mm(vma->vm_mm);
29352 +#define TLBSTATE_OK 1
29353 +#define TLBSTATE_LAZY 2
29355 +#ifdef CONFIG_X86_32
29358 + struct mm_struct *active_mm;
29360 + char __cacheline_padding[L1_CACHE_BYTES-8];
29362 +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
29367 +static inline void flush_tlb_kernel_range(unsigned long start,
29368 + unsigned long end)
29373 +#endif /* _ASM_X86_TLBFLUSH_H */
29374 --- a/include/asm-x86/mach-xen/irq_vectors.h
29375 +++ b/include/asm-x86/mach-xen/irq_vectors.h
29378 #define RESCHEDULE_VECTOR 0
29379 #define CALL_FUNCTION_VECTOR 1
29381 +#define SPIN_UNLOCK_VECTOR 2
29385 * The maximum number of vectors supported by i386 processors
29386 --- a/include/asm-x86/mmu.h
29387 +++ b/include/asm-x86/mmu.h
29388 @@ -23,7 +23,7 @@ typedef struct {
29393 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
29394 void leave_mm(int cpu);
29396 static inline void leave_mm(int cpu)
29397 --- a/include/asm-x86/ptrace.h
29398 +++ b/include/asm-x86/ptrace.h
29399 @@ -249,7 +249,9 @@ extern void user_enable_single_step(stru
29400 extern void user_disable_single_step(struct task_struct *);
29402 extern void user_enable_block_step(struct task_struct *);
29403 -#ifdef CONFIG_X86_DEBUGCTLMSR
29404 +#if defined(CONFIG_XEN)
29405 +#define arch_has_block_step() (0)
29406 +#elif defined(CONFIG_X86_DEBUGCTLMSR)
29407 #define arch_has_block_step() (1)
29409 #define arch_has_block_step() (boot_cpu_data.x86 >= 6)
29410 --- a/include/asm-x86/thread_info.h
29411 +++ b/include/asm-x86/thread_info.h
29412 @@ -94,6 +94,9 @@ struct thread_info {
29413 #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
29414 #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
29415 #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
29416 +#ifdef CONFIG_X86_XEN
29417 +#define TIF_CSTAR 31 /* cstar-based syscall (special handling) */
29420 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
29421 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
29422 @@ -118,6 +121,7 @@ struct thread_info {
29423 #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
29424 #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
29425 #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
29426 +#define _TIF_CSTAR (1 << TIF_CSTAR)
29428 /* work to do in syscall_trace_enter() */
29429 #define _TIF_WORK_SYSCALL_ENTRY \
29430 @@ -147,12 +151,12 @@ struct thread_info {
29431 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
29432 _TIF_NOTSC|_TIF_PERFMON_CTXSW)
29434 -#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
29435 -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
29437 -#define _TIF_WORK_CTXSW_NEXT (_TIF_NOTSC | _TIF_DEBUG)
29438 -#define _TIF_WORK_CTXSW_PREV (_TIF_NOTSC)
29439 +#define _TIF_WORK_CTXSW (_TIF_NOTSC \
29440 + /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/)
29442 +#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
29443 +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
29445 #define PREEMPT_ACTIVE 0x10000000
29447 --- a/include/asm-x86/time.h
29448 +++ b/include/asm-x86/time.h
29449 @@ -58,4 +58,10 @@ static inline int native_set_wallclock(u
29451 extern unsigned long __init calibrate_cpu(void);
29454 +extern int xen_independent_wallclock(void);
29455 +extern unsigned long xen_read_persistent_clock(void);
29456 +extern int xen_update_persistent_clock(void);
29460 --- a/include/linux/page-flags.h
29461 +++ b/include/linux/page-flags.h
29462 @@ -101,8 +101,8 @@ enum pageflags {
29463 PG_foreign, /* Page is owned by foreign allocator. */
29464 PG_pinned, /* Cannot alias with PG_owner_priv_1 since
29465 * bad_page() checks include this bit.
29466 - * Also cannot use PG_arch_1 since that now
29467 - * has a different purpose on x86. */
29468 + * Should not use PG_arch_1 as that may have
29469 + * a different purpose elsewhere. */
29473 --- a/include/linux/pci.h
29474 +++ b/include/linux/pci.h
29475 @@ -644,6 +644,9 @@ int pcie_set_readrq(struct pci_dev *dev,
29476 void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
29477 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
29478 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
29480 +void pci_restore_bars(struct pci_dev *);
29483 /* ROM control related routines */
29484 void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
29485 --- a/include/xen/evtchn.h
29486 +++ b/include/xen/evtchn.h
29487 @@ -130,12 +130,37 @@ static inline void clear_evtchn(int port
29488 synch_clear_bit(port, s->evtchn_pending);
29491 +static inline void set_evtchn(int port)
29493 + shared_info_t *s = HYPERVISOR_shared_info;
29494 + synch_set_bit(port, s->evtchn_pending);
29497 +static inline int test_evtchn(int port)
29499 + shared_info_t *s = HYPERVISOR_shared_info;
29500 + return synch_test_bit(port, s->evtchn_pending);
29503 static inline void notify_remote_via_evtchn(int port)
29505 struct evtchn_send send = { .port = port };
29506 VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
29509 +/* Clear an irq's pending state, in preparation for polling on it. */
29510 +void xen_clear_irq_pending(int irq);
29512 +/* Set an irq's pending state, to avoid blocking on it. */
29513 +void xen_set_irq_pending(int irq);
29515 +/* Test an irq's pending state. */
29516 +int xen_test_irq_pending(int irq);
29518 +/* Poll waiting for an irq to become pending. In the usual case, the
29519 + irq will be disabled so it won't deliver an interrupt. */
29520 +void xen_poll_irq(int irq);
29523 * Use these to access the event channel underlying the IRQ handle returned
29524 * by bind_*_to_irqhandler().
29525 --- a/kernel/sysctl_check.c
29526 +++ b/kernel/sysctl_check.c
29527 @@ -899,7 +899,7 @@ static const struct trans_ctl_table tran
29531 -static struct trans_ctl_table trans_xen_table[] = {
29532 +static const struct trans_ctl_table trans_xen_table[] = {
29533 { CTL_XEN_INDEPENDENT_WALLCLOCK, "independent_wallclock" },
29534 { CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" },
29536 --- a/lib/swiotlb-xen.c
29537 +++ b/lib/swiotlb-xen.c
29539 #include <asm/gnttab_dma.h>
29542 -EXPORT_SYMBOL(swiotlb);
29544 #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
29546 @@ -289,6 +288,15 @@ __sync_single(struct phys_addr buffer, c
29550 +static inline unsigned int is_span_boundary(unsigned int index,
29551 + unsigned int nslots,
29552 + unsigned long offset_slots,
29553 + unsigned long max_slots)
29555 + unsigned long offset = (offset_slots + index) & (max_slots - 1);
29556 + return offset + nslots > max_slots;
29560 * Allocates bounce buffer and returns its kernel virtual address.
29562 @@ -300,6 +308,15 @@ map_single(struct device *hwdev, struct
29563 unsigned int nslots, stride, index, wrap;
29564 struct phys_addr slot_buf;
29566 + unsigned long mask;
29567 + unsigned long offset_slots;
29568 + unsigned long max_slots;
29570 + mask = dma_get_seg_boundary(hwdev);
29571 + offset_slots = -IO_TLB_SEGSIZE;
29572 + max_slots = mask + 1
29573 + ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
29574 + : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
29577 * For mappings greater than a page, we limit the stride (and
29578 @@ -319,12 +336,21 @@ map_single(struct device *hwdev, struct
29580 spin_lock_irqsave(&io_tlb_lock, flags);
29582 - wrap = index = ALIGN(io_tlb_index, stride);
29584 + index = ALIGN(io_tlb_index, stride);
29585 if (index >= iotlb_nslabs)
29586 - wrap = index = 0;
29591 + while (is_span_boundary(index, nslots, offset_slots,
29594 + if (index >= iotlb_nslabs)
29596 + if (index == wrap)
29601 * If we find a slot that indicates we have 'nslots'
29602 * number of contiguous buffers, we allocate the
29603 @@ -359,6 +385,7 @@ map_single(struct device *hwdev, struct
29605 } while (index != wrap);
29608 spin_unlock_irqrestore(&io_tlb_lock, flags);