]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blame - src/patches/60034_xen3-patch-2.6.25.patch1
Merge branch 'master' of ssh://ms@git.ipfire.org/pub/git/ipfire-2.x
[people/pmueller/ipfire-2.x.git] / src / patches / 60034_xen3-patch-2.6.25.patch1
CommitLineData
cc90b958
BS
1From: kernel.org
2Subject: 2.6.25
3Patch-mainline: 2.6.25
4
5Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
6
7Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches.py
8
9---
10 arch/x86/Kconfig | 18
11 arch/x86/Kconfig.debug | 1
12 arch/x86/ia32/ia32entry-xen.S | 12
13 arch/x86/kernel/Makefile | 3
14 arch/x86/kernel/acpi/boot.c | 3
15 arch/x86/kernel/acpi/sleep-xen.c | 95 +
16 arch/x86/kernel/acpi/sleep_32-xen.c | 117 --
17 arch/x86/kernel/acpi/sleep_64-xen.c | 125 --
18 arch/x86/kernel/apic_32-xen.c | 2
19 arch/x86/kernel/apic_64-xen.c | 73 -
20 arch/x86/kernel/asm-offsets_32.c | 2
21 arch/x86/kernel/cpu/common-xen.c | 214 +--
22 arch/x86/kernel/cpu/mtrr/main-xen.c | 19
23 arch/x86/kernel/e820_32-xen.c | 275 -----
24 arch/x86/kernel/e820_64-xen.c | 485 +++++---
25 arch/x86/kernel/early_printk-xen.c | 2
26 arch/x86/kernel/entry_32-xen.S | 195 +++
27 arch/x86/kernel/entry_64-xen.S | 91 -
28 arch/x86/kernel/fixup.c | 2
29 arch/x86/kernel/genapic_64-xen.c | 15
30 arch/x86/kernel/head64-xen.c | 63 +
31 arch/x86/kernel/head_32-xen.S | 3
32 arch/x86/kernel/init_task-xen.c | 2
33 arch/x86/kernel/io_apic_32-xen.c | 15
34 arch/x86/kernel/io_apic_64-xen.c | 110 +-
35 arch/x86/kernel/ioport-xen.c | 112 ++
36 arch/x86/kernel/ioport_32-xen.c | 121 --
37 arch/x86/kernel/ioport_64-xen.c | 99 -
38 arch/x86/kernel/irq_32-xen.c | 22
39 arch/x86/kernel/irq_64-xen.c | 43
40 arch/x86/kernel/ldt-xen.c | 272 +++++
41 arch/x86/kernel/ldt_32-xen.c | 265 ----
42 arch/x86/kernel/ldt_64-xen.c | 271 ----
43 arch/x86/kernel/machine_kexec_64.c | 2
44 arch/x86/kernel/microcode-xen.c | 2
45 arch/x86/kernel/mpparse_32-xen.c | 49
46 arch/x86/kernel/mpparse_64-xen.c | 30
47 arch/x86/kernel/pci-dma-xen.c | 20
48 arch/x86/kernel/process_32-xen.c | 438 ++------
49 arch/x86/kernel/process_64-xen.c | 303 ++---
50 arch/x86/kernel/quirks-xen.c | 82 -
51 arch/x86/kernel/rtc.c | 8
52 arch/x86/kernel/setup64-xen.c | 70 +
53 arch/x86/kernel/setup_32-xen.c | 311 ++++-
54 arch/x86/kernel/setup_64-xen.c | 686 ++++++------
55 arch/x86/kernel/smp_32-xen.c | 5
56 arch/x86/kernel/smp_64-xen.c | 91 -
57 arch/x86/kernel/time_32-xen.c | 136 --
58 arch/x86/kernel/traps_32-xen.c | 320 +++--
59 arch/x86/kernel/traps_64-xen.c | 371 +++---
60 arch/x86/kernel/vsyscall_64-xen.c | 60 -
61 arch/x86/kernel/xen_entry_64.S | 36
62 arch/x86/mach-xen/setup.c | 11
63 arch/x86/mm/fault-xen.c | 1026 ++++++++++++++++++
64 arch/x86/mm/fault_32-xen.c | 757 -------------
65 arch/x86/mm/fault_64-xen.c | 686 ------------
66 arch/x86/mm/highmem_32-xen.c | 45
67 arch/x86/mm/hypervisor.c | 10
68 arch/x86/mm/init_32-xen.c | 464 +++-----
69 arch/x86/mm/init_64-xen.c | 517 ++++-----
70 arch/x86/mm/ioremap-xen.c | 685 ++++++++++++
71 arch/x86/mm/ioremap_32-xen.c | 445 --------
72 arch/x86/mm/pageattr-xen.c | 1412 ++++++++++++++++++++++++++
73 arch/x86/mm/pageattr_64-xen.c | 542 ---------
74 arch/x86/mm/pgtable_32-xen.c | 672 ++----------
75 arch/x86/pci/irq-xen.c | 24
76 arch/x86/vdso/Makefile | 1
77 arch/x86/vdso/vdso32-setup-xen.c | 506 +++++++++
78 arch/x86/vdso/vdso32-setup.c | 34
79 arch/x86/vdso/vdso32.S | 12
80 arch/x86/vdso/vdso32/syscall.S | 2
81 drivers/pci/msi-xen.c | 98 -
82 drivers/pci/pci.c | 5
83 drivers/xen/balloon/sysfs.c | 2
84 drivers/xen/blkback/blkback.c | 5
85 drivers/xen/blkfront/blkfront.c | 9
86 drivers/xen/blktap/blktap.c | 8
87 drivers/xen/core/Makefile | 1
88 drivers/xen/core/evtchn.c | 46
89 drivers/xen/core/hypervisor_sysfs.c | 2
90 drivers/xen/core/smpboot.c | 29
91 drivers/xen/core/spinlock.c | 161 ++
92 drivers/xen/core/xen_sysfs.c | 30
93 drivers/xen/gntdev/gntdev.c | 4
94 drivers/xen/scsifront/scsifront.c | 49
95 drivers/xen/xenoprof/xenoprofile.c | 2
96 include/asm-x86/mach-xen/asm/agp.h | 9
97 include/asm-x86/mach-xen/asm/desc.h | 403 +++++++
98 include/asm-x86/mach-xen/asm/desc_32.h | 262 ----
99 include/asm-x86/mach-xen/asm/desc_64.h | 228 ----
100 include/asm-x86/mach-xen/asm/dma-mapping_32.h | 18
101 include/asm-x86/mach-xen/asm/fixmap_32.h | 24
102 include/asm-x86/mach-xen/asm/fixmap_64.h | 25
103 include/asm-x86/mach-xen/asm/highmem.h | 10
104 include/asm-x86/mach-xen/asm/hypervisor.h | 19
105 include/asm-x86/mach-xen/asm/io_32.h | 69 -
106 include/asm-x86/mach-xen/asm/io_64.h | 62 -
107 include/asm-x86/mach-xen/asm/irqflags.h | 248 ++++
108 include/asm-x86/mach-xen/asm/irqflags_32.h | 212 ---
109 include/asm-x86/mach-xen/asm/irqflags_64.h | 178 ---
110 include/asm-x86/mach-xen/asm/maddr_32.h | 21
111 include/asm-x86/mach-xen/asm/maddr_64.h | 19
112 include/asm-x86/mach-xen/asm/mmu_context_32.h | 2
113 include/asm-x86/mach-xen/asm/mmu_context_64.h | 12
114 include/asm-x86/mach-xen/asm/page.h | 238 ++++
115 include/asm-x86/mach-xen/asm/page_64.h | 196 ---
116 include/asm-x86/mach-xen/asm/pci.h | 17
117 include/asm-x86/mach-xen/asm/pci_64.h | 1
118 include/asm-x86/mach-xen/asm/pgalloc_32.h | 116 +-
119 include/asm-x86/mach-xen/asm/pgalloc_64.h | 87 -
120 include/asm-x86/mach-xen/asm/pgtable-3level.h | 107 -
121 include/asm-x86/mach-xen/asm/pgtable.h | 449 ++++++++
122 include/asm-x86/mach-xen/asm/pgtable_32.h | 361 ------
123 include/asm-x86/mach-xen/asm/pgtable_64.h | 400 +------
124 include/asm-x86/mach-xen/asm/processor.h | 792 ++++++++++++++
125 include/asm-x86/mach-xen/asm/processor_32.h | 751 -------------
126 include/asm-x86/mach-xen/asm/processor_64.h | 461 --------
127 include/asm-x86/mach-xen/asm/segment.h | 203 +++
128 include/asm-x86/mach-xen/asm/segment_32.h | 150 --
129 include/asm-x86/mach-xen/asm/smp_32.h | 125 +-
130 include/asm-x86/mach-xen/asm/smp_64.h | 138 --
131 include/asm-x86/mach-xen/asm/spinlock.h | 333 ++++++
132 include/asm-x86/mach-xen/asm/system.h | 392 +++++++
133 include/asm-x86/mach-xen/asm/system_32.h | 312 -----
134 include/asm-x86/mach-xen/asm/system_64.h | 159 --
135 include/asm-x86/mach-xen/asm/tlbflush.h | 105 +
136 include/asm-x86/mach-xen/asm/tlbflush_32.h | 99 -
137 include/asm-x86/mach-xen/asm/tlbflush_64.h | 97 -
138 include/asm-x86/mach-xen/irq_vectors.h | 3
139 include/asm-x86/mmu.h | 2
140 include/asm-x86/ptrace.h | 4
141 include/asm-x86/thread_info.h | 12
142 include/asm-x86/time.h | 6
143 include/linux/page-flags.h | 4
144 include/linux/pci.h | 3
145 include/xen/evtchn.h | 25
146 kernel/sysctl_check.c | 2
147 lib/swiotlb-xen.c | 35
148 138 files changed, 11322 insertions(+), 11153 deletions(-)
149
150--- a/arch/x86/ia32/ia32entry-xen.S
151+++ b/arch/x86/ia32/ia32entry-xen.S
152@@ -12,7 +12,6 @@
153 #include <asm/ia32_unistd.h>
154 #include <asm/thread_info.h>
155 #include <asm/segment.h>
156-#include <asm/vsyscall32.h>
157 #include <asm/irqflags.h>
158 #include <linux/linkage.h>
159
160@@ -99,10 +98,11 @@ ENTRY(ia32_sysenter_target)
161 CFI_RESTORE rcx
162 movl %ebp,%ebp /* zero extension */
163 movl %eax,%eax
164+ movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
165 movl $__USER32_DS,40(%rsp)
166 movq %rbp,32(%rsp)
167 movl $__USER32_CS,16(%rsp)
168- movl $VSYSCALL32_SYSEXIT,8(%rsp)
169+ movq %r10,8(%rsp)
170 movq %rax,(%rsp)
171 cld
172 SAVE_ARGS 0,0,1
173@@ -582,8 +582,8 @@ ia32_sys_call_table:
174 .quad compat_sys_futex /* 240 */
175 .quad compat_sys_sched_setaffinity
176 .quad compat_sys_sched_getaffinity
177- .quad sys32_set_thread_area
178- .quad sys32_get_thread_area
179+ .quad sys_set_thread_area
180+ .quad sys_get_thread_area
181 .quad compat_sys_io_setup /* 245 */
182 .quad sys_io_destroy
183 .quad compat_sys_io_getevents
184@@ -661,7 +661,9 @@ ia32_sys_call_table:
185 .quad sys_epoll_pwait
186 .quad compat_sys_utimensat /* 320 */
187 .quad compat_sys_signalfd
188- .quad compat_sys_timerfd
189+ .quad sys_timerfd_create
190 .quad sys_eventfd
191 .quad sys32_fallocate
192+ .quad compat_sys_timerfd_settime /* 325 */
193+ .quad compat_sys_timerfd_gettime
194 ia32_syscall_end:
195--- a/arch/x86/Kconfig
196+++ b/arch/x86/Kconfig
197@@ -27,7 +27,7 @@ config X86
198 select HAVE_KRETPROBES
199 select HAVE_DYNAMIC_FTRACE
200 select HAVE_FTRACE
201- select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
202+ select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
203 select HAVE_ARCH_KGDB if !X86_VOYAGER
204 select HAVE_ARCH_TRACEHOOK
205 select HAVE_GENERIC_DMA_COHERENT if X86_32
206@@ -208,14 +208,12 @@ config X86_TRAMPOLINE
207 default y
208
209 config X86_NO_TSS
210- bool
211+ def_bool y
212 depends on XEN
213- default y
214
215 config X86_NO_IDT
216- bool
217+ def_bool y
218 depends on XEN
219- default y
220
221 config KTIME_SCALAR
222 def_bool X86_32
223@@ -724,9 +722,8 @@ config X86_VISWS_APIC
224 depends on X86_32 && X86_VISWS
225
226 config X86_XEN_GENAPIC
227- bool
228+ def_bool y
229 depends on X86_64_XEN
230- default y
231
232 config X86_MCE
233 bool "Machine Check Exception"
234@@ -1113,7 +1110,7 @@ config ARCH_DISCONTIGMEM_DEFAULT
235
236 config ARCH_SPARSEMEM_DEFAULT
237 def_bool y
238- depends on X86_64
239+ depends on X86_64 && !X86_64_XEN
240
241 config ARCH_SPARSEMEM_ENABLE
242 def_bool y
243@@ -1743,10 +1740,10 @@ config PCI_MMCONFIG
244 depends on X86_64 && PCI && ACPI
245
246 config XEN_PCIDEV_FRONTEND
247- bool "Xen PCI Frontend" if X86_64
248+ def_bool y
249+ prompt "Xen PCI Frontend" if X86_64
250 depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64)
251 select HOTPLUG
252- default y
253 help
254 The PCI device frontend driver allows the kernel to import arbitrary
255 PCI devices from a PCI backend to support PCI driver domains.
256@@ -1754,7 +1751,6 @@ config XEN_PCIDEV_FRONTEND
257 config XEN_PCIDEV_FE_DEBUG
258 bool "Xen PCI Frontend Debugging"
259 depends on XEN_PCIDEV_FRONTEND
260- default n
261 help
262 Enables some debug statements within the PCI Frontend.
263
264--- a/arch/x86/Kconfig.debug
265+++ b/arch/x86/Kconfig.debug
266@@ -266,6 +266,7 @@ config DEBUG_BOOT_PARAMS
267 bool "Debug boot parameters"
268 depends on DEBUG_KERNEL
269 depends on DEBUG_FS
270+ depends on !XEN
271 help
272 This option will cause struct boot_params to be exported via debugfs.
273
274--- a/arch/x86/kernel/acpi/boot.c
275+++ b/arch/x86/kernel/acpi/boot.c
276@@ -133,6 +133,9 @@ char *__init __acpi_map_table(unsigned l
277 #ifndef CONFIG_XEN
278 if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
279 return __va(phys);
280+#else
281+ if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT))
282+ return isa_bus_to_virt(phys);
283 #endif
284
285 offset = phys & (PAGE_SIZE - 1);
286--- a/arch/x86/kernel/acpi/sleep_32-xen.c
287+++ /dev/null
288@@ -1,117 +0,0 @@
289-/*
290- * sleep.c - x86-specific ACPI sleep support.
291- *
292- * Copyright (C) 2001-2003 Patrick Mochel
293- * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
294- */
295-
296-#include <linux/acpi.h>
297-#include <linux/bootmem.h>
298-#include <linux/dmi.h>
299-#include <linux/cpumask.h>
300-
301-#include <asm/smp.h>
302-
303-#ifndef CONFIG_ACPI_PV_SLEEP
304-/* address in low memory of the wakeup routine. */
305-unsigned long acpi_wakeup_address = 0;
306-unsigned long acpi_realmode_flags;
307-extern char wakeup_start, wakeup_end;
308-
309-extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
310-#endif
311-
312-/**
313- * acpi_save_state_mem - save kernel state
314- *
315- * Create an identity mapped page table and copy the wakeup routine to
316- * low memory.
317- */
318-int acpi_save_state_mem(void)
319-{
320-#ifndef CONFIG_ACPI_PV_SLEEP
321- if (!acpi_wakeup_address)
322- return 1;
323- memcpy((void *)acpi_wakeup_address, &wakeup_start,
324- &wakeup_end - &wakeup_start);
325- acpi_copy_wakeup_routine(acpi_wakeup_address);
326-#endif
327- return 0;
328-}
329-
330-/*
331- * acpi_restore_state - undo effects of acpi_save_state_mem
332- */
333-void acpi_restore_state_mem(void)
334-{
335-}
336-
337-/**
338- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
339- *
340- * We allocate a page from the first 1MB of memory for the wakeup
341- * routine for when we come back from a sleep state. The
342- * runtime allocator allows specification of <16MB pages, but not
343- * <1MB pages.
344- */
345-void __init acpi_reserve_bootmem(void)
346-{
347-#ifndef CONFIG_ACPI_PV_SLEEP
348- if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
349- printk(KERN_ERR
350- "ACPI: Wakeup code way too big, S3 disabled.\n");
351- return;
352- }
353-
354- acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
355- if (!acpi_wakeup_address)
356- printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
357-#endif
358-}
359-
360-#ifndef CONFIG_ACPI_PV_SLEEP
361-static int __init acpi_sleep_setup(char *str)
362-{
363- while ((str != NULL) && (*str != '\0')) {
364- if (strncmp(str, "s3_bios", 7) == 0)
365- acpi_realmode_flags |= 1;
366- if (strncmp(str, "s3_mode", 7) == 0)
367- acpi_realmode_flags |= 2;
368- if (strncmp(str, "s3_beep", 7) == 0)
369- acpi_realmode_flags |= 4;
370- str = strchr(str, ',');
371- if (str != NULL)
372- str += strspn(str, ", \t");
373- }
374- return 1;
375-}
376-
377-__setup("acpi_sleep=", acpi_sleep_setup);
378-
379-/* Ouch, we want to delete this. We already have better version in userspace, in
380- s2ram from suspend.sf.net project */
381-static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
382-{
383- acpi_realmode_flags |= 2;
384- return 0;
385-}
386-
387-static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
388- { /* Reset video mode after returning from ACPI S3 sleep */
389- .callback = reset_videomode_after_s3,
390- .ident = "Toshiba Satellite 4030cdt",
391- .matches = {
392- DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
393- },
394- },
395- {}
396-};
397-
398-static int __init acpisleep_dmi_init(void)
399-{
400- dmi_check_system(acpisleep_dmi_table);
401- return 0;
402-}
403-
404-core_initcall(acpisleep_dmi_init);
405-#endif /* CONFIG_ACPI_PV_SLEEP */
406--- a/arch/x86/kernel/acpi/sleep_64-xen.c
407+++ /dev/null
408@@ -1,125 +0,0 @@
409-/*
410- * acpi.c - Architecture-Specific Low-Level ACPI Support
411- *
412- * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
413- * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
414- * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
415- * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
416- * Copyright (C) 2003 Pavel Machek, SuSE Labs
417- *
418- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
419- *
420- * This program is free software; you can redistribute it and/or modify
421- * it under the terms of the GNU General Public License as published by
422- * the Free Software Foundation; either version 2 of the License, or
423- * (at your option) any later version.
424- *
425- * This program is distributed in the hope that it will be useful,
426- * but WITHOUT ANY WARRANTY; without even the implied warranty of
427- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
428- * GNU General Public License for more details.
429- *
430- * You should have received a copy of the GNU General Public License
431- * along with this program; if not, write to the Free Software
432- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
433- *
434- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
435- */
436-
437-#include <linux/kernel.h>
438-#include <linux/init.h>
439-#include <linux/types.h>
440-#include <linux/stddef.h>
441-#include <linux/slab.h>
442-#include <linux/pci.h>
443-#include <linux/bootmem.h>
444-#include <linux/acpi.h>
445-#include <linux/cpumask.h>
446-
447-#include <asm/mpspec.h>
448-#include <asm/io.h>
449-#include <asm/apic.h>
450-#include <asm/apicdef.h>
451-#include <asm/page.h>
452-#include <asm/pgtable.h>
453-#include <asm/pgalloc.h>
454-#include <asm/io_apic.h>
455-#include <asm/proto.h>
456-#include <asm/tlbflush.h>
457-
458-/* --------------------------------------------------------------------------
459- Low-Level Sleep Support
460- -------------------------------------------------------------------------- */
461-
462-#ifndef CONFIG_ACPI_PV_SLEEP
463-/* address in low memory of the wakeup routine. */
464-unsigned long acpi_wakeup_address = 0;
465-unsigned long acpi_realmode_flags;
466-extern char wakeup_start, wakeup_end;
467-
468-extern unsigned long acpi_copy_wakeup_routine(unsigned long);
469-#endif
470-
471-/**
472- * acpi_save_state_mem - save kernel state
473- *
474- * Create an identity mapped page table and copy the wakeup routine to
475- * low memory.
476- */
477-int acpi_save_state_mem(void)
478-{
479-#ifndef CONFIG_ACPI_PV_SLEEP
480- memcpy((void *)acpi_wakeup_address, &wakeup_start,
481- &wakeup_end - &wakeup_start);
482- acpi_copy_wakeup_routine(acpi_wakeup_address);
483-#endif
484- return 0;
485-}
486-
487-/*
488- * acpi_restore_state
489- */
490-void acpi_restore_state_mem(void)
491-{
492-}
493-
494-/**
495- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
496- *
497- * We allocate a page in low memory for the wakeup
498- * routine for when we come back from a sleep state. The
499- * runtime allocator allows specification of <16M pages, but not
500- * <1M pages.
501- */
502-void __init acpi_reserve_bootmem(void)
503-{
504-#ifndef CONFIG_ACPI_PV_SLEEP
505- acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
506- if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
507- printk(KERN_CRIT
508- "ACPI: Wakeup code way too big, will crash on attempt"
509- " to suspend\n");
510-#endif
511-}
512-
513-#ifndef CONFIG_ACPI_PV_SLEEP
514-static int __init acpi_sleep_setup(char *str)
515-{
516- while ((str != NULL) && (*str != '\0')) {
517- if (strncmp(str, "s3_bios", 7) == 0)
518- acpi_realmode_flags |= 1;
519- if (strncmp(str, "s3_mode", 7) == 0)
520- acpi_realmode_flags |= 2;
521- if (strncmp(str, "s3_beep", 7) == 0)
522- acpi_realmode_flags |= 4;
523- str = strchr(str, ',');
524- if (str != NULL)
525- str += strspn(str, ", \t");
526- }
527-
528- return 1;
529-}
530-
531-__setup("acpi_sleep=", acpi_sleep_setup);
532-#endif /* CONFIG_ACPI_PV_SLEEP */
533-
534--- /dev/null
535+++ b/arch/x86/kernel/acpi/sleep-xen.c
536@@ -0,0 +1,95 @@
537+/*
538+ * sleep.c - x86-specific ACPI sleep support.
539+ *
540+ * Copyright (C) 2001-2003 Patrick Mochel
541+ * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
542+ */
543+
544+#include <linux/acpi.h>
545+#include <linux/bootmem.h>
546+#include <linux/dmi.h>
547+#include <linux/cpumask.h>
548+
549+#include <asm/smp.h>
550+
551+#ifndef CONFIG_ACPI_PV_SLEEP
552+/* address in low memory of the wakeup routine. */
553+unsigned long acpi_wakeup_address = 0;
554+unsigned long acpi_realmode_flags;
555+extern char wakeup_start, wakeup_end;
556+
557+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
558+#endif
559+
560+/**
561+ * acpi_save_state_mem - save kernel state
562+ *
563+ * Create an identity mapped page table and copy the wakeup routine to
564+ * low memory.
565+ */
566+int acpi_save_state_mem(void)
567+{
568+#ifndef CONFIG_ACPI_PV_SLEEP
569+ if (!acpi_wakeup_address) {
570+ printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
571+ return -ENOMEM;
572+ }
573+ memcpy((void *)acpi_wakeup_address, &wakeup_start,
574+ &wakeup_end - &wakeup_start);
575+ acpi_copy_wakeup_routine(acpi_wakeup_address);
576+#endif
577+
578+ return 0;
579+}
580+
581+/*
582+ * acpi_restore_state - undo effects of acpi_save_state_mem
583+ */
584+void acpi_restore_state_mem(void)
585+{
586+}
587+
588+
589+/**
590+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
591+ *
592+ * We allocate a page from the first 1MB of memory for the wakeup
593+ * routine for when we come back from a sleep state. The
594+ * runtime allocator allows specification of <16MB pages, but not
595+ * <1MB pages.
596+ */
597+void __init acpi_reserve_bootmem(void)
598+{
599+#ifndef CONFIG_ACPI_PV_SLEEP
600+ if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
601+ printk(KERN_ERR
602+ "ACPI: Wakeup code way too big, S3 disabled.\n");
603+ return;
604+ }
605+
606+ acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
607+ if (!acpi_wakeup_address)
608+ printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
609+#endif
610+}
611+
612+
613+#ifndef CONFIG_ACPI_PV_SLEEP
614+static int __init acpi_sleep_setup(char *str)
615+{
616+ while ((str != NULL) && (*str != '\0')) {
617+ if (strncmp(str, "s3_bios", 7) == 0)
618+ acpi_realmode_flags |= 1;
619+ if (strncmp(str, "s3_mode", 7) == 0)
620+ acpi_realmode_flags |= 2;
621+ if (strncmp(str, "s3_beep", 7) == 0)
622+ acpi_realmode_flags |= 4;
623+ str = strchr(str, ',');
624+ if (str != NULL)
625+ str += strspn(str, ", \t");
626+ }
627+ return 1;
628+}
629+
630+__setup("acpi_sleep=", acpi_sleep_setup);
631+#endif /* CONFIG_ACPI_PV_SLEEP */
632--- a/arch/x86/kernel/apic_32-xen.c
633+++ b/arch/x86/kernel/apic_32-xen.c
634@@ -86,7 +86,7 @@ int setup_profiling_timer(unsigned int m
635 * This initializes the IO-APIC and APIC hardware if this is
636 * a UP kernel.
637 */
638-int __init APIC_init_uniprocessor (void)
639+int __init APIC_init_uniprocessor(void)
640 {
641 #ifdef CONFIG_X86_IO_APIC
642 if (smp_found_config)
643--- a/arch/x86/kernel/apic_64-xen.c
644+++ b/arch/x86/kernel/apic_64-xen.c
645@@ -34,34 +34,17 @@
646 #include <asm/hpet.h>
647 #include <asm/idle.h>
648
649-int apic_verbosity;
650+int disable_apic;
651
652 /*
653- * 'what should we do if we get a hw irq event on an illegal vector'.
654- * each architecture has to answer this themselves.
655+ * Debug level, exported for io_apic.c
656 */
657-void ack_bad_irq(unsigned int irq)
658-{
659- printk("unexpected IRQ trap at irq %02x\n", irq);
660- /*
661- * Currently unexpected vectors happen only on SMP and APIC.
662- * We _must_ ack these because every local APIC has only N
663- * irq slots per priority level, and a 'hanging, unacked' IRQ
664- * holds up an irq slot - in excessive cases (when multiple
665- * unexpected vectors occur) that might lock up the APIC
666- * completely.
667- * But don't ack when the APIC is disabled. -AK
668- */
669- if (!disable_apic)
670- ack_APIC_irq();
671-}
672-
673-int setup_profiling_timer(unsigned int multiplier)
674-{
675- return -EINVAL;
676-}
677+int apic_verbosity;
678
679-void smp_local_timer_interrupt(void)
680+/*
681+ * The guts of the apic timer interrupt
682+ */
683+static void local_apic_timer_interrupt(void)
684 {
685 #ifndef CONFIG_XEN
686 int cpu = smp_processor_id();
687@@ -121,11 +104,34 @@ void smp_apic_timer_interrupt(struct pt_
688 */
689 exit_idle();
690 irq_enter();
691- smp_local_timer_interrupt();
692+ local_apic_timer_interrupt();
693 irq_exit();
694 set_irq_regs(old_regs);
695 }
696
697+int setup_profiling_timer(unsigned int multiplier)
698+{
699+ return -EINVAL;
700+}
701+
702+/*
703+ * This initializes the IO-APIC and APIC hardware if this is
704+ * a UP kernel.
705+ */
706+int __init APIC_init_uniprocessor(void)
707+{
708+#ifdef CONFIG_X86_IO_APIC
709+ if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
710+ setup_IO_APIC();
711+#endif
712+
713+ return 1;
714+}
715+
716+/*
717+ * Local APIC interrupts
718+ */
719+
720 /*
721 * This interrupt should _never_ happen with our APIC/SMP architecture
722 */
723@@ -150,7 +156,6 @@ asmlinkage void smp_spurious_interrupt(v
724 /*
725 * This interrupt should never happen with our APIC/SMP architecture
726 */
727-
728 asmlinkage void smp_error_interrupt(void)
729 {
730 unsigned int v, v1;
731@@ -178,19 +183,3 @@ asmlinkage void smp_error_interrupt(void
732 smp_processor_id(), v , v1);
733 irq_exit();
734 }
735-
736-int disable_apic;
737-
738-/*
739- * This initializes the IO-APIC and APIC hardware if this is
740- * a UP kernel.
741- */
742-int __init APIC_init_uniprocessor (void)
743-{
744-#ifdef CONFIG_X86_IO_APIC
745- if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
746- setup_IO_APIC();
747-#endif
748-
749- return 1;
750-}
751--- a/arch/x86/kernel/asm-offsets_32.c
752+++ b/arch/x86/kernel/asm-offsets_32.c
753@@ -23,8 +23,10 @@
754 #include <xen/interface/xen.h>
755 #endif
756
757+#ifdef CONFIG_LGUEST_GUEST
758 #include <linux/lguest.h>
759 #include "../../../drivers/lguest/lg.h"
760+#endif
761
762 /* workaround for a warning with -Wmissing-prototypes */
763 void foo(void);
764--- a/arch/x86/kernel/cpu/common-xen.c
765+++ b/arch/x86/kernel/cpu/common-xen.c
766@@ -27,45 +27,50 @@
767 #include "cpu.h"
768
769 DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
770- [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
771- [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
772- [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
773- [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
774+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
775+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
776+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
777+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
778 #ifndef CONFIG_XEN
779 /*
780 * Segments used for calling PnP BIOS have byte granularity.
781 * They code segments and data segments have fixed 64k limits,
782 * the transfer segment sizes are set at run time.
783 */
784- [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
785- [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
786- [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
787- [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
788- [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
789+ /* 32-bit code */
790+ [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
791+ /* 16-bit code */
792+ [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
793+ /* 16-bit data */
794+ [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
795+ /* 16-bit data */
796+ [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
797+ /* 16-bit data */
798+ [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
799 /*
800 * The APM segments have byte granularity and their bases
801 * are set at run time. All have 64k limits.
802 */
803- [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
804+ /* 32-bit code */
805+ [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
806 /* 16-bit code */
807- [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
808- [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
809+ [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
810+ /* data */
811+ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
812
813- [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
814+ [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
815 #endif
816- [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
817+ [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
818 } };
819 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
820
821+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
822+
823 static int cachesize_override __cpuinitdata = -1;
824-static int disable_x86_fxsr __cpuinitdata;
825 static int disable_x86_serial_nr __cpuinitdata = 1;
826-static int disable_x86_sep __cpuinitdata;
827
828 struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
829
830-extern int disable_pse;
831-
832 static void __cpuinit default_init(struct cpuinfo_x86 * c)
833 {
834 /* Not much we can do here... */
835@@ -214,16 +219,8 @@ static void __cpuinit get_cpu_vendor(str
836
837 static int __init x86_fxsr_setup(char * s)
838 {
839- /* Tell all the other CPUs to not use it... */
840- disable_x86_fxsr = 1;
841-
842- /*
843- * ... and clear the bits early in the boot_cpu_data
844- * so that the bootup process doesn't try to do this
845- * either.
846- */
847- clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
848- clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
849+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
850+ setup_clear_cpu_cap(X86_FEATURE_XMM);
851 return 1;
852 }
853 __setup("nofxsr", x86_fxsr_setup);
854@@ -231,7 +228,7 @@ __setup("nofxsr", x86_fxsr_setup);
855
856 static int __init x86_sep_setup(char * s)
857 {
858- disable_x86_sep = 1;
859+ setup_clear_cpu_cap(X86_FEATURE_SEP);
860 return 1;
861 }
862 __setup("nosep", x86_sep_setup);
863@@ -268,10 +265,10 @@ static int __cpuinit have_cpuid_p(void)
864 void __init cpu_detect(struct cpuinfo_x86 *c)
865 {
866 /* Get vendor name */
867- cpuid(0x00000000, &c->cpuid_level,
868- (int *)&c->x86_vendor_id[0],
869- (int *)&c->x86_vendor_id[8],
870- (int *)&c->x86_vendor_id[4]);
871+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
872+ (unsigned int *)&c->x86_vendor_id[0],
873+ (unsigned int *)&c->x86_vendor_id[8],
874+ (unsigned int *)&c->x86_vendor_id[4]);
875
876 c->x86 = 4;
877 if (c->cpuid_level >= 0x00000001) {
878@@ -284,9 +281,38 @@ void __init cpu_detect(struct cpuinfo_x8
879 if (c->x86 >= 0x6)
880 c->x86_model += ((tfms >> 16) & 0xF) << 4;
881 c->x86_mask = tfms & 15;
882- if (cap0 & (1<<19))
883+ if (cap0 & (1<<19)) {
884 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
885+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
886+ }
887+ }
888+}
889+static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
890+{
891+ u32 tfms, xlvl;
892+ unsigned int ebx;
893+
894+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
895+ if (have_cpuid_p()) {
896+ /* Intel-defined flags: level 0x00000001 */
897+ if (c->cpuid_level >= 0x00000001) {
898+ u32 capability, excap;
899+ cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
900+ c->x86_capability[0] = capability;
901+ c->x86_capability[4] = excap;
902+ }
903+
904+ /* AMD-defined flags: level 0x80000001 */
905+ xlvl = cpuid_eax(0x80000000);
906+ if ((xlvl & 0xffff0000) == 0x80000000) {
907+ if (xlvl >= 0x80000001) {
908+ c->x86_capability[1] = cpuid_edx(0x80000001);
909+ c->x86_capability[6] = cpuid_ecx(0x80000001);
910+ }
911+ }
912+
913 }
914+
915 }
916
917 /* Do minimum CPU detection early.
918@@ -300,6 +326,7 @@ static void __init early_cpu_detect(void
919 struct cpuinfo_x86 *c = &boot_cpu_data;
920
921 c->x86_cache_alignment = 32;
922+ c->x86_clflush_size = 32;
923
924 if (!have_cpuid_p())
925 return;
926@@ -307,19 +334,30 @@ static void __init early_cpu_detect(void
927 cpu_detect(c);
928
929 get_cpu_vendor(c, 1);
930+
931+ switch (c->x86_vendor) {
932+ case X86_VENDOR_AMD:
933+ early_init_amd(c);
934+ break;
935+ case X86_VENDOR_INTEL:
936+ early_init_intel(c);
937+ break;
938+ }
939+
940+ early_get_cap(c);
941 }
942
943 static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
944 {
945 u32 tfms, xlvl;
946- int ebx;
947+ unsigned int ebx;
948
949 if (have_cpuid_p()) {
950 /* Get vendor name */
951- cpuid(0x00000000, &c->cpuid_level,
952- (int *)&c->x86_vendor_id[0],
953- (int *)&c->x86_vendor_id[8],
954- (int *)&c->x86_vendor_id[4]);
955+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
956+ (unsigned int *)&c->x86_vendor_id[0],
957+ (unsigned int *)&c->x86_vendor_id[8],
958+ (unsigned int *)&c->x86_vendor_id[4]);
959
960 get_cpu_vendor(c, 0);
961 /* Initialize the standard set of capabilities */
962@@ -364,8 +402,6 @@ static void __cpuinit generic_identify(s
963 init_scattered_cpuid_features(c);
964 }
965
966- early_intel_workaround(c);
967-
968 #ifdef CONFIG_X86_HT
969 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
970 #endif
971@@ -399,7 +435,7 @@ __setup("serialnumber", x86_serial_nr_se
972 /*
973 * This does the hard work of actually picking apart the CPU stuff...
974 */
975-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
976+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
977 {
978 int i;
979
980@@ -425,20 +461,9 @@ static void __cpuinit identify_cpu(struc
981
982 generic_identify(c);
983
984- printk(KERN_DEBUG "CPU: After generic identify, caps:");
985- for (i = 0; i < NCAPINTS; i++)
986- printk(" %08lx", c->x86_capability[i]);
987- printk("\n");
988-
989- if (this_cpu->c_identify) {
990+ if (this_cpu->c_identify)
991 this_cpu->c_identify(c);
992
993- printk(KERN_DEBUG "CPU: After vendor identify, caps:");
994- for (i = 0; i < NCAPINTS; i++)
995- printk(" %08lx", c->x86_capability[i]);
996- printk("\n");
997- }
998-
999 /*
1000 * Vendor-specific initialization. In this section we
1001 * canonicalize the feature flags, meaning if there are
1002@@ -460,23 +485,6 @@ static void __cpuinit identify_cpu(struc
1003 * we do "generic changes."
1004 */
1005
1006- /* TSC disabled? */
1007- if ( tsc_disable )
1008- clear_bit(X86_FEATURE_TSC, c->x86_capability);
1009-
1010- /* FXSR disabled? */
1011- if (disable_x86_fxsr) {
1012- clear_bit(X86_FEATURE_FXSR, c->x86_capability);
1013- clear_bit(X86_FEATURE_XMM, c->x86_capability);
1014- }
1015-
1016- /* SEP disabled? */
1017- if (disable_x86_sep)
1018- clear_bit(X86_FEATURE_SEP, c->x86_capability);
1019-
1020- if (disable_pse)
1021- clear_bit(X86_FEATURE_PSE, c->x86_capability);
1022-
1023 /* If the model name is still unset, do table lookup. */
1024 if ( !c->x86_model_id[0] ) {
1025 char *p;
1026@@ -489,13 +497,6 @@ static void __cpuinit identify_cpu(struc
1027 c->x86, c->x86_model);
1028 }
1029
1030- /* Now the feature flags better reflect actual CPU features! */
1031-
1032- printk(KERN_DEBUG "CPU: After all inits, caps:");
1033- for (i = 0; i < NCAPINTS; i++)
1034- printk(" %08lx", c->x86_capability[i]);
1035- printk("\n");
1036-
1037 /*
1038 * On SMP, boot_cpu_data holds the common feature set between
1039 * all CPUs; so make sure that we indicate which features are
1040@@ -508,8 +509,14 @@ static void __cpuinit identify_cpu(struc
1041 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1042 }
1043
1044+ /* Clear all flags overriden by options */
1045+ for (i = 0; i < NCAPINTS; i++)
1046+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
1047+
1048 /* Init Machine Check Exception if available. */
1049 mcheck_init(c);
1050+
1051+ select_idle_routine(c);
1052 }
1053
1054 void __init identify_boot_cpu(void)
1055@@ -517,7 +524,6 @@ void __init identify_boot_cpu(void)
1056 identify_cpu(&boot_cpu_data);
1057 sysenter_setup();
1058 enable_sep_cpu();
1059- mtrr_bp_init();
1060 }
1061
1062 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1063@@ -574,6 +580,13 @@ void __cpuinit detect_ht(struct cpuinfo_
1064 }
1065 #endif
1066
1067+static __init int setup_noclflush(char *arg)
1068+{
1069+ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1070+ return 1;
1071+}
1072+__setup("noclflush", setup_noclflush);
1073+
1074 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1075 {
1076 char *vendor = NULL;
1077@@ -597,6 +610,17 @@ void __cpuinit print_cpu_info(struct cpu
1078 printk("\n");
1079 }
1080
1081+static __init int setup_disablecpuid(char *arg)
1082+{
1083+ int bit;
1084+ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1085+ setup_clear_cpu_cap(bit);
1086+ else
1087+ return 0;
1088+ return 1;
1089+}
1090+__setup("clearcpuid=", setup_disablecpuid);
1091+
1092 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1093
1094 /* This is hacky. :)
1095@@ -606,16 +630,6 @@ cpumask_t cpu_initialized __cpuinitdata
1096 * They will insert themselves into the cpu_devs structure.
1097 * Then, when cpu_init() is called, we can just iterate over that array.
1098 */
1099-
1100-extern int intel_cpu_init(void);
1101-extern int cyrix_init_cpu(void);
1102-extern int nsc_init_cpu(void);
1103-extern int amd_init_cpu(void);
1104-extern int centaur_init_cpu(void);
1105-extern int transmeta_init_cpu(void);
1106-extern int nexgen_init_cpu(void);
1107-extern int umc_init_cpu(void);
1108-
1109 void __init early_cpu_init(void)
1110 {
1111 intel_cpu_init();
1112@@ -627,21 +641,13 @@ void __init early_cpu_init(void)
1113 nexgen_init_cpu();
1114 umc_init_cpu();
1115 early_cpu_detect();
1116-
1117-#ifdef CONFIG_DEBUG_PAGEALLOC
1118- /* pse is not compatible with on-the-fly unmapping,
1119- * disable it even if the cpus claim to support it.
1120- */
1121- clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
1122- disable_pse = 1;
1123-#endif
1124 }
1125
1126 /* Make sure %fs is initialized properly in idle threads */
1127-struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
1128+struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
1129 {
1130 memset(regs, 0, sizeof(struct pt_regs));
1131- regs->xfs = __KERNEL_PERCPU;
1132+ regs->fs = __KERNEL_PERCPU;
1133 return regs;
1134 }
1135
1136@@ -649,7 +655,7 @@ struct pt_regs * __devinit idle_regs(str
1137 * it's on the real one. */
1138 void switch_to_new_gdt(void)
1139 {
1140- struct Xgt_desc_struct gdt_descr;
1141+ struct desc_ptr gdt_descr;
1142 unsigned long va, frames[16];
1143 int f;
1144
1145@@ -692,12 +698,6 @@ void __cpuinit cpu_init(void)
1146
1147 if (cpu_has_vme || cpu_has_de)
1148 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1149- if (tsc_disable && cpu_has_tsc) {
1150- printk(KERN_NOTICE "Disabling TSC...\n");
1151- /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
1152- clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
1153- set_in_cr4(X86_CR4_TSD);
1154- }
1155
1156 switch_to_new_gdt();
1157
1158@@ -710,7 +710,7 @@ void __cpuinit cpu_init(void)
1159 BUG();
1160 enter_lazy_tlb(&init_mm, curr);
1161
1162- load_esp0(t, thread);
1163+ load_sp0(t, thread);
1164
1165 load_LDT(&init_mm.context);
1166
1167--- a/arch/x86/kernel/cpu/mtrr/main-xen.c
1168+++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
1169@@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = {
1170
1171 struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
1172 unsigned int num_var_ranges;
1173-unsigned int *usage_table;
1174+unsigned int mtrr_usage_table[MAX_VAR_RANGES];
1175
1176 static void __init set_num_var_ranges(void)
1177 {
1178@@ -52,17 +52,12 @@ static void __init init_table(void)
1179 int i, max;
1180
1181 max = num_var_ranges;
1182- if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
1183- == NULL) {
1184- printk(KERN_ERR "mtrr: could not allocate\n");
1185- return;
1186- }
1187 for (i = 0; i < max; i++)
1188- usage_table[i] = 0;
1189+ mtrr_usage_table[i] = 0;
1190 }
1191
1192 int mtrr_add_page(unsigned long base, unsigned long size,
1193- unsigned int type, char increment)
1194+ unsigned int type, bool increment)
1195 {
1196 int error;
1197 struct xen_platform_op op;
1198@@ -81,7 +76,7 @@ int mtrr_add_page(unsigned long base, un
1199 }
1200
1201 if (increment)
1202- ++usage_table[op.u.add_memtype.reg];
1203+ ++mtrr_usage_table[op.u.add_memtype.reg];
1204
1205 mutex_unlock(&mtrr_mutex);
1206
1207@@ -103,7 +98,7 @@ static int mtrr_check(unsigned long base
1208
1209 int
1210 mtrr_add(unsigned long base, unsigned long size, unsigned int type,
1211- char increment)
1212+ bool increment)
1213 {
1214 if (mtrr_check(base, size))
1215 return -EINVAL;
1216@@ -136,11 +131,11 @@ int mtrr_del_page(int reg, unsigned long
1217 goto out;
1218 }
1219 }
1220- if (usage_table[reg] < 1) {
1221+ if (mtrr_usage_table[reg] < 1) {
1222 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
1223 goto out;
1224 }
1225- if (--usage_table[reg] < 1) {
1226+ if (--mtrr_usage_table[reg] < 1) {
1227 op.cmd = XENPF_del_memtype;
1228 op.u.del_memtype.handle = 0;
1229 op.u.del_memtype.reg = reg;
1230--- a/arch/x86/kernel/e820_32-xen.c
1231+++ b/arch/x86/kernel/e820_32-xen.c
1232@@ -7,7 +7,6 @@
1233 #include <linux/kexec.h>
1234 #include <linux/module.h>
1235 #include <linux/mm.h>
1236-#include <linux/efi.h>
1237 #include <linux/pfn.h>
1238 #include <linux/uaccess.h>
1239 #include <linux/suspend.h>
1240@@ -18,11 +17,6 @@
1241 #include <asm/setup.h>
1242 #include <xen/interface/memory.h>
1243
1244-#ifdef CONFIG_EFI
1245-int efi_enabled = 0;
1246-EXPORT_SYMBOL(efi_enabled);
1247-#endif
1248-
1249 struct e820map e820;
1250 struct change_member {
1251 struct e820entry *pbios; /* pointer to original bios entry */
1252@@ -38,26 +32,6 @@ unsigned long pci_mem_start = 0x10000000
1253 EXPORT_SYMBOL(pci_mem_start);
1254 #endif
1255 extern int user_defined_memmap;
1256-struct resource data_resource = {
1257- .name = "Kernel data",
1258- .start = 0,
1259- .end = 0,
1260- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1261-};
1262-
1263-struct resource code_resource = {
1264- .name = "Kernel code",
1265- .start = 0,
1266- .end = 0,
1267- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1268-};
1269-
1270-struct resource bss_resource = {
1271- .name = "Kernel bss",
1272- .start = 0,
1273- .end = 0,
1274- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1275-};
1276
1277 static struct resource system_rom_resource = {
1278 .name = "System ROM",
1279@@ -112,60 +86,6 @@ static struct resource video_rom_resourc
1280 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
1281 };
1282
1283-static struct resource video_ram_resource = {
1284- .name = "Video RAM area",
1285- .start = 0xa0000,
1286- .end = 0xbffff,
1287- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1288-};
1289-
1290-static struct resource standard_io_resources[] = { {
1291- .name = "dma1",
1292- .start = 0x0000,
1293- .end = 0x001f,
1294- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1295-}, {
1296- .name = "pic1",
1297- .start = 0x0020,
1298- .end = 0x0021,
1299- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1300-}, {
1301- .name = "timer0",
1302- .start = 0x0040,
1303- .end = 0x0043,
1304- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1305-}, {
1306- .name = "timer1",
1307- .start = 0x0050,
1308- .end = 0x0053,
1309- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1310-}, {
1311- .name = "keyboard",
1312- .start = 0x0060,
1313- .end = 0x006f,
1314- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1315-}, {
1316- .name = "dma page reg",
1317- .start = 0x0080,
1318- .end = 0x008f,
1319- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1320-}, {
1321- .name = "pic2",
1322- .start = 0x00a0,
1323- .end = 0x00a1,
1324- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1325-}, {
1326- .name = "dma2",
1327- .start = 0x00c0,
1328- .end = 0x00df,
1329- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1330-}, {
1331- .name = "fpu",
1332- .start = 0x00f0,
1333- .end = 0x00ff,
1334- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1335-} };
1336-
1337 #define ROMSIGNATURE 0xaa55
1338
1339 static int __init romsignature(const unsigned char *rom)
1340@@ -272,10 +192,9 @@ static struct e820map machine_e820;
1341 * Request address space for all standard RAM and ROM resources
1342 * and also for regions reported as reserved by the e820.
1343 */
1344-static void __init
1345-legacy_init_iomem_resources(struct resource *code_resource,
1346- struct resource *data_resource,
1347- struct resource *bss_resource)
1348+void __init init_iomem_resources(struct resource *code_resource,
1349+ struct resource *data_resource,
1350+ struct resource *bss_resource)
1351 {
1352 int i;
1353
1354@@ -324,39 +243,6 @@ legacy_init_iomem_resources(struct resou
1355
1356 #undef e820
1357
1358-/*
1359- * Request address space for all standard resources
1360- *
1361- * This is called just before pcibios_init(), which is also a
1362- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
1363- */
1364-static int __init request_standard_resources(void)
1365-{
1366- int i;
1367-
1368- /* Nothing to do if not running in dom0. */
1369- if (!is_initial_xendomain())
1370- return 0;
1371-
1372- printk("Setting up standard PCI resources\n");
1373- if (efi_enabled)
1374- efi_initialize_iomem_resources(&code_resource,
1375- &data_resource, &bss_resource);
1376- else
1377- legacy_init_iomem_resources(&code_resource,
1378- &data_resource, &bss_resource);
1379-
1380- /* EFI systems may still have VGA */
1381- request_resource(&iomem_resource, &video_ram_resource);
1382-
1383- /* request I/O space for devices used on all i[345]86 PCs */
1384- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
1385- request_resource(&ioport_resource, &standard_io_resources[i]);
1386- return 0;
1387-}
1388-
1389-subsys_initcall(request_standard_resources);
1390-
1391 #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
1392 /**
1393 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
1394@@ -393,19 +279,17 @@ void __init add_memory_region(unsigned l
1395 {
1396 int x;
1397
1398- if (!efi_enabled) {
1399- x = e820.nr_map;
1400-
1401- if (x == E820MAX) {
1402- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1403- return;
1404- }
1405+ x = e820.nr_map;
1406
1407- e820.map[x].addr = start;
1408- e820.map[x].size = size;
1409- e820.map[x].type = type;
1410- e820.nr_map++;
1411+ if (x == E820MAX) {
1412+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1413+ return;
1414 }
1415+
1416+ e820.map[x].addr = start;
1417+ e820.map[x].size = size;
1418+ e820.map[x].type = type;
1419+ e820.nr_map++;
1420 } /* add_memory_region */
1421
1422 /*
1423@@ -642,29 +526,6 @@ int __init copy_e820_map(struct e820entr
1424 }
1425
1426 /*
1427- * Callback for efi_memory_walk.
1428- */
1429-static int __init
1430-efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
1431-{
1432- unsigned long *max_pfn = arg, pfn;
1433-
1434- if (start < end) {
1435- pfn = PFN_UP(end -1);
1436- if (pfn > *max_pfn)
1437- *max_pfn = pfn;
1438- }
1439- return 0;
1440-}
1441-
1442-static int __init
1443-efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
1444-{
1445- memory_present(0, PFN_UP(start), PFN_DOWN(end));
1446- return 0;
1447-}
1448-
1449-/*
1450 * Find the highest page frame number we have available
1451 */
1452 void __init find_max_pfn(void)
1453@@ -672,11 +533,6 @@ void __init find_max_pfn(void)
1454 int i;
1455
1456 max_pfn = 0;
1457- if (efi_enabled) {
1458- efi_memmap_walk(efi_find_max_pfn, &max_pfn);
1459- efi_memmap_walk(efi_memory_present_wrapper, NULL);
1460- return;
1461- }
1462
1463 for (i = 0; i < e820.nr_map; i++) {
1464 unsigned long start, end;
1465@@ -694,34 +550,12 @@ void __init find_max_pfn(void)
1466 }
1467
1468 /*
1469- * Free all available memory for boot time allocation. Used
1470- * as a callback function by efi_memory_walk()
1471- */
1472-
1473-static int __init
1474-free_available_memory(unsigned long start, unsigned long end, void *arg)
1475-{
1476- /* check max_low_pfn */
1477- if (start >= (max_low_pfn << PAGE_SHIFT))
1478- return 0;
1479- if (end >= (max_low_pfn << PAGE_SHIFT))
1480- end = max_low_pfn << PAGE_SHIFT;
1481- if (start < end)
1482- free_bootmem(start, end - start);
1483-
1484- return 0;
1485-}
1486-/*
1487 * Register fully available low RAM pages with the bootmem allocator.
1488 */
1489 void __init register_bootmem_low_pages(unsigned long max_low_pfn)
1490 {
1491 int i;
1492
1493- if (efi_enabled) {
1494- efi_memmap_walk(free_available_memory, NULL);
1495- return;
1496- }
1497 for (i = 0; i < e820.nr_map; i++) {
1498 unsigned long curr_pfn, last_pfn, size;
1499 /*
1500@@ -855,56 +689,12 @@ void __init print_memory_map(char *who)
1501 }
1502 }
1503
1504-static __init __always_inline void efi_limit_regions(unsigned long long size)
1505-{
1506- unsigned long long current_addr = 0;
1507- efi_memory_desc_t *md, *next_md;
1508- void *p, *p1;
1509- int i, j;
1510-
1511- j = 0;
1512- p1 = memmap.map;
1513- for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
1514- md = p;
1515- next_md = p1;
1516- current_addr = md->phys_addr +
1517- PFN_PHYS(md->num_pages);
1518- if (is_available_memory(md)) {
1519- if (md->phys_addr >= size) continue;
1520- memcpy(next_md, md, memmap.desc_size);
1521- if (current_addr >= size) {
1522- next_md->num_pages -=
1523- PFN_UP(current_addr-size);
1524- }
1525- p1 += memmap.desc_size;
1526- next_md = p1;
1527- j++;
1528- } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
1529- EFI_MEMORY_RUNTIME) {
1530- /* In order to make runtime services
1531- * available we have to include runtime
1532- * memory regions in memory map */
1533- memcpy(next_md, md, memmap.desc_size);
1534- p1 += memmap.desc_size;
1535- next_md = p1;
1536- j++;
1537- }
1538- }
1539- memmap.nr_map = j;
1540- memmap.map_end = memmap.map +
1541- (memmap.nr_map * memmap.desc_size);
1542-}
1543-
1544 void __init limit_regions(unsigned long long size)
1545 {
1546 unsigned long long current_addr = 0;
1547 int i;
1548
1549 print_memory_map("limit_regions start");
1550- if (efi_enabled) {
1551- efi_limit_regions(size);
1552- return;
1553- }
1554 for (i = 0; i < e820.nr_map; i++) {
1555 current_addr = e820.map[i].addr + e820.map[i].size;
1556 if (current_addr < size)
1557@@ -1056,3 +846,44 @@ static int __init parse_memmap(char *arg
1558 return 0;
1559 }
1560 early_param("memmap", parse_memmap);
1561+
1562+#ifndef CONFIG_XEN
1563+void __init update_memory_range(u64 start, u64 size, unsigned old_type,
1564+ unsigned new_type)
1565+{
1566+ int i;
1567+
1568+ BUG_ON(old_type == new_type);
1569+
1570+ for (i = 0; i < e820.nr_map; i++) {
1571+ struct e820entry *ei = &e820.map[i];
1572+ u64 final_start, final_end;
1573+ if (ei->type != old_type)
1574+ continue;
1575+ /* totally covered? */
1576+ if (ei->addr >= start && ei->size <= size) {
1577+ ei->type = new_type;
1578+ continue;
1579+ }
1580+ /* partially covered */
1581+ final_start = max(start, ei->addr);
1582+ final_end = min(start + size, ei->addr + ei->size);
1583+ if (final_start >= final_end)
1584+ continue;
1585+ add_memory_region(final_start, final_end - final_start,
1586+ new_type);
1587+ }
1588+}
1589+
1590+void __init update_e820(void)
1591+{
1592+ u8 nr_map;
1593+
1594+ nr_map = e820.nr_map;
1595+ if (sanitize_e820_map(e820.map, &nr_map))
1596+ return;
1597+ e820.nr_map = nr_map;
1598+ printk(KERN_INFO "modified physical RAM map:\n");
1599+ print_memory_map("modified");
1600+}
1601+#endif
1602--- a/arch/x86/kernel/e820_64-xen.c
1603+++ b/arch/x86/kernel/e820_64-xen.c
1604@@ -1,4 +1,4 @@
1605-/*
1606+/*
1607 * Handle the memory map.
1608 * The functions here do the job until bootmem takes over.
1609 *
1610@@ -26,6 +26,7 @@
1611 #include <asm/proto.h>
1612 #include <asm/setup.h>
1613 #include <asm/sections.h>
1614+#include <asm/kdebug.h>
1615 #include <xen/interface/memory.h>
1616
1617 struct e820map e820 __initdata;
1618@@ -33,96 +34,103 @@ struct e820map e820 __initdata;
1619 struct e820map machine_e820;
1620 #endif
1621
1622-/*
1623+/*
1624 * PFN of last memory page.
1625 */
1626-unsigned long end_pfn;
1627-EXPORT_SYMBOL(end_pfn);
1628+unsigned long end_pfn;
1629
1630-/*
1631+/*
1632 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
1633 * The direct mapping extends to end_pfn_map, so that we can directly access
1634 * apertures, ACPI and other tables without having to play with fixmaps.
1635- */
1636-unsigned long end_pfn_map;
1637+ */
1638+unsigned long end_pfn_map;
1639
1640-/*
1641+/*
1642 * Last pfn which the user wants to use.
1643 */
1644 static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
1645
1646-extern struct resource code_resource, data_resource, bss_resource;
1647-
1648-/* Check for some hardcoded bad areas that early boot is not allowed to touch */
1649-static inline int bad_addr(unsigned long *addrp, unsigned long size)
1650-{
1651- unsigned long addr = *addrp, last = addr + size;
1652+/*
1653+ * Early reserved memory areas.
1654+ */
1655+#define MAX_EARLY_RES 20
1656
1657+struct early_res {
1658+ unsigned long start, end;
1659+ char name[16];
1660+};
1661+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
1662 #ifndef CONFIG_XEN
1663- /* various gunk below that needed for SMP startup */
1664- if (addr < 0x8000) {
1665- *addrp = PAGE_ALIGN(0x8000);
1666- return 1;
1667- }
1668-
1669- /* direct mapping tables of the kernel */
1670- if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
1671- *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
1672- return 1;
1673- }
1674-
1675- /* initrd */
1676-#ifdef CONFIG_BLK_DEV_INITRD
1677- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
1678- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
1679- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
1680- unsigned long ramdisk_end = ramdisk_image+ramdisk_size;
1681-
1682- if (last >= ramdisk_image && addr < ramdisk_end) {
1683- *addrp = PAGE_ALIGN(ramdisk_end);
1684- return 1;
1685- }
1686- }
1687+ { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
1688+#ifdef CONFIG_SMP
1689+ { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
1690 #endif
1691- /* kernel code */
1692- if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
1693- *addrp = PAGE_ALIGN(__pa_symbol(&_end));
1694- return 1;
1695- }
1696+#endif
1697+ {}
1698+};
1699
1700- if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
1701- *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
1702- return 1;
1703+void __init reserve_early(unsigned long start, unsigned long end, char *name)
1704+{
1705+ int i;
1706+ struct early_res *r;
1707+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1708+ r = &early_res[i];
1709+ if (end > r->start && start < r->end)
1710+ panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
1711+ start, end - 1, name?name:"", r->start, r->end - 1, r->name);
1712 }
1713+ if (i >= MAX_EARLY_RES)
1714+ panic("Too many early reservations");
1715+ r = &early_res[i];
1716+ r->start = start;
1717+ r->end = end;
1718+ if (name)
1719+ strncpy(r->name, name, sizeof(r->name) - 1);
1720+}
1721
1722-#ifdef CONFIG_NUMA
1723- /* NUMA memory to node map */
1724- if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
1725- *addrp = nodemap_addr + nodemap_size;
1726- return 1;
1727+void __init early_res_to_bootmem(void)
1728+{
1729+ int i;
1730+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1731+ struct early_res *r = &early_res[i];
1732+ printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
1733+ r->start, r->end - 1, r->name);
1734+ reserve_bootmem_generic(r->start, r->end - r->start);
1735 }
1736-#endif
1737- /* XXX ramdisk image here? */
1738-#else
1739- if (last < (table_end<<PAGE_SHIFT)) {
1740- *addrp = table_end << PAGE_SHIFT;
1741- return 1;
1742+}
1743+
1744+/* Check for already reserved areas */
1745+static inline int bad_addr(unsigned long *addrp, unsigned long size)
1746+{
1747+ int i;
1748+ unsigned long addr = *addrp, last;
1749+ int changed = 0;
1750+again:
1751+ last = addr + size;
1752+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1753+ struct early_res *r = &early_res[i];
1754+ if (last >= r->start && addr < r->end) {
1755+ *addrp = addr = r->end;
1756+ changed = 1;
1757+ goto again;
1758+ }
1759 }
1760-#endif
1761- return 0;
1762-}
1763+ return changed;
1764+}
1765
1766 /*
1767 * This function checks if any part of the range <start,end> is mapped
1768 * with type.
1769 */
1770-int e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
1771-{
1772+int
1773+e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
1774+{
1775 int i;
1776
1777 #ifndef CONFIG_XEN
1778- for (i = 0; i < e820.nr_map; i++) {
1779- struct e820entry *ei = &e820.map[i];
1780+ for (i = 0; i < e820.nr_map; i++) {
1781+ struct e820entry *ei = &e820.map[i];
1782 #else
1783 if (!is_initial_xendomain())
1784 return 0;
1785@@ -130,12 +138,12 @@ int e820_any_mapped(unsigned long start,
1786 const struct e820entry *ei = &machine_e820.map[i];
1787 #endif
1788
1789- if (type && ei->type != type)
1790+ if (type && ei->type != type)
1791 continue;
1792 if (ei->addr >= end || ei->addr + ei->size <= start)
1793- continue;
1794- return 1;
1795- }
1796+ continue;
1797+ return 1;
1798+ }
1799 return 0;
1800 }
1801 EXPORT_SYMBOL_GPL(e820_any_mapped);
1802@@ -146,7 +154,8 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
1803 * Note: this function only works correct if the e820 table is sorted and
1804 * not-overlapping, which is the case
1805 */
1806-int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
1807+int __init e820_all_mapped(unsigned long start, unsigned long end,
1808+ unsigned type)
1809 {
1810 int i;
1811
1812@@ -171,65 +180,77 @@ int __init e820_all_mapped(unsigned long
1813 */
1814 if (ei->addr <= start)
1815 start = ei->addr + ei->size;
1816- /* if start is now at or beyond end, we're done, full coverage */
1817+ /*
1818+ * if start is now at or beyond end, we're done, full
1819+ * coverage
1820+ */
1821 if (start >= end)
1822- return 1; /* we're done */
1823+ return 1;
1824 }
1825 return 0;
1826 }
1827
1828-/*
1829- * Find a free area in a specific range.
1830- */
1831-unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
1832-{
1833- int i;
1834- for (i = 0; i < e820.nr_map; i++) {
1835- struct e820entry *ei = &e820.map[i];
1836- unsigned long addr = ei->addr, last;
1837- if (ei->type != E820_RAM)
1838- continue;
1839- if (addr < start)
1840+/*
1841+ * Find a free area with specified alignment in a specific range.
1842+ */
1843+unsigned long __init find_e820_area(unsigned long start, unsigned long end,
1844+ unsigned size, unsigned long align)
1845+{
1846+ int i;
1847+ unsigned long mask = ~(align - 1);
1848+
1849+ for (i = 0; i < e820.nr_map; i++) {
1850+ struct e820entry *ei = &e820.map[i];
1851+ unsigned long addr = ei->addr, last;
1852+
1853+ if (ei->type != E820_RAM)
1854+ continue;
1855+ if (addr < start)
1856 addr = start;
1857- if (addr > ei->addr + ei->size)
1858- continue;
1859+ if (addr > ei->addr + ei->size)
1860+ continue;
1861 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
1862 ;
1863- last = PAGE_ALIGN(addr) + size;
1864+ addr = (addr + align - 1) & mask;
1865+ last = addr + size;
1866 if (last > ei->addr + ei->size)
1867 continue;
1868- if (last > end)
1869+ if (last > end)
1870 continue;
1871- return addr;
1872- }
1873- return -1UL;
1874-}
1875+ return addr;
1876+ }
1877+ return -1UL;
1878+}
1879
1880 /*
1881 * Find the highest page frame number we have available
1882 */
1883 unsigned long __init e820_end_of_ram(void)
1884 {
1885- unsigned long end_pfn = 0;
1886+ unsigned long end_pfn;
1887+
1888 end_pfn = find_max_pfn_with_active_regions();
1889-
1890- if (end_pfn > end_pfn_map)
1891+
1892+ if (end_pfn > end_pfn_map)
1893 end_pfn_map = end_pfn;
1894 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
1895 end_pfn_map = MAXMEM>>PAGE_SHIFT;
1896 if (end_pfn > end_user_pfn)
1897 end_pfn = end_user_pfn;
1898- if (end_pfn > end_pfn_map)
1899- end_pfn = end_pfn_map;
1900+ if (end_pfn > end_pfn_map)
1901+ end_pfn = end_pfn_map;
1902
1903- printk("end_pfn_map = %lu\n", end_pfn_map);
1904- return end_pfn;
1905+ printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
1906+ return end_pfn;
1907 }
1908
1909 /*
1910 * Mark e820 reserved areas as busy for the resource manager.
1911 */
1912-void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
1913+void __init e820_reserve_resources(struct e820entry *e820, int nr_map,
1914+ struct resource *code_resource,
1915+ struct resource *data_resource,
1916+ struct resource *bss_resource)
1917 {
1918 int i;
1919 for (i = 0; i < nr_map; i++) {
1920@@ -247,14 +268,14 @@ void __init e820_reserve_resources(struc
1921 request_resource(&iomem_resource, res);
1922 if (e820[i].type == E820_RAM) {
1923 /*
1924- * We don't know which RAM region contains kernel data,
1925- * so we try it repeatedly and let the resource manager
1926- * test it.
1927+ * We don't know which RAM region contains kernel data,
1928+ * so we try it repeatedly and let the resource manager
1929+ * test it.
1930 */
1931 #ifndef CONFIG_XEN
1932- request_resource(res, &code_resource);
1933- request_resource(res, &data_resource);
1934- request_resource(res, &bss_resource);
1935+ request_resource(res, code_resource);
1936+ request_resource(res, data_resource);
1937+ request_resource(res, bss_resource);
1938 #endif
1939 #ifdef CONFIG_KEXEC
1940 if (crashk_res.start != crashk_res.end)
1941@@ -357,9 +378,9 @@ e820_register_active_regions(int nid, un
1942 add_active_range(nid, ei_startpfn, ei_endpfn);
1943 }
1944
1945-/*
1946+/*
1947 * Add a memory region to the kernel e820 map.
1948- */
1949+ */
1950 void __init add_memory_region(unsigned long start, unsigned long size, int type)
1951 {
1952 int x = e820.nr_map;
1953@@ -384,9 +405,7 @@ unsigned long __init e820_hole_size(unsi
1954 {
1955 unsigned long start_pfn = start >> PAGE_SHIFT;
1956 unsigned long end_pfn = end >> PAGE_SHIFT;
1957- unsigned long ei_startpfn;
1958- unsigned long ei_endpfn;
1959- unsigned long ram = 0;
1960+ unsigned long ei_startpfn, ei_endpfn, ram = 0;
1961 int i;
1962
1963 for (i = 0; i < e820.nr_map; i++) {
1964@@ -398,28 +417,31 @@ unsigned long __init e820_hole_size(unsi
1965 return end - start - (ram << PAGE_SHIFT);
1966 }
1967
1968-void __init e820_print_map(char *who)
1969+static void __init e820_print_map(char *who)
1970 {
1971 int i;
1972
1973 for (i = 0; i < e820.nr_map; i++) {
1974 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1975- (unsigned long long) e820.map[i].addr,
1976- (unsigned long long) (e820.map[i].addr + e820.map[i].size));
1977+ (unsigned long long) e820.map[i].addr,
1978+ (unsigned long long)
1979+ (e820.map[i].addr + e820.map[i].size));
1980 switch (e820.map[i].type) {
1981- case E820_RAM: printk("(usable)\n");
1982- break;
1983+ case E820_RAM:
1984+ printk(KERN_CONT "(usable)\n");
1985+ break;
1986 case E820_RESERVED:
1987- printk("(reserved)\n");
1988- break;
1989+ printk(KERN_CONT "(reserved)\n");
1990+ break;
1991 case E820_ACPI:
1992- printk("(ACPI data)\n");
1993- break;
1994+ printk(KERN_CONT "(ACPI data)\n");
1995+ break;
1996 case E820_NVS:
1997- printk("(ACPI NVS)\n");
1998- break;
1999- default: printk("type %u\n", e820.map[i].type);
2000- break;
2001+ printk(KERN_CONT "(ACPI NVS)\n");
2002+ break;
2003+ default:
2004+ printk(KERN_CONT "type %u\n", e820.map[i].type);
2005+ break;
2006 }
2007 }
2008 }
2009@@ -427,11 +449,11 @@ void __init e820_print_map(char *who)
2010 /*
2011 * Sanitize the BIOS e820 map.
2012 *
2013- * Some e820 responses include overlapping entries. The following
2014+ * Some e820 responses include overlapping entries. The following
2015 * replaces the original e820 map with a new one, removing overlaps.
2016 *
2017 */
2018-static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
2019+static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
2020 {
2021 struct change_member {
2022 struct e820entry *pbios; /* pointer to original bios entry */
2023@@ -451,7 +473,8 @@ static int __init sanitize_e820_map(stru
2024 int i;
2025
2026 /*
2027- Visually we're performing the following (1,2,3,4 = memory types)...
2028+ Visually we're performing the following
2029+ (1,2,3,4 = memory types)...
2030
2031 Sample memory map (w/overlaps):
2032 ____22__________________
2033@@ -493,22 +516,23 @@ static int __init sanitize_e820_map(stru
2034 old_nr = *pnr_map;
2035
2036 /* bail out if we find any unreasonable addresses in bios map */
2037- for (i=0; i<old_nr; i++)
2038+ for (i = 0; i < old_nr; i++)
2039 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
2040 return -1;
2041
2042 /* create pointers for initial change-point information (for sorting) */
2043- for (i=0; i < 2*old_nr; i++)
2044+ for (i = 0; i < 2 * old_nr; i++)
2045 change_point[i] = &change_point_list[i];
2046
2047 /* record all known change-points (starting and ending addresses),
2048 omitting those that are for empty memory regions */
2049 chgidx = 0;
2050- for (i=0; i < old_nr; i++) {
2051+ for (i = 0; i < old_nr; i++) {
2052 if (biosmap[i].size != 0) {
2053 change_point[chgidx]->addr = biosmap[i].addr;
2054 change_point[chgidx++]->pbios = &biosmap[i];
2055- change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
2056+ change_point[chgidx]->addr = biosmap[i].addr +
2057+ biosmap[i].size;
2058 change_point[chgidx++]->pbios = &biosmap[i];
2059 }
2060 }
2061@@ -518,75 +542,106 @@ static int __init sanitize_e820_map(stru
2062 still_changing = 1;
2063 while (still_changing) {
2064 still_changing = 0;
2065- for (i=1; i < chg_nr; i++) {
2066- /* if <current_addr> > <last_addr>, swap */
2067- /* or, if current=<start_addr> & last=<end_addr>, swap */
2068- if ((change_point[i]->addr < change_point[i-1]->addr) ||
2069- ((change_point[i]->addr == change_point[i-1]->addr) &&
2070- (change_point[i]->addr == change_point[i]->pbios->addr) &&
2071- (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
2072- )
2073- {
2074+ for (i = 1; i < chg_nr; i++) {
2075+ unsigned long long curaddr, lastaddr;
2076+ unsigned long long curpbaddr, lastpbaddr;
2077+
2078+ curaddr = change_point[i]->addr;
2079+ lastaddr = change_point[i - 1]->addr;
2080+ curpbaddr = change_point[i]->pbios->addr;
2081+ lastpbaddr = change_point[i - 1]->pbios->addr;
2082+
2083+ /*
2084+ * swap entries, when:
2085+ *
2086+ * curaddr > lastaddr or
2087+ * curaddr == lastaddr and curaddr == curpbaddr and
2088+ * lastaddr != lastpbaddr
2089+ */
2090+ if (curaddr < lastaddr ||
2091+ (curaddr == lastaddr && curaddr == curpbaddr &&
2092+ lastaddr != lastpbaddr)) {
2093 change_tmp = change_point[i];
2094 change_point[i] = change_point[i-1];
2095 change_point[i-1] = change_tmp;
2096- still_changing=1;
2097+ still_changing = 1;
2098 }
2099 }
2100 }
2101
2102 /* create a new bios memory map, removing overlaps */
2103- overlap_entries=0; /* number of entries in the overlap table */
2104- new_bios_entry=0; /* index for creating new bios map entries */
2105+ overlap_entries = 0; /* number of entries in the overlap table */
2106+ new_bios_entry = 0; /* index for creating new bios map entries */
2107 last_type = 0; /* start with undefined memory type */
2108 last_addr = 0; /* start with 0 as last starting address */
2109+
2110 /* loop through change-points, determining affect on the new bios map */
2111- for (chgidx=0; chgidx < chg_nr; chgidx++)
2112- {
2113+ for (chgidx = 0; chgidx < chg_nr; chgidx++) {
2114 /* keep track of all overlapping bios entries */
2115- if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
2116- {
2117- /* add map entry to overlap list (> 1 entry implies an overlap) */
2118- overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
2119- }
2120- else
2121- {
2122- /* remove entry from list (order independent, so swap with last) */
2123- for (i=0; i<overlap_entries; i++)
2124- {
2125- if (overlap_list[i] == change_point[chgidx]->pbios)
2126- overlap_list[i] = overlap_list[overlap_entries-1];
2127+ if (change_point[chgidx]->addr ==
2128+ change_point[chgidx]->pbios->addr) {
2129+ /*
2130+ * add map entry to overlap list (> 1 entry
2131+ * implies an overlap)
2132+ */
2133+ overlap_list[overlap_entries++] =
2134+ change_point[chgidx]->pbios;
2135+ } else {
2136+ /*
2137+ * remove entry from list (order independent,
2138+ * so swap with last)
2139+ */
2140+ for (i = 0; i < overlap_entries; i++) {
2141+ if (overlap_list[i] ==
2142+ change_point[chgidx]->pbios)
2143+ overlap_list[i] =
2144+ overlap_list[overlap_entries-1];
2145 }
2146 overlap_entries--;
2147 }
2148- /* if there are overlapping entries, decide which "type" to use */
2149- /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
2150+ /*
2151+ * if there are overlapping entries, decide which
2152+ * "type" to use (larger value takes precedence --
2153+ * 1=usable, 2,3,4,4+=unusable)
2154+ */
2155 current_type = 0;
2156- for (i=0; i<overlap_entries; i++)
2157+ for (i = 0; i < overlap_entries; i++)
2158 if (overlap_list[i]->type > current_type)
2159 current_type = overlap_list[i]->type;
2160- /* continue building up new bios map based on this information */
2161+ /*
2162+ * continue building up new bios map based on this
2163+ * information
2164+ */
2165 if (current_type != last_type) {
2166 if (last_type != 0) {
2167 new_bios[new_bios_entry].size =
2168 change_point[chgidx]->addr - last_addr;
2169- /* move forward only if the new size was non-zero */
2170+ /*
2171+ * move forward only if the new size
2172+ * was non-zero
2173+ */
2174 if (new_bios[new_bios_entry].size != 0)
2175+ /*
2176+ * no more space left for new
2177+ * bios entries ?
2178+ */
2179 if (++new_bios_entry >= E820MAX)
2180- break; /* no more space left for new bios entries */
2181+ break;
2182 }
2183 if (current_type != 0) {
2184- new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
2185+ new_bios[new_bios_entry].addr =
2186+ change_point[chgidx]->addr;
2187 new_bios[new_bios_entry].type = current_type;
2188- last_addr=change_point[chgidx]->addr;
2189+ last_addr = change_point[chgidx]->addr;
2190 }
2191 last_type = current_type;
2192 }
2193 }
2194- new_nr = new_bios_entry; /* retain count for new bios entries */
2195+ /* retain count for new bios entries */
2196+ new_nr = new_bios_entry;
2197
2198 /* copy new bios mapping into original location */
2199- memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
2200+ memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
2201 *pnr_map = new_nr;
2202
2203 return 0;
2204@@ -601,7 +656,7 @@ static int __init sanitize_e820_map(stru
2205 * will have given us a memory map that we can use to properly
2206 * set up memory. If we aren't, we'll fake a memory map.
2207 */
2208-static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
2209+static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
2210 {
2211 #ifndef CONFIG_XEN
2212 /* Only one memory region (or negative)? Ignore it */
2213@@ -622,7 +677,7 @@ static int __init copy_e820_map(struct e
2214 return -1;
2215
2216 add_memory_region(start, size, type);
2217- } while (biosmap++,--nr_map);
2218+ } while (biosmap++, --nr_map);
2219
2220 #ifdef CONFIG_XEN
2221 if (is_initial_xendomain()) {
2222@@ -641,15 +696,17 @@ static int __init copy_e820_map(struct e
2223 return 0;
2224 }
2225
2226-void early_panic(char *msg)
2227+static void early_panic(char *msg)
2228 {
2229 early_printk(msg);
2230 panic(msg);
2231 }
2232
2233-#ifndef CONFIG_XEN
2234-void __init setup_memory_region(void)
2235+/* We're not void only for x86 32-bit compat */
2236+char * __init machine_specific_memory_setup(void)
2237 {
2238+#ifndef CONFIG_XEN
2239+ char *who = "BIOS-e820";
2240 /*
2241 * Try to copy the BIOS-supplied E820-map.
2242 *
2243@@ -659,14 +716,8 @@ void __init setup_memory_region(void)
2244 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
2245 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
2246 early_panic("Cannot find a valid memory map");
2247- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
2248- e820_print_map("BIOS-e820");
2249-}
2250-
2251 #else /* CONFIG_XEN */
2252-
2253-void __init setup_memory_region(void)
2254-{
2255+ char *who = "Xen";
2256 int rc;
2257 struct xen_memory_map memmap;
2258 /*
2259@@ -694,11 +745,13 @@ void __init setup_memory_region(void)
2260
2261 if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
2262 early_panic("Cannot find a valid memory map");
2263-
2264+#endif
2265 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
2266- e820_print_map("Xen");
2267+ e820_print_map(who);
2268+
2269+ /* In case someone cares... */
2270+ return who;
2271 }
2272-#endif
2273
2274 static int __init parse_memopt(char *p)
2275 {
2276@@ -709,7 +762,7 @@ static int __init parse_memopt(char *p)
2277 if (!p)
2278 return -EINVAL;
2279 end_user_pfn = memparse(p, &p);
2280- end_user_pfn >>= PAGE_SHIFT;
2281+ end_user_pfn >>= PAGE_SHIFT;
2282
2283 end = end_user_pfn<<PAGE_SHIFT;
2284 i = e820.nr_map-1;
2285@@ -727,7 +780,7 @@ static int __init parse_memopt(char *p)
2286 }
2287
2288 return 0;
2289-}
2290+}
2291 early_param("mem", parse_memopt);
2292
2293 static int userdef __initdata;
2294@@ -739,9 +792,9 @@ static int __init parse_memmap_opt(char
2295
2296 if (!strcmp(p, "exactmap")) {
2297 #ifdef CONFIG_CRASH_DUMP
2298- /* If we are doing a crash dump, we
2299- * still need to know the real mem
2300- * size before original memory map is
2301+ /*
2302+ * If we are doing a crash dump, we still need to know
2303+ * the real mem size before original memory map is
2304 * reset.
2305 */
2306 e820_register_active_regions(0, 0, -1UL);
2307@@ -758,6 +811,8 @@ static int __init parse_memmap_opt(char
2308 mem_size = memparse(p, &p);
2309 if (p == oldp)
2310 return -EINVAL;
2311+
2312+ userdef = 1;
2313 if (*p == '@') {
2314 start_at = memparse(p+1, &p);
2315 add_memory_region(start_at, mem_size, E820_RAM);
2316@@ -777,11 +832,58 @@ early_param("memmap", parse_memmap_opt);
2317 void __init finish_e820_parsing(void)
2318 {
2319 if (userdef) {
2320+ char nr = e820.nr_map;
2321+
2322+ if (sanitize_e820_map(e820.map, &nr) < 0)
2323+ early_panic("Invalid user supplied memory map");
2324+ e820.nr_map = nr;
2325+
2326 printk(KERN_INFO "user-defined physical RAM map:\n");
2327 e820_print_map("user");
2328 }
2329 }
2330
2331+#ifndef CONFIG_XEN
2332+void __init update_memory_range(u64 start, u64 size, unsigned old_type,
2333+ unsigned new_type)
2334+{
2335+ int i;
2336+
2337+ BUG_ON(old_type == new_type);
2338+
2339+ for (i = 0; i < e820.nr_map; i++) {
2340+ struct e820entry *ei = &e820.map[i];
2341+ u64 final_start, final_end;
2342+ if (ei->type != old_type)
2343+ continue;
2344+ /* totally covered? */
2345+ if (ei->addr >= start && ei->size <= size) {
2346+ ei->type = new_type;
2347+ continue;
2348+ }
2349+ /* partially covered */
2350+ final_start = max(start, ei->addr);
2351+ final_end = min(start + size, ei->addr + ei->size);
2352+ if (final_start >= final_end)
2353+ continue;
2354+ add_memory_region(final_start, final_end - final_start,
2355+ new_type);
2356+ }
2357+}
2358+
2359+void __init update_e820(void)
2360+{
2361+ u8 nr_map;
2362+
2363+ nr_map = e820.nr_map;
2364+ if (sanitize_e820_map(e820.map, &nr_map))
2365+ return;
2366+ e820.nr_map = nr_map;
2367+ printk(KERN_INFO "modified physical RAM map:\n");
2368+ e820_print_map("modified");
2369+}
2370+#endif
2371+
2372 unsigned long pci_mem_start = 0xaeedbabe;
2373 EXPORT_SYMBOL(pci_mem_start);
2374
2375@@ -825,8 +927,10 @@ __init void e820_setup_gap(struct e820en
2376
2377 if (!found) {
2378 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
2379- printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
2380- KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
2381+ printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2382+ "address range\n"
2383+ KERN_ERR "PCI: Unassigned devices with 32bit resource "
2384+ "registers may break!\n");
2385 }
2386
2387 /*
2388@@ -839,8 +943,9 @@ __init void e820_setup_gap(struct e820en
2389 /* Fun with two's complement */
2390 pci_mem_start = (gapstart + round) & -round;
2391
2392- printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2393- pci_mem_start, gapstart, gapsize);
2394+ printk(KERN_INFO
2395+ "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2396+ pci_mem_start, gapstart, gapsize);
2397 }
2398
2399 int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
2400--- a/arch/x86/kernel/early_printk-xen.c
2401+++ b/arch/x86/kernel/early_printk-xen.c
2402@@ -222,7 +222,7 @@ static struct console simnow_console = {
2403 };
2404
2405 /* Direct interface for emergencies */
2406-struct console *early_console = &early_vga_console;
2407+static struct console *early_console = &early_vga_console;
2408 static int early_console_initialized = 0;
2409
2410 void early_printk(const char *fmt, ...)
2411--- a/arch/x86/kernel/entry_32-xen.S
2412+++ b/arch/x86/kernel/entry_32-xen.S
2413@@ -59,7 +59,7 @@
2414 * for paravirtualization. The following will never clobber any registers:
2415 * INTERRUPT_RETURN (aka. "iret")
2416 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
2417- * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
2418+ * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
2419 *
2420 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
2421 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
2422@@ -282,16 +282,21 @@ END(resume_kernel)
2423 #endif
2424 CFI_ENDPROC
2425
2426+ .macro test_tif ti_reg # system call tracing in operation / emulation
2427+ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2428+ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
2429+ .endm
2430+
2431 /* SYSENTER_RETURN points to after the "sysenter" instruction in
2432 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
2433
2434 # sysenter call handler stub
2435-ENTRY(sysenter_entry)
2436+ENTRY(ia32_sysenter_target)
2437 CFI_STARTPROC simple
2438 CFI_SIGNAL_FRAME
2439 CFI_DEF_CFA esp, 0
2440 CFI_REGISTER esp, ebp
2441- movl SYSENTER_stack_esp0(%esp),%esp
2442+ movl SYSENTER_stack_sp0(%esp),%esp
2443 sysenter_past_esp:
2444 /*
2445 * No need to follow this irqs on/off section: the syscall
2446@@ -334,9 +339,7 @@ sysenter_past_esp:
2447 CFI_ADJUST_CFA_OFFSET 4
2448 SAVE_ALL
2449 GET_THREAD_INFO(%ebp)
2450-
2451- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2452- testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2453+ test_tif %ebp
2454 jnz syscall_trace_entry
2455 cmpl $(nr_syscalls), %eax
2456 jae syscall_badsys
2457@@ -354,7 +357,7 @@ sysenter_past_esp:
2458 xorl %ebp,%ebp
2459 TRACE_IRQS_ON
2460 1: mov PT_FS(%esp), %fs
2461- ENABLE_INTERRUPTS_SYSEXIT
2462+ ENABLE_INTERRUPTS_SYSCALL_RET
2463 CFI_ENDPROC
2464 .pushsection .fixup,"ax"
2465 2: movl $0,PT_FS(%esp)
2466@@ -363,10 +366,10 @@ sysenter_past_esp:
2467 .align 4
2468 .long 1b,2b
2469 .popsection
2470-ENDPROC(sysenter_entry)
2471+ENDPROC(ia32_sysenter_target)
2472
2473 # pv sysenter call handler stub
2474-ENTRY(sysenter_entry_pv)
2475+ENTRY(ia32pv_sysenter_target)
2476 RING0_INT_FRAME
2477 movl $__USER_DS,16(%esp)
2478 movl %ebp,12(%esp)
2479@@ -389,7 +392,7 @@ ENTRY(sysenter_entry_pv)
2480 .previous
2481 /* fall through */
2482 CFI_ENDPROC
2483-ENDPROC(sysenter_entry_pv)
2484+ENDPROC(ia32pv_sysenter_target)
2485
2486 # system call handler stub
2487 ENTRY(system_call)
2488@@ -398,9 +401,7 @@ ENTRY(system_call)
2489 CFI_ADJUST_CFA_OFFSET 4
2490 SAVE_ALL
2491 GET_THREAD_INFO(%ebp)
2492- # system call tracing in operation / emulation
2493- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2494- testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2495+ test_tif %ebp
2496 jnz syscall_trace_entry
2497 cmpl $(nr_syscalls), %eax
2498 jae syscall_badsys
2499@@ -452,7 +453,8 @@ restore_nocheck_notrace:
2500 RESTORE_REGS
2501 addl $4, %esp # skip orig_eax/error_code
2502 CFI_ADJUST_CFA_OFFSET -4
2503-1: INTERRUPT_RETURN
2504+irq_return:
2505+ INTERRUPT_RETURN
2506 .section .fixup,"ax"
2507 iret_exc:
2508 pushl $0 # no error code
2509@@ -461,7 +463,7 @@ iret_exc:
2510 .previous
2511 .section __ex_table,"a"
2512 .align 4
2513- .long 1b,iret_exc
2514+ .long irq_return,iret_exc
2515 .previous
2516
2517 CFI_RESTORE_STATE
2518@@ -657,7 +659,7 @@ END(syscall_badsys)
2519 * Build the entry stubs and pointer table with
2520 * some assembler magic.
2521 */
2522-.data
2523+.section .rodata,"a"
2524 ENTRY(interrupt)
2525 .text
2526
2527@@ -959,7 +961,7 @@ END(device_not_available)
2528 * that sets up the real kernel stack. Check here, since we can't
2529 * allow the wrong stack to be used.
2530 *
2531- * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
2532+ * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
2533 * already pushed 3 words if it hits on the sysenter instruction:
2534 * eflags, cs and eip.
2535 *
2536@@ -971,7 +973,7 @@ END(device_not_available)
2537 cmpw $__KERNEL_CS,4(%esp); \
2538 jne ok; \
2539 label: \
2540- movl SYSENTER_stack_esp0+offset(%esp),%esp; \
2541+ movl SYSENTER_stack_sp0+offset(%esp),%esp; \
2542 CFI_DEF_CFA esp, 0; \
2543 CFI_UNDEFINED eip; \
2544 pushfl; \
2545@@ -986,7 +988,7 @@ label: \
2546 KPROBE_ENTRY(debug)
2547 RING0_INT_FRAME
2548 #ifndef CONFIG_XEN
2549- cmpl $sysenter_entry,(%esp)
2550+ cmpl $ia32_sysenter_target,(%esp)
2551 jne debug_stack_correct
2552 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
2553 debug_stack_correct:
2554@@ -1019,7 +1021,7 @@ KPROBE_ENTRY(nmi)
2555 popl %eax
2556 CFI_ADJUST_CFA_OFFSET -4
2557 je nmi_espfix_stack
2558- cmpl $sysenter_entry,(%esp)
2559+ cmpl $ia32_sysenter_target,(%esp)
2560 je nmi_stack_fixup
2561 pushl %eax
2562 CFI_ADJUST_CFA_OFFSET 4
2563@@ -1032,7 +1034,7 @@ KPROBE_ENTRY(nmi)
2564 popl %eax
2565 CFI_ADJUST_CFA_OFFSET -4
2566 jae nmi_stack_correct
2567- cmpl $sysenter_entry,12(%esp)
2568+ cmpl $ia32_sysenter_target,12(%esp)
2569 je nmi_debug_stack_check
2570 nmi_stack_correct:
2571 /* We have a RING0_INT_FRAME here */
2572@@ -1085,12 +1087,8 @@ nmi_espfix_stack:
2573 RESTORE_REGS
2574 lss 12+4(%esp), %esp # back to espfix stack
2575 CFI_ADJUST_CFA_OFFSET -24
2576-1: INTERRUPT_RETURN
2577+ jmp irq_return
2578 CFI_ENDPROC
2579-.section __ex_table,"a"
2580- .align 4
2581- .long 1b,iret_exc
2582-.previous
2583 #else
2584 KPROBE_ENTRY(nmi)
2585 RING0_INT_FRAME
2586@@ -1108,17 +1106,17 @@ KPROBE_END(nmi)
2587
2588 #ifdef CONFIG_PARAVIRT
2589 ENTRY(native_iret)
2590-1: iret
2591+ iret
2592 .section __ex_table,"a"
2593 .align 4
2594- .long 1b,iret_exc
2595+ .long native_iret, iret_exc
2596 .previous
2597 END(native_iret)
2598
2599-ENTRY(native_irq_enable_sysexit)
2600+ENTRY(native_irq_enable_syscall_ret)
2601 sti
2602 sysexit
2603-END(native_irq_enable_sysexit)
2604+END(native_irq_enable_syscall_ret)
2605 #endif
2606
2607 KPROBE_ENTRY(int3)
2608@@ -1267,7 +1265,144 @@ ENTRY(kernel_thread_helper)
2609 CFI_ENDPROC
2610 ENDPROC(kernel_thread_helper)
2611
2612+#include <asm/alternative-asm.h>
2613+
2614+ # pv syscall call handler stub
2615+ENTRY(ia32pv_cstar_target)
2616+ RING0_INT_FRAME
2617+ movl $__USER_DS,16(%esp)
2618+ movl %ebp,%ecx
2619+ movl $__USER_CS,4(%esp)
2620+ movl 12(%esp),%ebp
2621+ pushl %eax # save orig_eax
2622+ CFI_ADJUST_CFA_OFFSET 4
2623+/*
2624+ * Load the potential sixth argument from user stack.
2625+ * Careful about security.
2626+ */
2627+ cmpl $__PAGE_OFFSET-4,%ebp
2628+ CFI_REMEMBER_STATE
2629+ ja cstar_fault
2630+1: movl (%ebp),%ebp
2631+.section __ex_table,"a"
2632+ .align 4
2633+ .long 1b,cstar_fault
2634+.previous
2635+ SAVE_ALL
2636+ GET_THREAD_INFO(%ebp)
2637+ test_tif %ebp
2638+ jnz cstar_trace_entry
2639+ cmpl $nr_syscalls,%eax
2640+ jae cstar_badsys
2641+.Lcstar_call:
2642+ btl %eax,cstar_special
2643+ jc .Lcstar_special
2644+ call *cstar_call_table(,%eax,4)
2645+ movl %eax,PT_EAX(%esp) # store the return value
2646+.Lcstar_exit:
2647+ movl PT_ECX(%esp),%ecx
2648+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
2649+ jmp syscall_exit
2650+.Lcstar_special:
2651+ movl PT_ECX(%esp),%ecx
2652+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
2653+ jmp syscall_call
2654+cstar_set_tif:
2655+ movl $cstar_clear_tif,(%esp) # replace return address
2656+ LOCK_PREFIX
2657+ orl $_TIF_CSTAR,TI_flags(%ebp)
2658+ jmp *sys_call_table(,%eax,4)
2659+cstar_clear_tif:
2660+ movl %eax,PT_EAX(%esp) # store the return value
2661+ LOCK_PREFIX
2662+ andl $~_TIF_CSTAR,TI_flags(%ebp)
2663+ jmp .Lcstar_exit
2664+cstar_trace_entry:
2665+ movl $-ENOSYS,PT_EAX(%esp)
2666+ cmpl $nr_syscalls,%eax
2667+ jae 1f
2668+ btl %eax,cstar_special
2669+ jc .Lcstar_trace_special
2670+1: movl %esp,%eax
2671+ xorl %edx,%edx
2672+ LOCK_PREFIX
2673+ orl $_TIF_CSTAR,TI_flags(%ebp)
2674+ call do_syscall_trace
2675+ LOCK_PREFIX
2676+ andl $~_TIF_CSTAR,TI_flags(%ebp)
2677+ testl %eax,%eax
2678+ jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
2679+ # so must skip actual syscall
2680+ movl PT_ORIG_EAX(%esp),%eax
2681+ cmpl $nr_syscalls,%eax
2682+ jb .Lcstar_call
2683+ jmp .Lcstar_exit
2684+.Lcstar_trace_special:
2685+ movl PT_ECX(%esp),%ecx
2686+ movl %esp,%eax
2687+ xorl %edx,%edx
2688+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
2689+ call do_syscall_trace
2690+ testl %eax,%eax
2691+ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
2692+ # so must skip actual syscall
2693+ movl PT_ORIG_EAX(%esp),%eax
2694+ cmpl $nr_syscalls,%eax
2695+ jb syscall_call
2696+ jmp syscall_exit
2697+cstar_badsys:
2698+ movl $-ENOSYS,PT_EAX(%esp)
2699+.Lcstar_resume:
2700+ movl PT_ECX(%esp),%ecx
2701+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
2702+ jmp resume_userspace
2703+ CFI_RESTORE_STATE
2704+cstar_fault:
2705+ movl $-EFAULT,%eax
2706+ SAVE_ALL
2707+ GET_THREAD_INFO(%ebp)
2708+ jmp .Lcstar_resume
2709+ CFI_ENDPROC
2710+ENDPROC(ia32pv_cstar_target)
2711+
2712+ENTRY(cstar_ret_from_fork)
2713+ CFI_STARTPROC
2714+ movl PT_ECX(%esp),%ecx
2715+ GET_THREAD_INFO(%ebp)
2716+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
2717+ LOCK_PREFIX
2718+ andl $~_TIF_CSTAR,TI_flags(%ebp)
2719+ jmp ret_from_fork
2720+ CFI_ENDPROC
2721+END(ret_from_fork)
2722+
2723 .section .rodata,"a"
2724 #include "syscall_table_32.S"
2725
2726 syscall_table_size=(.-sys_call_table)
2727+
2728+#include <asm/unistd.h>
2729+cstar_special:
2730+nr=0
2731+mask=0
2732+.rept nr_syscalls+31
2733+ .irp n, __NR_sigreturn, __NR_rt_sigreturn
2734+ .if nr == \n
2735+ mask = mask | (1 << (\n & 31))
2736+ .endif
2737+ .endr
2738+ nr = nr + 1
2739+ .if (nr & 31) == 0
2740+ .long mask
2741+ mask = 0
2742+ .endif
2743+.endr
2744+#define sys_call_table cstar_call_table
2745+#define sys_fork cstar_set_tif
2746+#define sys_clone cstar_set_tif
2747+#define sys_vfork cstar_set_tif
2748+#include "syscall_table_32.S"
2749+#undef sys_call_table
2750+#undef sys_fork
2751+#undef sys_clone
2752+#undef sys_vfork
2753--- a/arch/x86/kernel/entry_64-xen.S
2754+++ b/arch/x86/kernel/entry_64-xen.S
2755@@ -54,17 +54,22 @@
2756 #include <asm/page.h>
2757 #include <asm/irqflags.h>
2758 #include <asm/errno.h>
2759-#include <xen/interface/arch-x86_64.h>
2760+#include <xen/interface/xen.h>
2761 #include <xen/interface/features.h>
2762
2763-#include "xen_entry_64.S"
2764-
2765 .code64
2766
2767 #ifndef CONFIG_PREEMPT
2768 #define retint_kernel retint_restore_args
2769 #endif
2770
2771+#ifdef CONFIG_PARAVIRT
2772+ENTRY(native_irq_enable_syscall_ret)
2773+ movq %gs:pda_oldrsp,%rsp
2774+ swapgs
2775+ sysretq
2776+#endif /* CONFIG_PARAVIRT */
2777+
2778
2779 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
2780 #ifdef CONFIG_TRACE_IRQFLAGS
2781@@ -277,7 +282,7 @@ ret_from_sys_call:
2782 sysret_check:
2783 LOCKDEP_SYS_EXIT
2784 GET_THREAD_INFO(%rcx)
2785- XEN_BLOCK_EVENTS(%rsi)
2786+ DISABLE_INTERRUPTS(CLBR_NONE)
2787 TRACE_IRQS_OFF
2788 movl threadinfo_flags(%rcx),%edx
2789 andl %edi,%edx
2790@@ -287,7 +292,7 @@ sysret_check:
2791 * sysretq will re-enable interrupts:
2792 */
2793 TRACE_IRQS_ON
2794- XEN_UNBLOCK_EVENTS(%rsi)
2795+ ENABLE_INTERRUPTS(CLBR_NONE)
2796 RESTORE_ARGS 0,8,0
2797 HYPERVISOR_IRET VGCF_IN_SYSCALL
2798
2799@@ -298,7 +303,7 @@ sysret_careful:
2800 bt $TIF_NEED_RESCHED,%edx
2801 jnc sysret_signal
2802 TRACE_IRQS_ON
2803- XEN_UNBLOCK_EVENTS(%rsi)
2804+ ENABLE_INTERRUPTS(CLBR_NONE)
2805 pushq %rdi
2806 CFI_ADJUST_CFA_OFFSET 8
2807 call schedule
2808@@ -309,9 +314,8 @@ sysret_careful:
2809 /* Handle a signal */
2810 sysret_signal:
2811 TRACE_IRQS_ON
2812-/* sti */
2813- XEN_UNBLOCK_EVENTS(%rsi)
2814- testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2815+ ENABLE_INTERRUPTS(CLBR_NONE)
2816+ testl $_TIF_DO_NOTIFY_MASK,%edx
2817 jz 1f
2818
2819 /* Really a signal */
2820@@ -323,7 +327,7 @@ sysret_signal:
2821 1: movl $_TIF_NEED_RESCHED,%edi
2822 /* Use IRET because user could have changed frame. This
2823 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
2824- XEN_BLOCK_EVENTS(%rsi)
2825+ DISABLE_INTERRUPTS(CLBR_NONE)
2826 TRACE_IRQS_OFF
2827 jmp int_with_check
2828
2829@@ -355,7 +359,7 @@ tracesys:
2830 */
2831 .globl int_ret_from_sys_call
2832 int_ret_from_sys_call:
2833- XEN_BLOCK_EVENTS(%rsi)
2834+ DISABLE_INTERRUPTS(CLBR_NONE)
2835 TRACE_IRQS_OFF
2836 testb $3,CS-ARGOFFSET(%rsp)
2837 jnz 1f
2838@@ -381,22 +385,20 @@ int_careful:
2839 bt $TIF_NEED_RESCHED,%edx
2840 jnc int_very_careful
2841 TRACE_IRQS_ON
2842-/* sti */
2843- XEN_UNBLOCK_EVENTS(%rsi)
2844+ ENABLE_INTERRUPTS(CLBR_NONE)
2845 pushq %rdi
2846 CFI_ADJUST_CFA_OFFSET 8
2847 call schedule
2848 popq %rdi
2849 CFI_ADJUST_CFA_OFFSET -8
2850- XEN_BLOCK_EVENTS(%rsi)
2851+ DISABLE_INTERRUPTS(CLBR_NONE)
2852 TRACE_IRQS_OFF
2853 jmp int_with_check
2854
2855 /* handle signals and tracing -- both require a full stack frame */
2856 int_very_careful:
2857 TRACE_IRQS_ON
2858-/* sti */
2859- XEN_UNBLOCK_EVENTS(%rsi)
2860+ ENABLE_INTERRUPTS(CLBR_NONE)
2861 SAVE_REST
2862 /* Check for syscall exit trace */
2863 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
2864@@ -411,7 +413,7 @@ int_very_careful:
2865 jmp int_restore_rest
2866
2867 int_signal:
2868- testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2869+ testl $_TIF_DO_NOTIFY_MASK,%edx
2870 jz 1f
2871 movq %rsp,%rdi # &ptregs -> arg1
2872 xorl %esi,%esi # oldset -> arg2
2873@@ -419,7 +421,7 @@ int_signal:
2874 1: movl $_TIF_NEED_RESCHED,%edi
2875 int_restore_rest:
2876 RESTORE_REST
2877- XEN_BLOCK_EVENTS(%rsi)
2878+ DISABLE_INTERRUPTS(CLBR_NONE)
2879 TRACE_IRQS_OFF
2880 jmp int_with_check
2881 CFI_ENDPROC
2882@@ -474,6 +476,7 @@ ENTRY(stub_execve)
2883 CFI_REGISTER rip, r11
2884 SAVE_REST
2885 FIXUP_TOP_OF_STACK %r11
2886+ movq %rsp, %rcx
2887 call sys_execve
2888 RESTORE_TOP_OF_STACK %r11
2889 movq %rax,RAX(%rsp)
2890@@ -526,11 +529,10 @@ retint_check:
2891 retint_restore_args: /* return to kernel space */
2892 movl EFLAGS-REST_SKIP(%rsp), %eax
2893 shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
2894- XEN_GET_VCPU_INFO(%rsi)
2895+ GET_VCPU_INFO
2896 andb evtchn_upcall_mask(%rsi),%al
2897 andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
2898 jnz restore_all_enable_events # != 0 => enable event delivery
2899- XEN_PUT_VCPU_INFO(%rsi)
2900
2901 RESTORE_ARGS 0,8,0
2902 HYPERVISOR_IRET 0
2903@@ -541,31 +543,29 @@ retint_careful:
2904 bt $TIF_NEED_RESCHED,%edx
2905 jnc retint_signal
2906 TRACE_IRQS_ON
2907- XEN_UNBLOCK_EVENTS(%rsi)
2908-/* sti */
2909+ ENABLE_INTERRUPTS(CLBR_NONE)
2910 pushq %rdi
2911 CFI_ADJUST_CFA_OFFSET 8
2912 call schedule
2913 popq %rdi
2914 CFI_ADJUST_CFA_OFFSET -8
2915 GET_THREAD_INFO(%rcx)
2916- XEN_BLOCK_EVENTS(%rsi)
2917-/* cli */
2918+ DISABLE_INTERRUPTS(CLBR_NONE)
2919 TRACE_IRQS_OFF
2920 jmp retint_check
2921
2922 retint_signal:
2923- testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2924+ testl $_TIF_DO_NOTIFY_MASK,%edx
2925 jz retint_restore_args
2926 TRACE_IRQS_ON
2927- XEN_UNBLOCK_EVENTS(%rsi)
2928+ ENABLE_INTERRUPTS(CLBR_NONE)
2929 SAVE_REST
2930 movq $-1,ORIG_RAX(%rsp)
2931 xorl %esi,%esi # oldset
2932 movq %rsp,%rdi # &pt_regs
2933 call do_notify_resume
2934 RESTORE_REST
2935- XEN_BLOCK_EVENTS(%rsi)
2936+ DISABLE_INTERRUPTS(CLBR_NONE)
2937 TRACE_IRQS_OFF
2938 movl $_TIF_NEED_RESCHED,%edi
2939 GET_THREAD_INFO(%rcx)
2940@@ -702,7 +702,7 @@ END(spurious_interrupt)
2941 rdmsr
2942 testl %edx,%edx
2943 js 1f
2944- swapgs
2945+ SWAPGS
2946 xorl %ebx,%ebx
2947 1:
2948 #endif
2949@@ -719,8 +719,7 @@ END(spurious_interrupt)
2950 .if \ist
2951 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
2952 .endif
2953-/* cli */
2954- XEN_BLOCK_EVENTS(%rsi)
2955+ DISABLE_INTERRUPTS(CLBR_NONE)
2956 .if \irqtrace
2957 TRACE_IRQS_OFF
2958 .endif
2959@@ -749,10 +748,10 @@ paranoid_swapgs\trace:
2960 .if \trace
2961 TRACE_IRQS_IRETQ 0
2962 .endif
2963- swapgs
2964+ SWAPGS_UNSAFE_STACK
2965 paranoid_restore\trace:
2966 RESTORE_ALL 8
2967- iretq
2968+ jmp irq_return
2969 paranoid_userspace\trace:
2970 GET_THREAD_INFO(%rcx)
2971 movl threadinfo_flags(%rcx),%ebx
2972@@ -767,11 +766,11 @@ paranoid_userspace\trace:
2973 .if \trace
2974 TRACE_IRQS_ON
2975 .endif
2976- sti
2977+ ENABLE_INTERRUPTS(CLBR_NONE)
2978 xorl %esi,%esi /* arg2: oldset */
2979 movq %rsp,%rdi /* arg1: &pt_regs */
2980 call do_notify_resume
2981- cli
2982+ DISABLE_INTERRUPTS(CLBR_NONE)
2983 .if \trace
2984 TRACE_IRQS_OFF
2985 .endif
2986@@ -780,9 +779,9 @@ paranoid_schedule\trace:
2987 .if \trace
2988 TRACE_IRQS_ON
2989 .endif
2990- sti
2991+ ENABLE_INTERRUPTS(CLBR_ANY)
2992 call schedule
2993- cli
2994+ DISABLE_INTERRUPTS(CLBR_ANY)
2995 .if \trace
2996 TRACE_IRQS_OFF
2997 .endif
2998@@ -846,8 +845,7 @@ error_call_handler:
2999 call *%rax
3000 error_exit:
3001 RESTORE_REST
3002-/* cli */
3003- XEN_BLOCK_EVENTS(%rsi)
3004+ DISABLE_INTERRUPTS(CLBR_NONE)
3005 TRACE_IRQS_OFF
3006 GET_THREAD_INFO(%rcx)
3007 testb $3,CS-ARGOFFSET(%rsp)
3008@@ -875,7 +873,7 @@ error_kernelspace:
3009 iret run with kernel gs again, so don't set the user space flag.
3010 B stepping K8s sometimes report an truncated RIP for IRET
3011 exceptions returning to compat mode. Check for these here too. */
3012- leaq iret_label(%rip),%rbp
3013+ leaq irq_return(%rip),%rbp
3014 cmpq %rbp,RIP(%rsp)
3015 je error_swapgs
3016 movl %ebp,%ebp /* zero extend */
3017@@ -930,19 +928,17 @@ END(do_hypervisor_callback)
3018 restore_all_enable_events:
3019 CFI_DEFAULT_STACK adj=1
3020 TRACE_IRQS_ON
3021- XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up...
3022+ __ENABLE_INTERRUPTS
3023
3024 scrit: /**** START OF CRITICAL REGION ****/
3025- XEN_TEST_PENDING(%rsi)
3026+ __TEST_PENDING
3027 CFI_REMEMBER_STATE
3028 jnz 14f # process more events if necessary...
3029- XEN_PUT_VCPU_INFO(%rsi)
3030 RESTORE_ARGS 0,8,0
3031 HYPERVISOR_IRET 0
3032
3033 CFI_RESTORE_STATE
3034-14: XEN_LOCKED_BLOCK_EVENTS(%rsi)
3035- XEN_PUT_VCPU_INFO(%rsi)
3036+14: __DISABLE_INTERRUPTS
3037 SAVE_REST
3038 movq %rsp,%rdi # set the argument again
3039 jmp 11b
3040@@ -1086,15 +1082,16 @@ ENDPROC(child_rip)
3041 * rdi: name, rsi: argv, rdx: envp
3042 *
3043 * We want to fallback into:
3044- * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
3045+ * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
3046 *
3047 * do_sys_execve asm fallback arguments:
3048- * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
3049+ * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
3050 */
3051 ENTRY(kernel_execve)
3052 CFI_STARTPROC
3053 FAKE_STACK_FRAME $0
3054 SAVE_ALL
3055+ movq %rsp,%rcx
3056 call sys_execve
3057 movq %rax, RAX(%rsp)
3058 RESTORE_REST
3059@@ -1144,7 +1141,7 @@ do_nmi_callback:
3060 call do_nmi
3061 orl $NMI_MASK,EFLAGS(%rsp)
3062 RESTORE_REST
3063- XEN_BLOCK_EVENTS(%rsi)
3064+ DISABLE_INTERRUPTS(CLBR_NONE)
3065 TRACE_IRQS_OFF
3066 GET_THREAD_INFO(%rcx)
3067 jmp retint_restore_args
3068--- a/arch/x86/kernel/fixup.c
3069+++ b/arch/x86/kernel/fixup.c
3070@@ -36,7 +36,7 @@
3071
3072 #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
3073
3074-fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
3075+void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
3076 {
3077 static unsigned long printed = 0;
3078 char info[100];
3079--- a/arch/x86/kernel/genapic_64-xen.c
3080+++ b/arch/x86/kernel/genapic_64-xen.c
3081@@ -24,20 +24,13 @@
3082 #include <acpi/acpi_bus.h>
3083 #endif
3084
3085-/*
3086- * which logical CPU number maps to which CPU (physical APIC ID)
3087- *
3088- * The following static array is used during kernel startup
3089- * and the x86_cpu_to_apicid_ptr contains the address of the
3090- * array during this time. Is it zeroed when the per_cpu
3091- * data area is removed.
3092- */
3093+/* which logical CPU number maps to which CPU (physical APIC ID) */
3094 #ifndef CONFIG_XEN
3095-u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
3096+u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
3097 = { [0 ... NR_CPUS-1] = BAD_APICID };
3098-void *x86_cpu_to_apicid_ptr;
3099+void *x86_cpu_to_apicid_early_ptr;
3100 #endif
3101-DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
3102+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
3103 EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
3104
3105 #ifndef CONFIG_XEN
3106--- a/arch/x86/kernel/head_32-xen.S
3107+++ b/arch/x86/kernel/head_32-xen.S
3108@@ -3,6 +3,7 @@
3109 .text
3110 #include <linux/elfnote.h>
3111 #include <linux/threads.h>
3112+#include <linux/init.h>
3113 #include <linux/linkage.h>
3114 #include <asm/segment.h>
3115 #include <asm/page.h>
3116@@ -88,7 +89,7 @@ ENTRY(_stext)
3117 */
3118 .section ".bss.page_aligned","wa"
3119 .align PAGE_SIZE_asm
3120-ENTRY(swapper_pg_pmd)
3121+ENTRY(swapper_pg_fixmap)
3122 .fill 1024,4,0
3123 ENTRY(empty_zero_page)
3124 .fill 4096,1,0
3125--- a/arch/x86/kernel/head64-xen.c
3126+++ b/arch/x86/kernel/head64-xen.c
3127@@ -16,6 +16,7 @@
3128 #include <linux/kernel.h>
3129 #include <linux/string.h>
3130 #include <linux/percpu.h>
3131+#include <linux/start_kernel.h>
3132 #include <linux/module.h>
3133
3134 #include <asm/processor.h>
3135@@ -26,6 +27,8 @@
3136 #include <asm/pgtable.h>
3137 #include <asm/tlbflush.h>
3138 #include <asm/sections.h>
3139+#include <asm/kdebug.h>
3140+#include <asm/e820.h>
3141
3142 unsigned long start_pfn;
3143
3144@@ -34,7 +37,7 @@ static void __init zap_identity_mappings
3145 {
3146 pgd_t *pgd = pgd_offset_k(0UL);
3147 pgd_clear(pgd);
3148- __flush_tlb();
3149+ __flush_tlb_all();
3150 }
3151
3152 /* Don't add a printk in there. printk relies on the PDA which is not initialized
3153@@ -72,6 +75,37 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
3154 unsigned int machine_to_phys_order;
3155 EXPORT_SYMBOL(machine_to_phys_order);
3156
3157+#define EBDA_ADDR_POINTER 0x40E
3158+
3159+static __init void reserve_ebda(void)
3160+{
3161+#ifndef CONFIG_XEN
3162+ unsigned ebda_addr, ebda_size;
3163+
3164+ /*
3165+ * there is a real-mode segmented pointer pointing to the
3166+ * 4K EBDA area at 0x40E
3167+ */
3168+ ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
3169+ ebda_addr <<= 4;
3170+
3171+ if (!ebda_addr)
3172+ return;
3173+
3174+ ebda_size = *(unsigned short *)__va(ebda_addr);
3175+
3176+ /* Round EBDA up to pages */
3177+ if (ebda_size == 0)
3178+ ebda_size = 1;
3179+ ebda_size <<= 10;
3180+ ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
3181+ if (ebda_size > 64*1024)
3182+ ebda_size = 64*1024;
3183+
3184+ reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
3185+#endif
3186+}
3187+
3188 void __init x86_64_start_kernel(char * real_mode_data)
3189 {
3190 struct xen_machphys_mapping mapping;
3191@@ -103,8 +137,16 @@ void __init x86_64_start_kernel(char * r
3192 /* Make NULL pointers segfault */
3193 zap_identity_mappings();
3194
3195- for (i = 0; i < IDT_ENTRIES; i++)
3196+ /* Cleanup the over mapped high alias */
3197+ cleanup_highmap();
3198+
3199+ for (i = 0; i < IDT_ENTRIES; i++) {
3200+#ifdef CONFIG_EARLY_PRINTK
3201+ set_intr_gate(i, &early_idt_handlers[i]);
3202+#else
3203 set_intr_gate(i, early_idt_handler);
3204+#endif
3205+ }
3206 load_idt((const struct desc_ptr *)&idt_descr);
3207 #endif
3208
3209@@ -115,8 +157,19 @@ void __init x86_64_start_kernel(char * r
3210
3211 pda_init(0);
3212 copy_bootdata(__va(real_mode_data));
3213-#ifdef CONFIG_SMP
3214- cpu_set(0, cpu_online_map);
3215-#endif
3216+
3217+ reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
3218+
3219+ reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
3220+ start_pfn << PAGE_SHIFT, "Xen provided");
3221+
3222+ reserve_ebda();
3223+
3224+ /*
3225+ * At this point everything still needed from the boot loader
3226+ * or BIOS or kernel text should be early reserved or marked not
3227+ * RAM in e820. All other memory is free game.
3228+ */
3229+
3230 start_kernel();
3231 }
3232--- a/arch/x86/kernel/init_task-xen.c
3233+++ b/arch/x86/kernel/init_task-xen.c
3234@@ -19,7 +19,7 @@ static struct sighand_struct init_sighan
3235 #endif
3236 struct mm_struct init_mm = INIT_MM(init_mm);
3237 #undef swapper_pg_dir
3238-EXPORT_SYMBOL(init_mm);
3239+EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
3240
3241 /*
3242 * Initial thread structure.
3243--- a/arch/x86/kernel/io_apic_32-xen.c
3244+++ b/arch/x86/kernel/io_apic_32-xen.c
3245@@ -35,6 +35,7 @@
3246 #include <linux/htirq.h>
3247 #include <linux/freezer.h>
3248 #include <linux/kthread.h>
3249+#include <linux/jiffies.h> /* time_after() */
3250
3251 #include <asm/io.h>
3252 #include <asm/smp.h>
3253@@ -48,8 +49,6 @@
3254 #include <mach_apic.h>
3255 #include <mach_apicdef.h>
3256
3257-#include "io_ports.h"
3258-
3259 #ifdef CONFIG_XEN
3260 #include <xen/interface/xen.h>
3261 #include <xen/interface/physdev.h>
3262@@ -400,7 +399,7 @@ static void set_ioapic_affinity_irq(unsi
3263 # include <asm/processor.h> /* kernel_thread() */
3264 # include <linux/kernel_stat.h> /* kstat */
3265 # include <linux/slab.h> /* kmalloc() */
3266-# include <linux/timer.h> /* time_after() */
3267+# include <linux/timer.h>
3268
3269 #define IRQBALANCE_CHECK_ARCH -999
3270 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
3271@@ -777,7 +776,7 @@ late_initcall(balanced_irq_init);
3272 #endif
3273
3274 #ifndef CONFIG_SMP
3275-void fastcall send_IPI_self(int vector)
3276+void send_IPI_self(int vector)
3277 {
3278 #ifndef CONFIG_XEN
3279 unsigned int cfg;
3280@@ -1959,7 +1958,7 @@ static int __init timer_irq_works(void)
3281 * might have cached one ExtINT interrupt. Finally, at
3282 * least one tick may be lost due to delays.
3283 */
3284- if (jiffies - t1 > 4)
3285+ if (time_after(jiffies, t1 + 4))
3286 return 1;
3287
3288 return 0;
3289@@ -2142,7 +2141,7 @@ static struct irq_chip lapic_chip __read
3290 .eoi = ack_apic,
3291 };
3292
3293-static void setup_nmi (void)
3294+static void __init setup_nmi(void)
3295 {
3296 /*
3297 * Dirty trick to enable the NMI watchdog ...
3298@@ -2155,7 +2154,7 @@ static void setup_nmi (void)
3299 */
3300 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
3301
3302- on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
3303+ enable_NMI_through_LVT0();
3304
3305 apic_printk(APIC_VERBOSE, " done.\n");
3306 }
3307@@ -2479,7 +2478,7 @@ static int ioapic_resume(struct sys_devi
3308 }
3309
3310 static struct sysdev_class ioapic_sysdev_class = {
3311- set_kset_name("ioapic"),
3312+ .name = "ioapic",
3313 .suspend = ioapic_suspend,
3314 .resume = ioapic_resume,
3315 };
3316--- a/arch/x86/kernel/io_apic_64-xen.c
3317+++ b/arch/x86/kernel/io_apic_64-xen.c
3318@@ -32,9 +32,11 @@
3319 #include <linux/msi.h>
3320 #include <linux/htirq.h>
3321 #include <linux/dmar.h>
3322+#include <linux/jiffies.h>
3323 #ifdef CONFIG_ACPI
3324 #include <acpi/acpi_bus.h>
3325 #endif
3326+#include <linux/bootmem.h>
3327
3328 #include <asm/idle.h>
3329 #include <asm/io.h>
3330@@ -1064,7 +1066,7 @@ void __apicdebuginit print_local_APIC(vo
3331 v = apic_read(APIC_LVR);
3332 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
3333 ver = GET_APIC_VERSION(v);
3334- maxlvt = get_maxlvt();
3335+ maxlvt = lapic_get_maxlvt();
3336
3337 v = apic_read(APIC_TASKPRI);
3338 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
3339@@ -1165,7 +1167,7 @@ void __apicdebuginit print_PIC(void)
3340 }
3341 #endif /* !CONFIG_XEN */
3342
3343-static void __init enable_IO_APIC(void)
3344+void __init enable_IO_APIC(void)
3345 {
3346 union IO_APIC_reg_01 reg_01;
3347 #ifndef CONFIG_XEN
3348@@ -1299,7 +1301,7 @@ static int __init timer_irq_works(void)
3349 */
3350
3351 /* jiffies wrap? */
3352- if (jiffies - t1 > 4)
3353+ if (time_after(jiffies, t1 + 4))
3354 return 1;
3355 return 0;
3356 }
3357@@ -1412,7 +1414,7 @@ static void irq_complete_move(unsigned i
3358 if (likely(!cfg->move_in_progress))
3359 return;
3360
3361- vector = ~get_irq_regs()->orig_rax;
3362+ vector = ~get_irq_regs()->orig_ax;
3363 me = smp_processor_id();
3364 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
3365 cpumask_t cleanup_mask;
3366@@ -1439,7 +1441,7 @@ static void ack_apic_level(unsigned int
3367 int do_unmask_irq = 0;
3368
3369 irq_complete_move(irq);
3370-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
3371+#ifdef CONFIG_GENERIC_PENDING_IRQ
3372 /* If we are moving the irq we need to mask it */
3373 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
3374 do_unmask_irq = 1;
3375@@ -1570,7 +1572,7 @@ static struct hw_interrupt_type lapic_ir
3376 .end = end_lapic_irq,
3377 };
3378
3379-static void setup_nmi (void)
3380+static void __init setup_nmi(void)
3381 {
3382 /*
3383 * Dirty trick to enable the NMI watchdog ...
3384@@ -1583,7 +1585,7 @@ static void setup_nmi (void)
3385 */
3386 printk(KERN_INFO "activating NMI Watchdog ...");
3387
3388- enable_NMI_through_LVT0(NULL);
3389+ enable_NMI_through_LVT0();
3390
3391 printk(" done.\n");
3392 }
3393@@ -1659,7 +1661,7 @@ static inline void unlock_ExtINT_logic(v
3394 *
3395 * FIXME: really need to revamp this for modern platforms only.
3396 */
3397-static inline void check_timer(void)
3398+static inline void __init check_timer(void)
3399 {
3400 struct irq_cfg *cfg = irq_cfg + 0;
3401 int apic1, pin1, apic2, pin2;
3402@@ -1863,7 +1865,7 @@ static int ioapic_resume(struct sys_devi
3403 }
3404
3405 static struct sysdev_class ioapic_sysdev_class = {
3406- set_kset_name("ioapic"),
3407+ .name = "ioapic",
3408 .suspend = ioapic_suspend,
3409 .resume = ioapic_resume,
3410 };
3411@@ -2303,5 +2305,93 @@ void __init setup_ioapic_dest(void)
3412 }
3413 }
3414 #endif
3415-#endif /* !CONFIG_XEN */
3416
3417+#define IOAPIC_RESOURCE_NAME_SIZE 11
3418+
3419+static struct resource *ioapic_resources;
3420+
3421+static struct resource * __init ioapic_setup_resources(void)
3422+{
3423+ unsigned long n;
3424+ struct resource *res;
3425+ char *mem;
3426+ int i;
3427+
3428+ if (nr_ioapics <= 0)
3429+ return NULL;
3430+
3431+ n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
3432+ n *= nr_ioapics;
3433+
3434+ mem = alloc_bootmem(n);
3435+ res = (void *)mem;
3436+
3437+ if (mem != NULL) {
3438+ memset(mem, 0, n);
3439+ mem += sizeof(struct resource) * nr_ioapics;
3440+
3441+ for (i = 0; i < nr_ioapics; i++) {
3442+ res[i].name = mem;
3443+ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3444+ sprintf(mem, "IOAPIC %u", i);
3445+ mem += IOAPIC_RESOURCE_NAME_SIZE;
3446+ }
3447+ }
3448+
3449+ ioapic_resources = res;
3450+
3451+ return res;
3452+}
3453+
3454+void __init ioapic_init_mappings(void)
3455+{
3456+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
3457+ struct resource *ioapic_res;
3458+ int i;
3459+
3460+ ioapic_res = ioapic_setup_resources();
3461+ for (i = 0; i < nr_ioapics; i++) {
3462+ if (smp_found_config) {
3463+ ioapic_phys = mp_ioapics[i].mpc_apicaddr;
3464+ } else {
3465+ ioapic_phys = (unsigned long)
3466+ alloc_bootmem_pages(PAGE_SIZE);
3467+ ioapic_phys = __pa(ioapic_phys);
3468+ }
3469+ set_fixmap_nocache(idx, ioapic_phys);
3470+ apic_printk(APIC_VERBOSE,
3471+ "mapped IOAPIC to %016lx (%016lx)\n",
3472+ __fix_to_virt(idx), ioapic_phys);
3473+ idx++;
3474+
3475+ if (ioapic_res != NULL) {
3476+ ioapic_res->start = ioapic_phys;
3477+ ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
3478+ ioapic_res++;
3479+ }
3480+ }
3481+}
3482+
3483+static int __init ioapic_insert_resources(void)
3484+{
3485+ int i;
3486+ struct resource *r = ioapic_resources;
3487+
3488+ if (!r) {
3489+ printk(KERN_ERR
3490+ "IO APIC resources could be not be allocated.\n");
3491+ return -1;
3492+ }
3493+
3494+ for (i = 0; i < nr_ioapics; i++) {
3495+ insert_resource(&iomem_resource, r);
3496+ r++;
3497+ }
3498+
3499+ return 0;
3500+}
3501+
3502+/* Insert the IO APIC resources after PCI initialization has occured to handle
3503+ * IO APICS that are mapped in on a BAR in PCI space. */
3504+late_initcall(ioapic_insert_resources);
3505+#endif /* !CONFIG_XEN */
3506--- a/arch/x86/kernel/ioport_32-xen.c
3507+++ /dev/null
3508@@ -1,121 +0,0 @@
3509-/*
3510- * This contains the io-permission bitmap code - written by obz, with changes
3511- * by Linus.
3512- */
3513-
3514-#include <linux/sched.h>
3515-#include <linux/kernel.h>
3516-#include <linux/capability.h>
3517-#include <linux/errno.h>
3518-#include <linux/types.h>
3519-#include <linux/ioport.h>
3520-#include <linux/smp.h>
3521-#include <linux/stddef.h>
3522-#include <linux/slab.h>
3523-#include <linux/thread_info.h>
3524-#include <linux/syscalls.h>
3525-#include <xen/interface/physdev.h>
3526-
3527-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3528-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
3529-{
3530- unsigned long mask;
3531- unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
3532- unsigned int low_index = base & (BITS_PER_LONG-1);
3533- int length = low_index + extent;
3534-
3535- if (low_index != 0) {
3536- mask = (~0UL << low_index);
3537- if (length < BITS_PER_LONG)
3538- mask &= ~(~0UL << length);
3539- if (new_value)
3540- *bitmap_base++ |= mask;
3541- else
3542- *bitmap_base++ &= ~mask;
3543- length -= BITS_PER_LONG;
3544- }
3545-
3546- mask = (new_value ? ~0UL : 0UL);
3547- while (length >= BITS_PER_LONG) {
3548- *bitmap_base++ = mask;
3549- length -= BITS_PER_LONG;
3550- }
3551-
3552- if (length > 0) {
3553- mask = ~(~0UL << length);
3554- if (new_value)
3555- *bitmap_base++ |= mask;
3556- else
3557- *bitmap_base++ &= ~mask;
3558- }
3559-}
3560-
3561-
3562-/*
3563- * this changes the io permissions bitmap in the current task.
3564- */
3565-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3566-{
3567- struct thread_struct * t = &current->thread;
3568- unsigned long *bitmap;
3569- struct physdev_set_iobitmap set_iobitmap;
3570-
3571- if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3572- return -EINVAL;
3573- if (turn_on && !capable(CAP_SYS_RAWIO))
3574- return -EPERM;
3575-
3576- /*
3577- * If it's the first ioperm() call in this thread's lifetime, set the
3578- * IO bitmap up. ioperm() is much less timing critical than clone(),
3579- * this is why we delay this operation until now:
3580- */
3581- if (!t->io_bitmap_ptr) {
3582- bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3583- if (!bitmap)
3584- return -ENOMEM;
3585-
3586- memset(bitmap, 0xff, IO_BITMAP_BYTES);
3587- t->io_bitmap_ptr = bitmap;
3588- set_thread_flag(TIF_IO_BITMAP);
3589-
3590- set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3591- set_iobitmap.nr_ports = IO_BITMAP_BITS;
3592- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3593- &set_iobitmap));
3594- }
3595-
3596- set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3597-
3598- return 0;
3599-}
3600-
3601-/*
3602- * sys_iopl has to be used when you want to access the IO ports
3603- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3604- * you'd need 8kB of bitmaps/process, which is a bit excessive.
3605- *
3606- * Here we just change the eflags value on the stack: we allow
3607- * only the super-user to do it. This depends on the stack-layout
3608- * on system-call entry - see also fork() and the signal handling
3609- * code.
3610- */
3611-
3612-asmlinkage long sys_iopl(unsigned long unused)
3613-{
3614- volatile struct pt_regs * regs = (struct pt_regs *) &unused;
3615- unsigned int level = regs->ebx;
3616- struct thread_struct *t = &current->thread;
3617- unsigned int old = (t->iopl >> 12) & 3;
3618-
3619- if (level > 3)
3620- return -EINVAL;
3621- /* Trying to gain more privileges? */
3622- if (level > old) {
3623- if (!capable(CAP_SYS_RAWIO))
3624- return -EPERM;
3625- }
3626- t->iopl = level << 12;
3627- set_iopl_mask(t->iopl);
3628- return 0;
3629-}
3630--- a/arch/x86/kernel/ioport_64-xen.c
3631+++ /dev/null
3632@@ -1,99 +0,0 @@
3633-/*
3634- * This contains the io-permission bitmap code - written by obz, with changes
3635- * by Linus.
3636- */
3637-
3638-#include <linux/sched.h>
3639-#include <linux/kernel.h>
3640-#include <linux/capability.h>
3641-#include <linux/errno.h>
3642-#include <linux/types.h>
3643-#include <linux/ioport.h>
3644-#include <linux/mm.h>
3645-#include <linux/smp.h>
3646-#include <linux/stddef.h>
3647-#include <linux/slab.h>
3648-#include <linux/thread_info.h>
3649-#include <linux/syscalls.h>
3650-#include <xen/interface/physdev.h>
3651-
3652-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3653-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
3654-{
3655- int i;
3656-
3657- if (new_value)
3658- for (i = base; i < base + extent; i++)
3659- __set_bit(i, bitmap);
3660- else
3661- for (i = base; i < base + extent; i++)
3662- clear_bit(i, bitmap);
3663-}
3664-
3665-/*
3666- * this changes the io permissions bitmap in the current task.
3667- */
3668-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3669-{
3670- struct thread_struct * t = &current->thread;
3671- unsigned long *bitmap;
3672- struct physdev_set_iobitmap set_iobitmap;
3673-
3674- if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3675- return -EINVAL;
3676- if (turn_on && !capable(CAP_SYS_RAWIO))
3677- return -EPERM;
3678-
3679- /*
3680- * If it's the first ioperm() call in this thread's lifetime, set the
3681- * IO bitmap up. ioperm() is much less timing critical than clone(),
3682- * this is why we delay this operation until now:
3683- */
3684- if (!t->io_bitmap_ptr) {
3685- bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3686- if (!bitmap)
3687- return -ENOMEM;
3688-
3689- memset(bitmap, 0xff, IO_BITMAP_BYTES);
3690- t->io_bitmap_ptr = bitmap;
3691- set_thread_flag(TIF_IO_BITMAP);
3692-
3693- set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3694- set_iobitmap.nr_ports = IO_BITMAP_BITS;
3695- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3696- &set_iobitmap));
3697- }
3698-
3699- set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3700-
3701- return 0;
3702-}
3703-
3704-/*
3705- * sys_iopl has to be used when you want to access the IO ports
3706- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3707- * you'd need 8kB of bitmaps/process, which is a bit excessive.
3708- *
3709- */
3710-
3711-asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
3712-{
3713- unsigned int old_iopl = current->thread.iopl;
3714- struct physdev_set_iopl set_iopl;
3715-
3716- if (new_iopl > 3)
3717- return -EINVAL;
3718-
3719- /* Need "raw I/O" privileges for direct port access. */
3720- if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
3721- return -EPERM;
3722-
3723- /* Change our version of the privilege levels. */
3724- current->thread.iopl = new_iopl;
3725-
3726- /* Force the change at ring 0. */
3727- set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
3728- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
3729-
3730- return 0;
3731-}
3732--- /dev/null
3733+++ b/arch/x86/kernel/ioport-xen.c
3734@@ -0,0 +1,112 @@
3735+/*
3736+ * This contains the io-permission bitmap code - written by obz, with changes
3737+ * by Linus. 32/64 bits code unification by Miguel Botón.
3738+ */
3739+
3740+#include <linux/sched.h>
3741+#include <linux/kernel.h>
3742+#include <linux/capability.h>
3743+#include <linux/errno.h>
3744+#include <linux/types.h>
3745+#include <linux/ioport.h>
3746+#include <linux/smp.h>
3747+#include <linux/stddef.h>
3748+#include <linux/slab.h>
3749+#include <linux/thread_info.h>
3750+#include <linux/syscalls.h>
3751+#include <xen/interface/physdev.h>
3752+
3753+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3754+static void set_bitmap(unsigned long *bitmap, unsigned int base,
3755+ unsigned int extent, int new_value)
3756+{
3757+ unsigned int i;
3758+
3759+ for (i = base; i < base + extent; i++) {
3760+ if (new_value)
3761+ __set_bit(i, bitmap);
3762+ else
3763+ __clear_bit(i, bitmap);
3764+ }
3765+}
3766+
3767+/*
3768+ * this changes the io permissions bitmap in the current task.
3769+ */
3770+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3771+{
3772+ struct thread_struct * t = &current->thread;
3773+ struct physdev_set_iobitmap set_iobitmap;
3774+
3775+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3776+ return -EINVAL;
3777+ if (turn_on && !capable(CAP_SYS_RAWIO))
3778+ return -EPERM;
3779+
3780+ /*
3781+ * If it's the first ioperm() call in this thread's lifetime, set the
3782+ * IO bitmap up. ioperm() is much less timing critical than clone(),
3783+ * this is why we delay this operation until now:
3784+ */
3785+ if (!t->io_bitmap_ptr) {
3786+ unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3787+
3788+ if (!bitmap)
3789+ return -ENOMEM;
3790+
3791+ memset(bitmap, 0xff, IO_BITMAP_BYTES);
3792+ t->io_bitmap_ptr = bitmap;
3793+ set_thread_flag(TIF_IO_BITMAP);
3794+
3795+ set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3796+ set_iobitmap.nr_ports = IO_BITMAP_BITS;
3797+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3798+ &set_iobitmap));
3799+ }
3800+
3801+ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3802+
3803+ return 0;
3804+}
3805+
3806+/*
3807+ * sys_iopl has to be used when you want to access the IO ports
3808+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3809+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
3810+ */
3811+static int do_iopl(unsigned int level, struct thread_struct *t)
3812+{
3813+ unsigned int old = t->iopl >> 12;
3814+
3815+ if (level > 3)
3816+ return -EINVAL;
3817+ /* Trying to gain more privileges? */
3818+ if (level > old) {
3819+ if (!capable(CAP_SYS_RAWIO))
3820+ return -EPERM;
3821+ }
3822+
3823+ return 0;
3824+}
3825+
3826+#ifdef CONFIG_X86_32
3827+asmlinkage long sys_iopl(unsigned long regsp)
3828+{
3829+ struct pt_regs *regs = (struct pt_regs *)&regsp;
3830+ unsigned int level = regs->bx;
3831+#else
3832+asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
3833+{
3834+#endif
3835+ struct thread_struct *t = &current->thread;
3836+ int rc;
3837+
3838+ rc = do_iopl(level, t);
3839+ if (rc < 0)
3840+ goto out;
3841+
3842+ t->iopl = level << 12;
3843+ set_iopl_mask(t->iopl);
3844+out:
3845+ return rc;
3846+}
3847--- a/arch/x86/kernel/irq_32-xen.c
3848+++ b/arch/x86/kernel/irq_32-xen.c
3849@@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPU
3850 * SMP cross-CPU interrupts have their own specific
3851 * handlers).
3852 */
3853-fastcall unsigned int do_IRQ(struct pt_regs *regs)
3854+unsigned int do_IRQ(struct pt_regs *regs)
3855 {
3856 struct pt_regs *old_regs;
3857 /* high bit used in ret_from_ code */
3858- int irq = ~regs->orig_eax;
3859+ int irq = ~regs->orig_ax;
3860 struct irq_desc *desc = irq_desc + irq;
3861 #ifdef CONFIG_4KSTACKS
3862 union irq_ctx *curctx, *irqctx;
3863@@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_r
3864 #ifdef CONFIG_DEBUG_STACKOVERFLOW
3865 /* Debugging check for stack overflow: is there less than 1KB free? */
3866 {
3867- long esp;
3868+ long sp;
3869
3870 __asm__ __volatile__("andl %%esp,%0" :
3871- "=r" (esp) : "0" (THREAD_SIZE - 1));
3872- if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
3873+ "=r" (sp) : "0" (THREAD_SIZE - 1));
3874+ if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
3875 printk("do_IRQ: stack overflow: %ld\n",
3876- esp - sizeof(struct thread_info));
3877+ sp - sizeof(struct thread_info));
3878 dump_stack();
3879 }
3880 }
3881@@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_r
3882 * current stack (which is the irq stack already after all)
3883 */
3884 if (curctx != irqctx) {
3885- int arg1, arg2, ebx;
3886+ int arg1, arg2, bx;
3887
3888 /* build the stack frame on the IRQ stack */
3889 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
3890@@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_r
3891 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
3892
3893 asm volatile(
3894- " xchgl %%ebx,%%esp \n"
3895- " call *%%edi \n"
3896- " movl %%ebx,%%esp \n"
3897- : "=a" (arg1), "=d" (arg2), "=b" (ebx)
3898+ " xchgl %%ebx,%%esp \n"
3899+ " call *%%edi \n"
3900+ " movl %%ebx,%%esp \n"
3901+ : "=a" (arg1), "=d" (arg2), "=b" (bx)
3902 : "0" (irq), "1" (desc), "2" (isp),
3903 "D" (desc->handle_irq)
3904 : "memory", "cc"
3905--- a/arch/x86/kernel/irq_64-xen.c
3906+++ b/arch/x86/kernel/irq_64-xen.c
3907@@ -20,6 +20,28 @@
3908
3909 atomic_t irq_err_count;
3910
3911+/*
3912+ * 'what should we do if we get a hw irq event on an illegal vector'.
3913+ * each architecture has to answer this themselves.
3914+ */
3915+void ack_bad_irq(unsigned int irq)
3916+{
3917+ printk(KERN_WARNING "unexpected IRQ trap at irq %02x\n", irq);
3918+#ifdef CONFIG_X86_LOCAL_APIC
3919+ /*
3920+ * Currently unexpected vectors happen only on SMP and APIC.
3921+ * We _must_ ack these because every local APIC has only N
3922+ * irq slots per priority level, and a 'hanging, unacked' IRQ
3923+ * holds up an irq slot - in excessive cases (when multiple
3924+ * unexpected vectors occur) that might lock up the APIC
3925+ * completely.
3926+ * But don't ack when the APIC is disabled. -AK
3927+ */
3928+ if (!disable_apic)
3929+ ack_APIC_irq();
3930+#endif
3931+}
3932+
3933 #ifdef CONFIG_DEBUG_STACKOVERFLOW
3934 /*
3935 * Probabilistic stack overflow check:
3936@@ -33,11 +55,11 @@ static inline void stack_overflow_check(
3937 u64 curbase = (u64)task_stack_page(current);
3938 static unsigned long warned = -60*HZ;
3939
3940- if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
3941- regs->rsp < curbase + sizeof(struct thread_info) + 128 &&
3942+ if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
3943+ regs->sp < curbase + sizeof(struct thread_info) + 128 &&
3944 time_after(jiffies, warned + 60*HZ)) {
3945- printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
3946- current->comm, curbase, regs->rsp);
3947+ printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
3948+ current->comm, curbase, regs->sp);
3949 show_stack(NULL,NULL);
3950 warned = jiffies;
3951 }
3952@@ -150,7 +172,7 @@ asmlinkage unsigned int do_IRQ(struct pt
3953 struct pt_regs *old_regs = set_irq_regs(regs);
3954
3955 /* high bit used in ret_from_ code */
3956- unsigned irq = ~regs->orig_rax;
3957+ unsigned irq = ~regs->orig_ax;
3958
3959 /*exit_idle();*/
3960 /*irq_enter();*/
3961@@ -251,14 +273,3 @@ asmlinkage void do_softirq(void)
3962 }
3963 local_irq_restore(flags);
3964 }
3965-
3966-#ifndef CONFIG_X86_LOCAL_APIC
3967-/*
3968- * 'what should we do if we get a hw irq event on an illegal vector'.
3969- * each architecture has to answer this themselves.
3970- */
3971-void ack_bad_irq(unsigned int irq)
3972-{
3973- printk("unexpected IRQ trap at irq %02x\n", irq);
3974-}
3975-#endif
3976--- a/arch/x86/kernel/ldt_32-xen.c
3977+++ /dev/null
3978@@ -1,265 +0,0 @@
3979-/*
3980- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
3981- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
3982- */
3983-
3984-#include <linux/errno.h>
3985-#include <linux/sched.h>
3986-#include <linux/string.h>
3987-#include <linux/mm.h>
3988-#include <linux/smp.h>
3989-#include <linux/vmalloc.h>
3990-#include <linux/slab.h>
3991-
3992-#include <asm/uaccess.h>
3993-#include <asm/system.h>
3994-#include <asm/ldt.h>
3995-#include <asm/desc.h>
3996-#include <asm/mmu_context.h>
3997-
3998-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
3999-static void flush_ldt(void *null)
4000-{
4001- if (current->active_mm)
4002- load_LDT(&current->active_mm->context);
4003-}
4004-#endif
4005-
4006-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
4007-{
4008- void *oldldt;
4009- void *newldt;
4010- int oldsize;
4011-
4012- if (mincount <= pc->size)
4013- return 0;
4014- oldsize = pc->size;
4015- mincount = (mincount+511)&(~511);
4016- if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
4017- newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
4018- else
4019- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
4020-
4021- if (!newldt)
4022- return -ENOMEM;
4023-
4024- if (oldsize)
4025- memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
4026- oldldt = pc->ldt;
4027- memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
4028- pc->ldt = newldt;
4029- wmb();
4030- pc->size = mincount;
4031- wmb();
4032-
4033- if (reload) {
4034-#ifdef CONFIG_SMP
4035- cpumask_t mask;
4036- preempt_disable();
4037-#endif
4038- make_pages_readonly(
4039- pc->ldt,
4040- (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4041- XENFEAT_writable_descriptor_tables);
4042- load_LDT(pc);
4043-#ifdef CONFIG_SMP
4044- mask = cpumask_of_cpu(smp_processor_id());
4045- if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4046- smp_call_function(flush_ldt, NULL, 1, 1);
4047- preempt_enable();
4048-#endif
4049- }
4050- if (oldsize) {
4051- make_pages_writable(
4052- oldldt,
4053- (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4054- XENFEAT_writable_descriptor_tables);
4055- if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
4056- vfree(oldldt);
4057- else
4058- kfree(oldldt);
4059- }
4060- return 0;
4061-}
4062-
4063-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4064-{
4065- int err = alloc_ldt(new, old->size, 0);
4066- if (err < 0)
4067- return err;
4068- memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
4069- make_pages_readonly(
4070- new->ldt,
4071- (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4072- XENFEAT_writable_descriptor_tables);
4073- return 0;
4074-}
4075-
4076-/*
4077- * we do not have to muck with descriptors here, that is
4078- * done in switch_mm() as needed.
4079- */
4080-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4081-{
4082- struct mm_struct * old_mm;
4083- int retval = 0;
4084-
4085- mutex_init(&mm->context.lock);
4086- mm->context.size = 0;
4087- mm->context.has_foreign_mappings = 0;
4088- old_mm = current->mm;
4089- if (old_mm && old_mm->context.size > 0) {
4090- mutex_lock(&old_mm->context.lock);
4091- retval = copy_ldt(&mm->context, &old_mm->context);
4092- mutex_unlock(&old_mm->context.lock);
4093- }
4094- return retval;
4095-}
4096-
4097-/*
4098- * No need to lock the MM as we are the last user
4099- */
4100-void destroy_context(struct mm_struct *mm)
4101-{
4102- if (mm->context.size) {
4103- if (mm == current->active_mm)
4104- clear_LDT();
4105- make_pages_writable(
4106- mm->context.ldt,
4107- (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4108- XENFEAT_writable_descriptor_tables);
4109- if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
4110- vfree(mm->context.ldt);
4111- else
4112- kfree(mm->context.ldt);
4113- mm->context.size = 0;
4114- }
4115-}
4116-
4117-static int read_ldt(void __user * ptr, unsigned long bytecount)
4118-{
4119- int err;
4120- unsigned long size;
4121- struct mm_struct * mm = current->mm;
4122-
4123- if (!mm->context.size)
4124- return 0;
4125- if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
4126- bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
4127-
4128- mutex_lock(&mm->context.lock);
4129- size = mm->context.size*LDT_ENTRY_SIZE;
4130- if (size > bytecount)
4131- size = bytecount;
4132-
4133- err = 0;
4134- if (copy_to_user(ptr, mm->context.ldt, size))
4135- err = -EFAULT;
4136- mutex_unlock(&mm->context.lock);
4137- if (err < 0)
4138- goto error_return;
4139- if (size != bytecount) {
4140- /* zero-fill the rest */
4141- if (clear_user(ptr+size, bytecount-size) != 0) {
4142- err = -EFAULT;
4143- goto error_return;
4144- }
4145- }
4146- return bytecount;
4147-error_return:
4148- return err;
4149-}
4150-
4151-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
4152-{
4153- int err;
4154- unsigned long size;
4155-
4156- err = 0;
4157- size = 5*sizeof(struct desc_struct);
4158- if (size > bytecount)
4159- size = bytecount;
4160-
4161- err = size;
4162- if (clear_user(ptr, size))
4163- err = -EFAULT;
4164-
4165- return err;
4166-}
4167-
4168-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
4169-{
4170- struct mm_struct * mm = current->mm;
4171- __u32 entry_1, entry_2;
4172- int error;
4173- struct user_desc ldt_info;
4174-
4175- error = -EINVAL;
4176- if (bytecount != sizeof(ldt_info))
4177- goto out;
4178- error = -EFAULT;
4179- if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
4180- goto out;
4181-
4182- error = -EINVAL;
4183- if (ldt_info.entry_number >= LDT_ENTRIES)
4184- goto out;
4185- if (ldt_info.contents == 3) {
4186- if (oldmode)
4187- goto out;
4188- if (ldt_info.seg_not_present == 0)
4189- goto out;
4190- }
4191-
4192- mutex_lock(&mm->context.lock);
4193- if (ldt_info.entry_number >= mm->context.size) {
4194- error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
4195- if (error < 0)
4196- goto out_unlock;
4197- }
4198-
4199- /* Allow LDTs to be cleared by the user. */
4200- if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4201- if (oldmode || LDT_empty(&ldt_info)) {
4202- entry_1 = 0;
4203- entry_2 = 0;
4204- goto install;
4205- }
4206- }
4207-
4208- entry_1 = LDT_entry_a(&ldt_info);
4209- entry_2 = LDT_entry_b(&ldt_info);
4210- if (oldmode)
4211- entry_2 &= ~(1 << 20);
4212-
4213- /* Install the new entry ... */
4214-install:
4215- error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
4216- entry_1, entry_2);
4217-
4218-out_unlock:
4219- mutex_unlock(&mm->context.lock);
4220-out:
4221- return error;
4222-}
4223-
4224-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
4225-{
4226- int ret = -ENOSYS;
4227-
4228- switch (func) {
4229- case 0:
4230- ret = read_ldt(ptr, bytecount);
4231- break;
4232- case 1:
4233- ret = write_ldt(ptr, bytecount, 1);
4234- break;
4235- case 2:
4236- ret = read_default_ldt(ptr, bytecount);
4237- break;
4238- case 0x11:
4239- ret = write_ldt(ptr, bytecount, 0);
4240- break;
4241- }
4242- return ret;
4243-}
4244--- a/arch/x86/kernel/ldt_64-xen.c
4245+++ /dev/null
4246@@ -1,271 +0,0 @@
4247-/*
4248- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4249- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4250- * Copyright (C) 2002 Andi Kleen
4251- *
4252- * This handles calls from both 32bit and 64bit mode.
4253- */
4254-
4255-#include <linux/errno.h>
4256-#include <linux/sched.h>
4257-#include <linux/string.h>
4258-#include <linux/mm.h>
4259-#include <linux/smp.h>
4260-#include <linux/vmalloc.h>
4261-#include <linux/slab.h>
4262-
4263-#include <asm/uaccess.h>
4264-#include <asm/system.h>
4265-#include <asm/ldt.h>
4266-#include <asm/desc.h>
4267-#include <asm/proto.h>
4268-#include <asm/pgalloc.h>
4269-
4270-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
4271-static void flush_ldt(void *null)
4272-{
4273- if (current->active_mm)
4274- load_LDT(&current->active_mm->context);
4275-}
4276-#endif
4277-
4278-static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
4279-{
4280- void *oldldt;
4281- void *newldt;
4282- unsigned oldsize;
4283-
4284- if (mincount <= (unsigned)pc->size)
4285- return 0;
4286- oldsize = pc->size;
4287- mincount = (mincount+511)&(~511);
4288- if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
4289- newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
4290- else
4291- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
4292-
4293- if (!newldt)
4294- return -ENOMEM;
4295-
4296- if (oldsize)
4297- memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
4298- oldldt = pc->ldt;
4299- memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
4300- wmb();
4301- pc->ldt = newldt;
4302- wmb();
4303- pc->size = mincount;
4304- wmb();
4305- if (reload) {
4306-#ifdef CONFIG_SMP
4307- cpumask_t mask;
4308-
4309- preempt_disable();
4310-#endif
4311- make_pages_readonly(
4312- pc->ldt,
4313- (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4314- XENFEAT_writable_descriptor_tables);
4315- load_LDT(pc);
4316-#ifdef CONFIG_SMP
4317- mask = cpumask_of_cpu(smp_processor_id());
4318- if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4319- smp_call_function(flush_ldt, NULL, 1, 1);
4320- preempt_enable();
4321-#endif
4322- }
4323- if (oldsize) {
4324- make_pages_writable(
4325- oldldt,
4326- (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4327- XENFEAT_writable_descriptor_tables);
4328- if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
4329- vfree(oldldt);
4330- else
4331- kfree(oldldt);
4332- }
4333- return 0;
4334-}
4335-
4336-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4337-{
4338- int err = alloc_ldt(new, old->size, 0);
4339- if (err < 0)
4340- return err;
4341- memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
4342- make_pages_readonly(
4343- new->ldt,
4344- (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4345- XENFEAT_writable_descriptor_tables);
4346- return 0;
4347-}
4348-
4349-/*
4350- * we do not have to muck with descriptors here, that is
4351- * done in switch_mm() as needed.
4352- */
4353-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4354-{
4355- struct mm_struct * old_mm;
4356- int retval = 0;
4357-
4358- memset(&mm->context, 0, sizeof(mm->context));
4359- mutex_init(&mm->context.lock);
4360- old_mm = current->mm;
4361- if (old_mm)
4362- mm->context.vdso = old_mm->context.vdso;
4363- if (old_mm && old_mm->context.size > 0) {
4364- mutex_lock(&old_mm->context.lock);
4365- retval = copy_ldt(&mm->context, &old_mm->context);
4366- mutex_unlock(&old_mm->context.lock);
4367- }
4368- return retval;
4369-}
4370-
4371-/*
4372- *
4373- * Don't touch the LDT register - we're already in the next thread.
4374- */
4375-void destroy_context(struct mm_struct *mm)
4376-{
4377- if (mm->context.size) {
4378- if (mm == current->active_mm)
4379- clear_LDT();
4380- make_pages_writable(
4381- mm->context.ldt,
4382- (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4383- XENFEAT_writable_descriptor_tables);
4384- if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
4385- vfree(mm->context.ldt);
4386- else
4387- kfree(mm->context.ldt);
4388- mm->context.size = 0;
4389- }
4390-}
4391-
4392-static int read_ldt(void __user * ptr, unsigned long bytecount)
4393-{
4394- int err;
4395- unsigned long size;
4396- struct mm_struct * mm = current->mm;
4397-
4398- if (!mm->context.size)
4399- return 0;
4400- if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
4401- bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
4402-
4403- mutex_lock(&mm->context.lock);
4404- size = mm->context.size*LDT_ENTRY_SIZE;
4405- if (size > bytecount)
4406- size = bytecount;
4407-
4408- err = 0;
4409- if (copy_to_user(ptr, mm->context.ldt, size))
4410- err = -EFAULT;
4411- mutex_unlock(&mm->context.lock);
4412- if (err < 0)
4413- goto error_return;
4414- if (size != bytecount) {
4415- /* zero-fill the rest */
4416- if (clear_user(ptr+size, bytecount-size) != 0) {
4417- err = -EFAULT;
4418- goto error_return;
4419- }
4420- }
4421- return bytecount;
4422-error_return:
4423- return err;
4424-}
4425-
4426-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
4427-{
4428- /* Arbitrary number */
4429- /* x86-64 default LDT is all zeros */
4430- if (bytecount > 128)
4431- bytecount = 128;
4432- if (clear_user(ptr, bytecount))
4433- return -EFAULT;
4434- return bytecount;
4435-}
4436-
4437-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
4438-{
4439- struct task_struct *me = current;
4440- struct mm_struct * mm = me->mm;
4441- __u32 entry_1, entry_2, *lp;
4442- unsigned long mach_lp;
4443- int error;
4444- struct user_desc ldt_info;
4445-
4446- error = -EINVAL;
4447-
4448- if (bytecount != sizeof(ldt_info))
4449- goto out;
4450- error = -EFAULT;
4451- if (copy_from_user(&ldt_info, ptr, bytecount))
4452- goto out;
4453-
4454- error = -EINVAL;
4455- if (ldt_info.entry_number >= LDT_ENTRIES)
4456- goto out;
4457- if (ldt_info.contents == 3) {
4458- if (oldmode)
4459- goto out;
4460- if (ldt_info.seg_not_present == 0)
4461- goto out;
4462- }
4463-
4464- mutex_lock(&mm->context.lock);
4465- if (ldt_info.entry_number >= (unsigned)mm->context.size) {
4466- error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
4467- if (error < 0)
4468- goto out_unlock;
4469- }
4470-
4471- lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
4472- mach_lp = arbitrary_virt_to_machine(lp);
4473-
4474- /* Allow LDTs to be cleared by the user. */
4475- if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4476- if (oldmode || LDT_empty(&ldt_info)) {
4477- entry_1 = 0;
4478- entry_2 = 0;
4479- goto install;
4480- }
4481- }
4482-
4483- entry_1 = LDT_entry_a(&ldt_info);
4484- entry_2 = LDT_entry_b(&ldt_info);
4485- if (oldmode)
4486- entry_2 &= ~(1 << 20);
4487-
4488- /* Install the new entry ... */
4489-install:
4490- error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
4491-
4492-out_unlock:
4493- mutex_unlock(&mm->context.lock);
4494-out:
4495- return error;
4496-}
4497-
4498-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
4499-{
4500- int ret = -ENOSYS;
4501-
4502- switch (func) {
4503- case 0:
4504- ret = read_ldt(ptr, bytecount);
4505- break;
4506- case 1:
4507- ret = write_ldt(ptr, bytecount, 1);
4508- break;
4509- case 2:
4510- ret = read_default_ldt(ptr, bytecount);
4511- break;
4512- case 0x11:
4513- ret = write_ldt(ptr, bytecount, 0);
4514- break;
4515- }
4516- return ret;
4517-}
4518--- /dev/null
4519+++ b/arch/x86/kernel/ldt-xen.c
4520@@ -0,0 +1,272 @@
4521+/*
4522+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4523+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4524+ * Copyright (C) 2002 Andi Kleen
4525+ *
4526+ * This handles calls from both 32bit and 64bit mode.
4527+ */
4528+
4529+#include <linux/errno.h>
4530+#include <linux/sched.h>
4531+#include <linux/string.h>
4532+#include <linux/mm.h>
4533+#include <linux/smp.h>
4534+#include <linux/vmalloc.h>
4535+
4536+#include <asm/uaccess.h>
4537+#include <asm/system.h>
4538+#include <asm/ldt.h>
4539+#include <asm/desc.h>
4540+#include <asm/mmu_context.h>
4541+
4542+#ifdef CONFIG_SMP
4543+static void flush_ldt(void *null)
4544+{
4545+ if (current->active_mm)
4546+ load_LDT(&current->active_mm->context);
4547+}
4548+#endif
4549+
4550+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
4551+{
4552+ void *oldldt, *newldt;
4553+ int oldsize;
4554+
4555+ if (mincount <= pc->size)
4556+ return 0;
4557+ oldsize = pc->size;
4558+ mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
4559+ (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
4560+ if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
4561+ newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
4562+ else
4563+ newldt = (void *)__get_free_page(GFP_KERNEL);
4564+
4565+ if (!newldt)
4566+ return -ENOMEM;
4567+
4568+ if (oldsize)
4569+ memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
4570+ oldldt = pc->ldt;
4571+ memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
4572+ (mincount - oldsize) * LDT_ENTRY_SIZE);
4573+
4574+#ifdef CONFIG_X86_64
4575+ /* CHECKME: Do we really need this ? */
4576+ wmb();
4577+#endif
4578+ pc->ldt = newldt;
4579+ wmb();
4580+ pc->size = mincount;
4581+ wmb();
4582+
4583+ if (reload) {
4584+#ifdef CONFIG_SMP
4585+ cpumask_t mask;
4586+
4587+ preempt_disable();
4588+#endif
4589+ make_pages_readonly(newldt,
4590+ (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
4591+ XENFEAT_writable_descriptor_tables);
4592+ load_LDT(pc);
4593+#ifdef CONFIG_SMP
4594+ mask = cpumask_of_cpu(smp_processor_id());
4595+ if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4596+ smp_call_function(flush_ldt, NULL, 1, 1);
4597+ preempt_enable();
4598+#endif
4599+ }
4600+ if (oldsize) {
4601+ make_pages_writable(oldldt,
4602+ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4603+ XENFEAT_writable_descriptor_tables);
4604+ if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
4605+ vfree(oldldt);
4606+ else
4607+ put_page(virt_to_page(oldldt));
4608+ }
4609+ return 0;
4610+}
4611+
4612+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4613+{
4614+ int err = alloc_ldt(new, old->size, 0);
4615+
4616+ if (err < 0)
4617+ return err;
4618+ memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
4619+ make_pages_readonly(new->ldt,
4620+ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4621+ XENFEAT_writable_descriptor_tables);
4622+ return 0;
4623+}
4624+
4625+/*
4626+ * we do not have to muck with descriptors here, that is
4627+ * done in switch_mm() as needed.
4628+ */
4629+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4630+{
4631+ struct mm_struct *old_mm;
4632+ int retval = 0;
4633+
4634+ memset(&mm->context, 0, sizeof(mm->context));
4635+ mutex_init(&mm->context.lock);
4636+ old_mm = current->mm;
4637+ if (old_mm)
4638+ mm->context.vdso = old_mm->context.vdso;
4639+ if (old_mm && old_mm->context.size > 0) {
4640+ mutex_lock(&old_mm->context.lock);
4641+ retval = copy_ldt(&mm->context, &old_mm->context);
4642+ mutex_unlock(&old_mm->context.lock);
4643+ }
4644+ return retval;
4645+}
4646+
4647+/*
4648+ * No need to lock the MM as we are the last user
4649+ *
4650+ * 64bit: Don't touch the LDT register - we're already in the next thread.
4651+ */
4652+void destroy_context(struct mm_struct *mm)
4653+{
4654+ if (mm->context.size) {
4655+ /* CHECKME: Can this ever happen ? */
4656+ if (mm == current->active_mm)
4657+ clear_LDT();
4658+ make_pages_writable(mm->context.ldt,
4659+ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4660+ XENFEAT_writable_descriptor_tables);
4661+ if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
4662+ vfree(mm->context.ldt);
4663+ else
4664+ put_page(virt_to_page(mm->context.ldt));
4665+ mm->context.size = 0;
4666+ }
4667+}
4668+
4669+static int read_ldt(void __user *ptr, unsigned long bytecount)
4670+{
4671+ int err;
4672+ unsigned long size;
4673+ struct mm_struct *mm = current->mm;
4674+
4675+ if (!mm->context.size)
4676+ return 0;
4677+ if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
4678+ bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
4679+
4680+ mutex_lock(&mm->context.lock);
4681+ size = mm->context.size * LDT_ENTRY_SIZE;
4682+ if (size > bytecount)
4683+ size = bytecount;
4684+
4685+ err = 0;
4686+ if (copy_to_user(ptr, mm->context.ldt, size))
4687+ err = -EFAULT;
4688+ mutex_unlock(&mm->context.lock);
4689+ if (err < 0)
4690+ goto error_return;
4691+ if (size != bytecount) {
4692+ /* zero-fill the rest */
4693+ if (clear_user(ptr + size, bytecount - size) != 0) {
4694+ err = -EFAULT;
4695+ goto error_return;
4696+ }
4697+ }
4698+ return bytecount;
4699+error_return:
4700+ return err;
4701+}
4702+
4703+static int read_default_ldt(void __user *ptr, unsigned long bytecount)
4704+{
4705+ /* CHECKME: Can we use _one_ random number ? */
4706+#ifdef CONFIG_X86_32
4707+ unsigned long size = 5 * sizeof(struct desc_struct);
4708+#else
4709+ unsigned long size = 128;
4710+#endif
4711+ if (bytecount > size)
4712+ bytecount = size;
4713+ if (clear_user(ptr, bytecount))
4714+ return -EFAULT;
4715+ return bytecount;
4716+}
4717+
4718+static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
4719+{
4720+ struct mm_struct *mm = current->mm;
4721+ struct desc_struct ldt;
4722+ int error;
4723+ struct user_desc ldt_info;
4724+
4725+ error = -EINVAL;
4726+ if (bytecount != sizeof(ldt_info))
4727+ goto out;
4728+ error = -EFAULT;
4729+ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
4730+ goto out;
4731+
4732+ error = -EINVAL;
4733+ if (ldt_info.entry_number >= LDT_ENTRIES)
4734+ goto out;
4735+ if (ldt_info.contents == 3) {
4736+ if (oldmode)
4737+ goto out;
4738+ if (ldt_info.seg_not_present == 0)
4739+ goto out;
4740+ }
4741+
4742+ mutex_lock(&mm->context.lock);
4743+ if (ldt_info.entry_number >= mm->context.size) {
4744+ error = alloc_ldt(&current->mm->context,
4745+ ldt_info.entry_number + 1, 1);
4746+ if (error < 0)
4747+ goto out_unlock;
4748+ }
4749+
4750+ /* Allow LDTs to be cleared by the user. */
4751+ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4752+ if (oldmode || LDT_empty(&ldt_info)) {
4753+ memset(&ldt, 0, sizeof(ldt));
4754+ goto install;
4755+ }
4756+ }
4757+
4758+ fill_ldt(&ldt, &ldt_info);
4759+ if (oldmode)
4760+ ldt.avl = 0;
4761+
4762+ /* Install the new entry ... */
4763+install:
4764+ error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
4765+
4766+out_unlock:
4767+ mutex_unlock(&mm->context.lock);
4768+out:
4769+ return error;
4770+}
4771+
4772+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
4773+ unsigned long bytecount)
4774+{
4775+ int ret = -ENOSYS;
4776+
4777+ switch (func) {
4778+ case 0:
4779+ ret = read_ldt(ptr, bytecount);
4780+ break;
4781+ case 1:
4782+ ret = write_ldt(ptr, bytecount, 1);
4783+ break;
4784+ case 2:
4785+ ret = read_default_ldt(ptr, bytecount);
4786+ break;
4787+ case 0x11:
4788+ ret = write_ldt(ptr, bytecount, 0);
4789+ break;
4790+ }
4791+ return ret;
4792+}
4793--- a/arch/x86/kernel/machine_kexec_64.c
4794+++ b/arch/x86/kernel/machine_kexec_64.c
4795@@ -300,7 +300,9 @@ void machine_kexec(struct kimage *image)
4796
4797 void arch_crash_save_vmcoreinfo(void)
4798 {
4799+#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
4800 VMCOREINFO_SYMBOL(phys_base);
4801+#endif
4802 VMCOREINFO_SYMBOL(init_level4_pgt);
4803
4804 #ifdef CONFIG_NUMA
4805--- a/arch/x86/kernel/Makefile
4806+++ b/arch/x86/kernel/Makefile
4807@@ -120,11 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
4808
4809 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
4810
4811+ obj-$(CONFIG_XEN) += nmi_64.o
4812 time_64-$(CONFIG_XEN) += time_32.o
4813 pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
4814 endif
4815
4816 disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
4817 smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
4818-disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o
4819-%/head_64.o %/head_64.s: asflags-$(CONFIG_XEN) :=
4820--- a/arch/x86/kernel/microcode-xen.c
4821+++ b/arch/x86/kernel/microcode-xen.c
4822@@ -167,7 +167,7 @@ static int request_microcode(void)
4823 }
4824
4825 op.cmd = XENPF_microcode_update;
4826- set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data);
4827+ set_xen_guest_handle(op.u.microcode.data, firmware->data);
4828 op.u.microcode.length = firmware->size;
4829 error = HYPERVISOR_platform_op(&op);
4830
4831--- a/arch/x86/kernel/mpparse_32-xen.c
4832+++ b/arch/x86/kernel/mpparse_32-xen.c
4833@@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
4834 /* Processor that is doing the boot up */
4835 unsigned int boot_cpu_physical_apicid = -1U;
4836 /* Internal processor count */
4837-unsigned int __cpuinitdata num_processors;
4838+unsigned int num_processors;
4839
4840 /* Bitmask of physically existing CPUs */
4841 physid_mask_t phys_cpu_present_map;
4842@@ -265,7 +265,7 @@ static void __init MP_ioapic_info (struc
4843 if (!(m->mpc_flags & MPC_APIC_USABLE))
4844 return;
4845
4846- printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
4847+ printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
4848 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
4849 if (nr_ioapics >= MAX_IO_APICS) {
4850 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
4851@@ -412,9 +412,9 @@ static int __init smp_read_mpc(struct mp
4852
4853 mps_oem_check(mpc, oem, str);
4854
4855- printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
4856+ printk("APIC at: 0x%X\n", mpc->mpc_lapic);
4857
4858- /*
4859+ /*
4860 * Save the local APIC address (it might be non-default) -- but only
4861 * if we're not using ACPI.
4862 */
4863@@ -728,7 +728,7 @@ static int __init smp_scan_config (unsig
4864 unsigned long *bp = isa_bus_to_virt(base);
4865 struct intel_mp_floating *mpf;
4866
4867- Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
4868+ printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
4869 if (sizeof(*mpf) != 16)
4870 printk("Error: MPF size\n");
4871
4872@@ -742,9 +742,10 @@ static int __init smp_scan_config (unsig
4873
4874 smp_found_config = 1;
4875 #ifndef CONFIG_XEN
4876- printk(KERN_INFO "found SMP MP-table at %08lx\n",
4877- virt_to_phys(mpf));
4878- reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
4879+ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4880+ mpf, virt_to_phys(mpf));
4881+ reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
4882+ BOOTMEM_DEFAULT);
4883 if (mpf->mpf_physptr) {
4884 /*
4885 * We cannot access to MPC table to compute
4886@@ -759,11 +760,12 @@ static int __init smp_scan_config (unsig
4887 unsigned long end = max_low_pfn * PAGE_SIZE;
4888 if (mpf->mpf_physptr + size > end)
4889 size = end - mpf->mpf_physptr;
4890- reserve_bootmem(mpf->mpf_physptr, size);
4891+ reserve_bootmem(mpf->mpf_physptr, size,
4892+ BOOTMEM_DEFAULT);
4893 }
4894 #else
4895- printk(KERN_INFO "found SMP MP-table at %08lx\n",
4896- ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
4897+ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4898+ mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
4899 #endif
4900
4901 mpf_found = mpf;
4902@@ -940,14 +942,14 @@ void __init mp_register_ioapic(u8 id, u3
4903 */
4904 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
4905 mp_ioapic_routing[idx].gsi_base = gsi_base;
4906- mp_ioapic_routing[idx].gsi_end = gsi_base +
4907+ mp_ioapic_routing[idx].gsi_end = gsi_base +
4908 io_apic_get_redir_entries(idx);
4909
4910- printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
4911- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4912- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4913- mp_ioapic_routing[idx].gsi_base,
4914- mp_ioapic_routing[idx].gsi_end);
4915+ printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4916+ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4917+ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4918+ mp_ioapic_routing[idx].gsi_base,
4919+ mp_ioapic_routing[idx].gsi_end);
4920 }
4921
4922 void __init
4923@@ -1063,15 +1065,16 @@ void __init mp_config_acpi_legacy_irqs (
4924 }
4925
4926 #define MAX_GSI_NUM 4096
4927+#define IRQ_COMPRESSION_START 64
4928
4929 int mp_register_gsi(u32 gsi, int triggering, int polarity)
4930 {
4931 int ioapic = -1;
4932 int ioapic_pin = 0;
4933 int idx, bit = 0;
4934- static int pci_irq = 16;
4935+ static int pci_irq = IRQ_COMPRESSION_START;
4936 /*
4937- * Mapping between Global System Interrups, which
4938+ * Mapping between Global System Interrupts, which
4939 * represent all possible interrupts, and IRQs
4940 * assigned to actual devices.
4941 */
4942@@ -1108,12 +1111,16 @@ int mp_register_gsi(u32 gsi, int trigger
4943 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
4944 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
4945 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
4946- return gsi_to_irq[gsi];
4947+ return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
4948 }
4949
4950 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
4951
4952- if (triggering == ACPI_LEVEL_SENSITIVE) {
4953+ /*
4954+ * For GSI >= 64, use IRQ compression
4955+ */
4956+ if ((gsi >= IRQ_COMPRESSION_START)
4957+ && (triggering == ACPI_LEVEL_SENSITIVE)) {
4958 /*
4959 * For PCI devices assign IRQs in order, avoiding gaps
4960 * due to unused I/O APIC pins.
4961--- a/arch/x86/kernel/mpparse_64-xen.c
4962+++ b/arch/x86/kernel/mpparse_64-xen.c
4963@@ -60,14 +60,20 @@ unsigned int boot_cpu_id = -1U;
4964 EXPORT_SYMBOL(boot_cpu_id);
4965
4966 /* Internal processor count */
4967-unsigned int num_processors __cpuinitdata = 0;
4968+unsigned int num_processors;
4969
4970 unsigned disabled_cpus __cpuinitdata;
4971
4972 /* Bitmask of physically existing CPUs */
4973 physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
4974
4975-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
4976+#ifndef CONFIG_XEN
4977+u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
4978+ = { [0 ... NR_CPUS-1] = BAD_APICID };
4979+void *x86_bios_cpu_apicid_early_ptr;
4980+#endif
4981+DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
4982+EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
4983
4984
4985 /*
4986@@ -119,24 +125,22 @@ static void __cpuinit MP_processor_info(
4987 physid_set(m->mpc_apicid, phys_cpu_present_map);
4988 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4989 /*
4990- * bios_cpu_apicid is required to have processors listed
4991+ * x86_bios_cpu_apicid is required to have processors listed
4992 * in same order as logical cpu numbers. Hence the first
4993 * entry is BSP, and so on.
4994 */
4995 cpu = 0;
4996 }
4997- bios_cpu_apicid[cpu] = m->mpc_apicid;
4998- /*
4999- * We get called early in the the start_kernel initialization
5000- * process when the per_cpu data area is not yet setup, so we
5001- * use a static array that is removed after the per_cpu data
5002- * area is created.
5003- */
5004- if (x86_cpu_to_apicid_ptr) {
5005- u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
5006- x86_cpu_to_apicid[cpu] = m->mpc_apicid;
5007+ /* are we being called early in kernel startup? */
5008+ if (x86_cpu_to_apicid_early_ptr) {
5009+ u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
5010+ u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
5011+
5012+ cpu_to_apicid[cpu] = m->mpc_apicid;
5013+ bios_cpu_apicid[cpu] = m->mpc_apicid;
5014 } else {
5015 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
5016+ per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
5017 }
5018
5019 cpu_set(cpu, cpu_possible_map);
5020--- a/arch/x86/kernel/pci-dma-xen.c
5021+++ b/arch/x86/kernel/pci-dma-xen.c
5022@@ -434,3 +434,23 @@ dma_sync_single_for_device(struct device
5023 swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
5024 }
5025 EXPORT_SYMBOL(dma_sync_single_for_device);
5026+
5027+void
5028+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
5029+ enum dma_data_direction direction)
5030+{
5031+ if (swiotlb)
5032+ swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
5033+ flush_write_buffers();
5034+}
5035+EXPORT_SYMBOL(dma_sync_sg_for_cpu);
5036+
5037+void
5038+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
5039+ enum dma_data_direction direction)
5040+{
5041+ if (swiotlb)
5042+ swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
5043+ flush_write_buffers();
5044+}
5045+EXPORT_SYMBOL(dma_sync_sg_for_device);
5046--- a/arch/x86/kernel/process_32-xen.c
5047+++ b/arch/x86/kernel/process_32-xen.c
5048@@ -23,7 +23,6 @@
5049 #include <linux/slab.h>
5050 #include <linux/vmalloc.h>
5051 #include <linux/user.h>
5052-#include <linux/a.out.h>
5053 #include <linux/interrupt.h>
5054 #include <linux/utsname.h>
5055 #include <linux/delay.h>
5056@@ -59,8 +58,10 @@
5057
5058 #include <asm/tlbflush.h>
5059 #include <asm/cpu.h>
5060+#include <asm/kdebug.h>
5061
5062 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
5063+asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
5064
5065 static int hlt_counter;
5066
5067@@ -78,7 +79,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
5068 */
5069 unsigned long thread_saved_pc(struct task_struct *tsk)
5070 {
5071- return ((unsigned long *)tsk->thread.esp)[3];
5072+ return ((unsigned long *)tsk->thread.sp)[3];
5073 }
5074
5075 /*
5076@@ -86,7 +87,6 @@ unsigned long thread_saved_pc(struct tas
5077 */
5078 void (*pm_idle)(void);
5079 EXPORT_SYMBOL(pm_idle);
5080-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
5081
5082 void disable_hlt(void)
5083 {
5084@@ -107,7 +107,7 @@ EXPORT_SYMBOL(enable_hlt);
5085 * to poll the ->work.need_resched flag instead of waiting for the
5086 * cross-CPU IPI to arrive. Use this option with caution.
5087 */
5088-static void poll_idle (void)
5089+static void poll_idle(void)
5090 {
5091 cpu_relax();
5092 }
5093@@ -122,10 +122,19 @@ static void xen_idle(void)
5094 smp_mb();
5095
5096 local_irq_disable();
5097- if (!need_resched())
5098+ if (!need_resched()) {
5099+ ktime_t t0, t1;
5100+ u64 t0n, t1n;
5101+
5102+ t0 = ktime_get();
5103+ t0n = ktime_to_ns(t0);
5104 safe_halt(); /* enables interrupts racelessly */
5105- else
5106- local_irq_enable();
5107+ local_irq_disable();
5108+ t1 = ktime_get();
5109+ t1n = ktime_to_ns(t1);
5110+ sched_clock_idle_wakeup_event(t1n - t0n);
5111+ }
5112+ local_irq_enable();
5113 current_thread_info()->status |= TS_POLLING;
5114 }
5115 #ifdef CONFIG_APM_MODULE
5116@@ -168,13 +177,13 @@ void cpu_idle(void)
5117 while (!need_resched()) {
5118 void (*idle)(void);
5119
5120- if (__get_cpu_var(cpu_idle_state))
5121- __get_cpu_var(cpu_idle_state) = 0;
5122-
5123 check_pgt_cache();
5124 rmb();
5125 idle = xen_idle; /* no alternatives */
5126
5127+ if (rcu_pending(cpu))
5128+ rcu_check_callbacks(cpu, 0);
5129+
5130 if (cpu_is_offline(cpu))
5131 play_dead();
5132
5133@@ -192,40 +201,19 @@ static void do_nothing(void *unused)
5134 {
5135 }
5136
5137+/*
5138+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
5139+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
5140+ * handler on SMP systems.
5141+ *
5142+ * Caller must have changed pm_idle to the new value before the call. Old
5143+ * pm_idle value will not be used by any CPU after the return of this function.
5144+ */
5145 void cpu_idle_wait(void)
5146 {
5147- unsigned int cpu, this_cpu = get_cpu();
5148- cpumask_t map, tmp = current->cpus_allowed;
5149-
5150- set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5151- put_cpu();
5152-
5153- cpus_clear(map);
5154- for_each_online_cpu(cpu) {
5155- per_cpu(cpu_idle_state, cpu) = 1;
5156- cpu_set(cpu, map);
5157- }
5158-
5159- __get_cpu_var(cpu_idle_state) = 0;
5160-
5161- wmb();
5162- do {
5163- ssleep(1);
5164- for_each_online_cpu(cpu) {
5165- if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
5166- cpu_clear(cpu, map);
5167- }
5168- cpus_and(map, map, cpu_online_map);
5169- /*
5170- * We waited 1 sec, if a CPU still did not call idle
5171- * it may be because it is in idle and not waking up
5172- * because it has nothing to do.
5173- * Give all the remaining CPUS a kick.
5174- */
5175- smp_call_function_mask(map, do_nothing, 0, 0);
5176- } while (!cpus_empty(map));
5177-
5178- set_cpus_allowed(current, tmp);
5179+ smp_mb();
5180+ /* kick all the CPUs so that they exit out of pm_idle */
5181+ smp_call_function(do_nothing, NULL, 0, 1);
5182 }
5183 EXPORT_SYMBOL_GPL(cpu_idle_wait);
5184
5185@@ -251,15 +239,15 @@ void __show_registers(struct pt_regs *re
5186 {
5187 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
5188 unsigned long d0, d1, d2, d3, d6, d7;
5189- unsigned long esp;
5190+ unsigned long sp;
5191 unsigned short ss, gs;
5192
5193 if (user_mode_vm(regs)) {
5194- esp = regs->esp;
5195- ss = regs->xss & 0xffff;
5196+ sp = regs->sp;
5197+ ss = regs->ss & 0xffff;
5198 savesegment(gs, gs);
5199 } else {
5200- esp = (unsigned long) (&regs->esp);
5201+ sp = (unsigned long) (&regs->sp);
5202 savesegment(ss, ss);
5203 savesegment(gs, gs);
5204 }
5205@@ -272,17 +260,17 @@ void __show_registers(struct pt_regs *re
5206 init_utsname()->version);
5207
5208 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
5209- 0xffff & regs->xcs, regs->eip, regs->eflags,
5210+ 0xffff & regs->cs, regs->ip, regs->flags,
5211 smp_processor_id());
5212- print_symbol("EIP is at %s\n", regs->eip);
5213+ print_symbol("EIP is at %s\n", regs->ip);
5214
5215 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
5216- regs->eax, regs->ebx, regs->ecx, regs->edx);
5217+ regs->ax, regs->bx, regs->cx, regs->dx);
5218 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
5219- regs->esi, regs->edi, regs->ebp, esp);
5220+ regs->si, regs->di, regs->bp, sp);
5221 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
5222- regs->xds & 0xffff, regs->xes & 0xffff,
5223- regs->xfs & 0xffff, gs, ss);
5224+ regs->ds & 0xffff, regs->es & 0xffff,
5225+ regs->fs & 0xffff, gs, ss);
5226
5227 if (!all)
5228 return;
5229@@ -310,12 +298,12 @@ void __show_registers(struct pt_regs *re
5230 void show_regs(struct pt_regs *regs)
5231 {
5232 __show_registers(regs, 1);
5233- show_trace(NULL, regs, &regs->esp);
5234+ show_trace(NULL, regs, &regs->sp, regs->bp);
5235 }
5236
5237 /*
5238- * This gets run with %ebx containing the
5239- * function to call, and %edx containing
5240+ * This gets run with %bx containing the
5241+ * function to call, and %dx containing
5242 * the "args".
5243 */
5244 extern void kernel_thread_helper(void);
5245@@ -329,16 +317,16 @@ int kernel_thread(int (*fn)(void *), voi
5246
5247 memset(&regs, 0, sizeof(regs));
5248
5249- regs.ebx = (unsigned long) fn;
5250- regs.edx = (unsigned long) arg;
5251+ regs.bx = (unsigned long) fn;
5252+ regs.dx = (unsigned long) arg;
5253
5254- regs.xds = __USER_DS;
5255- regs.xes = __USER_DS;
5256- regs.xfs = __KERNEL_PERCPU;
5257- regs.orig_eax = -1;
5258- regs.eip = (unsigned long) kernel_thread_helper;
5259- regs.xcs = __KERNEL_CS | get_kernel_rpl();
5260- regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5261+ regs.ds = __USER_DS;
5262+ regs.es = __USER_DS;
5263+ regs.fs = __KERNEL_PERCPU;
5264+ regs.orig_ax = -1;
5265+ regs.ip = (unsigned long) kernel_thread_helper;
5266+ regs.cs = __KERNEL_CS | get_kernel_rpl();
5267+ regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5268
5269 /* Ok, create the new process.. */
5270 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
5271@@ -368,7 +356,12 @@ void flush_thread(void)
5272 {
5273 struct task_struct *tsk = current;
5274
5275- memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
5276+ tsk->thread.debugreg0 = 0;
5277+ tsk->thread.debugreg1 = 0;
5278+ tsk->thread.debugreg2 = 0;
5279+ tsk->thread.debugreg3 = 0;
5280+ tsk->thread.debugreg6 = 0;
5281+ tsk->thread.debugreg7 = 0;
5282 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5283 clear_tsk_thread_flag(tsk, TIF_DEBUG);
5284 /*
5285@@ -393,7 +386,7 @@ void prepare_to_copy(struct task_struct
5286 unlazy_fpu(tsk);
5287 }
5288
5289-int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
5290+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
5291 unsigned long unused,
5292 struct task_struct * p, struct pt_regs * regs)
5293 {
5294@@ -403,17 +396,19 @@ int copy_thread(int nr, unsigned long cl
5295
5296 childregs = task_pt_regs(p);
5297 *childregs = *regs;
5298- childregs->eax = 0;
5299- childregs->esp = esp;
5300+ childregs->ax = 0;
5301+ childregs->sp = sp;
5302
5303- p->thread.esp = (unsigned long) childregs;
5304- p->thread.esp0 = (unsigned long) (childregs+1);
5305+ p->thread.sp = (unsigned long) childregs;
5306+ p->thread.sp0 = (unsigned long) (childregs+1);
5307
5308- p->thread.eip = (unsigned long) ret_from_fork;
5309+ p->thread.ip = (unsigned long) ret_from_fork;
5310
5311- savesegment(gs,p->thread.gs);
5312+ savesegment(gs, p->thread.gs);
5313
5314 tsk = current;
5315+ if (test_tsk_thread_flag(tsk, TIF_CSTAR))
5316+ p->thread.ip = (unsigned long) cstar_ret_from_fork;
5317 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
5318 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
5319 IO_BITMAP_BYTES, GFP_KERNEL);
5320@@ -424,34 +419,17 @@ int copy_thread(int nr, unsigned long cl
5321 set_tsk_thread_flag(p, TIF_IO_BITMAP);
5322 }
5323
5324+ err = 0;
5325+
5326 /*
5327 * Set a new TLS for the child thread?
5328 */
5329- if (clone_flags & CLONE_SETTLS) {
5330- struct desc_struct *desc;
5331- struct user_desc info;
5332- int idx;
5333-
5334- err = -EFAULT;
5335- if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
5336- goto out;
5337- err = -EINVAL;
5338- if (LDT_empty(&info))
5339- goto out;
5340-
5341- idx = info.entry_number;
5342- if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5343- goto out;
5344-
5345- desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
5346- desc->a = LDT_entry_a(&info);
5347- desc->b = LDT_entry_b(&info);
5348- }
5349+ if (clone_flags & CLONE_SETTLS)
5350+ err = do_set_thread_area(p, -1,
5351+ (struct user_desc __user *)childregs->si, 0);
5352
5353 p->thread.iopl = current->thread.iopl;
5354
5355- err = 0;
5356- out:
5357 if (err && p->thread.io_bitmap_ptr) {
5358 kfree(p->thread.io_bitmap_ptr);
5359 p->thread.io_bitmap_max = 0;
5360@@ -459,67 +437,8 @@ int copy_thread(int nr, unsigned long cl
5361 return err;
5362 }
5363
5364-/*
5365- * fill in the user structure for a core dump..
5366- */
5367-void dump_thread(struct pt_regs * regs, struct user * dump)
5368-{
5369- int i;
5370-
5371-/* changed the size calculations - should hopefully work better. lbt */
5372- dump->magic = CMAGIC;
5373- dump->start_code = 0;
5374- dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
5375- dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
5376- dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
5377- dump->u_dsize -= dump->u_tsize;
5378- dump->u_ssize = 0;
5379- for (i = 0; i < 8; i++)
5380- dump->u_debugreg[i] = current->thread.debugreg[i];
5381-
5382- if (dump->start_stack < TASK_SIZE)
5383- dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
5384-
5385- dump->regs.ebx = regs->ebx;
5386- dump->regs.ecx = regs->ecx;
5387- dump->regs.edx = regs->edx;
5388- dump->regs.esi = regs->esi;
5389- dump->regs.edi = regs->edi;
5390- dump->regs.ebp = regs->ebp;
5391- dump->regs.eax = regs->eax;
5392- dump->regs.ds = regs->xds;
5393- dump->regs.es = regs->xes;
5394- dump->regs.fs = regs->xfs;
5395- savesegment(gs,dump->regs.gs);
5396- dump->regs.orig_eax = regs->orig_eax;
5397- dump->regs.eip = regs->eip;
5398- dump->regs.cs = regs->xcs;
5399- dump->regs.eflags = regs->eflags;
5400- dump->regs.esp = regs->esp;
5401- dump->regs.ss = regs->xss;
5402-
5403- dump->u_fpvalid = dump_fpu (regs, &dump->i387);
5404-}
5405-EXPORT_SYMBOL(dump_thread);
5406-
5407-/*
5408- * Capture the user space registers if the task is not running (in user space)
5409- */
5410-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
5411-{
5412- struct pt_regs ptregs = *task_pt_regs(tsk);
5413- ptregs.xcs &= 0xffff;
5414- ptregs.xds &= 0xffff;
5415- ptregs.xes &= 0xffff;
5416- ptregs.xss &= 0xffff;
5417-
5418- elf_core_copy_regs(regs, &ptregs);
5419-
5420- return 1;
5421-}
5422-
5423 #ifdef CONFIG_SECCOMP
5424-void hard_disable_TSC(void)
5425+static void hard_disable_TSC(void)
5426 {
5427 write_cr4(read_cr4() | X86_CR4_TSD);
5428 }
5429@@ -534,7 +453,7 @@ void disable_TSC(void)
5430 hard_disable_TSC();
5431 preempt_enable();
5432 }
5433-void hard_enable_TSC(void)
5434+static void hard_enable_TSC(void)
5435 {
5436 write_cr4(read_cr4() & ~X86_CR4_TSD);
5437 }
5438@@ -543,18 +462,32 @@ void hard_enable_TSC(void)
5439 static noinline void
5440 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
5441 {
5442- struct thread_struct *next;
5443+ struct thread_struct *prev, *next;
5444+ unsigned long debugctl;
5445
5446+ prev = &prev_p->thread;
5447 next = &next_p->thread;
5448
5449+ debugctl = prev->debugctlmsr;
5450+ if (next->ds_area_msr != prev->ds_area_msr) {
5451+ /* we clear debugctl to make sure DS
5452+ * is not in use when we change it */
5453+ debugctl = 0;
5454+ wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
5455+ wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
5456+ }
5457+
5458+ if (next->debugctlmsr != debugctl)
5459+ wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
5460+
5461 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
5462- set_debugreg(next->debugreg[0], 0);
5463- set_debugreg(next->debugreg[1], 1);
5464- set_debugreg(next->debugreg[2], 2);
5465- set_debugreg(next->debugreg[3], 3);
5466+ set_debugreg(next->debugreg0, 0);
5467+ set_debugreg(next->debugreg1, 1);
5468+ set_debugreg(next->debugreg2, 2);
5469+ set_debugreg(next->debugreg3, 3);
5470 /* no 4 and 5 */
5471- set_debugreg(next->debugreg[6], 6);
5472- set_debugreg(next->debugreg[7], 7);
5473+ set_debugreg(next->debugreg6, 6);
5474+ set_debugreg(next->debugreg7, 7);
5475 }
5476
5477 #ifdef CONFIG_SECCOMP
5478@@ -567,6 +500,14 @@ __switch_to_xtra(struct task_struct *pre
5479 hard_enable_TSC();
5480 }
5481 #endif
5482+
5483+#ifdef X86_BTS
5484+ if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
5485+ ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
5486+
5487+ if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
5488+ ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
5489+#endif
5490 }
5491
5492 /*
5493@@ -592,11 +533,11 @@ __switch_to_xtra(struct task_struct *pre
5494 * More important, however, is the fact that this allows us much
5495 * more flexibility.
5496 *
5497- * The return value (in %eax) will be the "prev" task after
5498+ * The return value (in %ax) will be the "prev" task after
5499 * the task-switch, and shows up in ret_from_fork in entry.S,
5500 * for example.
5501 */
5502-struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
5503+struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
5504 {
5505 struct thread_struct *prev = &prev_p->thread,
5506 *next = &next_p->thread;
5507@@ -632,12 +573,12 @@ struct task_struct fastcall * __switch_t
5508 #endif
5509
5510 /*
5511- * Reload esp0.
5512- * This is load_esp0(tss, next) with a multicall.
5513+ * Reload sp0.
5514+ * This is load_sp0(tss, next) with a multicall.
5515 */
5516 mcl->op = __HYPERVISOR_stack_switch;
5517 mcl->args[0] = __KERNEL_DS;
5518- mcl->args[1] = next->esp0;
5519+ mcl->args[1] = next->sp0;
5520 mcl++;
5521
5522 /*
5523@@ -734,7 +675,7 @@ struct task_struct fastcall * __switch_t
5524
5525 asmlinkage int sys_fork(struct pt_regs regs)
5526 {
5527- return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
5528+ return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
5529 }
5530
5531 asmlinkage int sys_clone(struct pt_regs regs)
5532@@ -743,12 +684,12 @@ asmlinkage int sys_clone(struct pt_regs
5533 unsigned long newsp;
5534 int __user *parent_tidptr, *child_tidptr;
5535
5536- clone_flags = regs.ebx;
5537- newsp = regs.ecx;
5538- parent_tidptr = (int __user *)regs.edx;
5539- child_tidptr = (int __user *)regs.edi;
5540+ clone_flags = regs.bx;
5541+ newsp = regs.cx;
5542+ parent_tidptr = (int __user *)regs.dx;
5543+ child_tidptr = (int __user *)regs.di;
5544 if (!newsp)
5545- newsp = regs.esp;
5546+ newsp = regs.sp;
5547 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
5548 }
5549
5550@@ -764,7 +705,7 @@ asmlinkage int sys_clone(struct pt_regs
5551 */
5552 asmlinkage int sys_vfork(struct pt_regs regs)
5553 {
5554- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
5555+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
5556 }
5557
5558 /*
5559@@ -775,18 +716,15 @@ asmlinkage int sys_execve(struct pt_regs
5560 int error;
5561 char * filename;
5562
5563- filename = getname((char __user *) regs.ebx);
5564+ filename = getname((char __user *) regs.bx);
5565 error = PTR_ERR(filename);
5566 if (IS_ERR(filename))
5567 goto out;
5568 error = do_execve(filename,
5569- (char __user * __user *) regs.ecx,
5570- (char __user * __user *) regs.edx,
5571+ (char __user * __user *) regs.cx,
5572+ (char __user * __user *) regs.dx,
5573 &regs);
5574 if (error == 0) {
5575- task_lock(current);
5576- current->ptrace &= ~PT_DTRACE;
5577- task_unlock(current);
5578 /* Make sure we don't return using sysenter.. */
5579 set_thread_flag(TIF_IRET);
5580 }
5581@@ -800,145 +738,37 @@ out:
5582
5583 unsigned long get_wchan(struct task_struct *p)
5584 {
5585- unsigned long ebp, esp, eip;
5586+ unsigned long bp, sp, ip;
5587 unsigned long stack_page;
5588 int count = 0;
5589 if (!p || p == current || p->state == TASK_RUNNING)
5590 return 0;
5591 stack_page = (unsigned long)task_stack_page(p);
5592- esp = p->thread.esp;
5593- if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
5594+ sp = p->thread.sp;
5595+ if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
5596 return 0;
5597- /* include/asm-i386/system.h:switch_to() pushes ebp last. */
5598- ebp = *(unsigned long *) esp;
5599+ /* include/asm-i386/system.h:switch_to() pushes bp last. */
5600+ bp = *(unsigned long *) sp;
5601 do {
5602- if (ebp < stack_page || ebp > top_ebp+stack_page)
5603+ if (bp < stack_page || bp > top_ebp+stack_page)
5604 return 0;
5605- eip = *(unsigned long *) (ebp+4);
5606- if (!in_sched_functions(eip))
5607- return eip;
5608- ebp = *(unsigned long *) ebp;
5609+ ip = *(unsigned long *) (bp+4);
5610+ if (!in_sched_functions(ip))
5611+ return ip;
5612+ bp = *(unsigned long *) bp;
5613 } while (count++ < 16);
5614 return 0;
5615 }
5616
5617-/*
5618- * sys_alloc_thread_area: get a yet unused TLS descriptor index.
5619- */
5620-static int get_free_idx(void)
5621-{
5622- struct thread_struct *t = &current->thread;
5623- int idx;
5624-
5625- for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
5626- if (desc_empty(t->tls_array + idx))
5627- return idx + GDT_ENTRY_TLS_MIN;
5628- return -ESRCH;
5629-}
5630-
5631-/*
5632- * Set a given TLS descriptor:
5633- */
5634-asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
5635-{
5636- struct thread_struct *t = &current->thread;
5637- struct user_desc info;
5638- struct desc_struct *desc;
5639- int cpu, idx;
5640-
5641- if (copy_from_user(&info, u_info, sizeof(info)))
5642- return -EFAULT;
5643- idx = info.entry_number;
5644-
5645- /*
5646- * index -1 means the kernel should try to find and
5647- * allocate an empty descriptor:
5648- */
5649- if (idx == -1) {
5650- idx = get_free_idx();
5651- if (idx < 0)
5652- return idx;
5653- if (put_user(idx, &u_info->entry_number))
5654- return -EFAULT;
5655- }
5656-
5657- if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5658- return -EINVAL;
5659-
5660- desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
5661-
5662- /*
5663- * We must not get preempted while modifying the TLS.
5664- */
5665- cpu = get_cpu();
5666-
5667- if (LDT_empty(&info)) {
5668- desc->a = 0;
5669- desc->b = 0;
5670- } else {
5671- desc->a = LDT_entry_a(&info);
5672- desc->b = LDT_entry_b(&info);
5673- }
5674- load_TLS(t, cpu);
5675-
5676- put_cpu();
5677-
5678- return 0;
5679-}
5680-
5681-/*
5682- * Get the current Thread-Local Storage area:
5683- */
5684-
5685-#define GET_BASE(desc) ( \
5686- (((desc)->a >> 16) & 0x0000ffff) | \
5687- (((desc)->b << 16) & 0x00ff0000) | \
5688- ( (desc)->b & 0xff000000) )
5689-
5690-#define GET_LIMIT(desc) ( \
5691- ((desc)->a & 0x0ffff) | \
5692- ((desc)->b & 0xf0000) )
5693-
5694-#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
5695-#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
5696-#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
5697-#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
5698-#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
5699-#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
5700-
5701-asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
5702-{
5703- struct user_desc info;
5704- struct desc_struct *desc;
5705- int idx;
5706-
5707- if (get_user(idx, &u_info->entry_number))
5708- return -EFAULT;
5709- if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5710- return -EINVAL;
5711-
5712- memset(&info, 0, sizeof(info));
5713-
5714- desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
5715-
5716- info.entry_number = idx;
5717- info.base_addr = GET_BASE(desc);
5718- info.limit = GET_LIMIT(desc);
5719- info.seg_32bit = GET_32BIT(desc);
5720- info.contents = GET_CONTENTS(desc);
5721- info.read_exec_only = !GET_WRITABLE(desc);
5722- info.limit_in_pages = GET_LIMIT_PAGES(desc);
5723- info.seg_not_present = !GET_PRESENT(desc);
5724- info.useable = GET_USEABLE(desc);
5725-
5726- if (copy_to_user(u_info, &info, sizeof(info)))
5727- return -EFAULT;
5728- return 0;
5729-}
5730-
5731 unsigned long arch_align_stack(unsigned long sp)
5732 {
5733 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
5734 sp -= get_random_int() % 8192;
5735 return sp & ~0xf;
5736 }
5737+
5738+unsigned long arch_randomize_brk(struct mm_struct *mm)
5739+{
5740+ unsigned long range_end = mm->brk + 0x02000000;
5741+ return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
5742+}
5743--- a/arch/x86/kernel/process_64-xen.c
5744+++ b/arch/x86/kernel/process_64-xen.c
5745@@ -3,7 +3,7 @@
5746 *
5747 * Pentium III FXSR, SSE support
5748 * Gareth Hughes <gareth@valinux.com>, May 2000
5749- *
5750+ *
5751 * X86-64 port
5752 * Andi Kleen.
5753 *
5754@@ -22,19 +22,18 @@
5755 #include <linux/cpu.h>
5756 #include <linux/errno.h>
5757 #include <linux/sched.h>
5758+#include <linux/fs.h>
5759 #include <linux/kernel.h>
5760 #include <linux/mm.h>
5761-#include <linux/fs.h>
5762 #include <linux/elfcore.h>
5763 #include <linux/smp.h>
5764 #include <linux/slab.h>
5765 #include <linux/user.h>
5766-#include <linux/module.h>
5767-#include <linux/a.out.h>
5768 #include <linux/interrupt.h>
5769+#include <linux/utsname.h>
5770 #include <linux/delay.h>
5771+#include <linux/module.h>
5772 #include <linux/ptrace.h>
5773-#include <linux/utsname.h>
5774 #include <linux/random.h>
5775 #include <linux/notifier.h>
5776 #include <linux/kprobes.h>
5777@@ -73,7 +72,6 @@ EXPORT_SYMBOL(boot_option_idle_override)
5778 */
5779 void (*pm_idle)(void);
5780 EXPORT_SYMBOL(pm_idle);
5781-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
5782
5783 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
5784
5785@@ -81,13 +79,6 @@ void idle_notifier_register(struct notif
5786 {
5787 atomic_notifier_chain_register(&idle_notifier, n);
5788 }
5789-EXPORT_SYMBOL_GPL(idle_notifier_register);
5790-
5791-void idle_notifier_unregister(struct notifier_block *n)
5792-{
5793- atomic_notifier_chain_unregister(&idle_notifier, n);
5794-}
5795-EXPORT_SYMBOL(idle_notifier_unregister);
5796
5797 void enter_idle(void)
5798 {
5799@@ -116,7 +107,7 @@ void exit_idle(void)
5800 * to poll the ->need_resched flag instead of waiting for the
5801 * cross-CPU IPI to arrive. Use this option with caution.
5802 */
5803-static void poll_idle (void)
5804+static void poll_idle(void)
5805 {
5806 local_irq_enable();
5807 cpu_relax();
5808@@ -131,10 +122,19 @@ static void xen_idle(void)
5809 */
5810 smp_mb();
5811 local_irq_disable();
5812- if (!need_resched())
5813- safe_halt();
5814- else
5815- local_irq_enable();
5816+ if (!need_resched()) {
5817+ ktime_t t0, t1;
5818+ u64 t0n, t1n;
5819+
5820+ t0 = ktime_get();
5821+ t0n = ktime_to_ns(t0);
5822+ safe_halt(); /* enables interrupts racelessly */
5823+ local_irq_disable();
5824+ t1 = ktime_get();
5825+ t1n = ktime_to_ns(t1);
5826+ sched_clock_idle_wakeup_event(t1n - t0n);
5827+ }
5828+ local_irq_enable();
5829 current_thread_info()->status |= TS_POLLING;
5830 }
5831
5832@@ -161,19 +161,15 @@ static inline void play_dead(void)
5833 * low exit latency (ie sit in a loop waiting for
5834 * somebody to say that they'd like to reschedule)
5835 */
5836-void cpu_idle (void)
5837+void cpu_idle(void)
5838 {
5839 current_thread_info()->status |= TS_POLLING;
5840 /* endless idle loop with no priority at all */
5841 while (1) {
5842+ tick_nohz_stop_sched_tick();
5843 while (!need_resched()) {
5844 void (*idle)(void);
5845
5846- if (__get_cpu_var(cpu_idle_state))
5847- __get_cpu_var(cpu_idle_state) = 0;
5848-
5849- tick_nohz_stop_sched_tick();
5850-
5851 rmb();
5852 idle = xen_idle; /* no alternatives */
5853 if (cpu_is_offline(smp_processor_id()))
5854@@ -203,49 +199,27 @@ static void do_nothing(void *unused)
5855 {
5856 }
5857
5858+/*
5859+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
5860+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
5861+ * handler on SMP systems.
5862+ *
5863+ * Caller must have changed pm_idle to the new value before the call. Old
5864+ * pm_idle value will not be used by any CPU after the return of this function.
5865+ */
5866 void cpu_idle_wait(void)
5867 {
5868- unsigned int cpu, this_cpu = get_cpu();
5869- cpumask_t map, tmp = current->cpus_allowed;
5870-
5871- set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5872- put_cpu();
5873-
5874- cpus_clear(map);
5875- for_each_online_cpu(cpu) {
5876- per_cpu(cpu_idle_state, cpu) = 1;
5877- cpu_set(cpu, map);
5878- }
5879-
5880- __get_cpu_var(cpu_idle_state) = 0;
5881-
5882- wmb();
5883- do {
5884- ssleep(1);
5885- for_each_online_cpu(cpu) {
5886- if (cpu_isset(cpu, map) &&
5887- !per_cpu(cpu_idle_state, cpu))
5888- cpu_clear(cpu, map);
5889- }
5890- cpus_and(map, map, cpu_online_map);
5891- /*
5892- * We waited 1 sec, if a CPU still did not call idle
5893- * it may be because it is in idle and not waking up
5894- * because it has nothing to do.
5895- * Give all the remaining CPUS a kick.
5896- */
5897- smp_call_function_mask(map, do_nothing, 0, 0);
5898- } while (!cpus_empty(map));
5899-
5900- set_cpus_allowed(current, tmp);
5901+ smp_mb();
5902+ /* kick all the CPUs so that they exit out of pm_idle */
5903+ smp_call_function(do_nothing, NULL, 0, 1);
5904 }
5905 EXPORT_SYMBOL_GPL(cpu_idle_wait);
5906
5907-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
5908+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
5909 {
5910 }
5911
5912-static int __init idle_setup (char *str)
5913+static int __init idle_setup(char *str)
5914 {
5915 if (!strcmp(str, "poll")) {
5916 printk("using polling idle threads.\n");
5917@@ -260,13 +234,13 @@ static int __init idle_setup (char *str)
5918 }
5919 early_param("idle", idle_setup);
5920
5921-/* Prints also some state that isn't saved in the pt_regs */
5922+/* Prints also some state that isn't saved in the pt_regs */
5923 void __show_regs(struct pt_regs * regs)
5924 {
5925 unsigned long fs, gs, shadowgs;
5926 unsigned long d0, d1, d2, d3, d6, d7;
5927- unsigned int fsindex,gsindex;
5928- unsigned int ds,cs,es;
5929+ unsigned int fsindex, gsindex;
5930+ unsigned int ds, cs, es;
5931
5932 printk("\n");
5933 print_modules();
5934@@ -275,16 +249,16 @@ void __show_regs(struct pt_regs * regs)
5935 init_utsname()->release,
5936 (int)strcspn(init_utsname()->version, " "),
5937 init_utsname()->version);
5938- printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
5939- printk_address(regs->rip);
5940- printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
5941- regs->eflags);
5942+ printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
5943+ printk_address(regs->ip, 1);
5944+ printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
5945+ regs->flags);
5946 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
5947- regs->rax, regs->rbx, regs->rcx);
5948+ regs->ax, regs->bx, regs->cx);
5949 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
5950- regs->rdx, regs->rsi, regs->rdi);
5951+ regs->dx, regs->si, regs->di);
5952 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
5953- regs->rbp, regs->r8, regs->r9);
5954+ regs->bp, regs->r8, regs->r9);
5955 printk("R10: %016lx R11: %016lx R12: %016lx\n",
5956 regs->r10, regs->r11, regs->r12);
5957 printk("R13: %016lx R14: %016lx R15: %016lx\n",
5958@@ -318,7 +292,7 @@ void show_regs(struct pt_regs *regs)
5959 {
5960 printk("CPU %d:", smp_processor_id());
5961 __show_regs(regs);
5962- show_trace(NULL, regs, (void *)(regs + 1));
5963+ show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
5964 }
5965
5966 /*
5967@@ -329,7 +303,7 @@ void exit_thread(void)
5968 struct task_struct *me = current;
5969 struct thread_struct *t = &me->thread;
5970
5971- if (me->thread.io_bitmap_ptr) {
5972+ if (me->thread.io_bitmap_ptr) {
5973 #ifndef CONFIG_X86_NO_TSS
5974 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
5975 #endif
5976@@ -382,7 +356,7 @@ void flush_thread(void)
5977 tsk->thread.debugreg3 = 0;
5978 tsk->thread.debugreg6 = 0;
5979 tsk->thread.debugreg7 = 0;
5980- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5981+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5982 /*
5983 * Forget coprocessor state..
5984 */
5985@@ -405,26 +379,21 @@ void release_thread(struct task_struct *
5986
5987 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
5988 {
5989- struct user_desc ud = {
5990+ struct user_desc ud = {
5991 .base_addr = addr,
5992 .limit = 0xfffff,
5993 .seg_32bit = 1,
5994 .limit_in_pages = 1,
5995 .useable = 1,
5996 };
5997- struct n_desc_struct *desc = (void *)t->thread.tls_array;
5998+ struct desc_struct *desc = t->thread.tls_array;
5999 desc += tls;
6000- desc->a = LDT_entry_a(&ud);
6001- desc->b = LDT_entry_b(&ud);
6002+ fill_ldt(desc, &ud);
6003 }
6004
6005 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
6006 {
6007- struct desc_struct *desc = (void *)t->thread.tls_array;
6008- desc += tls;
6009- return desc->base0 |
6010- (((u32)desc->base1) << 16) |
6011- (((u32)desc->base2) << 24);
6012+ return get_desc_base(&t->thread.tls_array[tls]);
6013 }
6014
6015 /*
6016@@ -436,7 +405,7 @@ void prepare_to_copy(struct task_struct
6017 unlazy_fpu(tsk);
6018 }
6019
6020-int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
6021+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
6022 unsigned long unused,
6023 struct task_struct * p, struct pt_regs * regs)
6024 {
6025@@ -448,14 +417,14 @@ int copy_thread(int nr, unsigned long cl
6026 (THREAD_SIZE + task_stack_page(p))) - 1;
6027 *childregs = *regs;
6028
6029- childregs->rax = 0;
6030- childregs->rsp = rsp;
6031- if (rsp == ~0UL)
6032- childregs->rsp = (unsigned long)childregs;
6033-
6034- p->thread.rsp = (unsigned long) childregs;
6035- p->thread.rsp0 = (unsigned long) (childregs+1);
6036- p->thread.userrsp = me->thread.userrsp;
6037+ childregs->ax = 0;
6038+ childregs->sp = sp;
6039+ if (sp == ~0UL)
6040+ childregs->sp = (unsigned long)childregs;
6041+
6042+ p->thread.sp = (unsigned long) childregs;
6043+ p->thread.sp0 = (unsigned long) (childregs+1);
6044+ p->thread.usersp = me->thread.usersp;
6045
6046 set_tsk_thread_flag(p, TIF_FORK);
6047
6048@@ -476,7 +445,7 @@ int copy_thread(int nr, unsigned long cl
6049 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
6050 IO_BITMAP_BYTES);
6051 set_tsk_thread_flag(p, TIF_IO_BITMAP);
6052- }
6053+ }
6054
6055 /*
6056 * Set a new TLS for the child thread?
6057@@ -484,7 +453,8 @@ int copy_thread(int nr, unsigned long cl
6058 if (clone_flags & CLONE_SETTLS) {
6059 #ifdef CONFIG_IA32_EMULATION
6060 if (test_thread_flag(TIF_IA32))
6061- err = ia32_child_tls(p, childregs);
6062+ err = do_set_thread_area(p, -1,
6063+ (struct user_desc __user *)childregs->si, 0);
6064 else
6065 #endif
6066 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
6067@@ -502,26 +472,32 @@ out:
6068 return err;
6069 }
6070
6071-static inline void __save_init_fpu( struct task_struct *tsk )
6072-{
6073- asm volatile( "rex64 ; fxsave %0 ; fnclex"
6074- : "=m" (tsk->thread.i387.fxsave));
6075- tsk->thread_info->status &= ~TS_USEDFPU;
6076-}
6077-
6078 /*
6079 * This special macro can be used to load a debugging register
6080 */
6081-#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
6082+#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
6083
6084 static inline void __switch_to_xtra(struct task_struct *prev_p,
6085- struct task_struct *next_p)
6086+ struct task_struct *next_p)
6087 {
6088 struct thread_struct *prev, *next;
6089+ unsigned long debugctl;
6090
6091 prev = &prev_p->thread,
6092 next = &next_p->thread;
6093
6094+ debugctl = prev->debugctlmsr;
6095+ if (next->ds_area_msr != prev->ds_area_msr) {
6096+ /* we clear debugctl to make sure DS
6097+ * is not in use when we change it */
6098+ debugctl = 0;
6099+ wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
6100+ wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
6101+ }
6102+
6103+ if (next->debugctlmsr != debugctl)
6104+ wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
6105+
6106 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
6107 loaddebug(next, 0);
6108 loaddebug(next, 1);
6109@@ -531,12 +507,20 @@ static inline void __switch_to_xtra(stru
6110 loaddebug(next, 6);
6111 loaddebug(next, 7);
6112 }
6113+
6114+#ifdef X86_BTS
6115+ if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
6116+ ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
6117+
6118+ if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
6119+ ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
6120+#endif
6121 }
6122
6123 /*
6124 * switch_to(x,y) should switch tasks from x to y.
6125 *
6126- * This could still be optimized:
6127+ * This could still be optimized:
6128 * - fold all the options into a flag word and test it with a single test.
6129 * - could test fs/gs bitsliced
6130 *
6131@@ -547,7 +531,7 @@ __switch_to(struct task_struct *prev_p,
6132 {
6133 struct thread_struct *prev = &prev_p->thread,
6134 *next = &next_p->thread;
6135- int cpu = smp_processor_id();
6136+ int cpu = smp_processor_id();
6137 #ifndef CONFIG_X86_NO_TSS
6138 struct tss_struct *tss = &per_cpu(init_tss, cpu);
6139 #endif
6140@@ -581,11 +565,12 @@ __switch_to(struct task_struct *prev_p,
6141 prev_p->fpu_counter = 0;
6142
6143 /*
6144- * Reload esp0, LDT and the page table pointer:
6145+ * Reload sp0.
6146+ * This is load_sp0(tss, next) with a multicall.
6147 */
6148 mcl->op = __HYPERVISOR_stack_switch;
6149 mcl->args[0] = __KERNEL_DS;
6150- mcl->args[1] = next->rsp0;
6151+ mcl->args[1] = next->sp0;
6152 mcl++;
6153
6154 /*
6155@@ -593,11 +578,12 @@ __switch_to(struct task_struct *prev_p,
6156 * This is load_TLS(next, cpu) with multicalls.
6157 */
6158 #define C(i) do { \
6159- if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
6160+ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
6161+ next->tls_array[i].b != prev->tls_array[i].b)) { \
6162 mcl->op = __HYPERVISOR_update_descriptor; \
6163 mcl->args[0] = virt_to_machine( \
6164- &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
6165- mcl->args[1] = next->tls_array[i]; \
6166+ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
6167+ mcl->args[1] = *(u64 *)&next->tls_array[i]; \
6168 mcl++; \
6169 } \
6170 } while (0)
6171@@ -605,7 +591,7 @@ __switch_to(struct task_struct *prev_p,
6172 #undef C
6173
6174 if (unlikely(prev->iopl != next->iopl)) {
6175- iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
6176+ iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
6177 #if CONFIG_XEN_COMPAT > 0x030002
6178 mcl->op = __HYPERVISOR_physdev_op;
6179 mcl->args[0] = PHYSDEVOP_set_iopl;
6180@@ -669,8 +655,8 @@ __switch_to(struct task_struct *prev_p,
6181 /*
6182 * Switch the PDA context.
6183 */
6184- prev->userrsp = read_pda(oldrsp);
6185- write_pda(oldrsp, next->userrsp);
6186+ prev->usersp = read_pda(oldrsp);
6187+ write_pda(oldrsp, next->usersp);
6188 write_pda(pcurrent, next_p);
6189 write_pda(kernelstack,
6190 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
6191@@ -687,7 +673,8 @@ __switch_to(struct task_struct *prev_p,
6192 /*
6193 * Now maybe reload the debug registers
6194 */
6195- if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
6196+ if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
6197+ task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
6198 __switch_to_xtra(prev_p, next_p);
6199
6200 /* If the task has used fpu the last 5 timeslices, just do a full
6201@@ -702,23 +689,18 @@ __switch_to(struct task_struct *prev_p,
6202 /*
6203 * sys_execve() executes a new program.
6204 */
6205-asmlinkage
6206+asmlinkage
6207 long sys_execve(char __user *name, char __user * __user *argv,
6208- char __user * __user *envp, struct pt_regs regs)
6209+ char __user * __user *envp, struct pt_regs *regs)
6210 {
6211 long error;
6212 char * filename;
6213
6214 filename = getname(name);
6215 error = PTR_ERR(filename);
6216- if (IS_ERR(filename))
6217+ if (IS_ERR(filename))
6218 return error;
6219- error = do_execve(filename, argv, envp, &regs);
6220- if (error == 0) {
6221- task_lock(current);
6222- current->ptrace &= ~PT_DTRACE;
6223- task_unlock(current);
6224- }
6225+ error = do_execve(filename, argv, envp, regs);
6226 putname(filename);
6227 return error;
6228 }
6229@@ -728,18 +710,18 @@ void set_personality_64bit(void)
6230 /* inherit personality from parent */
6231
6232 /* Make sure to be in 64bit mode */
6233- clear_thread_flag(TIF_IA32);
6234+ clear_thread_flag(TIF_IA32);
6235
6236 /* TBD: overwrites user setup. Should have two bits.
6237 But 64bit processes have always behaved this way,
6238 so it's not too bad. The main problem is just that
6239- 32bit childs are affected again. */
6240+ 32bit childs are affected again. */
6241 current->personality &= ~READ_IMPLIES_EXEC;
6242 }
6243
6244 asmlinkage long sys_fork(struct pt_regs *regs)
6245 {
6246- return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
6247+ return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
6248 }
6249
6250 asmlinkage long
6251@@ -747,7 +729,7 @@ sys_clone(unsigned long clone_flags, uns
6252 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
6253 {
6254 if (!newsp)
6255- newsp = regs->rsp;
6256+ newsp = regs->sp;
6257 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
6258 }
6259
6260@@ -763,29 +745,29 @@ sys_clone(unsigned long clone_flags, uns
6261 */
6262 asmlinkage long sys_vfork(struct pt_regs *regs)
6263 {
6264- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
6265+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
6266 NULL, NULL);
6267 }
6268
6269 unsigned long get_wchan(struct task_struct *p)
6270 {
6271 unsigned long stack;
6272- u64 fp,rip;
6273+ u64 fp,ip;
6274 int count = 0;
6275
6276 if (!p || p == current || p->state==TASK_RUNNING)
6277 return 0;
6278 stack = (unsigned long)task_stack_page(p);
6279- if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
6280+ if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
6281 return 0;
6282- fp = *(u64 *)(p->thread.rsp);
6283+ fp = *(u64 *)(p->thread.sp);
6284 do {
6285 if (fp < (unsigned long)stack ||
6286 fp > (unsigned long)stack+THREAD_SIZE)
6287 return 0;
6288- rip = *(u64 *)(fp+8);
6289- if (!in_sched_functions(rip))
6290- return rip;
6291+ ip = *(u64 *)(fp+8);
6292+ if (!in_sched_functions(ip))
6293+ return ip;
6294 fp = *(u64 *)fp;
6295 } while (count++ < 16);
6296 return 0;
6297@@ -827,19 +809,19 @@ long do_arch_prctl(struct task_struct *t
6298 /* Not strictly needed for fs, but do it for symmetry
6299 with gs */
6300 if (addr >= TASK_SIZE_OF(task))
6301- return -EPERM;
6302+ return -EPERM;
6303 cpu = get_cpu();
6304- /* handle small bases via the GDT because that's faster to
6305+ /* handle small bases via the GDT because that's faster to
6306 switch. */
6307- if (addr <= 0xffffffff) {
6308+ if (addr <= 0xffffffff) {
6309 set_32bit_tls(task, FS_TLS, addr);
6310- if (doit) {
6311- load_TLS(&task->thread, cpu);
6312+ if (doit) {
6313+ load_TLS(&task->thread, cpu);
6314 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
6315 }
6316 task->thread.fsindex = FS_TLS_SEL;
6317 task->thread.fs = 0;
6318- } else {
6319+ } else {
6320 task->thread.fsindex = 0;
6321 task->thread.fs = addr;
6322 if (doit) {
6323@@ -852,24 +834,24 @@ long do_arch_prctl(struct task_struct *t
6324 }
6325 put_cpu();
6326 break;
6327- case ARCH_GET_FS: {
6328- unsigned long base;
6329+ case ARCH_GET_FS: {
6330+ unsigned long base;
6331 if (task->thread.fsindex == FS_TLS_SEL)
6332 base = read_32bit_tls(task, FS_TLS);
6333 else if (doit)
6334 rdmsrl(MSR_FS_BASE, base);
6335 else
6336 base = task->thread.fs;
6337- ret = put_user(base, (unsigned long __user *)addr);
6338- break;
6339+ ret = put_user(base, (unsigned long __user *)addr);
6340+ break;
6341 }
6342- case ARCH_GET_GS: {
6343+ case ARCH_GET_GS: {
6344 unsigned long base;
6345 unsigned gsindex;
6346 if (task->thread.gsindex == GS_TLS_SEL)
6347 base = read_32bit_tls(task, GS_TLS);
6348 else if (doit) {
6349- asm("movl %%gs,%0" : "=r" (gsindex));
6350+ asm("movl %%gs,%0" : "=r" (gsindex));
6351 if (gsindex)
6352 rdmsrl(MSR_KERNEL_GS_BASE, base);
6353 else
6354@@ -877,40 +859,21 @@ long do_arch_prctl(struct task_struct *t
6355 }
6356 else
6357 base = task->thread.gs;
6358- ret = put_user(base, (unsigned long __user *)addr);
6359+ ret = put_user(base, (unsigned long __user *)addr);
6360 break;
6361 }
6362
6363 default:
6364 ret = -EINVAL;
6365 break;
6366- }
6367+ }
6368
6369- return ret;
6370-}
6371+ return ret;
6372+}
6373
6374 long sys_arch_prctl(int code, unsigned long addr)
6375 {
6376 return do_arch_prctl(current, code, addr);
6377-}
6378-
6379-/*
6380- * Capture the user space registers if the task is not running (in user space)
6381- */
6382-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
6383-{
6384- struct pt_regs *pp, ptregs;
6385-
6386- pp = task_pt_regs(tsk);
6387-
6388- ptregs = *pp;
6389- ptregs.cs &= 0xffff;
6390- ptregs.ss &= 0xffff;
6391-
6392- elf_core_copy_regs(regs, &ptregs);
6393-
6394- boot_option_idle_override = 1;
6395- return 1;
6396 }
6397
6398 unsigned long arch_align_stack(unsigned long sp)
6399@@ -919,3 +882,9 @@ unsigned long arch_align_stack(unsigned
6400 sp -= get_random_int() % 8192;
6401 return sp & ~0xf;
6402 }
6403+
6404+unsigned long arch_randomize_brk(struct mm_struct *mm)
6405+{
6406+ unsigned long range_end = mm->brk + 0x02000000;
6407+ return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
6408+}
6409--- a/arch/x86/kernel/quirks-xen.c
6410+++ b/arch/x86/kernel/quirks-xen.c
6411@@ -9,7 +9,7 @@
6412 static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
6413 {
6414 u8 config, rev;
6415- u32 word;
6416+ u16 word;
6417
6418 /* BIOS may enable hardware IRQ balancing for
6419 * E7520/E7320/E7525(revision ID 0x9 and below)
6420@@ -24,14 +24,17 @@ static void __devinit quirk_intel_irqbal
6421 pci_read_config_byte(dev, 0xf4, &config);
6422 pci_write_config_byte(dev, 0xf4, config|0x2);
6423
6424- /* read xTPR register */
6425- raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
6426+ /*
6427+ * read xTPR register. We may not have a pci_dev for device 8
6428+ * because it might be hidden until the above write.
6429+ */
6430+ pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word);
6431
6432 if (!(word & (1 << 13))) {
6433 struct xen_platform_op op;
6434
6435- printk(KERN_INFO "Intel E7520/7320/7525 detected. "
6436- "Disabling irq balancing and affinity\n");
6437+ dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
6438+ "disabling irq balancing and affinity\n");
6439 op.cmd = XENPF_platform_quirk;
6440 op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
6441 WARN_ON(HYPERVISOR_platform_op(&op));
6442@@ -102,14 +105,16 @@ static void ich_force_enable_hpet(struct
6443 pci_read_config_dword(dev, 0xF0, &rcba);
6444 rcba &= 0xFFFFC000;
6445 if (rcba == 0) {
6446- printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n");
6447+ dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
6448+ "cannot force enable HPET\n");
6449 return;
6450 }
6451
6452 /* use bits 31:14, 16 kB aligned */
6453 rcba_base = ioremap_nocache(rcba, 0x4000);
6454 if (rcba_base == NULL) {
6455- printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n");
6456+ dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
6457+ "cannot force enable HPET\n");
6458 return;
6459 }
6460
6461@@ -120,8 +125,8 @@ static void ich_force_enable_hpet(struct
6462 /* HPET is enabled in HPTC. Just not reported by BIOS */
6463 val = val & 0x3;
6464 force_hpet_address = 0xFED00000 | (val << 12);
6465- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6466- force_hpet_address);
6467+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6468+ "0x%lx\n", force_hpet_address);
6469 iounmap(rcba_base);
6470 return;
6471 }
6472@@ -140,11 +145,12 @@ static void ich_force_enable_hpet(struct
6473 if (err) {
6474 force_hpet_address = 0;
6475 iounmap(rcba_base);
6476- printk(KERN_DEBUG "Failed to force enable HPET\n");
6477+ dev_printk(KERN_DEBUG, &dev->dev,
6478+ "Failed to force enable HPET\n");
6479 } else {
6480 force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
6481- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6482- force_hpet_address);
6483+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6484+ "0x%lx\n", force_hpet_address);
6485 }
6486 }
6487
6488@@ -160,6 +166,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
6489 ich_force_enable_hpet);
6490 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
6491 ich_force_enable_hpet);
6492+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
6493+ ich_force_enable_hpet);
6494
6495
6496 static struct pci_dev *cached_dev;
6497@@ -204,8 +212,8 @@ static void old_ich_force_enable_hpet(st
6498 if (val & 0x4) {
6499 val &= 0x3;
6500 force_hpet_address = 0xFED00000 | (val << 12);
6501- printk(KERN_DEBUG "HPET at base address 0x%lx\n",
6502- force_hpet_address);
6503+ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
6504+ force_hpet_address);
6505 return;
6506 }
6507
6508@@ -225,14 +233,14 @@ static void old_ich_force_enable_hpet(st
6509 /* HPET is enabled in HPTC. Just not reported by BIOS */
6510 val &= 0x3;
6511 force_hpet_address = 0xFED00000 | (val << 12);
6512- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6513- force_hpet_address);
6514+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6515+ "0x%lx\n", force_hpet_address);
6516 cached_dev = dev;
6517 force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
6518 return;
6519 }
6520
6521- printk(KERN_DEBUG "Failed to force enable HPET\n");
6522+ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
6523 }
6524
6525 /*
6526@@ -290,8 +298,8 @@ static void vt8237_force_enable_hpet(str
6527 */
6528 if (val & 0x80) {
6529 force_hpet_address = (val & ~0x3ff);
6530- printk(KERN_DEBUG "HPET at base address 0x%lx\n",
6531- force_hpet_address);
6532+ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
6533+ force_hpet_address);
6534 return;
6535 }
6536
6537@@ -305,14 +313,14 @@ static void vt8237_force_enable_hpet(str
6538 pci_read_config_dword(dev, 0x68, &val);
6539 if (val & 0x80) {
6540 force_hpet_address = (val & ~0x3ff);
6541- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6542- force_hpet_address);
6543+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6544+ "0x%lx\n", force_hpet_address);
6545 cached_dev = dev;
6546 force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
6547 return;
6548 }
6549
6550- printk(KERN_DEBUG "Failed to force enable HPET\n");
6551+ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
6552 }
6553
6554 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
6555@@ -340,7 +348,7 @@ static void nvidia_force_enable_hpet(str
6556 pci_read_config_dword(dev, 0x44, &val);
6557 force_hpet_address = val & 0xfffffffe;
6558 force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
6559- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6560+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
6561 force_hpet_address);
6562 cached_dev = dev;
6563 return;
6564@@ -353,6 +361,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
6565 nvidia_force_enable_hpet);
6566
6567 /* LPC bridges */
6568+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260,
6569+ nvidia_force_enable_hpet);
6570 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360,
6571 nvidia_force_enable_hpet);
6572 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361,
6573@@ -373,19 +383,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
6574 void force_hpet_resume(void)
6575 {
6576 switch (force_hpet_resume_type) {
6577- case ICH_FORCE_HPET_RESUME:
6578- return ich_force_hpet_resume();
6579-
6580- case OLD_ICH_FORCE_HPET_RESUME:
6581- return old_ich_force_hpet_resume();
6582-
6583- case VT8237_FORCE_HPET_RESUME:
6584- return vt8237_force_hpet_resume();
6585-
6586- case NVIDIA_FORCE_HPET_RESUME:
6587- return nvidia_force_hpet_resume();
6588-
6589- default:
6590+ case ICH_FORCE_HPET_RESUME:
6591+ ich_force_hpet_resume();
6592+ return;
6593+ case OLD_ICH_FORCE_HPET_RESUME:
6594+ old_ich_force_hpet_resume();
6595+ return;
6596+ case VT8237_FORCE_HPET_RESUME:
6597+ vt8237_force_hpet_resume();
6598+ return;
6599+ case NVIDIA_FORCE_HPET_RESUME:
6600+ nvidia_force_hpet_resume();
6601+ return;
6602+ default:
6603 break;
6604 }
6605 }
6606--- a/arch/x86/kernel/rtc.c
6607+++ b/arch/x86/kernel/rtc.c
6608@@ -181,6 +181,10 @@ unsigned long read_persistent_clock(void
6609 {
6610 unsigned long retval, flags;
6611
6612+#ifdef CONFIG_XEN
6613+ if (!is_initial_xendomain())
6614+ return xen_read_persistent_clock();
6615+#endif
6616 spin_lock_irqsave(&rtc_lock, flags);
6617 retval = get_wallclock();
6618 spin_unlock_irqrestore(&rtc_lock, flags);
6619@@ -190,6 +194,10 @@ unsigned long read_persistent_clock(void
6620
6621 int update_persistent_clock(struct timespec now)
6622 {
6623+#ifdef CONFIG_XEN
6624+ if (xen_update_persistent_clock() < 0 || xen_independent_wallclock())
6625+ return 0;
6626+#endif
6627 return set_rtc_mmss(now.tv_sec);
6628 }
6629
6630--- a/arch/x86/kernel/setup_32-xen.c
6631+++ b/arch/x86/kernel/setup_32-xen.c
6632@@ -47,9 +47,12 @@
6633 #include <linux/crash_dump.h>
6634 #include <linux/dmi.h>
6635 #include <linux/pfn.h>
6636+#include <linux/pci.h>
6637+#include <linux/init_ohci1394_dma.h>
6638
6639 #include <video/edid.h>
6640
6641+#include <asm/mtrr.h>
6642 #include <asm/apic.h>
6643 #include <asm/e820.h>
6644 #include <asm/mpspec.h>
6645@@ -79,14 +82,83 @@ static struct notifier_block xen_panic_b
6646 xen_panic_event, NULL, 0 /* try to go last */
6647 };
6648
6649-int disable_pse __cpuinitdata = 0;
6650-
6651 /*
6652 * Machine setup..
6653 */
6654-extern struct resource code_resource;
6655-extern struct resource data_resource;
6656-extern struct resource bss_resource;
6657+static struct resource data_resource = {
6658+ .name = "Kernel data",
6659+ .start = 0,
6660+ .end = 0,
6661+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6662+};
6663+
6664+static struct resource code_resource = {
6665+ .name = "Kernel code",
6666+ .start = 0,
6667+ .end = 0,
6668+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6669+};
6670+
6671+static struct resource bss_resource = {
6672+ .name = "Kernel bss",
6673+ .start = 0,
6674+ .end = 0,
6675+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6676+};
6677+
6678+static struct resource video_ram_resource = {
6679+ .name = "Video RAM area",
6680+ .start = 0xa0000,
6681+ .end = 0xbffff,
6682+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6683+};
6684+
6685+static struct resource standard_io_resources[] = { {
6686+ .name = "dma1",
6687+ .start = 0x0000,
6688+ .end = 0x001f,
6689+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6690+}, {
6691+ .name = "pic1",
6692+ .start = 0x0020,
6693+ .end = 0x0021,
6694+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6695+}, {
6696+ .name = "timer0",
6697+ .start = 0x0040,
6698+ .end = 0x0043,
6699+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6700+}, {
6701+ .name = "timer1",
6702+ .start = 0x0050,
6703+ .end = 0x0053,
6704+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6705+}, {
6706+ .name = "keyboard",
6707+ .start = 0x0060,
6708+ .end = 0x006f,
6709+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6710+}, {
6711+ .name = "dma page reg",
6712+ .start = 0x0080,
6713+ .end = 0x008f,
6714+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6715+}, {
6716+ .name = "pic2",
6717+ .start = 0x00a0,
6718+ .end = 0x00a1,
6719+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6720+}, {
6721+ .name = "dma2",
6722+ .start = 0x00c0,
6723+ .end = 0x00df,
6724+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6725+}, {
6726+ .name = "fpu",
6727+ .start = 0x00f0,
6728+ .end = 0x00ff,
6729+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6730+} };
6731
6732 /* cpu data as detected by the assembly code in head.S */
6733 struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
6734@@ -94,13 +166,16 @@ struct cpuinfo_x86 new_cpu_data __cpuini
6735 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
6736 EXPORT_SYMBOL(boot_cpu_data);
6737
6738+#ifndef CONFIG_X86_PAE
6739 unsigned long mmu_cr4_features;
6740+#else
6741+unsigned long mmu_cr4_features = X86_CR4_PAE;
6742+#endif
6743
6744 /* for MCA, but anyone else can use it if they want */
6745 unsigned int machine_id;
6746 unsigned int machine_submodel_id;
6747 unsigned int BIOS_revision;
6748-unsigned int mca_pentium_flag;
6749
6750 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
6751 int bootloader_type;
6752@@ -131,13 +206,17 @@ extern int root_mountflags;
6753
6754 unsigned long saved_videomode;
6755
6756-#define RAMDISK_IMAGE_START_MASK 0x07FF
6757+#define RAMDISK_IMAGE_START_MASK 0x07FF
6758 #define RAMDISK_PROMPT_FLAG 0x8000
6759-#define RAMDISK_LOAD_FLAG 0x4000
6760+#define RAMDISK_LOAD_FLAG 0x4000
6761
6762 static char __initdata command_line[COMMAND_LINE_SIZE];
6763
6764+#ifndef CONFIG_DEBUG_BOOT_PARAMS
6765 struct boot_params __initdata boot_params;
6766+#else
6767+struct boot_params boot_params;
6768+#endif
6769
6770 /*
6771 * Point at the empty zero page to start with. We map the real shared_info
6772@@ -198,8 +277,7 @@ static int __init parse_mem(char *arg)
6773 return -EINVAL;
6774
6775 if (strcmp(arg, "nopentium") == 0) {
6776- clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
6777- disable_pse = 1;
6778+ setup_clear_cpu_cap(X86_FEATURE_PSE);
6779 } else {
6780 /* If the user specifies memory size, we
6781 * limit the BIOS-provided memory map to
6782@@ -208,7 +286,7 @@ static int __init parse_mem(char *arg)
6783 * trim the existing memory map.
6784 */
6785 unsigned long long mem_size;
6786-
6787+
6788 mem_size = memparse(arg, &arg);
6789 limit_regions(mem_size);
6790 user_defined_memmap = 1;
6791@@ -350,7 +428,7 @@ static void __init reserve_ebda_region(v
6792 unsigned int addr;
6793 addr = get_bios_ebda();
6794 if (addr)
6795- reserve_bootmem(addr, PAGE_SIZE);
6796+ reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
6797 }
6798 #endif
6799
6800@@ -365,8 +443,6 @@ static unsigned long __init setup_memory
6801 min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
6802 xen_start_info->nr_pt_frames;
6803
6804- find_max_pfn();
6805-
6806 max_low_pfn = find_max_low_pfn();
6807
6808 #ifdef CONFIG_HIGHMEM
6809@@ -447,7 +523,8 @@ static void __init reserve_crashkernel(v
6810 (unsigned long)(total_mem >> 20));
6811 crashk_res.start = crash_base;
6812 crashk_res.end = crash_base + crash_size - 1;
6813- reserve_bootmem(crash_base, crash_size);
6814+ reserve_bootmem(crash_base, crash_size,
6815+ BOOTMEM_DEFAULT);
6816 } else
6817 printk(KERN_INFO "crashkernel reservation failed - "
6818 "you have to specify a base address\n");
6819@@ -461,6 +538,99 @@ static inline void __init reserve_crashk
6820 {}
6821 #endif
6822
6823+#ifdef CONFIG_BLK_DEV_INITRD
6824+
6825+static bool do_relocate_initrd = false;
6826+
6827+static void __init reserve_initrd(void)
6828+{
6829+ unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
6830+ unsigned long ramdisk_size = xen_start_info->mod_len;
6831+ unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
6832+ unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6833+ unsigned long ramdisk_here;
6834+
6835+ initrd_start = 0;
6836+
6837+ if (!xen_start_info->mod_start || !ramdisk_size)
6838+ return; /* No initrd provided by bootloader */
6839+
6840+ if (ramdisk_end < ramdisk_image) {
6841+ printk(KERN_ERR "initrd wraps around end of memory, "
6842+ "disabling initrd\n");
6843+ return;
6844+ }
6845+ if (ramdisk_size >= end_of_lowmem/2) {
6846+ printk(KERN_ERR "initrd too large to handle, "
6847+ "disabling initrd\n");
6848+ return;
6849+ }
6850+ if (ramdisk_end <= end_of_lowmem) {
6851+ /* All in lowmem, easy case */
6852+ reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
6853+ initrd_start = ramdisk_image + PAGE_OFFSET;
6854+ initrd_end = initrd_start+ramdisk_size;
6855+ return;
6856+ }
6857+
6858+ /* We need to move the initrd down into lowmem */
6859+ ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
6860+
6861+ /* Note: this includes all the lowmem currently occupied by
6862+ the initrd, we rely on that fact to keep the data intact. */
6863+ reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
6864+ initrd_start = ramdisk_here + PAGE_OFFSET;
6865+ initrd_end = initrd_start + ramdisk_size;
6866+
6867+ do_relocate_initrd = true;
6868+}
6869+
6870+#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
6871+
6872+static void __init relocate_initrd(void)
6873+{
6874+ unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
6875+ unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
6876+ unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6877+ unsigned long ramdisk_here;
6878+ unsigned long slop, clen, mapaddr;
6879+ char *p, *q;
6880+
6881+ if (!do_relocate_initrd)
6882+ return;
6883+
6884+ ramdisk_here = initrd_start - PAGE_OFFSET;
6885+
6886+ q = (char *)initrd_start;
6887+
6888+ /* Copy any lowmem portion of the initrd */
6889+ if (ramdisk_image < end_of_lowmem) {
6890+ clen = end_of_lowmem - ramdisk_image;
6891+ p = (char *)__va(ramdisk_image);
6892+ memcpy(q, p, clen);
6893+ q += clen;
6894+ ramdisk_image += clen;
6895+ ramdisk_size -= clen;
6896+ }
6897+
6898+ /* Copy the highmem portion of the initrd */
6899+ while (ramdisk_size) {
6900+ slop = ramdisk_image & ~PAGE_MASK;
6901+ clen = ramdisk_size;
6902+ if (clen > MAX_MAP_CHUNK-slop)
6903+ clen = MAX_MAP_CHUNK-slop;
6904+ mapaddr = ramdisk_image & PAGE_MASK;
6905+ p = early_ioremap(mapaddr, clen+slop);
6906+ memcpy(q, p+slop, clen);
6907+ early_iounmap(p, clen+slop);
6908+ q += clen;
6909+ ramdisk_image += clen;
6910+ ramdisk_size -= clen;
6911+ }
6912+}
6913+
6914+#endif /* CONFIG_BLK_DEV_INITRD */
6915+
6916 void __init setup_bootmem_allocator(void)
6917 {
6918 unsigned long bootmap_size;
6919@@ -478,14 +648,15 @@ void __init setup_bootmem_allocator(void
6920 * bootmem allocator with an invalid RAM area.
6921 */
6922 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
6923- bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
6924+ bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
6925+ BOOTMEM_DEFAULT);
6926
6927 #ifndef CONFIG_XEN
6928 /*
6929 * reserve physical page 0 - it's a special BIOS page on many boxes,
6930 * enabling clean reboots, SMP operation, laptop functions.
6931 */
6932- reserve_bootmem(0, PAGE_SIZE);
6933+ reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
6934
6935 /* reserve EBDA region, it's a 4K region */
6936 reserve_ebda_region();
6937@@ -495,7 +666,7 @@ void __init setup_bootmem_allocator(void
6938 unless you have no PS/2 mouse plugged in. */
6939 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
6940 boot_cpu_data.x86 == 6)
6941- reserve_bootmem(0xa0000 - 4096, 4096);
6942+ reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
6943
6944 #ifdef CONFIG_SMP
6945 /*
6946@@ -503,7 +674,7 @@ void __init setup_bootmem_allocator(void
6947 * FIXME: Don't need the extra page at 4K, but need to fix
6948 * trampoline before removing it. (see the GDT stuff)
6949 */
6950- reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
6951+ reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
6952 #endif
6953 #ifdef CONFIG_ACPI_SLEEP
6954 /*
6955@@ -511,29 +682,12 @@ void __init setup_bootmem_allocator(void
6956 */
6957 acpi_reserve_bootmem();
6958 #endif
6959- numa_kva_reserve();
6960 #endif /* !CONFIG_XEN */
6961
6962 #ifdef CONFIG_BLK_DEV_INITRD
6963- if (xen_start_info->mod_start) {
6964- unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
6965- unsigned long ramdisk_size = xen_start_info->mod_len;
6966- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
6967- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6968-
6969- if (ramdisk_end <= end_of_lowmem) {
6970- /*reserve_bootmem(ramdisk_image, ramdisk_size);*/
6971- initrd_start = ramdisk_image + PAGE_OFFSET;
6972- initrd_end = initrd_start+ramdisk_size;
6973- initrd_below_start_ok = 1;
6974- } else {
6975- printk(KERN_ERR "initrd extends beyond end of memory "
6976- "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
6977- ramdisk_end, end_of_lowmem);
6978- initrd_start = 0;
6979- }
6980- }
6981+ reserve_initrd();
6982 #endif
6983+ numa_kva_reserve();
6984 reserve_crashkernel();
6985 }
6986
6987@@ -600,20 +754,14 @@ void __init setup_arch(char **cmdline_p)
6988 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
6989 pre_setup_arch_hook();
6990 early_cpu_init();
6991+ early_ioremap_init();
6992 #ifdef CONFIG_SMP
6993 prefill_possible_map();
6994 #endif
6995
6996- /*
6997- * FIXME: This isn't an official loader_type right
6998- * now but does currently work with elilo.
6999- * If we were configured as an EFI kernel, check to make
7000- * sure that we were loaded correctly from elilo and that
7001- * the system table is valid. If not, then initialize normally.
7002- */
7003 #ifdef CONFIG_EFI
7004- if ((boot_params.hdr.type_of_loader == 0x50) &&
7005- boot_params.efi_info.efi_systab)
7006+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
7007+ "EL32", 4))
7008 efi_enabled = 1;
7009 #endif
7010
7011@@ -653,12 +801,9 @@ void __init setup_arch(char **cmdline_p)
7012 #endif
7013
7014 ARCH_SETUP
7015- if (efi_enabled)
7016- efi_init();
7017- else {
7018- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
7019- print_memory_map(memory_setup());
7020- }
7021+
7022+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
7023+ print_memory_map(memory_setup());
7024
7025 copy_edd();
7026
7027@@ -691,6 +836,17 @@ void __init setup_arch(char **cmdline_p)
7028 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
7029 *cmdline_p = command_line;
7030
7031+ if (efi_enabled)
7032+ efi_init();
7033+
7034+ /* update e820 for memory not covered by WB MTRRs */
7035+ find_max_pfn();
7036+ mtrr_bp_init();
7037+#ifndef CONFIG_XEN
7038+ if (mtrr_trim_uncached_memory(max_pfn))
7039+ find_max_pfn();
7040+#endif
7041+
7042 max_low_pfn = setup_memory();
7043
7044 #ifdef CONFIG_VMI
7045@@ -715,6 +871,16 @@ void __init setup_arch(char **cmdline_p)
7046 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
7047 #endif
7048 paging_init();
7049+
7050+ /*
7051+ * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
7052+ */
7053+
7054+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7055+ if (init_ohci1394_dma_early)
7056+ init_ohci1394_dma_on_all_controllers();
7057+#endif
7058+
7059 remapped_pgdat_init();
7060 sparse_init();
7061 zone_sizes_init();
7062@@ -800,16 +966,20 @@ void __init setup_arch(char **cmdline_p)
7063 * NOTE: at this point the bootmem allocator is fully available.
7064 */
7065
7066+#ifdef CONFIG_BLK_DEV_INITRD
7067+ relocate_initrd();
7068+#endif
7069+
7070 paravirt_post_allocator_init();
7071
7072 if (is_initial_xendomain())
7073 dmi_scan_machine();
7074
7075+ io_delay_init();
7076+
7077 #ifdef CONFIG_X86_GENERICARCH
7078 generic_apic_probe();
7079-#endif
7080- if (efi_enabled)
7081- efi_map_memmap();
7082+#endif
7083
7084 set_iopl.iopl = 1;
7085 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
7086@@ -827,7 +997,7 @@ void __init setup_arch(char **cmdline_p)
7087 acpi_boot_table_init();
7088 #endif
7089
7090-#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
7091+#ifndef CONFIG_XEN
7092 early_quirks();
7093 #endif
7094
7095@@ -873,3 +1043,30 @@ xen_panic_event(struct notifier_block *t
7096 /* we're never actually going to get here... */
7097 return NOTIFY_DONE;
7098 }
7099+
7100+/*
7101+ * Request address space for all standard resources
7102+ *
7103+ * This is called just before pcibios_init(), which is also a
7104+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
7105+ */
7106+static int __init request_standard_resources(void)
7107+{
7108+ int i;
7109+
7110+ /* Nothing to do if not running in dom0. */
7111+ if (!is_initial_xendomain())
7112+ return 0;
7113+
7114+ printk(KERN_INFO "Setting up standard PCI resources\n");
7115+ init_iomem_resources(&code_resource, &data_resource, &bss_resource);
7116+
7117+ request_resource(&iomem_resource, &video_ram_resource);
7118+
7119+ /* request I/O space for devices used on all i[345]86 PCs */
7120+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
7121+ request_resource(&ioport_resource, &standard_io_resources[i]);
7122+ return 0;
7123+}
7124+
7125+subsys_initcall(request_standard_resources);
7126--- a/arch/x86/kernel/setup_64-xen.c
7127+++ b/arch/x86/kernel/setup_64-xen.c
7128@@ -15,7 +15,6 @@
7129 #include <linux/ptrace.h>
7130 #include <linux/slab.h>
7131 #include <linux/user.h>
7132-#include <linux/a.out.h>
7133 #include <linux/screen_info.h>
7134 #include <linux/ioport.h>
7135 #include <linux/delay.h>
7136@@ -30,6 +29,7 @@
7137 #include <linux/crash_dump.h>
7138 #include <linux/root_dev.h>
7139 #include <linux/pci.h>
7140+#include <linux/efi.h>
7141 #include <linux/acpi.h>
7142 #include <linux/kallsyms.h>
7143 #include <linux/edd.h>
7144@@ -39,10 +39,13 @@
7145 #include <linux/dmi.h>
7146 #include <linux/dma-mapping.h>
7147 #include <linux/ctype.h>
7148+#include <linux/uaccess.h>
7149+#include <linux/init_ohci1394_dma.h>
7150
7151 #include <asm/mtrr.h>
7152 #include <asm/uaccess.h>
7153 #include <asm/system.h>
7154+#include <asm/vsyscall.h>
7155 #include <asm/io.h>
7156 #include <asm/smp.h>
7157 #include <asm/msr.h>
7158@@ -50,6 +53,7 @@
7159 #include <video/edid.h>
7160 #include <asm/e820.h>
7161 #include <asm/dma.h>
7162+#include <asm/gart.h>
7163 #include <asm/mpspec.h>
7164 #include <asm/mmu_context.h>
7165 #include <asm/proto.h>
7166@@ -59,6 +63,9 @@
7167 #include <asm/sections.h>
7168 #include <asm/dmi.h>
7169 #include <asm/cacheflush.h>
7170+#include <asm/mce.h>
7171+#include <asm/ds.h>
7172+#include <asm/topology.h>
7173 #ifdef CONFIG_XEN
7174 #include <linux/percpu.h>
7175 #include <xen/interface/physdev.h>
7176@@ -108,6 +115,8 @@ EXPORT_SYMBOL(xen_start_info);
7177 struct cpuinfo_x86 boot_cpu_data __read_mostly;
7178 EXPORT_SYMBOL(boot_cpu_data);
7179
7180+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
7181+
7182 unsigned long mmu_cr4_features;
7183
7184 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
7185@@ -117,7 +126,7 @@ unsigned long saved_video_mode;
7186
7187 int force_mwait __cpuinitdata;
7188
7189-/*
7190+/*
7191 * Early DMI memory
7192 */
7193 int dmi_alloc_index;
7194@@ -163,25 +172,27 @@ struct resource standard_io_resources[]
7195
7196 #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
7197
7198-struct resource data_resource = {
7199+static struct resource data_resource = {
7200 .name = "Kernel data",
7201 .start = 0,
7202 .end = 0,
7203 .flags = IORESOURCE_RAM,
7204 };
7205-struct resource code_resource = {
7206+static struct resource code_resource = {
7207 .name = "Kernel code",
7208 .start = 0,
7209 .end = 0,
7210 .flags = IORESOURCE_RAM,
7211 };
7212-struct resource bss_resource = {
7213+static struct resource bss_resource = {
7214 .name = "Kernel bss",
7215 .start = 0,
7216 .end = 0,
7217 .flags = IORESOURCE_RAM,
7218 };
7219
7220+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
7221+
7222 #ifdef CONFIG_PROC_VMCORE
7223 /* elfcorehdr= specifies the location of elf core header
7224 * stored by the crashed kernel. This option will be passed
7225@@ -205,9 +216,10 @@ contig_initmem_init(unsigned long start_
7226 unsigned long bootmap_size, bootmap;
7227
7228 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
7229- bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
7230+ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
7231+ PAGE_SIZE);
7232 if (bootmap == -1L)
7233- panic("Cannot find bootmem map of size %ld\n",bootmap_size);
7234+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
7235 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
7236 e820_register_active_regions(0, start_pfn, end_pfn);
7237 #ifdef CONFIG_XEN
7238@@ -215,8 +227,8 @@ contig_initmem_init(unsigned long start_
7239 #else
7240 free_bootmem_with_active_regions(0, end_pfn);
7241 #endif
7242- reserve_bootmem(bootmap, bootmap_size);
7243-}
7244+ reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
7245+}
7246 #endif
7247
7248 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
7249@@ -249,27 +261,35 @@ static inline void copy_edd(void)
7250 #ifndef CONFIG_XEN
7251 static void __init reserve_crashkernel(void)
7252 {
7253- unsigned long long free_mem;
7254+ unsigned long long total_mem;
7255 unsigned long long crash_size, crash_base;
7256 int ret;
7257
7258- free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
7259+ total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
7260
7261- ret = parse_crashkernel(boot_command_line, free_mem,
7262+ ret = parse_crashkernel(boot_command_line, total_mem,
7263 &crash_size, &crash_base);
7264 if (ret == 0 && crash_size) {
7265- if (crash_base > 0) {
7266- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
7267- "for crashkernel (System RAM: %ldMB)\n",
7268- (unsigned long)(crash_size >> 20),
7269- (unsigned long)(crash_base >> 20),
7270- (unsigned long)(free_mem >> 20));
7271- crashk_res.start = crash_base;
7272- crashk_res.end = crash_base + crash_size - 1;
7273- reserve_bootmem(crash_base, crash_size);
7274- } else
7275+ if (crash_base <= 0) {
7276 printk(KERN_INFO "crashkernel reservation failed - "
7277 "you have to specify a base address\n");
7278+ return;
7279+ }
7280+
7281+ if (reserve_bootmem(crash_base, crash_size,
7282+ BOOTMEM_EXCLUSIVE) < 0) {
7283+ printk(KERN_INFO "crashkernel reservation failed - "
7284+ "memory is in use\n");
7285+ return;
7286+ }
7287+
7288+ printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
7289+ "for crashkernel (System RAM: %ldMB)\n",
7290+ (unsigned long)(crash_size >> 20),
7291+ (unsigned long)(crash_base >> 20),
7292+ (unsigned long)(total_mem >> 20));
7293+ crashk_res.start = crash_base;
7294+ crashk_res.end = crash_base + crash_size - 1;
7295 }
7296 }
7297 #else
7298@@ -280,37 +300,21 @@ static inline void __init reserve_crashk
7299 {}
7300 #endif
7301
7302-#ifndef CONFIG_XEN
7303-#define EBDA_ADDR_POINTER 0x40E
7304-
7305-unsigned __initdata ebda_addr;
7306-unsigned __initdata ebda_size;
7307-
7308-static void discover_ebda(void)
7309+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
7310+void __attribute__((weak)) __init memory_setup(void)
7311 {
7312- /*
7313- * there is a real-mode segmented pointer pointing to the
7314- * 4K EBDA area at 0x40E
7315- */
7316- ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
7317- ebda_addr <<= 4;
7318-
7319- ebda_size = *(unsigned short *)__va(ebda_addr);
7320-
7321- /* Round EBDA up to pages */
7322- if (ebda_size == 0)
7323- ebda_size = 1;
7324- ebda_size <<= 10;
7325- ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
7326- if (ebda_size > 64*1024)
7327- ebda_size = 64*1024;
7328+ machine_specific_memory_setup();
7329 }
7330-#else
7331-#define discover_ebda() ((void)0)
7332-#endif
7333
7334+/*
7335+ * setup_arch - architecture-specific boot-time initializations
7336+ *
7337+ * Note: On x86_64, fixmaps are ready for use even before this is called.
7338+ */
7339 void __init setup_arch(char **cmdline_p)
7340 {
7341+ unsigned i;
7342+
7343 #ifdef CONFIG_XEN
7344 extern struct e820map machine_e820;
7345
7346@@ -319,6 +323,11 @@ void __init setup_arch(char **cmdline_p)
7347 /* Register a call for panic conditions. */
7348 atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
7349
7350+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
7351+ VMASST_TYPE_writable_pagetables));
7352+
7353+ early_ioremap_init();
7354+
7355 ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
7356 screen_info = boot_params.screen_info;
7357
7358@@ -335,11 +344,6 @@ void __init setup_arch(char **cmdline_p)
7359 screen_info.orig_video_isVGA = 0;
7360
7361 copy_edid();
7362-
7363- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
7364- VMASST_TYPE_writable_pagetables));
7365-
7366- ARCH_SETUP
7367 #else
7368 printk(KERN_INFO "Command line: %s\n", boot_command_line);
7369
7370@@ -355,7 +359,15 @@ void __init setup_arch(char **cmdline_p)
7371 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
7372 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
7373 #endif
7374- setup_memory_region();
7375+#ifdef CONFIG_EFI
7376+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
7377+ "EL64", 4))
7378+ efi_enabled = 1;
7379+#endif
7380+
7381+ ARCH_SETUP
7382+
7383+ memory_setup();
7384 copy_edd();
7385
7386 if (!boot_params.hdr.root_flags)
7387@@ -379,28 +391,51 @@ void __init setup_arch(char **cmdline_p)
7388
7389 parse_early_param();
7390
7391+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7392+ if (init_ohci1394_dma_early)
7393+ init_ohci1394_dma_on_all_controllers();
7394+#endif
7395+
7396 finish_e820_parsing();
7397
7398+ early_gart_iommu_check();
7399+
7400 e820_register_active_regions(0, 0, -1UL);
7401 /*
7402 * partially used pages are not usable - thus
7403 * we are rounding upwards:
7404 */
7405 end_pfn = e820_end_of_ram();
7406+ /* update e820 for memory not covered by WB MTRRs */
7407+ mtrr_bp_init();
7408+#ifndef CONFIG_XEN
7409+ if (mtrr_trim_uncached_memory(end_pfn)) {
7410+ e820_register_active_regions(0, 0, -1UL);
7411+ end_pfn = e820_end_of_ram();
7412+ }
7413+#endif
7414+
7415 num_physpages = end_pfn;
7416+ max_mapnr = end_pfn;
7417
7418 check_efer();
7419
7420- discover_ebda();
7421-
7422 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
7423+ if (efi_enabled)
7424+ efi_init();
7425
7426 if (is_initial_xendomain())
7427 dmi_scan_machine();
7428
7429+ io_delay_init();
7430+
7431 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
7432- /* setup to use the static apicid table during kernel startup */
7433- x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
7434+ /* setup to use the early static init tables during kernel startup */
7435+ x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
7436+ x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
7437+#ifdef CONFIG_NUMA
7438+ x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
7439+#endif
7440 #endif
7441
7442 /* How many end-of-memory variables you have, grandma! */
7443@@ -419,54 +454,25 @@ void __init setup_arch(char **cmdline_p)
7444 #endif
7445
7446 #ifdef CONFIG_NUMA
7447- numa_initmem_init(0, end_pfn);
7448+ numa_initmem_init(0, end_pfn);
7449 #else
7450 contig_initmem_init(0, end_pfn);
7451 #endif
7452
7453-#ifdef CONFIG_XEN
7454- /*
7455- * Reserve kernel, physmap, start info, initial page tables, and
7456- * direct mapping.
7457- */
7458- reserve_bootmem_generic(__pa_symbol(&_text),
7459- (table_end << PAGE_SHIFT) - __pa_symbol(&_text));
7460-#else
7461- /* Reserve direct mapping */
7462- reserve_bootmem_generic(table_start << PAGE_SHIFT,
7463- (table_end - table_start) << PAGE_SHIFT);
7464-
7465- /* reserve kernel */
7466- reserve_bootmem_generic(__pa_symbol(&_text),
7467- __pa_symbol(&_end) - __pa_symbol(&_text));
7468+ early_res_to_bootmem();
7469
7470+#ifndef CONFIG_XEN
7471+#ifdef CONFIG_ACPI_SLEEP
7472 /*
7473- * reserve physical page 0 - it's a special BIOS page on many boxes,
7474- * enabling clean reboots, SMP operation, laptop functions.
7475+ * Reserve low memory region for sleep support.
7476 */
7477- reserve_bootmem_generic(0, PAGE_SIZE);
7478-
7479- /* reserve ebda region */
7480- if (ebda_addr)
7481- reserve_bootmem_generic(ebda_addr, ebda_size);
7482-#ifdef CONFIG_NUMA
7483- /* reserve nodemap region */
7484- if (nodemap_addr)
7485- reserve_bootmem_generic(nodemap_addr, nodemap_size);
7486+ acpi_reserve_bootmem();
7487 #endif
7488
7489-#ifdef CONFIG_SMP
7490- /* Reserve SMP trampoline */
7491- reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
7492-#endif
7493+ if (efi_enabled)
7494+ efi_reserve_bootmem();
7495 #endif
7496
7497-#ifdef CONFIG_ACPI_SLEEP
7498- /*
7499- * Reserve low memory region for sleep support.
7500- */
7501- acpi_reserve_bootmem();
7502-#endif
7503 #ifdef CONFIG_BLK_DEV_INITRD
7504 #ifdef CONFIG_XEN
7505 if (xen_start_info->mod_start) {
7506@@ -490,6 +496,8 @@ void __init setup_arch(char **cmdline_p)
7507 initrd_below_start_ok = 1;
7508 #endif
7509 } else {
7510+ /* Assumes everything on node 0 */
7511+ free_bootmem(ramdisk_image, ramdisk_size);
7512 printk(KERN_ERR "initrd extends beyond end of memory "
7513 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
7514 ramdisk_end, end_of_mem);
7515@@ -499,10 +507,11 @@ void __init setup_arch(char **cmdline_p)
7516 #endif
7517 reserve_crashkernel();
7518 paging_init();
7519+ map_vsyscall();
7520 #ifdef CONFIG_X86_LOCAL_APIC
7521 /*
7522- * Find and reserve possible boot-time SMP configuration:
7523- */
7524+ * Find and reserve possible boot-time SMP configuration:
7525+ */
7526 find_smp_config();
7527 #endif
7528 #ifdef CONFIG_XEN
7529@@ -590,16 +599,10 @@ void __init setup_arch(char **cmdline_p)
7530 #endif
7531 #endif
7532
7533-#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
7534+#ifndef CONFIG_XEN
7535 early_quirks();
7536 #endif
7537
7538- /*
7539- * set this early, so we dont allocate cpu0
7540- * if MADT list doesnt list BSP first
7541- * mpparse.c/MP_processor_info() allocates logical cpu numbers.
7542- */
7543- cpu_set(0, cpu_present_map);
7544 #ifdef CONFIG_ACPI
7545 /*
7546 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
7547@@ -623,6 +626,7 @@ void __init setup_arch(char **cmdline_p)
7548 get_smp_config();
7549 #ifndef CONFIG_XEN
7550 init_apic_mappings();
7551+ ioapic_init_mappings();
7552 #endif
7553 #endif
7554 #if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
7555@@ -634,18 +638,17 @@ void __init setup_arch(char **cmdline_p)
7556 */
7557 #ifdef CONFIG_XEN
7558 if (is_initial_xendomain())
7559- e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
7560+ e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
7561+ &code_resource, &data_resource, &bss_resource);
7562 #else
7563- e820_reserve_resources(e820.map, e820.nr_map);
7564+ e820_reserve_resources(e820.map, e820.nr_map,
7565+ &code_resource, &data_resource, &bss_resource);
7566 e820_mark_nosave_regions();
7567 #endif
7568
7569- {
7570- unsigned i;
7571 /* request I/O space for devices used on all i[345]86 PCs */
7572 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
7573 request_resource(&ioport_resource, &standard_io_resources[i]);
7574- }
7575
7576 #ifdef CONFIG_XEN
7577 if (is_initial_xendomain())
7578@@ -679,7 +682,8 @@ void __init setup_arch(char **cmdline_p)
7579
7580 #ifdef CONFIG_VT
7581 #if defined(CONFIG_VGA_CONSOLE)
7582- conswitchp = &vga_con;
7583+ if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
7584+ conswitchp = &vga_con;
7585 #elif defined(CONFIG_DUMMY_CONSOLE)
7586 conswitchp = &dummy_con;
7587 #endif
7588@@ -723,9 +727,10 @@ static void __cpuinit display_cacheinfo(
7589
7590 if (n >= 0x80000005) {
7591 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
7592- printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
7593- edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
7594- c->x86_cache_size=(ecx>>24)+(edx>>24);
7595+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
7596+ "D cache %dK (%d bytes/line)\n",
7597+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
7598+ c->x86_cache_size = (ecx>>24) + (edx>>24);
7599 /* On K8 L1 TLB is inclusive, so don't count it */
7600 c->x86_tlbsize = 0;
7601 }
7602@@ -739,27 +744,25 @@ static void __cpuinit display_cacheinfo(
7603 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
7604 c->x86_cache_size, ecx & 0xFF);
7605 }
7606-
7607- if (n >= 0x80000007)
7608- cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
7609 if (n >= 0x80000008) {
7610- cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
7611+ cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
7612 c->x86_virt_bits = (eax >> 8) & 0xff;
7613 c->x86_phys_bits = eax & 0xff;
7614 }
7615 }
7616
7617 #ifdef CONFIG_NUMA
7618-static int nearby_node(int apicid)
7619+static int __cpuinit nearby_node(int apicid)
7620 {
7621- int i;
7622+ int i, node;
7623+
7624 for (i = apicid - 1; i >= 0; i--) {
7625- int node = apicid_to_node[i];
7626+ node = apicid_to_node[i];
7627 if (node != NUMA_NO_NODE && node_online(node))
7628 return node;
7629 }
7630 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
7631- int node = apicid_to_node[i];
7632+ node = apicid_to_node[i];
7633 if (node != NUMA_NO_NODE && node_online(node))
7634 return node;
7635 }
7636@@ -771,7 +774,7 @@ static int nearby_node(int apicid)
7637 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
7638 * Assumes number of cores is a power of two.
7639 */
7640-static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
7641+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
7642 {
7643 #ifdef CONFIG_SMP
7644 unsigned bits;
7645@@ -780,7 +783,54 @@ static void __init amd_detect_cmp(struct
7646 int node = 0;
7647 unsigned apicid = hard_smp_processor_id();
7648 #endif
7649- unsigned ecx = cpuid_ecx(0x80000008);
7650+ bits = c->x86_coreid_bits;
7651+
7652+ /* Low order bits define the core id (index of core in socket) */
7653+ c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
7654+ /* Convert the APIC ID into the socket ID */
7655+ c->phys_proc_id = phys_pkg_id(bits);
7656+
7657+#ifdef CONFIG_NUMA
7658+ node = c->phys_proc_id;
7659+ if (apicid_to_node[apicid] != NUMA_NO_NODE)
7660+ node = apicid_to_node[apicid];
7661+ if (!node_online(node)) {
7662+ /* Two possibilities here:
7663+ - The CPU is missing memory and no node was created.
7664+ In that case try picking one from a nearby CPU
7665+ - The APIC IDs differ from the HyperTransport node IDs
7666+ which the K8 northbridge parsing fills in.
7667+ Assume they are all increased by a constant offset,
7668+ but in the same order as the HT nodeids.
7669+ If that doesn't result in a usable node fall back to the
7670+ path for the previous case. */
7671+
7672+ int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
7673+
7674+ if (ht_nodeid >= 0 &&
7675+ apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
7676+ node = apicid_to_node[ht_nodeid];
7677+ /* Pick a nearby node */
7678+ if (!node_online(node))
7679+ node = nearby_node(apicid);
7680+ }
7681+ numa_set_node(cpu, node);
7682+
7683+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
7684+#endif
7685+#endif
7686+}
7687+
7688+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
7689+{
7690+#ifdef CONFIG_SMP
7691+ unsigned bits, ecx;
7692+
7693+ /* Multi core CPU? */
7694+ if (c->extended_cpuid_level < 0x80000008)
7695+ return;
7696+
7697+ ecx = cpuid_ecx(0x80000008);
7698
7699 c->x86_max_cores = (ecx & 0xff) + 1;
7700
7701@@ -793,37 +843,8 @@ static void __init amd_detect_cmp(struct
7702 bits++;
7703 }
7704
7705- /* Low order bits define the core id (index of core in socket) */
7706- c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
7707- /* Convert the APIC ID into the socket ID */
7708- c->phys_proc_id = phys_pkg_id(bits);
7709-
7710-#ifdef CONFIG_NUMA
7711- node = c->phys_proc_id;
7712- if (apicid_to_node[apicid] != NUMA_NO_NODE)
7713- node = apicid_to_node[apicid];
7714- if (!node_online(node)) {
7715- /* Two possibilities here:
7716- - The CPU is missing memory and no node was created.
7717- In that case try picking one from a nearby CPU
7718- - The APIC IDs differ from the HyperTransport node IDs
7719- which the K8 northbridge parsing fills in.
7720- Assume they are all increased by a constant offset,
7721- but in the same order as the HT nodeids.
7722- If that doesn't result in a usable node fall back to the
7723- path for the previous case. */
7724- int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
7725- if (ht_nodeid >= 0 &&
7726- apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
7727- node = apicid_to_node[ht_nodeid];
7728- /* Pick a nearby node */
7729- if (!node_online(node))
7730- node = nearby_node(apicid);
7731- }
7732- numa_set_node(cpu, node);
7733+ c->x86_coreid_bits = bits;
7734
7735- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
7736-#endif
7737 #endif
7738 }
7739
7740@@ -840,8 +861,8 @@ static void __init amd_detect_cmp(struct
7741 /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
7742 static __cpuinit int amd_apic_timer_broken(void)
7743 {
7744- u32 lo, hi;
7745- u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
7746+ u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
7747+
7748 switch (eax & CPUID_XFAM) {
7749 case CPUID_XFAM_K8:
7750 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
7751@@ -860,6 +881,15 @@ static __cpuinit int amd_apic_timer_brok
7752 }
7753 #endif
7754
7755+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
7756+{
7757+ early_init_amd_mc(c);
7758+
7759+ /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
7760+ if (c->x86_power & (1<<8))
7761+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
7762+}
7763+
7764 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
7765 {
7766 unsigned level;
7767@@ -870,7 +900,7 @@ static void __cpuinit init_amd(struct cp
7768 /*
7769 * Disable TLB flush filter by setting HWCR.FFDIS on K8
7770 * bit 6 of msr C001_0015
7771- *
7772+ *
7773 * Errata 63 for SH-B3 steppings
7774 * Errata 122 for all steppings (F+ have it disabled by default)
7775 */
7776@@ -883,35 +913,32 @@ static void __cpuinit init_amd(struct cp
7777
7778 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
7779 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
7780- clear_bit(0*32+31, &c->x86_capability);
7781-
7782+ clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
7783+
7784 /* On C+ stepping K8 rep microcode works well for copy/memset */
7785 level = cpuid_eax(1);
7786- if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
7787- set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7788+ if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
7789+ level >= 0x0f58))
7790+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7791 if (c->x86 == 0x10 || c->x86 == 0x11)
7792- set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7793+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7794
7795 /* Enable workaround for FXSAVE leak */
7796 if (c->x86 >= 6)
7797- set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
7798+ set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
7799
7800 level = get_model_name(c);
7801 if (!level) {
7802- switch (c->x86) {
7803+ switch (c->x86) {
7804 case 15:
7805 /* Should distinguish Models here, but this is only
7806 a fallback anyways. */
7807 strcpy(c->x86_model_id, "Hammer");
7808- break;
7809- }
7810- }
7811+ break;
7812+ }
7813+ }
7814 display_cacheinfo(c);
7815
7816- /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
7817- if (c->x86_power & (1<<8))
7818- set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7819-
7820 /* Multi core CPU? */
7821 if (c->extended_cpuid_level >= 0x80000008)
7822 amd_detect_cmp(c);
7823@@ -923,14 +950,10 @@ static void __cpuinit init_amd(struct cp
7824 num_cache_leaves = 3;
7825
7826 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
7827- set_bit(X86_FEATURE_K8, &c->x86_capability);
7828-
7829- /* RDTSC can be speculated around */
7830- clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7831+ set_cpu_cap(c, X86_FEATURE_K8);
7832
7833- /* Family 10 doesn't support C states in MWAIT so don't use it */
7834- if (c->x86 == 0x10 && !force_mwait)
7835- clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
7836+ /* MFENCE stops RDTSC speculation */
7837+ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
7838
7839 #ifndef CONFIG_XEN
7840 if (amd_apic_timer_broken())
7841@@ -938,28 +961,29 @@ static void __cpuinit init_amd(struct cp
7842 #endif
7843 }
7844
7845-static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
7846+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
7847 {
7848 #ifdef CONFIG_SMP
7849- u32 eax, ebx, ecx, edx;
7850- int index_msb, core_bits;
7851+ u32 eax, ebx, ecx, edx;
7852+ int index_msb, core_bits;
7853
7854 cpuid(1, &eax, &ebx, &ecx, &edx);
7855
7856
7857 if (!cpu_has(c, X86_FEATURE_HT))
7858 return;
7859- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
7860+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
7861 goto out;
7862
7863 smp_num_siblings = (ebx & 0xff0000) >> 16;
7864
7865 if (smp_num_siblings == 1) {
7866 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
7867- } else if (smp_num_siblings > 1 ) {
7868+ } else if (smp_num_siblings > 1) {
7869
7870 if (smp_num_siblings > NR_CPUS) {
7871- printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
7872+ printk(KERN_WARNING "CPU: Unsupported number of "
7873+ "siblings %d", smp_num_siblings);
7874 smp_num_siblings = 1;
7875 return;
7876 }
7877@@ -969,7 +993,7 @@ static void __cpuinit detect_ht(struct c
7878
7879 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
7880
7881- index_msb = get_count_order(smp_num_siblings) ;
7882+ index_msb = get_count_order(smp_num_siblings);
7883
7884 core_bits = get_count_order(c->x86_max_cores);
7885
7886@@ -978,8 +1002,10 @@ static void __cpuinit detect_ht(struct c
7887 }
7888 out:
7889 if ((c->x86_max_cores * smp_num_siblings) > 1) {
7890- printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
7891- printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id);
7892+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
7893+ c->phys_proc_id);
7894+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
7895+ c->cpu_core_id);
7896 }
7897
7898 #endif
7899@@ -1003,7 +1029,7 @@ static int __cpuinit intel_num_cpu_cores
7900 return 1;
7901 }
7902
7903-static void srat_detect_node(void)
7904+static void __cpuinit srat_detect_node(void)
7905 {
7906 #ifdef CONFIG_NUMA
7907 unsigned node;
7908@@ -1013,7 +1039,7 @@ static void srat_detect_node(void)
7909 /* Don't do the funky fallback heuristics the AMD version employs
7910 for now. */
7911 node = apicid_to_node[apicid];
7912- if (node == NUMA_NO_NODE)
7913+ if (node == NUMA_NO_NODE || !node_online(node))
7914 node = first_node(node_online_map);
7915 numa_set_node(cpu, node);
7916
7917@@ -1021,28 +1047,39 @@ static void srat_detect_node(void)
7918 #endif
7919 }
7920
7921+static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
7922+{
7923+ if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
7924+ (c->x86 == 0x6 && c->x86_model >= 0x0e))
7925+ set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7926+}
7927+
7928 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
7929 {
7930 /* Cache sizes */
7931 unsigned n;
7932
7933 init_intel_cacheinfo(c);
7934- if (c->cpuid_level > 9 ) {
7935+ if (c->cpuid_level > 9) {
7936 unsigned eax = cpuid_eax(10);
7937 /* Check for version and the number of counters */
7938 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
7939- set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
7940+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
7941 }
7942
7943 if (cpu_has_ds) {
7944 unsigned int l1, l2;
7945 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
7946 if (!(l1 & (1<<11)))
7947- set_bit(X86_FEATURE_BTS, c->x86_capability);
7948+ set_cpu_cap(c, X86_FEATURE_BTS);
7949 if (!(l1 & (1<<12)))
7950- set_bit(X86_FEATURE_PEBS, c->x86_capability);
7951+ set_cpu_cap(c, X86_FEATURE_PEBS);
7952 }
7953
7954+
7955+ if (cpu_has_bts)
7956+ ds_init_intel(c);
7957+
7958 n = c->extended_cpuid_level;
7959 if (n >= 0x80000008) {
7960 unsigned eax = cpuid_eax(0x80000008);
7961@@ -1059,14 +1096,11 @@ static void __cpuinit init_intel(struct
7962 c->x86_cache_alignment = c->x86_clflush_size * 2;
7963 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
7964 (c->x86 == 0x6 && c->x86_model >= 0x0e))
7965- set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7966+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
7967 if (c->x86 == 6)
7968- set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7969- if (c->x86 == 15)
7970- set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7971- else
7972- clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7973- c->x86_max_cores = intel_num_cpu_cores(c);
7974+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7975+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
7976+ c->x86_max_cores = intel_num_cpu_cores(c);
7977
7978 srat_detect_node();
7979 }
7980@@ -1083,18 +1117,12 @@ static void __cpuinit get_cpu_vendor(str
7981 c->x86_vendor = X86_VENDOR_UNKNOWN;
7982 }
7983
7984-struct cpu_model_info {
7985- int vendor;
7986- int family;
7987- char *model_names[16];
7988-};
7989-
7990 /* Do some early cpuid on the boot CPU to get some parameter that are
7991 needed before check_bugs. Everything advanced is in identify_cpu
7992 below. */
7993-void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
7994+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
7995 {
7996- u32 tfms;
7997+ u32 tfms, xlvl;
7998
7999 c->loops_per_jiffy = loops_per_jiffy;
8000 c->x86_cache_size = -1;
8001@@ -1105,6 +1133,7 @@ void __cpuinit early_identify_cpu(struct
8002 c->x86_clflush_size = 64;
8003 c->x86_cache_alignment = c->x86_clflush_size;
8004 c->x86_max_cores = 1;
8005+ c->x86_coreid_bits = 0;
8006 c->extended_cpuid_level = 0;
8007 memset(&c->x86_capability, 0, sizeof c->x86_capability);
8008
8009@@ -1113,7 +1142,7 @@ void __cpuinit early_identify_cpu(struct
8010 (unsigned int *)&c->x86_vendor_id[0],
8011 (unsigned int *)&c->x86_vendor_id[8],
8012 (unsigned int *)&c->x86_vendor_id[4]);
8013-
8014+
8015 get_cpu_vendor(c);
8016
8017 /* Initialize the standard set of capabilities */
8018@@ -1131,7 +1160,7 @@ void __cpuinit early_identify_cpu(struct
8019 c->x86 += (tfms >> 20) & 0xff;
8020 if (c->x86 >= 0x6)
8021 c->x86_model += ((tfms >> 16) & 0xF) << 4;
8022- if (c->x86_capability[0] & (1<<19))
8023+ if (c->x86_capability[0] & (1<<19))
8024 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
8025 } else {
8026 /* Have CPUID level 0 only - unheard of */
8027@@ -1141,18 +1170,6 @@ void __cpuinit early_identify_cpu(struct
8028 #ifdef CONFIG_SMP
8029 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
8030 #endif
8031-}
8032-
8033-/*
8034- * This does the hard work of actually picking apart the CPU stuff...
8035- */
8036-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
8037-{
8038- int i;
8039- u32 xlvl;
8040-
8041- early_identify_cpu(c);
8042-
8043 /* AMD-defined flags: level 0x80000001 */
8044 xlvl = cpuid_eax(0x80000000);
8045 c->extended_cpuid_level = xlvl;
8046@@ -1173,6 +1190,30 @@ void __cpuinit identify_cpu(struct cpuin
8047 c->x86_capability[2] = cpuid_edx(0x80860001);
8048 }
8049
8050+ c->extended_cpuid_level = cpuid_eax(0x80000000);
8051+ if (c->extended_cpuid_level >= 0x80000007)
8052+ c->x86_power = cpuid_edx(0x80000007);
8053+
8054+ switch (c->x86_vendor) {
8055+ case X86_VENDOR_AMD:
8056+ early_init_amd(c);
8057+ break;
8058+ case X86_VENDOR_INTEL:
8059+ early_init_intel(c);
8060+ break;
8061+ }
8062+
8063+}
8064+
8065+/*
8066+ * This does the hard work of actually picking apart the CPU stuff...
8067+ */
8068+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
8069+{
8070+ int i;
8071+
8072+ early_identify_cpu(c);
8073+
8074 init_scattered_cpuid_features(c);
8075
8076 c->apicid = phys_pkg_id(0);
8077@@ -1202,8 +1243,7 @@ void __cpuinit identify_cpu(struct cpuin
8078 break;
8079 }
8080
8081- select_idle_routine(c);
8082- detect_ht(c);
8083+ detect_ht(c);
8084
8085 /*
8086 * On SMP, boot_cpu_data holds the common feature set between
8087@@ -1213,31 +1253,55 @@ void __cpuinit identify_cpu(struct cpuin
8088 */
8089 if (c != &boot_cpu_data) {
8090 /* AND the already accumulated flags with these */
8091- for (i = 0 ; i < NCAPINTS ; i++)
8092+ for (i = 0; i < NCAPINTS; i++)
8093 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
8094 }
8095
8096+ /* Clear all flags overriden by options */
8097+ for (i = 0; i < NCAPINTS; i++)
8098+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
8099+
8100 #ifdef CONFIG_X86_MCE
8101 mcheck_init(c);
8102 #endif
8103+ select_idle_routine(c);
8104+
8105 if (c != &boot_cpu_data)
8106 mtrr_ap_init();
8107 #ifdef CONFIG_NUMA
8108 numa_add_cpu(smp_processor_id());
8109 #endif
8110+
8111 }
8112-
8113+
8114+static __init int setup_noclflush(char *arg)
8115+{
8116+ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
8117+ return 1;
8118+}
8119+__setup("noclflush", setup_noclflush);
8120
8121 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
8122 {
8123 if (c->x86_model_id[0])
8124- printk("%s", c->x86_model_id);
8125+ printk(KERN_CONT "%s", c->x86_model_id);
8126+
8127+ if (c->x86_mask || c->cpuid_level >= 0)
8128+ printk(KERN_CONT " stepping %02x\n", c->x86_mask);
8129+ else
8130+ printk(KERN_CONT "\n");
8131+}
8132
8133- if (c->x86_mask || c->cpuid_level >= 0)
8134- printk(" stepping %02x\n", c->x86_mask);
8135+static __init int setup_disablecpuid(char *arg)
8136+{
8137+ int bit;
8138+ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
8139+ setup_clear_cpu_cap(bit);
8140 else
8141- printk("\n");
8142+ return 0;
8143+ return 1;
8144 }
8145+__setup("clearcpuid=", setup_disablecpuid);
8146
8147 /*
8148 * Get CPU information for use by the procfs.
8149@@ -1246,116 +1310,41 @@ void __cpuinit print_cpu_info(struct cpu
8150 static int show_cpuinfo(struct seq_file *m, void *v)
8151 {
8152 struct cpuinfo_x86 *c = v;
8153- int cpu = 0;
8154-
8155- /*
8156- * These flag bits must match the definitions in <asm/cpufeature.h>.
8157- * NULL means this bit is undefined or reserved; either way it doesn't
8158- * have meaning as far as Linux is concerned. Note that it's important
8159- * to realize there is a difference between this table and CPUID -- if
8160- * applications want to get the raw CPUID data, they should access
8161- * /dev/cpu/<cpu_nr>/cpuid instead.
8162- */
8163- static const char *const x86_cap_flags[] = {
8164- /* Intel-defined */
8165- "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
8166- "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
8167- "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
8168- "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
8169-
8170- /* AMD-defined */
8171- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8172- NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
8173- NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
8174- NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
8175- "3dnowext", "3dnow",
8176-
8177- /* Transmeta-defined */
8178- "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
8179- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8180- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8181- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8182-
8183- /* Other (Linux-defined) */
8184- "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
8185- NULL, NULL, NULL, NULL,
8186- "constant_tsc", "up", NULL, "arch_perfmon",
8187- "pebs", "bts", NULL, "sync_rdtsc",
8188- "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8189- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8190-
8191- /* Intel-defined (#2) */
8192- "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
8193- "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
8194- NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
8195- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8196-
8197- /* VIA/Cyrix/Centaur-defined */
8198- NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
8199- "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
8200- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8201- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8202-
8203- /* AMD-defined (#2) */
8204- "lahf_lm", "cmp_legacy", "svm", "extapic",
8205- "cr8_legacy", "abm", "sse4a", "misalignsse",
8206- "3dnowprefetch", "osvw", "ibs", "sse5",
8207- "skinit", "wdt", NULL, NULL,
8208- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8209- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8210-
8211- /* Auxiliary (Linux-defined) */
8212- "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8213- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8214- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8215- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8216- };
8217- static const char *const x86_power_flags[] = {
8218- "ts", /* temperature sensor */
8219- "fid", /* frequency id control */
8220- "vid", /* voltage id control */
8221- "ttp", /* thermal trip */
8222- "tm",
8223- "stc",
8224- "100mhzsteps",
8225- "hwpstate",
8226- "", /* tsc invariant mapped to constant_tsc */
8227- /* nothing */
8228- };
8229-
8230+ int cpu = 0, i;
8231
8232 #ifdef CONFIG_SMP
8233 cpu = c->cpu_index;
8234 #endif
8235
8236- seq_printf(m,"processor\t: %u\n"
8237- "vendor_id\t: %s\n"
8238- "cpu family\t: %d\n"
8239- "model\t\t: %d\n"
8240- "model name\t: %s\n",
8241- (unsigned)cpu,
8242- c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8243- c->x86,
8244- (int)c->x86_model,
8245- c->x86_model_id[0] ? c->x86_model_id : "unknown");
8246-
8247+ seq_printf(m, "processor\t: %u\n"
8248+ "vendor_id\t: %s\n"
8249+ "cpu family\t: %d\n"
8250+ "model\t\t: %d\n"
8251+ "model name\t: %s\n",
8252+ (unsigned)cpu,
8253+ c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8254+ c->x86,
8255+ (int)c->x86_model,
8256+ c->x86_model_id[0] ? c->x86_model_id : "unknown");
8257+
8258 if (c->x86_mask || c->cpuid_level >= 0)
8259 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
8260 else
8261 seq_printf(m, "stepping\t: unknown\n");
8262-
8263- if (cpu_has(c,X86_FEATURE_TSC)) {
8264+
8265+ if (cpu_has(c, X86_FEATURE_TSC)) {
8266 unsigned int freq = cpufreq_quick_get((unsigned)cpu);
8267+
8268 if (!freq)
8269 freq = cpu_khz;
8270 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
8271- freq / 1000, (freq % 1000));
8272+ freq / 1000, (freq % 1000));
8273 }
8274
8275 /* Cache size */
8276- if (c->x86_cache_size >= 0)
8277+ if (c->x86_cache_size >= 0)
8278 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
8279-
8280+
8281 #ifdef CONFIG_SMP
8282 if (smp_num_siblings * c->x86_max_cores > 1) {
8283 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
8284@@ -1364,48 +1353,43 @@ static int show_cpuinfo(struct seq_file
8285 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
8286 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
8287 }
8288-#endif
8289+#endif
8290
8291 seq_printf(m,
8292- "fpu\t\t: yes\n"
8293- "fpu_exception\t: yes\n"
8294- "cpuid level\t: %d\n"
8295- "wp\t\t: yes\n"
8296- "flags\t\t:",
8297+ "fpu\t\t: yes\n"
8298+ "fpu_exception\t: yes\n"
8299+ "cpuid level\t: %d\n"
8300+ "wp\t\t: yes\n"
8301+ "flags\t\t:",
8302 c->cpuid_level);
8303
8304- {
8305- int i;
8306- for ( i = 0 ; i < 32*NCAPINTS ; i++ )
8307- if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8308- seq_printf(m, " %s", x86_cap_flags[i]);
8309- }
8310-
8311+ for (i = 0; i < 32*NCAPINTS; i++)
8312+ if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8313+ seq_printf(m, " %s", x86_cap_flags[i]);
8314+
8315 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
8316 c->loops_per_jiffy/(500000/HZ),
8317 (c->loops_per_jiffy/(5000/HZ)) % 100);
8318
8319- if (c->x86_tlbsize > 0)
8320+ if (c->x86_tlbsize > 0)
8321 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
8322 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
8323 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
8324
8325- seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8326+ seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8327 c->x86_phys_bits, c->x86_virt_bits);
8328
8329 seq_printf(m, "power management:");
8330- {
8331- unsigned i;
8332- for (i = 0; i < 32; i++)
8333- if (c->x86_power & (1 << i)) {
8334- if (i < ARRAY_SIZE(x86_power_flags) &&
8335- x86_power_flags[i])
8336- seq_printf(m, "%s%s",
8337- x86_power_flags[i][0]?" ":"",
8338- x86_power_flags[i]);
8339- else
8340- seq_printf(m, " [%d]", i);
8341- }
8342+ for (i = 0; i < 32; i++) {
8343+ if (c->x86_power & (1 << i)) {
8344+ if (i < ARRAY_SIZE(x86_power_flags) &&
8345+ x86_power_flags[i])
8346+ seq_printf(m, "%s%s",
8347+ x86_power_flags[i][0]?" ":"",
8348+ x86_power_flags[i]);
8349+ else
8350+ seq_printf(m, " [%d]", i);
8351+ }
8352 }
8353
8354 seq_printf(m, "\n\n");
8355@@ -1432,8 +1416,8 @@ static void c_stop(struct seq_file *m, v
8356 {
8357 }
8358
8359-struct seq_operations cpuinfo_op = {
8360- .start =c_start,
8361+const struct seq_operations cpuinfo_op = {
8362+ .start = c_start,
8363 .next = c_next,
8364 .stop = c_stop,
8365 .show = show_cpuinfo,
8366--- a/arch/x86/kernel/setup64-xen.c
8367+++ b/arch/x86/kernel/setup64-xen.c
8368@@ -31,7 +31,11 @@
8369 #include <asm/hypervisor.h>
8370 #endif
8371
8372+#ifndef CONFIG_DEBUG_BOOT_PARAMS
8373 struct boot_params __initdata boot_params;
8374+#else
8375+struct boot_params boot_params;
8376+#endif
8377
8378 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
8379
8380@@ -47,6 +51,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __attr
8381
8382 unsigned long __supported_pte_mask __read_mostly = ~0UL;
8383 EXPORT_SYMBOL(__supported_pte_mask);
8384+
8385 static int do_not_nx __cpuinitdata = 0;
8386
8387 /* noexec=on|off
8388@@ -90,6 +95,45 @@ static int __init nonx32_setup(char *str
8389 __setup("noexec32=", nonx32_setup);
8390
8391 /*
8392+ * Copy data used in early init routines from the initial arrays to the
8393+ * per cpu data areas. These arrays then become expendable and the
8394+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
8395+ */
8396+static void __init setup_per_cpu_maps(void)
8397+{
8398+#ifndef CONFIG_XEN
8399+ int cpu;
8400+
8401+ for_each_possible_cpu(cpu) {
8402+#ifdef CONFIG_SMP
8403+ if (per_cpu_offset(cpu)) {
8404+#endif
8405+ per_cpu(x86_cpu_to_apicid, cpu) =
8406+ x86_cpu_to_apicid_init[cpu];
8407+ per_cpu(x86_bios_cpu_apicid, cpu) =
8408+ x86_bios_cpu_apicid_init[cpu];
8409+#ifdef CONFIG_NUMA
8410+ per_cpu(x86_cpu_to_node_map, cpu) =
8411+ x86_cpu_to_node_map_init[cpu];
8412+#endif
8413+#ifdef CONFIG_SMP
8414+ }
8415+ else
8416+ printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
8417+ cpu);
8418+#endif
8419+ }
8420+
8421+ /* indicate the early static arrays will soon be gone */
8422+ x86_cpu_to_apicid_early_ptr = NULL;
8423+ x86_bios_cpu_apicid_early_ptr = NULL;
8424+#ifdef CONFIG_NUMA
8425+ x86_cpu_to_node_map_early_ptr = NULL;
8426+#endif
8427+#endif
8428+}
8429+
8430+/*
8431 * Great future plan:
8432 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
8433 * Always point %gs to its beginning
8434@@ -109,19 +153,24 @@ void __init setup_per_cpu_areas(void)
8435 printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
8436 for_each_cpu_mask (i, cpu_possible_map) {
8437 char *ptr;
8438+#ifndef CONFIG_NEED_MULTIPLE_NODES
8439+ ptr = alloc_bootmem_pages(size);
8440+#else
8441+ int node = early_cpu_to_node(i);
8442
8443- if (!NODE_DATA(cpu_to_node(i))) {
8444- printk("cpu with no node %d, num_online_nodes %d\n",
8445- i, num_online_nodes());
8446+ if (!node_online(node) || !NODE_DATA(node))
8447 ptr = alloc_bootmem_pages(size);
8448- } else {
8449- ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
8450- }
8451+ else
8452+ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
8453+#endif
8454 if (!ptr)
8455 panic("Cannot allocate cpu data for CPU %d\n", i);
8456 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
8457 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
8458 }
8459+
8460+ /* setup percpu data maps early */
8461+ setup_per_cpu_maps();
8462 }
8463
8464 #ifdef CONFIG_XEN
8465@@ -224,7 +273,8 @@ void syscall_init(void)
8466 wrmsrl(MSR_CSTAR, ignore_sysret);
8467
8468 /* Flags to clear on syscall */
8469- wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
8470+ wrmsrl(MSR_SYSCALL_MASK,
8471+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
8472 #endif
8473 #ifdef CONFIG_IA32_EMULATION
8474 syscall32_cpu_init ();
8475@@ -303,7 +353,7 @@ void __cpuinit cpu_init (void)
8476 */
8477 #ifndef CONFIG_XEN
8478 if (cpu)
8479- memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
8480+ memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
8481 #endif
8482
8483 cpu_gdt_descr[cpu].size = GDT_SIZE;
8484@@ -334,10 +384,10 @@ void __cpuinit cpu_init (void)
8485 v, cpu);
8486 }
8487 estacks += PAGE_SIZE << order[v];
8488- orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
8489+ orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
8490 }
8491
8492- t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
8493+ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
8494 /*
8495 * <= is required because the CPU will access up to
8496 * 8 bits beyond the end of the IO permission bitmap.
8497--- a/arch/x86/kernel/smp_32-xen.c
8498+++ b/arch/x86/kernel/smp_32-xen.c
8499@@ -168,7 +168,7 @@ void __send_IPI_shortcut(unsigned int sh
8500 }
8501 }
8502
8503-void fastcall send_IPI_self(int vector)
8504+void send_IPI_self(int vector)
8505 {
8506 __send_IPI_shortcut(APIC_DEST_SELF, vector);
8507 }
8508@@ -224,13 +224,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
8509 * We need to reload %cr3 since the page tables may be going
8510 * away from under us..
8511 */
8512-void leave_mm(unsigned long cpu)
8513+void leave_mm(int cpu)
8514 {
8515 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
8516 BUG();
8517 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
8518 load_cr3(swapper_pg_dir);
8519 }
8520+EXPORT_SYMBOL_GPL(leave_mm);
8521
8522 /*
8523 *
8524--- a/arch/x86/kernel/smp_64-xen.c
8525+++ b/arch/x86/kernel/smp_64-xen.c
8526@@ -33,7 +33,7 @@
8527
8528 #ifndef CONFIG_XEN
8529 /*
8530- * Smarter SMP flushing macros.
8531+ * Smarter SMP flushing macros.
8532 * c/o Linus Torvalds.
8533 *
8534 * These mean you can really definitely utterly forget about
8535@@ -41,15 +41,15 @@
8536 *
8537 * Optimizations Manfred Spraul <manfred@colorfullife.com>
8538 *
8539- * More scalable flush, from Andi Kleen
8540+ * More scalable flush, from Andi Kleen
8541 *
8542- * To avoid global state use 8 different call vectors.
8543- * Each CPU uses a specific vector to trigger flushes on other
8544- * CPUs. Depending on the received vector the target CPUs look into
8545+ * To avoid global state use 8 different call vectors.
8546+ * Each CPU uses a specific vector to trigger flushes on other
8547+ * CPUs. Depending on the received vector the target CPUs look into
8548 * the right per cpu variable for the flush data.
8549 *
8550- * With more than 8 CPUs they are hashed to the 8 available
8551- * vectors. The limited global vector space forces us to this right now.
8552+ * With more than 8 CPUs they are hashed to the 8 available
8553+ * vectors. The limited global vector space forces us to this right now.
8554 * In future when interrupts are split into per CPU domains this could be
8555 * fixed, at the cost of triggering multiple IPIs in some cases.
8556 */
8557@@ -59,7 +59,6 @@ union smp_flush_state {
8558 cpumask_t flush_cpumask;
8559 struct mm_struct *flush_mm;
8560 unsigned long flush_va;
8561-#define FLUSH_ALL -1ULL
8562 spinlock_t tlbstate_lock;
8563 };
8564 char pad[SMP_CACHE_BYTES];
8565@@ -71,16 +70,17 @@ union smp_flush_state {
8566 static DEFINE_PER_CPU(union smp_flush_state, flush_state);
8567
8568 /*
8569- * We cannot call mmdrop() because we are in interrupt context,
8570+ * We cannot call mmdrop() because we are in interrupt context,
8571 * instead update mm->cpu_vm_mask.
8572 */
8573-static inline void leave_mm(unsigned long cpu)
8574+void leave_mm(int cpu)
8575 {
8576 if (read_pda(mmu_state) == TLBSTATE_OK)
8577 BUG();
8578 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
8579 load_cr3(swapper_pg_dir);
8580 }
8581+EXPORT_SYMBOL_GPL(leave_mm);
8582
8583 /*
8584 *
8585@@ -89,25 +89,25 @@ static inline void leave_mm(unsigned lon
8586 * 1) switch_mm() either 1a) or 1b)
8587 * 1a) thread switch to a different mm
8588 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
8589- * Stop ipi delivery for the old mm. This is not synchronized with
8590- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
8591- * for the wrong mm, and in the worst case we perform a superfluous
8592- * tlb flush.
8593+ * Stop ipi delivery for the old mm. This is not synchronized with
8594+ * the other cpus, but smp_invalidate_interrupt ignore flush ipis
8595+ * for the wrong mm, and in the worst case we perform a superfluous
8596+ * tlb flush.
8597 * 1a2) set cpu mmu_state to TLBSTATE_OK
8598- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8599+ * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8600 * was in lazy tlb mode.
8601 * 1a3) update cpu active_mm
8602- * Now cpu0 accepts tlb flushes for the new mm.
8603+ * Now cpu0 accepts tlb flushes for the new mm.
8604 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
8605- * Now the other cpus will send tlb flush ipis.
8606+ * Now the other cpus will send tlb flush ipis.
8607 * 1a4) change cr3.
8608 * 1b) thread switch without mm change
8609 * cpu active_mm is correct, cpu0 already handles
8610 * flush ipis.
8611 * 1b1) set cpu mmu_state to TLBSTATE_OK
8612 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
8613- * Atomically set the bit [other cpus will start sending flush ipis],
8614- * and test the bit.
8615+ * Atomically set the bit [other cpus will start sending flush ipis],
8616+ * and test the bit.
8617 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
8618 * 2) switch %%esp, ie current
8619 *
8620@@ -141,12 +141,12 @@ asmlinkage void smp_invalidate_interrupt
8621 * orig_rax contains the negated interrupt vector.
8622 * Use that to determine where the sender put the data.
8623 */
8624- sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
8625+ sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
8626 f = &per_cpu(flush_state, sender);
8627
8628 if (!cpu_isset(cpu, f->flush_cpumask))
8629 goto out;
8630- /*
8631+ /*
8632 * This was a BUG() but until someone can quote me the
8633 * line from the intel manual that guarantees an IPI to
8634 * multiple CPUs is retried _only_ on the erroring CPUs
8635@@ -154,10 +154,10 @@ asmlinkage void smp_invalidate_interrupt
8636 *
8637 * BUG();
8638 */
8639-
8640+
8641 if (f->flush_mm == read_pda(active_mm)) {
8642 if (read_pda(mmu_state) == TLBSTATE_OK) {
8643- if (f->flush_va == FLUSH_ALL)
8644+ if (f->flush_va == TLB_FLUSH_ALL)
8645 local_flush_tlb();
8646 else
8647 __flush_tlb_one(f->flush_va);
8648@@ -170,19 +170,22 @@ out:
8649 add_pda(irq_tlb_count, 1);
8650 }
8651
8652-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
8653- unsigned long va)
8654+void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
8655+ unsigned long va)
8656 {
8657 int sender;
8658 union smp_flush_state *f;
8659+ cpumask_t cpumask = *cpumaskp;
8660
8661 /* Caller has disabled preemption */
8662 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
8663 f = &per_cpu(flush_state, sender);
8664
8665- /* Could avoid this lock when
8666- num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
8667- probably not worth checking this for a cache-hot lock. */
8668+ /*
8669+ * Could avoid this lock when
8670+ * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
8671+ * probably not worth checking this for a cache-hot lock.
8672+ */
8673 spin_lock(&f->tlbstate_lock);
8674
8675 f->flush_mm = mm;
8676@@ -206,14 +209,14 @@ static void flush_tlb_others(cpumask_t c
8677 int __cpuinit init_smp_flush(void)
8678 {
8679 int i;
8680+
8681 for_each_cpu_mask(i, cpu_possible_map) {
8682 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
8683 }
8684 return 0;
8685 }
8686-
8687 core_initcall(init_smp_flush);
8688-
8689+
8690 void flush_tlb_current_task(void)
8691 {
8692 struct mm_struct *mm = current->mm;
8693@@ -225,10 +228,9 @@ void flush_tlb_current_task(void)
8694
8695 local_flush_tlb();
8696 if (!cpus_empty(cpu_mask))
8697- flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
8698+ flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
8699 preempt_enable();
8700 }
8701-EXPORT_SYMBOL(flush_tlb_current_task);
8702
8703 void flush_tlb_mm (struct mm_struct * mm)
8704 {
8705@@ -245,11 +247,10 @@ void flush_tlb_mm (struct mm_struct * mm
8706 leave_mm(smp_processor_id());
8707 }
8708 if (!cpus_empty(cpu_mask))
8709- flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
8710+ flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
8711
8712 preempt_enable();
8713 }
8714-EXPORT_SYMBOL(flush_tlb_mm);
8715
8716 void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
8717 {
8718@@ -263,8 +264,8 @@ void flush_tlb_page(struct vm_area_struc
8719 if (current->active_mm == mm) {
8720 if(current->mm)
8721 __flush_tlb_one(va);
8722- else
8723- leave_mm(smp_processor_id());
8724+ else
8725+ leave_mm(smp_processor_id());
8726 }
8727
8728 if (!cpus_empty(cpu_mask))
8729@@ -272,7 +273,6 @@ void flush_tlb_page(struct vm_area_struc
8730
8731 preempt_enable();
8732 }
8733-EXPORT_SYMBOL(flush_tlb_page);
8734
8735 static void do_flush_tlb_all(void* info)
8736 {
8737@@ -330,11 +330,9 @@ void unlock_ipi_call_lock(void)
8738 * this function sends a 'generic call function' IPI to all other CPU
8739 * of the system defined in the mask.
8740 */
8741-
8742-static int
8743-__smp_call_function_mask(cpumask_t mask,
8744- void (*func)(void *), void *info,
8745- int wait)
8746+static int __smp_call_function_mask(cpumask_t mask,
8747+ void (*func)(void *), void *info,
8748+ int wait)
8749 {
8750 struct call_data_struct data;
8751 cpumask_t allbutself;
8752@@ -422,11 +420,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
8753 */
8754
8755 int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
8756- int nonatomic, int wait)
8757+ int nonatomic, int wait)
8758 {
8759 /* prevent preemption and reschedule on another processor */
8760- int ret;
8761- int me = get_cpu();
8762+ int ret, me = get_cpu();
8763
8764 /* Can deadlock when called with interrupts disabled */
8765 WARN_ON(irqs_disabled());
8766@@ -476,9 +473,9 @@ static void stop_this_cpu(void *dummy)
8767 */
8768 cpu_clear(smp_processor_id(), cpu_online_map);
8769 disable_all_local_evtchn();
8770- for (;;)
8771+ for (;;)
8772 halt();
8773-}
8774+}
8775
8776 void smp_send_stop(void)
8777 {
8778--- a/arch/x86/kernel/time_32-xen.c
8779+++ b/arch/x86/kernel/time_32-xen.c
8780@@ -28,21 +28,9 @@
8781 * serialize accesses to xtime/lost_ticks).
8782 */
8783
8784-#include <linux/errno.h>
8785-#include <linux/sched.h>
8786-#include <linux/kernel.h>
8787-#include <linux/param.h>
8788-#include <linux/string.h>
8789-#include <linux/mm.h>
8790+#include <linux/init.h>
8791 #include <linux/interrupt.h>
8792 #include <linux/time.h>
8793-#include <linux/delay.h>
8794-#include <linux/init.h>
8795-#include <linux/smp.h>
8796-#include <linux/module.h>
8797-#include <linux/sysdev.h>
8798-#include <linux/bcd.h>
8799-#include <linux/efi.h>
8800 #include <linux/mca.h>
8801 #include <linux/sysctl.h>
8802 #include <linux/percpu.h>
8803@@ -50,26 +38,10 @@
8804 #include <linux/posix-timers.h>
8805 #include <linux/cpufreq.h>
8806 #include <linux/clocksource.h>
8807+#include <linux/sysdev.h>
8808
8809-#include <asm/io.h>
8810-#include <asm/smp.h>
8811-#include <asm/irq.h>
8812-#include <asm/msr.h>
8813 #include <asm/delay.h>
8814-#include <asm/mpspec.h>
8815-#include <asm/uaccess.h>
8816-#include <asm/processor.h>
8817-#include <asm/timer.h>
8818 #include <asm/time.h>
8819-#include <asm/sections.h>
8820-
8821-#include "mach_time.h"
8822-
8823-#include <linux/timex.h>
8824-
8825-#include <asm/hpet.h>
8826-
8827-#include <asm/arch_hooks.h>
8828
8829 #include <xen/evtchn.h>
8830 #include <xen/sysctl.h>
8831@@ -89,9 +61,6 @@ volatile unsigned long __jiffies __secti
8832 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
8833 EXPORT_SYMBOL(cpu_khz);
8834
8835-DEFINE_SPINLOCK(rtc_lock);
8836-EXPORT_SYMBOL(rtc_lock);
8837-
8838 /* These are peridically updated in shared_info, and then copied here. */
8839 struct shadow_time_info {
8840 u64 tsc_timestamp; /* TSC at last update of time vals. */
8841@@ -154,6 +123,11 @@ static int __init __independent_wallcloc
8842 }
8843 __setup("independent_wallclock", __independent_wallclock);
8844
8845+int xen_independent_wallclock(void)
8846+{
8847+ return independent_wallclock;
8848+}
8849+
8850 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
8851 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
8852 static int __init __permitted_clock_jitter(char *str)
8853@@ -223,7 +197,6 @@ static inline u64 get64(volatile u64 *pt
8854 return cmpxchg64(ptr, 0, 0);
8855 #else
8856 return *ptr;
8857-#define cmpxchg64 cmpxchg
8858 #endif
8859 }
8860
8861@@ -233,7 +206,6 @@ static inline u64 get64_local(volatile u
8862 return cmpxchg64_local(ptr, 0, 0);
8863 #else
8864 return *ptr;
8865-#define cmpxchg64_local cmpxchg_local
8866 #endif
8867 }
8868
8869@@ -341,35 +313,6 @@ static inline int time_values_up_to_date
8870 return (dst->version == src->version);
8871 }
8872
8873-/*
8874- * This is a special lock that is owned by the CPU and holds the index
8875- * register we are working with. It is required for NMI access to the
8876- * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
8877- */
8878-volatile unsigned long cmos_lock = 0;
8879-EXPORT_SYMBOL(cmos_lock);
8880-
8881-/* Routines for accessing the CMOS RAM/RTC. */
8882-unsigned char rtc_cmos_read(unsigned char addr)
8883-{
8884- unsigned char val;
8885- lock_cmos_prefix(addr);
8886- outb_p(addr, RTC_PORT(0));
8887- val = inb_p(RTC_PORT(1));
8888- lock_cmos_suffix(addr);
8889- return val;
8890-}
8891-EXPORT_SYMBOL(rtc_cmos_read);
8892-
8893-void rtc_cmos_write(unsigned char val, unsigned char addr)
8894-{
8895- lock_cmos_prefix(addr);
8896- outb_p(addr, RTC_PORT(0));
8897- outb_p(val, RTC_PORT(1));
8898- lock_cmos_suffix(addr);
8899-}
8900-EXPORT_SYMBOL(rtc_cmos_write);
8901-
8902 static void sync_xen_wallclock(unsigned long dummy);
8903 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
8904 static void sync_xen_wallclock(unsigned long dummy)
8905@@ -378,7 +321,8 @@ static void sync_xen_wallclock(unsigned
8906 s64 nsec;
8907 struct xen_platform_op op;
8908
8909- if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
8910+ BUG_ON(!is_initial_xendomain());
8911+ if (!ntp_synced() || independent_wallclock)
8912 return;
8913
8914 write_seqlock_irq(&xtime_lock);
8915@@ -401,23 +345,6 @@ static void sync_xen_wallclock(unsigned
8916 mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
8917 }
8918
8919-static int set_rtc_mmss(unsigned long nowtime)
8920-{
8921- int retval;
8922- unsigned long flags;
8923-
8924- if (independent_wallclock || !is_initial_xendomain())
8925- return 0;
8926-
8927- /* gets recalled with irq locally disabled */
8928- /* XXX - does irqsave resolve this? -johnstul */
8929- spin_lock_irqsave(&rtc_lock, flags);
8930- retval = set_wallclock(nowtime);
8931- spin_unlock_irqrestore(&rtc_lock, flags);
8932-
8933- return retval;
8934-}
8935-
8936 static unsigned long long local_clock(void)
8937 {
8938 unsigned int cpu = get_cpu();
8939@@ -500,28 +427,24 @@ unsigned long profile_pc(struct pt_regs
8940
8941 #if defined(CONFIG_SMP) || defined(__x86_64__)
8942 # ifdef __i386__
8943- if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs)
8944+ if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs)
8945 # else
8946 if (!user_mode(regs)
8947 # endif
8948 && in_lock_functions(pc)) {
8949 # ifdef CONFIG_FRAME_POINTER
8950-# ifdef __i386__
8951- return ((unsigned long *)regs->ebp)[1];
8952-# else
8953- return ((unsigned long *)regs->rbp)[1];
8954-# endif
8955+ return ((unsigned long *)regs->bp)[1];
8956 # else
8957 # ifdef __i386__
8958- unsigned long *sp = (unsigned long *)&regs->esp;
8959+ unsigned long *sp = (unsigned long *)&regs->sp;
8960 # else
8961- unsigned long *sp = (unsigned long *)regs->rsp;
8962+ unsigned long *sp = (unsigned long *)regs->sp;
8963 # endif
8964
8965 /* Return address is either directly at stack pointer
8966- or above a saved eflags. Eflags has bits 22-31 zero,
8967+ or above a saved flags. Eflags has bits 22-31 zero,
8968 kernel addresses don't. */
8969- if (sp[0] >> 22)
8970+ if (sp[0] >> 22)
8971 return sp[0];
8972 if (sp[1] >> 22)
8973 return sp[1];
8974@@ -750,25 +673,32 @@ static void init_missing_ticks_accountin
8975 runstate->time[RUNSTATE_offline];
8976 }
8977
8978-/* not static: needed by APM */
8979-unsigned long read_persistent_clock(void)
8980+unsigned long xen_read_persistent_clock(void)
8981 {
8982- unsigned long retval;
8983- unsigned long flags;
8984-
8985- spin_lock_irqsave(&rtc_lock, flags);
8986+ const shared_info_t *s = HYPERVISOR_shared_info;
8987+ u32 version, sec, nsec;
8988+ u64 delta;
8989
8990- retval = get_wallclock();
8991+ do {
8992+ version = s->wc_version;
8993+ rmb();
8994+ sec = s->wc_sec;
8995+ nsec = s->wc_nsec;
8996+ rmb();
8997+ } while ((s->wc_version & 1) | (version ^ s->wc_version));
8998
8999- spin_unlock_irqrestore(&rtc_lock, flags);
9000+ delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
9001+ do_div(delta, NSEC_PER_SEC);
9002
9003- return retval;
9004+ return delta;
9005 }
9006
9007-int update_persistent_clock(struct timespec now)
9008+int xen_update_persistent_clock(void)
9009 {
9010+ if (!is_initial_xendomain())
9011+ return -1;
9012 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
9013- return set_rtc_mmss(now.tv_sec);
9014+ return 0;
9015 }
9016
9017 extern void (*late_time_init)(void);
9018--- a/arch/x86/kernel/traps_32-xen.c
9019+++ b/arch/x86/kernel/traps_32-xen.c
9020@@ -79,7 +79,8 @@ char ignore_fpu_irq = 0;
9021 * F0 0F bug workaround.. We have a special link segment
9022 * for this.
9023 */
9024-struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
9025+gate_desc idt_table[256]
9026+ __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
9027 #endif
9028
9029 asmlinkage void divide_error(void);
9030@@ -109,6 +110,34 @@ asmlinkage void machine_check(void);
9031 int kstack_depth_to_print = 24;
9032 static unsigned int code_bytes = 64;
9033
9034+void printk_address(unsigned long address, int reliable)
9035+{
9036+#ifdef CONFIG_KALLSYMS
9037+ unsigned long offset = 0, symsize;
9038+ const char *symname;
9039+ char *modname;
9040+ char *delim = ":";
9041+ char namebuf[128];
9042+ char reliab[4] = "";
9043+
9044+ symname = kallsyms_lookup(address, &symsize, &offset,
9045+ &modname, namebuf);
9046+ if (!symname) {
9047+ printk(" [<%08lx>]\n", address);
9048+ return;
9049+ }
9050+ if (!reliable)
9051+ strcpy(reliab, "? ");
9052+
9053+ if (!modname)
9054+ modname = delim = "";
9055+ printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
9056+ address, reliab, delim, modname, delim, symname, offset, symsize);
9057+#else
9058+ printk(" [<%08lx>]\n", address);
9059+#endif
9060+}
9061+
9062 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
9063 {
9064 return p > (void *)tinfo &&
9065@@ -122,48 +151,35 @@ struct stack_frame {
9066 };
9067
9068 static inline unsigned long print_context_stack(struct thread_info *tinfo,
9069- unsigned long *stack, unsigned long ebp,
9070+ unsigned long *stack, unsigned long bp,
9071 const struct stacktrace_ops *ops, void *data)
9072 {
9073-#ifdef CONFIG_FRAME_POINTER
9074- struct stack_frame *frame = (struct stack_frame *)ebp;
9075- while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
9076- struct stack_frame *next;
9077- unsigned long addr;
9078+ struct stack_frame *frame = (struct stack_frame *)bp;
9079
9080- addr = frame->return_address;
9081- ops->address(data, addr);
9082- /*
9083- * break out of recursive entries (such as
9084- * end_of_stack_stop_unwind_function). Also,
9085- * we can never allow a frame pointer to
9086- * move downwards!
9087- */
9088- next = frame->next_frame;
9089- if (next <= frame)
9090- break;
9091- frame = next;
9092- }
9093-#else
9094 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
9095 unsigned long addr;
9096
9097- addr = *stack++;
9098- if (__kernel_text_address(addr))
9099- ops->address(data, addr);
9100+ addr = *stack;
9101+ if (__kernel_text_address(addr)) {
9102+ if ((unsigned long) stack == bp + 4) {
9103+ ops->address(data, addr, 1);
9104+ frame = frame->next_frame;
9105+ bp = (unsigned long) frame;
9106+ } else {
9107+ ops->address(data, addr, bp == 0);
9108+ }
9109+ }
9110+ stack++;
9111 }
9112-#endif
9113- return ebp;
9114+ return bp;
9115 }
9116
9117 #define MSG(msg) ops->warning(data, msg)
9118
9119 void dump_trace(struct task_struct *task, struct pt_regs *regs,
9120- unsigned long *stack,
9121+ unsigned long *stack, unsigned long bp,
9122 const struct stacktrace_ops *ops, void *data)
9123 {
9124- unsigned long ebp = 0;
9125-
9126 if (!task)
9127 task = current;
9128
9129@@ -171,17 +187,17 @@ void dump_trace(struct task_struct *task
9130 unsigned long dummy;
9131 stack = &dummy;
9132 if (task != current)
9133- stack = (unsigned long *)task->thread.esp;
9134+ stack = (unsigned long *)task->thread.sp;
9135 }
9136
9137 #ifdef CONFIG_FRAME_POINTER
9138- if (!ebp) {
9139+ if (!bp) {
9140 if (task == current) {
9141- /* Grab ebp right from our regs */
9142- asm ("movl %%ebp, %0" : "=r" (ebp) : );
9143+ /* Grab bp right from our regs */
9144+ asm ("movl %%ebp, %0" : "=r" (bp) : );
9145 } else {
9146- /* ebp is the last reg pushed by switch_to */
9147- ebp = *(unsigned long *) task->thread.esp;
9148+ /* bp is the last reg pushed by switch_to */
9149+ bp = *(unsigned long *) task->thread.sp;
9150 }
9151 }
9152 #endif
9153@@ -190,7 +206,7 @@ void dump_trace(struct task_struct *task
9154 struct thread_info *context;
9155 context = (struct thread_info *)
9156 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
9157- ebp = print_context_stack(context, stack, ebp, ops, data);
9158+ bp = print_context_stack(context, stack, bp, ops, data);
9159 /* Should be after the line below, but somewhere
9160 in early boot context comes out corrupted and we
9161 can't reference it -AK */
9162@@ -225,9 +241,11 @@ static int print_trace_stack(void *data,
9163 /*
9164 * Print one address/symbol entries per line.
9165 */
9166-static void print_trace_address(void *data, unsigned long addr)
9167+static void print_trace_address(void *data, unsigned long addr, int reliable)
9168 {
9169 printk("%s [<%08lx>] ", (char *)data, addr);
9170+ if (!reliable)
9171+ printk("? ");
9172 print_symbol("%s\n", addr);
9173 touch_nmi_watchdog();
9174 }
9175@@ -241,32 +259,32 @@ static const struct stacktrace_ops print
9176
9177 static void
9178 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
9179- unsigned long * stack, char *log_lvl)
9180+ unsigned long *stack, unsigned long bp, char *log_lvl)
9181 {
9182- dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
9183+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
9184 printk("%s =======================\n", log_lvl);
9185 }
9186
9187 void show_trace(struct task_struct *task, struct pt_regs *regs,
9188- unsigned long * stack)
9189+ unsigned long *stack, unsigned long bp)
9190 {
9191- show_trace_log_lvl(task, regs, stack, "");
9192+ show_trace_log_lvl(task, regs, stack, bp, "");
9193 }
9194
9195 static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
9196- unsigned long *esp, char *log_lvl)
9197+ unsigned long *sp, unsigned long bp, char *log_lvl)
9198 {
9199 unsigned long *stack;
9200 int i;
9201
9202- if (esp == NULL) {
9203+ if (sp == NULL) {
9204 if (task)
9205- esp = (unsigned long*)task->thread.esp;
9206+ sp = (unsigned long*)task->thread.sp;
9207 else
9208- esp = (unsigned long *)&esp;
9209+ sp = (unsigned long *)&sp;
9210 }
9211
9212- stack = esp;
9213+ stack = sp;
9214 for(i = 0; i < kstack_depth_to_print; i++) {
9215 if (kstack_end(stack))
9216 break;
9217@@ -275,13 +293,13 @@ static void show_stack_log_lvl(struct ta
9218 printk("%08lx ", *stack++);
9219 }
9220 printk("\n%sCall Trace:\n", log_lvl);
9221- show_trace_log_lvl(task, regs, esp, log_lvl);
9222+ show_trace_log_lvl(task, regs, sp, bp, log_lvl);
9223 }
9224
9225-void show_stack(struct task_struct *task, unsigned long *esp)
9226+void show_stack(struct task_struct *task, unsigned long *sp)
9227 {
9228 printk(" ");
9229- show_stack_log_lvl(task, NULL, esp, "");
9230+ show_stack_log_lvl(task, NULL, sp, 0, "");
9231 }
9232
9233 /*
9234@@ -290,13 +308,19 @@ void show_stack(struct task_struct *task
9235 void dump_stack(void)
9236 {
9237 unsigned long stack;
9238+ unsigned long bp = 0;
9239+
9240+#ifdef CONFIG_FRAME_POINTER
9241+ if (!bp)
9242+ asm("movl %%ebp, %0" : "=r" (bp):);
9243+#endif
9244
9245 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
9246 current->pid, current->comm, print_tainted(),
9247 init_utsname()->release,
9248 (int)strcspn(init_utsname()->version, " "),
9249 init_utsname()->version);
9250- show_trace(current, NULL, &stack);
9251+ show_trace(current, NULL, &stack, bp);
9252 }
9253
9254 EXPORT_SYMBOL(dump_stack);
9255@@ -315,30 +339,30 @@ void show_registers(struct pt_regs *regs
9256 * time of the fault..
9257 */
9258 if (!user_mode_vm(regs)) {
9259- u8 *eip;
9260+ u8 *ip;
9261 unsigned int code_prologue = code_bytes * 43 / 64;
9262 unsigned int code_len = code_bytes;
9263 unsigned char c;
9264
9265 printk("\n" KERN_EMERG "Stack: ");
9266- show_stack_log_lvl(NULL, regs, &regs->esp, KERN_EMERG);
9267+ show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
9268
9269 printk(KERN_EMERG "Code: ");
9270
9271- eip = (u8 *)regs->eip - code_prologue;
9272- if (eip < (u8 *)PAGE_OFFSET ||
9273- probe_kernel_address(eip, c)) {
9274+ ip = (u8 *)regs->ip - code_prologue;
9275+ if (ip < (u8 *)PAGE_OFFSET ||
9276+ probe_kernel_address(ip, c)) {
9277 /* try starting at EIP */
9278- eip = (u8 *)regs->eip;
9279+ ip = (u8 *)regs->ip;
9280 code_len = code_len - code_prologue + 1;
9281 }
9282- for (i = 0; i < code_len; i++, eip++) {
9283- if (eip < (u8 *)PAGE_OFFSET ||
9284- probe_kernel_address(eip, c)) {
9285+ for (i = 0; i < code_len; i++, ip++) {
9286+ if (ip < (u8 *)PAGE_OFFSET ||
9287+ probe_kernel_address(ip, c)) {
9288 printk(" Bad EIP value.");
9289 break;
9290 }
9291- if (eip == (u8 *)regs->eip)
9292+ if (ip == (u8 *)regs->ip)
9293 printk("<%02x> ", c);
9294 else
9295 printk("%02x ", c);
9296@@ -347,18 +371,57 @@ void show_registers(struct pt_regs *regs
9297 printk("\n");
9298 }
9299
9300-int is_valid_bugaddr(unsigned long eip)
9301+int is_valid_bugaddr(unsigned long ip)
9302 {
9303 unsigned short ud2;
9304
9305- if (eip < PAGE_OFFSET)
9306+ if (ip < PAGE_OFFSET)
9307 return 0;
9308- if (probe_kernel_address((unsigned short *)eip, ud2))
9309+ if (probe_kernel_address((unsigned short *)ip, ud2))
9310 return 0;
9311
9312 return ud2 == 0x0b0f;
9313 }
9314
9315+static int die_counter;
9316+
9317+int __kprobes __die(const char * str, struct pt_regs * regs, long err)
9318+{
9319+ unsigned long sp;
9320+ unsigned short ss;
9321+
9322+ printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
9323+#ifdef CONFIG_PREEMPT
9324+ printk("PREEMPT ");
9325+#endif
9326+#ifdef CONFIG_SMP
9327+ printk("SMP ");
9328+#endif
9329+#ifdef CONFIG_DEBUG_PAGEALLOC
9330+ printk("DEBUG_PAGEALLOC");
9331+#endif
9332+ printk("\n");
9333+
9334+ if (notify_die(DIE_OOPS, str, regs, err,
9335+ current->thread.trap_no, SIGSEGV) !=
9336+ NOTIFY_STOP) {
9337+ show_registers(regs);
9338+ /* Executive summary in case the oops scrolled away */
9339+ sp = (unsigned long) (&regs->sp);
9340+ savesegment(ss, ss);
9341+ if (user_mode(regs)) {
9342+ sp = regs->sp;
9343+ ss = regs->ss & 0xffff;
9344+ }
9345+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
9346+ print_symbol("%s", regs->ip);
9347+ printk(" SS:ESP %04x:%08lx\n", ss, sp);
9348+ return 0;
9349+ } else {
9350+ return 1;
9351+ }
9352+}
9353+
9354 /*
9355 * This is gone through when something in the kernel has done something bad and
9356 * is about to be terminated.
9357@@ -374,7 +437,6 @@ void die(const char * str, struct pt_reg
9358 .lock_owner = -1,
9359 .lock_owner_depth = 0
9360 };
9361- static int die_counter;
9362 unsigned long flags;
9363
9364 oops_enter();
9365@@ -390,43 +452,13 @@ void die(const char * str, struct pt_reg
9366 raw_local_irq_save(flags);
9367
9368 if (++die.lock_owner_depth < 3) {
9369- unsigned long esp;
9370- unsigned short ss;
9371-
9372- report_bug(regs->eip, regs);
9373-
9374- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
9375- ++die_counter);
9376-#ifdef CONFIG_PREEMPT
9377- printk("PREEMPT ");
9378-#endif
9379-#ifdef CONFIG_SMP
9380- printk("SMP ");
9381-#endif
9382-#ifdef CONFIG_DEBUG_PAGEALLOC
9383- printk("DEBUG_PAGEALLOC");
9384-#endif
9385- printk("\n");
9386+ report_bug(regs->ip, regs);
9387
9388- if (notify_die(DIE_OOPS, str, regs, err,
9389- current->thread.trap_no, SIGSEGV) !=
9390- NOTIFY_STOP) {
9391- show_registers(regs);
9392- /* Executive summary in case the oops scrolled away */
9393- esp = (unsigned long) (&regs->esp);
9394- savesegment(ss, ss);
9395- if (user_mode(regs)) {
9396- esp = regs->esp;
9397- ss = regs->xss & 0xffff;
9398- }
9399- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
9400- print_symbol("%s", regs->eip);
9401- printk(" SS:ESP %04x:%08lx\n", ss, esp);
9402- }
9403- else
9404+ if (__die(str, regs, err))
9405 regs = NULL;
9406- } else
9407+ } else {
9408 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
9409+ }
9410
9411 bust_spinlocks(0);
9412 die.lock_owner = -1;
9413@@ -462,7 +494,7 @@ static void __kprobes do_trap(int trapnr
9414 {
9415 struct task_struct *tsk = current;
9416
9417- if (regs->eflags & VM_MASK) {
9418+ if (regs->flags & VM_MASK) {
9419 if (vm86)
9420 goto vm86_trap;
9421 goto trap_signal;
9422@@ -508,7 +540,7 @@ static void __kprobes do_trap(int trapnr
9423 }
9424
9425 #define DO_ERROR(trapnr, signr, str, name) \
9426-fastcall void do_##name(struct pt_regs * regs, long error_code) \
9427+void do_##name(struct pt_regs * regs, long error_code) \
9428 { \
9429 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
9430 == NOTIFY_STOP) \
9431@@ -517,7 +549,7 @@ fastcall void do_##name(struct pt_regs *
9432 }
9433
9434 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
9435-fastcall void do_##name(struct pt_regs * regs, long error_code) \
9436+void do_##name(struct pt_regs * regs, long error_code) \
9437 { \
9438 siginfo_t info; \
9439 if (irq) \
9440@@ -533,7 +565,7 @@ fastcall void do_##name(struct pt_regs *
9441 }
9442
9443 #define DO_VM86_ERROR(trapnr, signr, str, name) \
9444-fastcall void do_##name(struct pt_regs * regs, long error_code) \
9445+void do_##name(struct pt_regs * regs, long error_code) \
9446 { \
9447 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
9448 == NOTIFY_STOP) \
9449@@ -542,7 +574,7 @@ fastcall void do_##name(struct pt_regs *
9450 }
9451
9452 #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
9453-fastcall void do_##name(struct pt_regs * regs, long error_code) \
9454+void do_##name(struct pt_regs * regs, long error_code) \
9455 { \
9456 siginfo_t info; \
9457 info.si_signo = signr; \
9458@@ -556,13 +588,13 @@ fastcall void do_##name(struct pt_regs *
9459 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
9460 }
9461
9462-DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
9463+DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
9464 #ifndef CONFIG_KPROBES
9465 DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
9466 #endif
9467 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
9468 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
9469-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
9470+DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
9471 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
9472 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
9473 DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
9474@@ -570,10 +602,10 @@ DO_ERROR(12, SIGBUS, "stack segment", s
9475 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
9476 DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
9477
9478-fastcall void __kprobes do_general_protection(struct pt_regs * regs,
9479+void __kprobes do_general_protection(struct pt_regs * regs,
9480 long error_code)
9481 {
9482- if (regs->eflags & VM_MASK)
9483+ if (regs->flags & VM_MASK)
9484 goto gp_in_vm86;
9485
9486 if (!user_mode(regs))
9487@@ -582,11 +614,14 @@ fastcall void __kprobes do_general_prote
9488 current->thread.error_code = error_code;
9489 current->thread.trap_no = 13;
9490 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
9491- printk_ratelimit())
9492+ printk_ratelimit()) {
9493 printk(KERN_INFO
9494- "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
9495+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
9496 current->comm, task_pid_nr(current),
9497- regs->eip, regs->esp, error_code);
9498+ regs->ip, regs->sp, error_code);
9499+ print_vma_addr(" in ", regs->ip);
9500+ printk("\n");
9501+ }
9502
9503 force_sig(SIGSEGV, current);
9504 return;
9505@@ -675,8 +710,8 @@ void __kprobes die_nmi(struct pt_regs *r
9506 */
9507 bust_spinlocks(1);
9508 printk(KERN_EMERG "%s", msg);
9509- printk(" on CPU%d, eip %08lx, registers:\n",
9510- smp_processor_id(), regs->eip);
9511+ printk(" on CPU%d, ip %08lx, registers:\n",
9512+ smp_processor_id(), regs->ip);
9513 show_registers(regs);
9514 console_silent();
9515 spin_unlock(&nmi_print_lock);
9516@@ -733,7 +768,7 @@ static __kprobes void default_do_nmi(str
9517
9518 static int ignore_nmis;
9519
9520-fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
9521+__kprobes void do_nmi(struct pt_regs * regs, long error_code)
9522 {
9523 int cpu;
9524
9525@@ -762,7 +797,7 @@ void restart_nmi(void)
9526 }
9527
9528 #ifdef CONFIG_KPROBES
9529-fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
9530+void __kprobes do_int3(struct pt_regs *regs, long error_code)
9531 {
9532 trace_hardirqs_fixup();
9533
9534@@ -798,7 +833,7 @@ fastcall void __kprobes do_int3(struct p
9535 * find every occurrence of the TF bit that could be saved away even
9536 * by user code)
9537 */
9538-fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
9539+void __kprobes do_debug(struct pt_regs * regs, long error_code)
9540 {
9541 unsigned int condition;
9542 struct task_struct *tsk = current;
9543@@ -807,24 +842,30 @@ fastcall void __kprobes do_debug(struct
9544
9545 get_debugreg(condition, 6);
9546
9547+ /*
9548+ * The processor cleared BTF, so don't mark that we need it set.
9549+ */
9550+ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
9551+ tsk->thread.debugctlmsr = 0;
9552+
9553 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
9554 SIGTRAP) == NOTIFY_STOP)
9555 return;
9556 /* It's safe to allow irq's after DR6 has been saved */
9557- if (regs->eflags & X86_EFLAGS_IF)
9558+ if (regs->flags & X86_EFLAGS_IF)
9559 local_irq_enable();
9560
9561 /* Mask out spurious debug traps due to lazy DR7 setting */
9562 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
9563- if (!tsk->thread.debugreg[7])
9564+ if (!tsk->thread.debugreg7)
9565 goto clear_dr7;
9566 }
9567
9568- if (regs->eflags & VM_MASK)
9569+ if (regs->flags & VM_MASK)
9570 goto debug_vm86;
9571
9572 /* Save debug status register where ptrace can see it */
9573- tsk->thread.debugreg[6] = condition;
9574+ tsk->thread.debugreg6 = condition;
9575
9576 /*
9577 * Single-stepping through TF: make sure we ignore any events in
9578@@ -856,7 +897,7 @@ debug_vm86:
9579
9580 clear_TF_reenable:
9581 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
9582- regs->eflags &= ~TF_MASK;
9583+ regs->flags &= ~TF_MASK;
9584 return;
9585 }
9586
9587@@ -865,7 +906,7 @@ clear_TF_reenable:
9588 * the correct behaviour even in the presence of the asynchronous
9589 * IRQ13 behaviour
9590 */
9591-void math_error(void __user *eip)
9592+void math_error(void __user *ip)
9593 {
9594 struct task_struct * task;
9595 siginfo_t info;
9596@@ -881,7 +922,7 @@ void math_error(void __user *eip)
9597 info.si_signo = SIGFPE;
9598 info.si_errno = 0;
9599 info.si_code = __SI_FAULT;
9600- info.si_addr = eip;
9601+ info.si_addr = ip;
9602 /*
9603 * (~cwd & swd) will mask out exceptions that are not set to unmasked
9604 * status. 0x3f is the exception bits in these regs, 0x200 is the
9605@@ -924,13 +965,13 @@ void math_error(void __user *eip)
9606 force_sig_info(SIGFPE, &info, task);
9607 }
9608
9609-fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
9610+void do_coprocessor_error(struct pt_regs * regs, long error_code)
9611 {
9612 ignore_fpu_irq = 1;
9613- math_error((void __user *)regs->eip);
9614+ math_error((void __user *)regs->ip);
9615 }
9616
9617-static void simd_math_error(void __user *eip)
9618+static void simd_math_error(void __user *ip)
9619 {
9620 struct task_struct * task;
9621 siginfo_t info;
9622@@ -946,7 +987,7 @@ static void simd_math_error(void __user
9623 info.si_signo = SIGFPE;
9624 info.si_errno = 0;
9625 info.si_code = __SI_FAULT;
9626- info.si_addr = eip;
9627+ info.si_addr = ip;
9628 /*
9629 * The SIMD FPU exceptions are handled a little differently, as there
9630 * is only a single status/control register. Thus, to determine which
9631@@ -978,19 +1019,19 @@ static void simd_math_error(void __user
9632 force_sig_info(SIGFPE, &info, task);
9633 }
9634
9635-fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
9636+void do_simd_coprocessor_error(struct pt_regs * regs,
9637 long error_code)
9638 {
9639 if (cpu_has_xmm) {
9640 /* Handle SIMD FPU exceptions on PIII+ processors. */
9641 ignore_fpu_irq = 1;
9642- simd_math_error((void __user *)regs->eip);
9643+ simd_math_error((void __user *)regs->ip);
9644 } else {
9645 /*
9646 * Handle strange cache flush from user space exception
9647 * in all other cases. This is undocumented behaviour.
9648 */
9649- if (regs->eflags & VM_MASK) {
9650+ if (regs->flags & VM_MASK) {
9651 handle_vm86_fault((struct kernel_vm86_regs *)regs,
9652 error_code);
9653 return;
9654@@ -1003,7 +1044,7 @@ fastcall void do_simd_coprocessor_error(
9655 }
9656
9657 #ifndef CONFIG_XEN
9658-fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
9659+void do_spurious_interrupt_bug(struct pt_regs * regs,
9660 long error_code)
9661 {
9662 #if 0
9663@@ -1012,7 +1053,7 @@ fastcall void do_spurious_interrupt_bug(
9664 #endif
9665 }
9666
9667-fastcall unsigned long patch_espfix_desc(unsigned long uesp,
9668+unsigned long patch_espfix_desc(unsigned long uesp,
9669 unsigned long kesp)
9670 {
9671 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
9672@@ -1072,7 +1113,7 @@ asmlinkage void math_emulate(long arg)
9673 * NB. All these are "trap gates" (i.e. events_mask isn't set) except
9674 * for those that specify <dpl>|4 in the second field.
9675 */
9676-static trap_info_t __cpuinitdata trap_table[] = {
9677+static const trap_info_t __cpuinitconst trap_table[] = {
9678 { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
9679 { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
9680 { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
9681@@ -1105,17 +1146,12 @@ void __init trap_init(void)
9682 if (ret)
9683 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
9684
9685+ /*
9686+ * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
9687+ * Generate a build-time error if the alignment is wrong.
9688+ */
9689+ BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
9690 if (cpu_has_fxsr) {
9691- /*
9692- * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
9693- * Generates a compile-time "error: zero width for bit-field" if
9694- * the alignment is wrong.
9695- */
9696- struct fxsrAlignAssert {
9697- int _:!(offsetof(struct task_struct,
9698- thread.i387.fxsave) & 15);
9699- };
9700-
9701 printk(KERN_INFO "Enabling fast FPU save and restore... ");
9702 set_in_cr4(X86_CR4_OSFXSR);
9703 printk("done.\n");
9704--- a/arch/x86/kernel/traps_64-xen.c
9705+++ b/arch/x86/kernel/traps_64-xen.c
9706@@ -74,38 +74,41 @@ asmlinkage void alignment_check(void);
9707 asmlinkage void machine_check(void);
9708 asmlinkage void spurious_interrupt_bug(void);
9709
9710+static unsigned int code_bytes = 64;
9711+
9712 static inline void conditional_sti(struct pt_regs *regs)
9713 {
9714- if (regs->eflags & X86_EFLAGS_IF)
9715+ if (regs->flags & X86_EFLAGS_IF)
9716 local_irq_enable();
9717 }
9718
9719 static inline void preempt_conditional_sti(struct pt_regs *regs)
9720 {
9721- preempt_disable();
9722- if (regs->eflags & X86_EFLAGS_IF)
9723+ inc_preempt_count();
9724+ if (regs->flags & X86_EFLAGS_IF)
9725 local_irq_enable();
9726 }
9727
9728 static inline void preempt_conditional_cli(struct pt_regs *regs)
9729 {
9730- if (regs->eflags & X86_EFLAGS_IF)
9731+ if (regs->flags & X86_EFLAGS_IF)
9732 local_irq_disable();
9733 /* Make sure to not schedule here because we could be running
9734 on an exception stack. */
9735- preempt_enable_no_resched();
9736+ dec_preempt_count();
9737 }
9738
9739 int kstack_depth_to_print = 12;
9740
9741-#ifdef CONFIG_KALLSYMS
9742-void printk_address(unsigned long address)
9743+void printk_address(unsigned long address, int reliable)
9744 {
9745+#ifdef CONFIG_KALLSYMS
9746 unsigned long offset = 0, symsize;
9747 const char *symname;
9748 char *modname;
9749 char *delim = ":";
9750- char namebuf[128];
9751+ char namebuf[KSYM_NAME_LEN];
9752+ char reliab[4] = "";
9753
9754 symname = kallsyms_lookup(address, &symsize, &offset,
9755 &modname, namebuf);
9756@@ -113,17 +116,17 @@ void printk_address(unsigned long addres
9757 printk(" [<%016lx>]\n", address);
9758 return;
9759 }
9760+ if (!reliable)
9761+ strcpy(reliab, "? ");
9762+
9763 if (!modname)
9764- modname = delim = "";
9765- printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
9766- address, delim, modname, delim, symname, offset, symsize);
9767-}
9768+ modname = delim = "";
9769+ printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
9770+ address, reliab, delim, modname, delim, symname, offset, symsize);
9771 #else
9772-void printk_address(unsigned long address)
9773-{
9774 printk(" [<%016lx>]\n", address);
9775-}
9776 #endif
9777+}
9778
9779 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
9780 unsigned *usedp, char **idp)
9781@@ -210,14 +213,53 @@ static unsigned long *in_exception_stack
9782 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
9783 */
9784
9785-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
9786+static inline int valid_stack_ptr(struct thread_info *tinfo,
9787+ void *p, unsigned int size, void *end)
9788 {
9789- void *t = (void *)tinfo;
9790- return p > t && p < t + THREAD_SIZE - 3;
9791+ void *t = tinfo;
9792+ if (end) {
9793+ if (p < end && p >= (end-THREAD_SIZE))
9794+ return 1;
9795+ else
9796+ return 0;
9797+ }
9798+ return p > t && p < t + THREAD_SIZE - size;
9799+}
9800+
9801+/* The form of the top of the frame on the stack */
9802+struct stack_frame {
9803+ struct stack_frame *next_frame;
9804+ unsigned long return_address;
9805+};
9806+
9807+
9808+static inline unsigned long print_context_stack(struct thread_info *tinfo,
9809+ unsigned long *stack, unsigned long bp,
9810+ const struct stacktrace_ops *ops, void *data,
9811+ unsigned long *end)
9812+{
9813+ struct stack_frame *frame = (struct stack_frame *)bp;
9814+
9815+ while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
9816+ unsigned long addr;
9817+
9818+ addr = *stack;
9819+ if (__kernel_text_address(addr)) {
9820+ if ((unsigned long) stack == bp + 8) {
9821+ ops->address(data, addr, 1);
9822+ frame = frame->next_frame;
9823+ bp = (unsigned long) frame;
9824+ } else {
9825+ ops->address(data, addr, bp == 0);
9826+ }
9827+ }
9828+ stack++;
9829+ }
9830+ return bp;
9831 }
9832
9833 void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
9834- unsigned long *stack,
9835+ unsigned long *stack, unsigned long bp,
9836 const struct stacktrace_ops *ops, void *data)
9837 {
9838 const unsigned cpu = get_cpu();
9839@@ -227,36 +269,28 @@ void dump_trace(struct task_struct *tsk,
9840
9841 if (!tsk)
9842 tsk = current;
9843+ tinfo = task_thread_info(tsk);
9844
9845 if (!stack) {
9846 unsigned long dummy;
9847 stack = &dummy;
9848 if (tsk && tsk != current)
9849- stack = (unsigned long *)tsk->thread.rsp;
9850+ stack = (unsigned long *)tsk->thread.sp;
9851 }
9852
9853- /*
9854- * Print function call entries within a stack. 'cond' is the
9855- * "end of stackframe" condition, that the 'stack++'
9856- * iteration will eventually trigger.
9857- */
9858-#define HANDLE_STACK(cond) \
9859- do while (cond) { \
9860- unsigned long addr = *stack++; \
9861- /* Use unlocked access here because except for NMIs \
9862- we should be already protected against module unloads */ \
9863- if (__kernel_text_address(addr)) { \
9864- /* \
9865- * If the address is either in the text segment of the \
9866- * kernel, or in the region which contains vmalloc'ed \
9867- * memory, it *may* be the address of a calling \
9868- * routine; if so, print it so that someone tracing \
9869- * down the cause of the crash will be able to figure \
9870- * out the call path that was taken. \
9871- */ \
9872- ops->address(data, addr); \
9873- } \
9874- } while (0)
9875+#ifdef CONFIG_FRAME_POINTER
9876+ if (!bp) {
9877+ if (tsk == current) {
9878+ /* Grab bp right from our regs */
9879+ asm("movq %%rbp, %0" : "=r" (bp):);
9880+ } else {
9881+ /* bp is the last reg pushed by switch_to */
9882+ bp = *(unsigned long *) tsk->thread.sp;
9883+ }
9884+ }
9885+#endif
9886+
9887+
9888
9889 /*
9890 * Print function call entries in all stacks, starting at the
9891@@ -272,7 +306,9 @@ void dump_trace(struct task_struct *tsk,
9892 if (estack_end) {
9893 if (ops->stack(data, id) < 0)
9894 break;
9895- HANDLE_STACK (stack < estack_end);
9896+
9897+ bp = print_context_stack(tinfo, stack, bp, ops,
9898+ data, estack_end);
9899 ops->stack(data, "<EOE>");
9900 /*
9901 * We link to the next stack via the
9902@@ -290,7 +326,8 @@ void dump_trace(struct task_struct *tsk,
9903 if (stack >= irqstack && stack < irqstack_end) {
9904 if (ops->stack(data, "IRQ") < 0)
9905 break;
9906- HANDLE_STACK (stack < irqstack_end);
9907+ bp = print_context_stack(tinfo, stack, bp,
9908+ ops, data, irqstack_end);
9909 /*
9910 * We link to the next stack (which would be
9911 * the process stack normally) the last
9912@@ -308,9 +345,7 @@ void dump_trace(struct task_struct *tsk,
9913 /*
9914 * This handles the process stack:
9915 */
9916- tinfo = task_thread_info(tsk);
9917- HANDLE_STACK (valid_stack_ptr(tinfo, stack));
9918-#undef HANDLE_STACK
9919+ bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
9920 put_cpu();
9921 }
9922 EXPORT_SYMBOL(dump_trace);
9923@@ -333,10 +368,10 @@ static int print_trace_stack(void *data,
9924 return 0;
9925 }
9926
9927-static void print_trace_address(void *data, unsigned long addr)
9928+static void print_trace_address(void *data, unsigned long addr, int reliable)
9929 {
9930 touch_nmi_watchdog();
9931- printk_address(addr);
9932+ printk_address(addr, reliable);
9933 }
9934
9935 static const struct stacktrace_ops print_trace_ops = {
9936@@ -347,15 +382,17 @@ static const struct stacktrace_ops print
9937 };
9938
9939 void
9940-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
9941+show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
9942+ unsigned long bp)
9943 {
9944 printk("\nCall Trace:\n");
9945- dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
9946+ dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
9947 printk("\n");
9948 }
9949
9950 static void
9951-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
9952+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
9953+ unsigned long bp)
9954 {
9955 unsigned long *stack;
9956 int i;
9957@@ -366,14 +403,14 @@ _show_stack(struct task_struct *tsk, str
9958 // debugging aid: "show_stack(NULL, NULL);" prints the
9959 // back trace for this cpu.
9960
9961- if (rsp == NULL) {
9962+ if (sp == NULL) {
9963 if (tsk)
9964- rsp = (unsigned long *)tsk->thread.rsp;
9965+ sp = (unsigned long *)tsk->thread.sp;
9966 else
9967- rsp = (unsigned long *)&rsp;
9968+ sp = (unsigned long *)&sp;
9969 }
9970
9971- stack = rsp;
9972+ stack = sp;
9973 for(i=0; i < kstack_depth_to_print; i++) {
9974 if (stack >= irqstack && stack <= irqstack_end) {
9975 if (stack == irqstack_end) {
9976@@ -389,12 +426,12 @@ _show_stack(struct task_struct *tsk, str
9977 printk(" %016lx", *stack++);
9978 touch_nmi_watchdog();
9979 }
9980- show_trace(tsk, regs, rsp);
9981+ show_trace(tsk, regs, sp, bp);
9982 }
9983
9984-void show_stack(struct task_struct *tsk, unsigned long * rsp)
9985+void show_stack(struct task_struct *tsk, unsigned long * sp)
9986 {
9987- _show_stack(tsk, NULL, rsp);
9988+ _show_stack(tsk, NULL, sp, 0);
9989 }
9990
9991 /*
9992@@ -403,13 +440,19 @@ void show_stack(struct task_struct *tsk,
9993 void dump_stack(void)
9994 {
9995 unsigned long dummy;
9996+ unsigned long bp = 0;
9997+
9998+#ifdef CONFIG_FRAME_POINTER
9999+ if (!bp)
10000+ asm("movq %%rbp, %0" : "=r" (bp):);
10001+#endif
10002
10003 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
10004 current->pid, current->comm, print_tainted(),
10005 init_utsname()->release,
10006 (int)strcspn(init_utsname()->version, " "),
10007 init_utsname()->version);
10008- show_trace(NULL, NULL, &dummy);
10009+ show_trace(NULL, NULL, &dummy, bp);
10010 }
10011
10012 EXPORT_SYMBOL(dump_stack);
10013@@ -417,12 +460,15 @@ EXPORT_SYMBOL(dump_stack);
10014 void show_registers(struct pt_regs *regs)
10015 {
10016 int i;
10017- int in_kernel = !user_mode(regs);
10018- unsigned long rsp;
10019+ unsigned long sp;
10020 const int cpu = smp_processor_id();
10021 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
10022+ u8 *ip;
10023+ unsigned int code_prologue = code_bytes * 43 / 64;
10024+ unsigned int code_len = code_bytes;
10025
10026- rsp = regs->rsp;
10027+ sp = regs->sp;
10028+ ip = (u8 *) regs->ip - code_prologue;
10029 printk("CPU %d ", cpu);
10030 __show_regs(regs);
10031 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
10032@@ -432,45 +478,43 @@ void show_registers(struct pt_regs *regs
10033 * When in-kernel, we also print out the stack and code at the
10034 * time of the fault..
10035 */
10036- if (in_kernel) {
10037+ if (!user_mode(regs)) {
10038+ unsigned char c;
10039 printk("Stack: ");
10040- _show_stack(NULL, regs, (unsigned long*)rsp);
10041+ _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
10042+ printk("\n");
10043
10044- printk("\nCode: ");
10045- if (regs->rip < PAGE_OFFSET)
10046- goto bad;
10047-
10048- for (i=0; i<20; i++) {
10049- unsigned char c;
10050- if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
10051-bad:
10052+ printk(KERN_EMERG "Code: ");
10053+ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
10054+ /* try starting at RIP */
10055+ ip = (u8 *) regs->ip;
10056+ code_len = code_len - code_prologue + 1;
10057+ }
10058+ for (i = 0; i < code_len; i++, ip++) {
10059+ if (ip < (u8 *)PAGE_OFFSET ||
10060+ probe_kernel_address(ip, c)) {
10061 printk(" Bad RIP value.");
10062 break;
10063 }
10064- printk("%02x ", c);
10065+ if (ip == (u8 *)regs->ip)
10066+ printk("<%02x> ", c);
10067+ else
10068+ printk("%02x ", c);
10069 }
10070 }
10071 printk("\n");
10072 }
10073
10074-int is_valid_bugaddr(unsigned long rip)
10075+int is_valid_bugaddr(unsigned long ip)
10076 {
10077 unsigned short ud2;
10078
10079- if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
10080+ if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
10081 return 0;
10082
10083 return ud2 == 0x0b0f;
10084 }
10085
10086-#ifdef CONFIG_BUG
10087-void out_of_line_bug(void)
10088-{
10089- BUG();
10090-}
10091-EXPORT_SYMBOL(out_of_line_bug);
10092-#endif
10093-
10094 static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
10095 static int die_owner = -1;
10096 static unsigned int die_nest_count;
10097@@ -498,7 +542,7 @@ unsigned __kprobes long oops_begin(void)
10098 return flags;
10099 }
10100
10101-void __kprobes oops_end(unsigned long flags)
10102+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
10103 {
10104 die_owner = -1;
10105 bust_spinlocks(0);
10106@@ -507,12 +551,17 @@ void __kprobes oops_end(unsigned long fl
10107 /* Nest count reaches zero, release the lock. */
10108 __raw_spin_unlock(&die_lock);
10109 raw_local_irq_restore(flags);
10110+ if (!regs) {
10111+ oops_exit();
10112+ return;
10113+ }
10114 if (panic_on_oops)
10115 panic("Fatal exception");
10116 oops_exit();
10117+ do_exit(signr);
10118 }
10119
10120-void __kprobes __die(const char * str, struct pt_regs * regs, long err)
10121+int __kprobes __die(const char * str, struct pt_regs * regs, long err)
10122 {
10123 static int die_counter;
10124 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
10125@@ -526,15 +575,17 @@ void __kprobes __die(const char * str, s
10126 printk("DEBUG_PAGEALLOC");
10127 #endif
10128 printk("\n");
10129- notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
10130+ if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
10131+ return 1;
10132 show_registers(regs);
10133 add_taint(TAINT_DIE);
10134 /* Executive summary in case the oops scrolled away */
10135 printk(KERN_ALERT "RIP ");
10136- printk_address(regs->rip);
10137- printk(" RSP <%016lx>\n", regs->rsp);
10138+ printk_address(regs->ip, 1);
10139+ printk(" RSP <%016lx>\n", regs->sp);
10140 if (kexec_should_crash(current))
10141 crash_kexec(regs);
10142+ return 0;
10143 }
10144
10145 void die(const char * str, struct pt_regs * regs, long err)
10146@@ -542,11 +593,11 @@ void die(const char * str, struct pt_reg
10147 unsigned long flags = oops_begin();
10148
10149 if (!user_mode(regs))
10150- report_bug(regs->rip, regs);
10151+ report_bug(regs->ip, regs);
10152
10153- __die(str, regs, err);
10154- oops_end(flags);
10155- do_exit(SIGSEGV);
10156+ if (__die(str, regs, err))
10157+ regs = NULL;
10158+ oops_end(flags, regs, SIGSEGV);
10159 }
10160
10161 #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
10162@@ -564,10 +615,10 @@ void __kprobes die_nmi(char *str, struct
10163 crash_kexec(regs);
10164 if (do_panic || panic_on_oops)
10165 panic("Non maskable interrupt");
10166- oops_end(flags);
10167+ oops_end(flags, NULL, SIGBUS);
10168 nmi_exit();
10169 local_irq_enable();
10170- do_exit(SIGSEGV);
10171+ do_exit(SIGBUS);
10172 }
10173 #endif
10174
10175@@ -592,11 +643,14 @@ static void __kprobes do_trap(int trapnr
10176 tsk->thread.trap_no = trapnr;
10177
10178 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
10179- printk_ratelimit())
10180+ printk_ratelimit()) {
10181 printk(KERN_INFO
10182- "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
10183+ "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
10184 tsk->comm, tsk->pid, str,
10185- regs->rip, regs->rsp, error_code);
10186+ regs->ip, regs->sp, error_code);
10187+ print_vma_addr(" in ", regs->ip);
10188+ printk("\n");
10189+ }
10190
10191 if (info)
10192 force_sig_info(signr, info, tsk);
10193@@ -606,19 +660,12 @@ static void __kprobes do_trap(int trapnr
10194 }
10195
10196
10197- /* kernel trap */
10198- {
10199- const struct exception_table_entry *fixup;
10200- fixup = search_exception_tables(regs->rip);
10201- if (fixup)
10202- regs->rip = fixup->fixup;
10203- else {
10204- tsk->thread.error_code = error_code;
10205- tsk->thread.trap_no = trapnr;
10206- die(str, regs, error_code);
10207- }
10208- return;
10209+ if (!fixup_exception(regs)) {
10210+ tsk->thread.error_code = error_code;
10211+ tsk->thread.trap_no = trapnr;
10212+ die(str, regs, error_code);
10213 }
10214+ return;
10215 }
10216
10217 #define DO_ERROR(trapnr, signr, str, name) \
10218@@ -647,10 +694,10 @@ asmlinkage void do_##name(struct pt_regs
10219 do_trap(trapnr, signr, str, regs, error_code, &info); \
10220 }
10221
10222-DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
10223+DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
10224 DO_ERROR( 4, SIGSEGV, "overflow", overflow)
10225 DO_ERROR( 5, SIGSEGV, "bounds", bounds)
10226-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
10227+DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
10228 DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
10229 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
10230 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
10231@@ -698,32 +745,28 @@ asmlinkage void __kprobes do_general_pro
10232 tsk->thread.trap_no = 13;
10233
10234 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
10235- printk_ratelimit())
10236+ printk_ratelimit()) {
10237 printk(KERN_INFO
10238- "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
10239+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
10240 tsk->comm, tsk->pid,
10241- regs->rip, regs->rsp, error_code);
10242+ regs->ip, regs->sp, error_code);
10243+ print_vma_addr(" in ", regs->ip);
10244+ printk("\n");
10245+ }
10246
10247 force_sig(SIGSEGV, tsk);
10248 return;
10249 }
10250
10251- /* kernel gp */
10252- {
10253- const struct exception_table_entry *fixup;
10254- fixup = search_exception_tables(regs->rip);
10255- if (fixup) {
10256- regs->rip = fixup->fixup;
10257- return;
10258- }
10259+ if (fixup_exception(regs))
10260+ return;
10261
10262- tsk->thread.error_code = error_code;
10263- tsk->thread.trap_no = 13;
10264- if (notify_die(DIE_GPF, "general protection fault", regs,
10265- error_code, 13, SIGSEGV) == NOTIFY_STOP)
10266- return;
10267- die("general protection fault", regs, error_code);
10268- }
10269+ tsk->thread.error_code = error_code;
10270+ tsk->thread.trap_no = 13;
10271+ if (notify_die(DIE_GPF, "general protection fault", regs,
10272+ error_code, 13, SIGSEGV) == NOTIFY_STOP)
10273+ return;
10274+ die("general protection fault", regs, error_code);
10275 }
10276
10277 static __kprobes void
10278@@ -833,15 +876,15 @@ asmlinkage __kprobes struct pt_regs *syn
10279 {
10280 struct pt_regs *regs = eregs;
10281 /* Did already sync */
10282- if (eregs == (struct pt_regs *)eregs->rsp)
10283+ if (eregs == (struct pt_regs *)eregs->sp)
10284 ;
10285 /* Exception from user space */
10286 else if (user_mode(eregs))
10287 regs = task_pt_regs(current);
10288 /* Exception from kernel and interrupts are enabled. Move to
10289 kernel process stack. */
10290- else if (eregs->eflags & X86_EFLAGS_IF)
10291- regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
10292+ else if (eregs->flags & X86_EFLAGS_IF)
10293+ regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
10294 if (eregs != regs)
10295 *regs = *eregs;
10296 return regs;
10297@@ -859,6 +902,12 @@ asmlinkage void __kprobes do_debug(struc
10298
10299 get_debugreg(condition, 6);
10300
10301+ /*
10302+ * The processor cleared BTF, so don't mark that we need it set.
10303+ */
10304+ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
10305+ tsk->thread.debugctlmsr = 0;
10306+
10307 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
10308 SIGTRAP) == NOTIFY_STOP)
10309 return;
10310@@ -874,27 +923,14 @@ asmlinkage void __kprobes do_debug(struc
10311
10312 tsk->thread.debugreg6 = condition;
10313
10314- /* Mask out spurious TF errors due to lazy TF clearing */
10315+
10316+ /*
10317+ * Single-stepping through TF: make sure we ignore any events in
10318+ * kernel space (but re-enable TF when returning to user mode).
10319+ */
10320 if (condition & DR_STEP) {
10321- /*
10322- * The TF error should be masked out only if the current
10323- * process is not traced and if the TRAP flag has been set
10324- * previously by a tracing process (condition detected by
10325- * the PT_DTRACE flag); remember that the i386 TRAP flag
10326- * can be modified by the process itself in user mode,
10327- * allowing programs to debug themselves without the ptrace()
10328- * interface.
10329- */
10330 if (!user_mode(regs))
10331 goto clear_TF_reenable;
10332- /*
10333- * Was the TF flag set by a debugger? If so, clear it now,
10334- * so that register information is correct.
10335- */
10336- if (tsk->ptrace & PT_DTRACE) {
10337- regs->eflags &= ~TF_MASK;
10338- tsk->ptrace &= ~PT_DTRACE;
10339- }
10340 }
10341
10342 /* Ok, finally something we can handle */
10343@@ -903,7 +939,7 @@ asmlinkage void __kprobes do_debug(struc
10344 info.si_signo = SIGTRAP;
10345 info.si_errno = 0;
10346 info.si_code = TRAP_BRKPT;
10347- info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
10348+ info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
10349 force_sig_info(SIGTRAP, &info, tsk);
10350
10351 clear_dr7:
10352@@ -913,18 +949,15 @@ clear_dr7:
10353
10354 clear_TF_reenable:
10355 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
10356- regs->eflags &= ~TF_MASK;
10357+ regs->flags &= ~X86_EFLAGS_TF;
10358 preempt_conditional_cli(regs);
10359 }
10360
10361 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
10362 {
10363- const struct exception_table_entry *fixup;
10364- fixup = search_exception_tables(regs->rip);
10365- if (fixup) {
10366- regs->rip = fixup->fixup;
10367+ if (fixup_exception(regs))
10368 return 1;
10369- }
10370+
10371 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
10372 /* Illegal floating point operation in the kernel */
10373 current->thread.trap_no = trapnr;
10374@@ -939,7 +972,7 @@ static int kernel_math_error(struct pt_r
10375 */
10376 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
10377 {
10378- void __user *rip = (void __user *)(regs->rip);
10379+ void __user *ip = (void __user *)(regs->ip);
10380 struct task_struct * task;
10381 siginfo_t info;
10382 unsigned short cwd, swd;
10383@@ -959,7 +992,7 @@ asmlinkage void do_coprocessor_error(str
10384 info.si_signo = SIGFPE;
10385 info.si_errno = 0;
10386 info.si_code = __SI_FAULT;
10387- info.si_addr = rip;
10388+ info.si_addr = ip;
10389 /*
10390 * (~cwd & swd) will mask out exceptions that are not set to unmasked
10391 * status. 0x3f is the exception bits in these regs, 0x200 is the
10392@@ -1008,7 +1041,7 @@ asmlinkage void bad_intr(void)
10393
10394 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
10395 {
10396- void __user *rip = (void __user *)(regs->rip);
10397+ void __user *ip = (void __user *)(regs->ip);
10398 struct task_struct * task;
10399 siginfo_t info;
10400 unsigned short mxcsr;
10401@@ -1028,7 +1061,7 @@ asmlinkage void do_simd_coprocessor_erro
10402 info.si_signo = SIGFPE;
10403 info.si_errno = 0;
10404 info.si_code = __SI_FAULT;
10405- info.si_addr = rip;
10406+ info.si_addr = ip;
10407 /*
10408 * The SIMD FPU exceptions are handled a little differently, as there
10409 * is only a single status/control register. Thus, to determine which
10410@@ -1092,13 +1125,14 @@ asmlinkage void math_state_restore(void)
10411 task_thread_info(me)->status |= TS_USEDFPU;
10412 me->fpu_counter++;
10413 }
10414+EXPORT_SYMBOL_GPL(math_state_restore);
10415
10416
10417 /*
10418 * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
10419 * specify <dpl>|4 in the second field.
10420 */
10421-static trap_info_t __cpuinitdata trap_table[] = {
10422+static const trap_info_t __cpuinitconst trap_table[] = {
10423 { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error },
10424 { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
10425 { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
10426@@ -1169,3 +1203,14 @@ static int __init kstack_setup(char *s)
10427 return 0;
10428 }
10429 early_param("kstack", kstack_setup);
10430+
10431+
10432+static int __init code_bytes_setup(char *s)
10433+{
10434+ code_bytes = simple_strtoul(s, NULL, 0);
10435+ if (code_bytes > 8192)
10436+ code_bytes = 8192;
10437+
10438+ return 1;
10439+}
10440+__setup("code_bytes=", code_bytes_setup);
10441--- a/arch/x86/kernel/vsyscall_64-xen.c
10442+++ b/arch/x86/kernel/vsyscall_64-xen.c
10443@@ -43,12 +43,7 @@
10444 #include <asm/vgtod.h>
10445
10446 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
10447-#define __syscall_clobber "r11","rcx","memory"
10448-#define __pa_vsymbol(x) \
10449- ({unsigned long v; \
10450- extern char __vsyscall_0; \
10451- asm("" : "=r" (v) : "0" (x)); \
10452- ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
10453+#define __syscall_clobber "r11","cx","memory"
10454
10455 /*
10456 * vsyscall_gtod_data contains data that is :
10457@@ -102,7 +97,7 @@ static __always_inline void do_get_tz(st
10458 static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
10459 {
10460 int ret;
10461- asm volatile("vsysc2: syscall"
10462+ asm volatile("syscall"
10463 : "=a" (ret)
10464 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
10465 : __syscall_clobber );
10466@@ -112,7 +107,7 @@ static __always_inline int gettimeofday(
10467 static __always_inline long time_syscall(long *t)
10468 {
10469 long secs;
10470- asm volatile("vsysc1: syscall"
10471+ asm volatile("syscall"
10472 : "=a" (secs)
10473 : "0" (__NR_time),"D" (t) : __syscall_clobber);
10474 return secs;
10475@@ -190,7 +185,7 @@ time_t __vsyscall(1) vtime(time_t *t)
10476 long __vsyscall(2)
10477 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
10478 {
10479- unsigned int dummy, p;
10480+ unsigned int p;
10481 unsigned long j = 0;
10482
10483 /* Fast cache - only recompute value once per jiffies and avoid
10484@@ -205,7 +200,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
10485 p = tcache->blob[1];
10486 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
10487 /* Load per CPU data from RDTSCP */
10488- rdtscp(dummy, dummy, p);
10489+ native_read_tscp(&p);
10490 } else {
10491 /* Load per CPU data from GDT */
10492 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
10493@@ -228,42 +223,11 @@ long __vsyscall(3) venosys_1(void)
10494
10495 #ifdef CONFIG_SYSCTL
10496
10497-#define SYSCALL 0x050f
10498-#define NOP2 0x9090
10499-
10500-/*
10501- * NOP out syscall in vsyscall page when not needed.
10502- */
10503-static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
10504- void __user *buffer, size_t *lenp, loff_t *ppos)
10505+static int
10506+vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
10507+ void __user *buffer, size_t *lenp, loff_t *ppos)
10508 {
10509- extern u16 vsysc1, vsysc2;
10510- u16 __iomem *map1;
10511- u16 __iomem *map2;
10512- int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
10513- if (!write)
10514- return ret;
10515- /* gcc has some trouble with __va(__pa()), so just do it this
10516- way. */
10517- map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
10518- if (!map1)
10519- return -ENOMEM;
10520- map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
10521- if (!map2) {
10522- ret = -ENOMEM;
10523- goto out;
10524- }
10525- if (!vsyscall_gtod_data.sysctl_enabled) {
10526- writew(SYSCALL, map1);
10527- writew(SYSCALL, map2);
10528- } else {
10529- writew(NOP2, map1);
10530- writew(NOP2, map2);
10531- }
10532- iounmap(map2);
10533-out:
10534- iounmap(map1);
10535- return ret;
10536+ return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
10537 }
10538
10539 static ctl_table kernel_table2[] = {
10540@@ -279,7 +243,6 @@ static ctl_table kernel_root_table2[] =
10541 .child = kernel_table2 },
10542 {}
10543 };
10544-
10545 #endif
10546
10547 /* Assume __initcall executes before all user space. Hopefully kmod
10548@@ -301,7 +264,7 @@ static void __cpuinit vsyscall_set_cpu(i
10549 d |= cpu;
10550 d |= (node & 0xf) << 12;
10551 d |= (node >> 4) << 48;
10552- if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu)
10553+ if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
10554 + GDT_ENTRY_PER_CPU),
10555 d))
10556 BUG();
10557@@ -322,7 +285,7 @@ cpu_vsyscall_notifier(struct notifier_bl
10558 return NOTIFY_DONE;
10559 }
10560
10561-static void __init map_vsyscall(void)
10562+void __init map_vsyscall(void)
10563 {
10564 extern char __vsyscall_0;
10565 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
10566@@ -338,7 +301,6 @@ static int __init vsyscall_init(void)
10567 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
10568 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
10569 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
10570- map_vsyscall();
10571 #ifdef CONFIG_XEN
10572 vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */
10573 if (boot_cpu_has(X86_FEATURE_RDTSCP))
10574--- a/arch/x86/kernel/xen_entry_64.S
10575+++ /dev/null
10576@@ -1,36 +0,0 @@
10577-/*
10578- * Copied from arch/xen/i386/kernel/entry.S
10579- */
10580-/* Offsets into shared_info_t. */
10581-#define evtchn_upcall_pending /* 0 */
10582-#define evtchn_upcall_mask 1
10583-
10584-#define sizeof_vcpu_shift 6
10585-
10586-#ifdef CONFIG_SMP
10587-//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
10588-//#define preempt_enable(reg) decl threadinfo_preempt_count(reg)
10589-#define preempt_disable(reg)
10590-#define preempt_enable(reg)
10591-#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \
10592- movq %gs:pda_cpunumber,reg ; \
10593- shl $32, reg ; \
10594- shr $32-sizeof_vcpu_shift,reg ; \
10595- addq HYPERVISOR_shared_info,reg
10596-#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \
10597-#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
10598-#else
10599-#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
10600-#define XEN_PUT_VCPU_INFO(reg)
10601-#define XEN_PUT_VCPU_INFO_fixup
10602-#endif
10603-
10604-#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg)
10605-#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
10606-#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
10607- XEN_LOCKED_BLOCK_EVENTS(reg) ; \
10608- XEN_PUT_VCPU_INFO(reg)
10609-#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
10610- XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \
10611- XEN_PUT_VCPU_INFO(reg)
10612-#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg)
10613--- a/arch/x86/mach-xen/setup.c
10614+++ b/arch/x86/mach-xen/setup.c
10615@@ -161,15 +161,12 @@ void __init machine_specific_arch_setup(
10616
10617 /* Do an early initialization of the fixmap area */
10618 {
10619- extern pte_t swapper_pg_pmd[PTRS_PER_PTE];
10620+ extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
10621 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
10622- pgd_t *pgd = (pgd_t *)xen_start_info->pt_base;
10623- pud_t *pud = pud_offset(pgd + pgd_index(addr), addr);
10624+ pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
10625 pmd_t *pmd = pmd_offset(pud, addr);
10626
10627- swapper_pg_dir = pgd;
10628- init_mm.pgd = pgd;
10629- make_lowmem_page_readonly(swapper_pg_pmd, XENFEAT_writable_page_tables);
10630- set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_pmd) | _PAGE_TABLE));
10631+ make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
10632+ set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
10633 }
10634 }
10635--- a/arch/x86/mm/fault_32-xen.c
10636+++ /dev/null
10637@@ -1,757 +0,0 @@
10638-/*
10639- * linux/arch/i386/mm/fault.c
10640- *
10641- * Copyright (C) 1995 Linus Torvalds
10642- */
10643-
10644-#include <linux/signal.h>
10645-#include <linux/sched.h>
10646-#include <linux/kernel.h>
10647-#include <linux/errno.h>
10648-#include <linux/string.h>
10649-#include <linux/types.h>
10650-#include <linux/ptrace.h>
10651-#include <linux/mman.h>
10652-#include <linux/mm.h>
10653-#include <linux/smp.h>
10654-#include <linux/interrupt.h>
10655-#include <linux/init.h>
10656-#include <linux/tty.h>
10657-#include <linux/vt_kern.h> /* For unblank_screen() */
10658-#include <linux/highmem.h>
10659-#include <linux/bootmem.h> /* for max_low_pfn */
10660-#include <linux/vmalloc.h>
10661-#include <linux/module.h>
10662-#include <linux/kprobes.h>
10663-#include <linux/uaccess.h>
10664-#include <linux/kdebug.h>
10665-#include <linux/kprobes.h>
10666-
10667-#include <asm/system.h>
10668-#include <asm/desc.h>
10669-#include <asm/segment.h>
10670-
10671-extern void die(const char *,struct pt_regs *,long);
10672-
10673-#ifdef CONFIG_KPROBES
10674-static inline int notify_page_fault(struct pt_regs *regs)
10675-{
10676- int ret = 0;
10677-
10678- /* kprobe_running() needs smp_processor_id() */
10679- if (!user_mode_vm(regs)) {
10680- preempt_disable();
10681- if (kprobe_running() && kprobe_fault_handler(regs, 14))
10682- ret = 1;
10683- preempt_enable();
10684- }
10685-
10686- return ret;
10687-}
10688-#else
10689-static inline int notify_page_fault(struct pt_regs *regs)
10690-{
10691- return 0;
10692-}
10693-#endif
10694-
10695-/*
10696- * Return EIP plus the CS segment base. The segment limit is also
10697- * adjusted, clamped to the kernel/user address space (whichever is
10698- * appropriate), and returned in *eip_limit.
10699- *
10700- * The segment is checked, because it might have been changed by another
10701- * task between the original faulting instruction and here.
10702- *
10703- * If CS is no longer a valid code segment, or if EIP is beyond the
10704- * limit, or if it is a kernel address when CS is not a kernel segment,
10705- * then the returned value will be greater than *eip_limit.
10706- *
10707- * This is slow, but is very rarely executed.
10708- */
10709-static inline unsigned long get_segment_eip(struct pt_regs *regs,
10710- unsigned long *eip_limit)
10711-{
10712- unsigned long eip = regs->eip;
10713- unsigned seg = regs->xcs & 0xffff;
10714- u32 seg_ar, seg_limit, base, *desc;
10715-
10716- /* Unlikely, but must come before segment checks. */
10717- if (unlikely(regs->eflags & VM_MASK)) {
10718- base = seg << 4;
10719- *eip_limit = base + 0xffff;
10720- return base + (eip & 0xffff);
10721- }
10722-
10723- /* The standard kernel/user address space limit. */
10724- *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
10725-
10726- /* By far the most common cases. */
10727- if (likely(SEGMENT_IS_FLAT_CODE(seg)))
10728- return eip;
10729-
10730- /* Check the segment exists, is within the current LDT/GDT size,
10731- that kernel/user (ring 0..3) has the appropriate privilege,
10732- that it's a code segment, and get the limit. */
10733- __asm__ ("larl %3,%0; lsll %3,%1"
10734- : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
10735- if ((~seg_ar & 0x9800) || eip > seg_limit) {
10736- *eip_limit = 0;
10737- return 1; /* So that returned eip > *eip_limit. */
10738- }
10739-
10740- /* Get the GDT/LDT descriptor base.
10741- When you look for races in this code remember that
10742- LDT and other horrors are only used in user space. */
10743- if (seg & (1<<2)) {
10744- /* Must lock the LDT while reading it. */
10745- mutex_lock(&current->mm->context.lock);
10746- desc = current->mm->context.ldt;
10747- desc = (void *)desc + (seg & ~7);
10748- } else {
10749- /* Must disable preemption while reading the GDT. */
10750- desc = (u32 *)get_cpu_gdt_table(get_cpu());
10751- desc = (void *)desc + (seg & ~7);
10752- }
10753-
10754- /* Decode the code segment base from the descriptor */
10755- base = get_desc_base((unsigned long *)desc);
10756-
10757- if (seg & (1<<2)) {
10758- mutex_unlock(&current->mm->context.lock);
10759- } else
10760- put_cpu();
10761-
10762- /* Adjust EIP and segment limit, and clamp at the kernel limit.
10763- It's legitimate for segments to wrap at 0xffffffff. */
10764- seg_limit += base;
10765- if (seg_limit < *eip_limit && seg_limit >= base)
10766- *eip_limit = seg_limit;
10767- return eip + base;
10768-}
10769-
10770-/*
10771- * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
10772- * Check that here and ignore it.
10773- */
10774-static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
10775-{
10776- unsigned long limit;
10777- unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
10778- int scan_more = 1;
10779- int prefetch = 0;
10780- int i;
10781-
10782- for (i = 0; scan_more && i < 15; i++) {
10783- unsigned char opcode;
10784- unsigned char instr_hi;
10785- unsigned char instr_lo;
10786-
10787- if (instr > (unsigned char *)limit)
10788- break;
10789- if (probe_kernel_address(instr, opcode))
10790- break;
10791-
10792- instr_hi = opcode & 0xf0;
10793- instr_lo = opcode & 0x0f;
10794- instr++;
10795-
10796- switch (instr_hi) {
10797- case 0x20:
10798- case 0x30:
10799- /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
10800- scan_more = ((instr_lo & 7) == 0x6);
10801- break;
10802-
10803- case 0x60:
10804- /* 0x64 thru 0x67 are valid prefixes in all modes. */
10805- scan_more = (instr_lo & 0xC) == 0x4;
10806- break;
10807- case 0xF0:
10808- /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
10809- scan_more = !instr_lo || (instr_lo>>1) == 1;
10810- break;
10811- case 0x00:
10812- /* Prefetch instruction is 0x0F0D or 0x0F18 */
10813- scan_more = 0;
10814- if (instr > (unsigned char *)limit)
10815- break;
10816- if (probe_kernel_address(instr, opcode))
10817- break;
10818- prefetch = (instr_lo == 0xF) &&
10819- (opcode == 0x0D || opcode == 0x18);
10820- break;
10821- default:
10822- scan_more = 0;
10823- break;
10824- }
10825- }
10826- return prefetch;
10827-}
10828-
10829-static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
10830- unsigned long error_code)
10831-{
10832- if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
10833- boot_cpu_data.x86 >= 6)) {
10834- /* Catch an obscure case of prefetch inside an NX page. */
10835- if (nx_enabled && (error_code & 16))
10836- return 0;
10837- return __is_prefetch(regs, addr);
10838- }
10839- return 0;
10840-}
10841-
10842-static noinline void force_sig_info_fault(int si_signo, int si_code,
10843- unsigned long address, struct task_struct *tsk)
10844-{
10845- siginfo_t info;
10846-
10847- info.si_signo = si_signo;
10848- info.si_errno = 0;
10849- info.si_code = si_code;
10850- info.si_addr = (void __user *)address;
10851- force_sig_info(si_signo, &info, tsk);
10852-}
10853-
10854-fastcall void do_invalid_op(struct pt_regs *, unsigned long);
10855-
10856-#ifdef CONFIG_X86_PAE
10857-static void dump_fault_path(unsigned long address)
10858-{
10859- unsigned long *p, page;
10860- unsigned long mfn;
10861-
10862- page = read_cr3();
10863- p = (unsigned long *)__va(page);
10864- p += (address >> 30) * 2;
10865- printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
10866- if (p[0] & _PAGE_PRESENT) {
10867- mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
10868- page = mfn_to_pfn(mfn) << PAGE_SHIFT;
10869- p = (unsigned long *)__va(page);
10870- address &= 0x3fffffff;
10871- p += (address >> 21) * 2;
10872- printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
10873- page, p[1], p[0]);
10874- mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
10875-#ifdef CONFIG_HIGHPTE
10876- if (mfn_to_pfn(mfn) >= highstart_pfn)
10877- return;
10878-#endif
10879- if ((p[0] & _PAGE_PRESENT) && !(p[0] & _PAGE_PSE)) {
10880- page = mfn_to_pfn(mfn) << PAGE_SHIFT;
10881- p = (unsigned long *) __va(page);
10882- address &= 0x001fffff;
10883- p += (address >> 12) * 2;
10884- printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
10885- page, p[1], p[0]);
10886- }
10887- }
10888-}
10889-#else
10890-static void dump_fault_path(unsigned long address)
10891-{
10892- unsigned long page;
10893-
10894- page = read_cr3();
10895- page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
10896- printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
10897- machine_to_phys(page));
10898- /*
10899- * We must not directly access the pte in the highpte
10900- * case if the page table is located in highmem.
10901- * And lets rather not kmap-atomic the pte, just in case
10902- * it's allocated already.
10903- */
10904- if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
10905- && (page & _PAGE_PRESENT)
10906- && !(page & _PAGE_PSE)) {
10907- page = machine_to_phys(page & PAGE_MASK);
10908- page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
10909- & (PTRS_PER_PTE - 1)];
10910- printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
10911- machine_to_phys(page));
10912- }
10913-}
10914-#endif
10915-
10916-static int spurious_fault(struct pt_regs *regs,
10917- unsigned long address,
10918- unsigned long error_code)
10919-{
10920- pgd_t *pgd;
10921- pud_t *pud;
10922- pmd_t *pmd;
10923- pte_t *pte;
10924-
10925- /* Reserved-bit violation or user access to kernel space? */
10926- if (error_code & 0x0c)
10927- return 0;
10928-
10929- pgd = init_mm.pgd + pgd_index(address);
10930- if (!pgd_present(*pgd))
10931- return 0;
10932-
10933- pud = pud_offset(pgd, address);
10934- if (!pud_present(*pud))
10935- return 0;
10936-
10937- pmd = pmd_offset(pud, address);
10938- if (!pmd_present(*pmd))
10939- return 0;
10940-
10941- pte = pte_offset_kernel(pmd, address);
10942- if (!pte_present(*pte))
10943- return 0;
10944- if ((error_code & 0x02) && !pte_write(*pte))
10945- return 0;
10946-#ifdef CONFIG_X86_PAE
10947- if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX))
10948- return 0;
10949-#endif
10950-
10951- return 1;
10952-}
10953-
10954-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
10955-{
10956- unsigned index = pgd_index(address);
10957- pgd_t *pgd_k;
10958- pud_t *pud, *pud_k;
10959- pmd_t *pmd, *pmd_k;
10960-
10961- pgd += index;
10962- pgd_k = init_mm.pgd + index;
10963-
10964- if (!pgd_present(*pgd_k))
10965- return NULL;
10966-
10967- /*
10968- * set_pgd(pgd, *pgd_k); here would be useless on PAE
10969- * and redundant with the set_pmd() on non-PAE. As would
10970- * set_pud.
10971- */
10972-
10973- pud = pud_offset(pgd, address);
10974- pud_k = pud_offset(pgd_k, address);
10975- if (!pud_present(*pud_k))
10976- return NULL;
10977-
10978- pmd = pmd_offset(pud, address);
10979- pmd_k = pmd_offset(pud_k, address);
10980- if (!pmd_present(*pmd_k))
10981- return NULL;
10982- if (!pmd_present(*pmd)) {
10983- bool lazy = x86_read_percpu(xen_lazy_mmu);
10984-
10985- x86_write_percpu(xen_lazy_mmu, false);
10986-#if CONFIG_XEN_COMPAT > 0x030002
10987- set_pmd(pmd, *pmd_k);
10988-#else
10989- /*
10990- * When running on older Xen we must launder *pmd_k through
10991- * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
10992- */
10993- set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
10994-#endif
10995- x86_write_percpu(xen_lazy_mmu, lazy);
10996- } else
10997- BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
10998- return pmd_k;
10999-}
11000-
11001-/*
11002- * Handle a fault on the vmalloc or module mapping area
11003- *
11004- * This assumes no large pages in there.
11005- */
11006-static inline int vmalloc_fault(unsigned long address)
11007-{
11008- unsigned long pgd_paddr;
11009- pmd_t *pmd_k;
11010- pte_t *pte_k;
11011- /*
11012- * Synchronize this task's top level page-table
11013- * with the 'reference' page table.
11014- *
11015- * Do _not_ use "current" here. We might be inside
11016- * an interrupt in the middle of a task switch..
11017- */
11018- pgd_paddr = read_cr3();
11019- pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
11020- if (!pmd_k)
11021- return -1;
11022- pte_k = pte_offset_kernel(pmd_k, address);
11023- if (!pte_present(*pte_k))
11024- return -1;
11025- return 0;
11026-}
11027-
11028-int show_unhandled_signals = 1;
11029-
11030-/*
11031- * This routine handles page faults. It determines the address,
11032- * and the problem, and then passes it off to one of the appropriate
11033- * routines.
11034- *
11035- * error_code:
11036- * bit 0 == 0 means no page found, 1 means protection fault
11037- * bit 1 == 0 means read, 1 means write
11038- * bit 2 == 0 means kernel, 1 means user-mode
11039- * bit 3 == 1 means use of reserved bit detected
11040- * bit 4 == 1 means fault was an instruction fetch
11041- */
11042-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
11043- unsigned long error_code)
11044-{
11045- struct task_struct *tsk;
11046- struct mm_struct *mm;
11047- struct vm_area_struct * vma;
11048- unsigned long address;
11049- int write, si_code;
11050- int fault;
11051-
11052- /*
11053- * We can fault from pretty much anywhere, with unknown IRQ state.
11054- */
11055- trace_hardirqs_fixup();
11056-
11057- /* get the address */
11058- address = read_cr2();
11059-
11060- /* Set the "privileged fault" bit to something sane. */
11061- error_code &= ~4;
11062- error_code |= (regs->xcs & 2) << 1;
11063- if (regs->eflags & X86_EFLAGS_VM)
11064- error_code |= 4;
11065-
11066- tsk = current;
11067-
11068- si_code = SEGV_MAPERR;
11069-
11070- /*
11071- * We fault-in kernel-space virtual memory on-demand. The
11072- * 'reference' page table is init_mm.pgd.
11073- *
11074- * NOTE! We MUST NOT take any locks for this case. We may
11075- * be in an interrupt or a critical region, and should
11076- * only copy the information from the master page table,
11077- * nothing more.
11078- *
11079- * This verifies that the fault happens in kernel space
11080- * (error_code & 4) == 0, and that the fault was not a
11081- * protection error (error_code & 9) == 0.
11082- */
11083- if (unlikely(address >= TASK_SIZE)) {
11084-#ifdef CONFIG_XEN
11085- /* Faults in hypervisor area can never be patched up. */
11086- if (address >= hypervisor_virt_start)
11087- goto bad_area_nosemaphore;
11088-#endif
11089- if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
11090- return;
11091- /* Can take a spurious fault if mapping changes R/O -> R/W. */
11092- if (spurious_fault(regs, address, error_code))
11093- return;
11094- if (notify_page_fault(regs))
11095- return;
11096- /*
11097- * Don't take the mm semaphore here. If we fixup a prefetch
11098- * fault we could otherwise deadlock.
11099- */
11100- goto bad_area_nosemaphore;
11101- }
11102-
11103- if (notify_page_fault(regs))
11104- return;
11105-
11106- /* It's safe to allow irq's after cr2 has been saved and the vmalloc
11107- fault has been handled. */
11108- if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
11109- local_irq_enable();
11110-
11111- mm = tsk->mm;
11112-
11113- /*
11114- * If we're in an interrupt, have no user context or are running in an
11115- * atomic region then we must not take the fault..
11116- */
11117- if (in_atomic() || !mm)
11118- goto bad_area_nosemaphore;
11119-
11120- /* When running in the kernel we expect faults to occur only to
11121- * addresses in user space. All other faults represent errors in the
11122- * kernel and should generate an OOPS. Unfortunately, in the case of an
11123- * erroneous fault occurring in a code path which already holds mmap_sem
11124- * we will deadlock attempting to validate the fault against the
11125- * address space. Luckily the kernel only validly references user
11126- * space from well defined areas of code, which are listed in the
11127- * exceptions table.
11128- *
11129- * As the vast majority of faults will be valid we will only perform
11130- * the source reference check when there is a possibility of a deadlock.
11131- * Attempt to lock the address space, if we cannot we then validate the
11132- * source. If this is invalid we can skip the address space check,
11133- * thus avoiding the deadlock.
11134- */
11135- if (!down_read_trylock(&mm->mmap_sem)) {
11136- if ((error_code & 4) == 0 &&
11137- !search_exception_tables(regs->eip))
11138- goto bad_area_nosemaphore;
11139- down_read(&mm->mmap_sem);
11140- }
11141-
11142- vma = find_vma(mm, address);
11143- if (!vma)
11144- goto bad_area;
11145- if (vma->vm_start <= address)
11146- goto good_area;
11147- if (!(vma->vm_flags & VM_GROWSDOWN))
11148- goto bad_area;
11149- if (error_code & 4) {
11150- /*
11151- * Accessing the stack below %esp is always a bug.
11152- * The large cushion allows instructions like enter
11153- * and pusha to work. ("enter $65535,$31" pushes
11154- * 32 pointers and then decrements %esp by 65535.)
11155- */
11156- if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
11157- goto bad_area;
11158- }
11159- if (expand_stack(vma, address))
11160- goto bad_area;
11161-/*
11162- * Ok, we have a good vm_area for this memory access, so
11163- * we can handle it..
11164- */
11165-good_area:
11166- si_code = SEGV_ACCERR;
11167- write = 0;
11168- switch (error_code & 3) {
11169- default: /* 3: write, present */
11170- /* fall through */
11171- case 2: /* write, not present */
11172- if (!(vma->vm_flags & VM_WRITE))
11173- goto bad_area;
11174- write++;
11175- break;
11176- case 1: /* read, present */
11177- goto bad_area;
11178- case 0: /* read, not present */
11179- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
11180- goto bad_area;
11181- }
11182-
11183- survive:
11184- /*
11185- * If for any reason at all we couldn't handle the fault,
11186- * make sure we exit gracefully rather than endlessly redo
11187- * the fault.
11188- */
11189- fault = handle_mm_fault(mm, vma, address, write);
11190- if (unlikely(fault & VM_FAULT_ERROR)) {
11191- if (fault & VM_FAULT_OOM)
11192- goto out_of_memory;
11193- else if (fault & VM_FAULT_SIGBUS)
11194- goto do_sigbus;
11195- BUG();
11196- }
11197- if (fault & VM_FAULT_MAJOR)
11198- tsk->maj_flt++;
11199- else
11200- tsk->min_flt++;
11201-
11202- /*
11203- * Did it hit the DOS screen memory VA from vm86 mode?
11204- */
11205- if (regs->eflags & VM_MASK) {
11206- unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
11207- if (bit < 32)
11208- tsk->thread.screen_bitmap |= 1 << bit;
11209- }
11210- up_read(&mm->mmap_sem);
11211- return;
11212-
11213-/*
11214- * Something tried to access memory that isn't in our memory map..
11215- * Fix it, but check if it's kernel or user first..
11216- */
11217-bad_area:
11218- up_read(&mm->mmap_sem);
11219-
11220-bad_area_nosemaphore:
11221- /* User mode accesses just cause a SIGSEGV */
11222- if (error_code & 4) {
11223- /*
11224- * It's possible to have interrupts off here.
11225- */
11226- local_irq_enable();
11227-
11228- /*
11229- * Valid to do another page fault here because this one came
11230- * from user space.
11231- */
11232- if (is_prefetch(regs, address, error_code))
11233- return;
11234-
11235- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
11236- printk_ratelimit()) {
11237- printk("%s%s[%d]: segfault at %08lx eip %08lx "
11238- "esp %08lx error %lx\n",
11239- task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
11240- tsk->comm, task_pid_nr(tsk), address, regs->eip,
11241- regs->esp, error_code);
11242- }
11243- tsk->thread.cr2 = address;
11244- /* Kernel addresses are always protection faults */
11245- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
11246- tsk->thread.trap_no = 14;
11247- force_sig_info_fault(SIGSEGV, si_code, address, tsk);
11248- return;
11249- }
11250-
11251-#ifdef CONFIG_X86_F00F_BUG
11252- /*
11253- * Pentium F0 0F C7 C8 bug workaround.
11254- */
11255- if (boot_cpu_data.f00f_bug) {
11256- unsigned long nr;
11257-
11258- nr = (address - idt_descr.address) >> 3;
11259-
11260- if (nr == 6) {
11261- do_invalid_op(regs, 0);
11262- return;
11263- }
11264- }
11265-#endif
11266-
11267-no_context:
11268- /* Are we prepared to handle this kernel fault? */
11269- if (fixup_exception(regs))
11270- return;
11271-
11272- /*
11273- * Valid to do another page fault here, because if this fault
11274- * had been triggered by is_prefetch fixup_exception would have
11275- * handled it.
11276- */
11277- if (is_prefetch(regs, address, error_code))
11278- return;
11279-
11280-/*
11281- * Oops. The kernel tried to access some bad page. We'll have to
11282- * terminate things with extreme prejudice.
11283- */
11284-
11285- bust_spinlocks(1);
11286-
11287- if (oops_may_print()) {
11288-#ifdef CONFIG_X86_PAE
11289- if (error_code & 16) {
11290- pte_t *pte = lookup_address(address);
11291-
11292- if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
11293- printk(KERN_CRIT "kernel tried to execute "
11294- "NX-protected page - exploit attempt? "
11295- "(uid: %d)\n", current->uid);
11296- }
11297-#endif
11298- if (address < PAGE_SIZE)
11299- printk(KERN_ALERT "BUG: unable to handle kernel NULL "
11300- "pointer dereference");
11301- else
11302- printk(KERN_ALERT "BUG: unable to handle kernel paging"
11303- " request");
11304- printk(" at virtual address %08lx\n",address);
11305- printk(KERN_ALERT "printing eip: %08lx\n", regs->eip);
11306- dump_fault_path(address);
11307- }
11308- tsk->thread.cr2 = address;
11309- tsk->thread.trap_no = 14;
11310- tsk->thread.error_code = error_code;
11311- die("Oops", regs, error_code);
11312- bust_spinlocks(0);
11313- do_exit(SIGKILL);
11314-
11315-/*
11316- * We ran out of memory, or some other thing happened to us that made
11317- * us unable to handle the page fault gracefully.
11318- */
11319-out_of_memory:
11320- up_read(&mm->mmap_sem);
11321- if (is_global_init(tsk)) {
11322- yield();
11323- down_read(&mm->mmap_sem);
11324- goto survive;
11325- }
11326- printk("VM: killing process %s\n", tsk->comm);
11327- if (error_code & 4)
11328- do_group_exit(SIGKILL);
11329- goto no_context;
11330-
11331-do_sigbus:
11332- up_read(&mm->mmap_sem);
11333-
11334- /* Kernel mode? Handle exceptions or die */
11335- if (!(error_code & 4))
11336- goto no_context;
11337-
11338- /* User space => ok to do another page fault */
11339- if (is_prefetch(regs, address, error_code))
11340- return;
11341-
11342- tsk->thread.cr2 = address;
11343- tsk->thread.error_code = error_code;
11344- tsk->thread.trap_no = 14;
11345- force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
11346-}
11347-
11348-void vmalloc_sync_all(void)
11349-{
11350- /*
11351- * Note that races in the updates of insync and start aren't
11352- * problematic: insync can only get set bits added, and updates to
11353- * start are only improving performance (without affecting correctness
11354- * if undone).
11355- * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
11356- * This change works just fine with 2-level paging too.
11357- */
11358-#define sync_index(a) ((a) >> PMD_SHIFT)
11359- static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
11360- static unsigned long start = TASK_SIZE;
11361- unsigned long address;
11362-
11363- if (SHARED_KERNEL_PMD)
11364- return;
11365-
11366- BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
11367- for (address = start;
11368- address >= TASK_SIZE && address < hypervisor_virt_start;
11369- address += 1UL << PMD_SHIFT) {
11370- if (!test_bit(sync_index(address), insync)) {
11371- unsigned long flags;
11372- struct page *page;
11373-
11374- spin_lock_irqsave(&pgd_lock, flags);
11375- /* XEN: failure path assumes non-empty pgd_list. */
11376- if (unlikely(!pgd_list)) {
11377- spin_unlock_irqrestore(&pgd_lock, flags);
11378- return;
11379- }
11380- for (page = pgd_list; page; page =
11381- (struct page *)page->index)
11382- if (!vmalloc_sync_one(page_address(page),
11383- address)) {
11384- BUG_ON(page != pgd_list);
11385- break;
11386- }
11387- spin_unlock_irqrestore(&pgd_lock, flags);
11388- if (!page)
11389- set_bit(sync_index(address), insync);
11390- }
11391- if (address == start && test_bit(sync_index(address), insync))
11392- start = address + (1UL << PMD_SHIFT);
11393- }
11394-}
11395--- a/arch/x86/mm/fault_64-xen.c
11396+++ /dev/null
11397@@ -1,686 +0,0 @@
11398-/*
11399- * linux/arch/x86-64/mm/fault.c
11400- *
11401- * Copyright (C) 1995 Linus Torvalds
11402- * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
11403- */
11404-
11405-#include <linux/signal.h>
11406-#include <linux/sched.h>
11407-#include <linux/kernel.h>
11408-#include <linux/errno.h>
11409-#include <linux/string.h>
11410-#include <linux/types.h>
11411-#include <linux/ptrace.h>
11412-#include <linux/mman.h>
11413-#include <linux/mm.h>
11414-#include <linux/smp.h>
11415-#include <linux/interrupt.h>
11416-#include <linux/init.h>
11417-#include <linux/tty.h>
11418-#include <linux/vt_kern.h> /* For unblank_screen() */
11419-#include <linux/compiler.h>
11420-#include <linux/vmalloc.h>
11421-#include <linux/module.h>
11422-#include <linux/kprobes.h>
11423-#include <linux/uaccess.h>
11424-#include <linux/kdebug.h>
11425-#include <linux/kprobes.h>
11426-
11427-#include <asm/system.h>
11428-#include <asm/pgalloc.h>
11429-#include <asm/smp.h>
11430-#include <asm/tlbflush.h>
11431-#include <asm/proto.h>
11432-#include <asm-generic/sections.h>
11433-
11434-/* Page fault error code bits */
11435-#define PF_PROT (1<<0) /* or no page found */
11436-#define PF_WRITE (1<<1)
11437-#define PF_USER (1<<2)
11438-#define PF_RSVD (1<<3)
11439-#define PF_INSTR (1<<4)
11440-
11441-#ifdef CONFIG_KPROBES
11442-static inline int notify_page_fault(struct pt_regs *regs)
11443-{
11444- int ret = 0;
11445-
11446- /* kprobe_running() needs smp_processor_id() */
11447- if (!user_mode(regs)) {
11448- preempt_disable();
11449- if (kprobe_running() && kprobe_fault_handler(regs, 14))
11450- ret = 1;
11451- preempt_enable();
11452- }
11453-
11454- return ret;
11455-}
11456-#else
11457-static inline int notify_page_fault(struct pt_regs *regs)
11458-{
11459- return 0;
11460-}
11461-#endif
11462-
11463-/* Sometimes the CPU reports invalid exceptions on prefetch.
11464- Check that here and ignore.
11465- Opcode checker based on code by Richard Brunner */
11466-static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
11467- unsigned long error_code)
11468-{
11469- unsigned char *instr;
11470- int scan_more = 1;
11471- int prefetch = 0;
11472- unsigned char *max_instr;
11473-
11474- /* If it was a exec fault ignore */
11475- if (error_code & PF_INSTR)
11476- return 0;
11477-
11478- instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
11479- max_instr = instr + 15;
11480-
11481- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
11482- return 0;
11483-
11484- while (scan_more && instr < max_instr) {
11485- unsigned char opcode;
11486- unsigned char instr_hi;
11487- unsigned char instr_lo;
11488-
11489- if (probe_kernel_address(instr, opcode))
11490- break;
11491-
11492- instr_hi = opcode & 0xf0;
11493- instr_lo = opcode & 0x0f;
11494- instr++;
11495-
11496- switch (instr_hi) {
11497- case 0x20:
11498- case 0x30:
11499- /* Values 0x26,0x2E,0x36,0x3E are valid x86
11500- prefixes. In long mode, the CPU will signal
11501- invalid opcode if some of these prefixes are
11502- present so we will never get here anyway */
11503- scan_more = ((instr_lo & 7) == 0x6);
11504- break;
11505-
11506- case 0x40:
11507- /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
11508- Need to figure out under what instruction mode the
11509- instruction was issued ... */
11510- /* Could check the LDT for lm, but for now it's good
11511- enough to assume that long mode only uses well known
11512- segments or kernel. */
11513- scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
11514- break;
11515-
11516- case 0x60:
11517- /* 0x64 thru 0x67 are valid prefixes in all modes. */
11518- scan_more = (instr_lo & 0xC) == 0x4;
11519- break;
11520- case 0xF0:
11521- /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
11522- scan_more = !instr_lo || (instr_lo>>1) == 1;
11523- break;
11524- case 0x00:
11525- /* Prefetch instruction is 0x0F0D or 0x0F18 */
11526- scan_more = 0;
11527- if (probe_kernel_address(instr, opcode))
11528- break;
11529- prefetch = (instr_lo == 0xF) &&
11530- (opcode == 0x0D || opcode == 0x18);
11531- break;
11532- default:
11533- scan_more = 0;
11534- break;
11535- }
11536- }
11537- return prefetch;
11538-}
11539-
11540-static int bad_address(void *p)
11541-{
11542- unsigned long dummy;
11543- return probe_kernel_address((unsigned long *)p, dummy);
11544-}
11545-
11546-void dump_pagetable(unsigned long address)
11547-{
11548- pgd_t *pgd;
11549- pud_t *pud;
11550- pmd_t *pmd;
11551- pte_t *pte;
11552-
11553- pgd = (pgd_t *)read_cr3();
11554-
11555- pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
11556- pgd += pgd_index(address);
11557- if (bad_address(pgd)) goto bad;
11558- printk("PGD %lx ", pgd_val(*pgd));
11559- if (!pgd_present(*pgd)) goto ret;
11560-
11561- pud = pud_offset(pgd, address);
11562- if (bad_address(pud)) goto bad;
11563- printk("PUD %lx ", pud_val(*pud));
11564- if (!pud_present(*pud)) goto ret;
11565-
11566- pmd = pmd_offset(pud, address);
11567- if (bad_address(pmd)) goto bad;
11568- printk("PMD %lx ", pmd_val(*pmd));
11569- if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
11570-
11571- pte = pte_offset_kernel(pmd, address);
11572- if (bad_address(pte)) goto bad;
11573- printk("PTE %lx", pte_val(*pte));
11574-ret:
11575- printk("\n");
11576- return;
11577-bad:
11578- printk("BAD\n");
11579-}
11580-
11581-static const char errata93_warning[] =
11582-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
11583-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
11584-KERN_ERR "******* Please consider a BIOS update.\n"
11585-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
11586-
11587-/* Workaround for K8 erratum #93 & buggy BIOS.
11588- BIOS SMM functions are required to use a specific workaround
11589- to avoid corruption of the 64bit RIP register on C stepping K8.
11590- A lot of BIOS that didn't get tested properly miss this.
11591- The OS sees this as a page fault with the upper 32bits of RIP cleared.
11592- Try to work around it here.
11593- Note we only handle faults in kernel here. */
11594-
11595-static int is_errata93(struct pt_regs *regs, unsigned long address)
11596-{
11597- static int warned;
11598- if (address != regs->rip)
11599- return 0;
11600- if ((address >> 32) != 0)
11601- return 0;
11602- address |= 0xffffffffUL << 32;
11603- if ((address >= (u64)_stext && address <= (u64)_etext) ||
11604- (address >= MODULES_VADDR && address <= MODULES_END)) {
11605- if (!warned) {
11606- printk(errata93_warning);
11607- warned = 1;
11608- }
11609- regs->rip = address;
11610- return 1;
11611- }
11612- return 0;
11613-}
11614-
11615-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
11616- unsigned long error_code)
11617-{
11618- unsigned long flags = oops_begin();
11619- struct task_struct *tsk;
11620-
11621- printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
11622- current->comm, address);
11623- dump_pagetable(address);
11624- tsk = current;
11625- tsk->thread.cr2 = address;
11626- tsk->thread.trap_no = 14;
11627- tsk->thread.error_code = error_code;
11628- __die("Bad pagetable", regs, error_code);
11629- oops_end(flags);
11630- do_exit(SIGKILL);
11631-}
11632-
11633-/*
11634- * Handle a fault on the vmalloc area
11635- *
11636- * This assumes no large pages in there.
11637- */
11638-static int vmalloc_fault(unsigned long address)
11639-{
11640- pgd_t *pgd, *pgd_ref;
11641- pud_t *pud, *pud_ref;
11642- pmd_t *pmd, *pmd_ref;
11643- pte_t *pte, *pte_ref;
11644-
11645- /* Copy kernel mappings over when needed. This can also
11646- happen within a race in page table update. In the later
11647- case just flush. */
11648-
11649- /* On Xen the line below does not always work. Needs investigating! */
11650- /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
11651- pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
11652- pgd += pgd_index(address);
11653- pgd_ref = pgd_offset_k(address);
11654- if (pgd_none(*pgd_ref))
11655- return -1;
11656- if (pgd_none(*pgd))
11657- set_pgd(pgd, *pgd_ref);
11658- else
11659- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
11660-
11661- /* Below here mismatches are bugs because these lower tables
11662- are shared */
11663-
11664- pud = pud_offset(pgd, address);
11665- pud_ref = pud_offset(pgd_ref, address);
11666- if (pud_none(*pud_ref))
11667- return -1;
11668- if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
11669- BUG();
11670- pmd = pmd_offset(pud, address);
11671- pmd_ref = pmd_offset(pud_ref, address);
11672- if (pmd_none(*pmd_ref))
11673- return -1;
11674- if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
11675- BUG();
11676- pte_ref = pte_offset_kernel(pmd_ref, address);
11677- if (!pte_present(*pte_ref))
11678- return -1;
11679- pte = pte_offset_kernel(pmd, address);
11680- /* Don't use pte_page here, because the mappings can point
11681- outside mem_map, and the NUMA hash lookup cannot handle
11682- that. */
11683- if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
11684- BUG();
11685- return 0;
11686-}
11687-
11688-int show_unhandled_signals = 1;
11689-
11690-
11691-#define MEM_VERBOSE 1
11692-
11693-#ifdef MEM_VERBOSE
11694-#define MEM_LOG(_f, _a...) \
11695- printk("fault.c:[%d]-> " _f "\n", \
11696- __LINE__ , ## _a )
11697-#else
11698-#define MEM_LOG(_f, _a...) ((void)0)
11699-#endif
11700-
11701-static int spurious_fault(struct pt_regs *regs,
11702- unsigned long address,
11703- unsigned long error_code)
11704-{
11705- pgd_t *pgd;
11706- pud_t *pud;
11707- pmd_t *pmd;
11708- pte_t *pte;
11709-
11710-#ifdef CONFIG_XEN
11711- /* Faults in hypervisor area are never spurious. */
11712- if ((address >= HYPERVISOR_VIRT_START) &&
11713- (address < HYPERVISOR_VIRT_END))
11714- return 0;
11715-#endif
11716-
11717- /* Reserved-bit violation or user access to kernel space? */
11718- if (error_code & (PF_RSVD|PF_USER))
11719- return 0;
11720-
11721- pgd = init_mm.pgd + pgd_index(address);
11722- if (!pgd_present(*pgd))
11723- return 0;
11724-
11725- pud = pud_offset(pgd, address);
11726- if (!pud_present(*pud))
11727- return 0;
11728-
11729- pmd = pmd_offset(pud, address);
11730- if (!pmd_present(*pmd))
11731- return 0;
11732-
11733- pte = pte_offset_kernel(pmd, address);
11734- if (!pte_present(*pte))
11735- return 0;
11736- if ((error_code & PF_WRITE) && !pte_write(*pte))
11737- return 0;
11738- if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX))
11739- return 0;
11740-
11741- return 1;
11742-}
11743-
11744-/*
11745- * This routine handles page faults. It determines the address,
11746- * and the problem, and then passes it off to one of the appropriate
11747- * routines.
11748- */
11749-asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
11750- unsigned long error_code)
11751-{
11752- struct task_struct *tsk;
11753- struct mm_struct *mm;
11754- struct vm_area_struct * vma;
11755- unsigned long address;
11756- const struct exception_table_entry *fixup;
11757- int write, fault;
11758- unsigned long flags;
11759- siginfo_t info;
11760-
11761- if (!user_mode(regs))
11762- error_code &= ~PF_USER; /* means kernel */
11763-
11764- /*
11765- * We can fault from pretty much anywhere, with unknown IRQ state.
11766- */
11767- trace_hardirqs_fixup();
11768-
11769- tsk = current;
11770- mm = tsk->mm;
11771- prefetchw(&mm->mmap_sem);
11772-
11773- /* get the address */
11774- address = read_cr2();
11775-
11776- info.si_code = SEGV_MAPERR;
11777-
11778-
11779- /*
11780- * We fault-in kernel-space virtual memory on-demand. The
11781- * 'reference' page table is init_mm.pgd.
11782- *
11783- * NOTE! We MUST NOT take any locks for this case. We may
11784- * be in an interrupt or a critical region, and should
11785- * only copy the information from the master page table,
11786- * nothing more.
11787- *
11788- * This verifies that the fault happens in kernel space
11789- * (error_code & 4) == 0, and that the fault was not a
11790- * protection error (error_code & 9) == 0.
11791- */
11792- if (unlikely(address >= TASK_SIZE64)) {
11793- /*
11794- * Don't check for the module range here: its PML4
11795- * is always initialized because it's shared with the main
11796- * kernel text. Only vmalloc may need PML4 syncups.
11797- */
11798- if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
11799- ((address >= VMALLOC_START && address < VMALLOC_END))) {
11800- if (vmalloc_fault(address) >= 0)
11801- return;
11802- }
11803- /* Can take a spurious fault if mapping changes R/O -> R/W. */
11804- if (spurious_fault(regs, address, error_code))
11805- return;
11806- if (notify_page_fault(regs))
11807- return;
11808- /*
11809- * Don't take the mm semaphore here. If we fixup a prefetch
11810- * fault we could otherwise deadlock.
11811- */
11812- goto bad_area_nosemaphore;
11813- }
11814-
11815- if (notify_page_fault(regs))
11816- return;
11817-
11818- if (likely(regs->eflags & X86_EFLAGS_IF))
11819- local_irq_enable();
11820-
11821- if (unlikely(error_code & PF_RSVD))
11822- pgtable_bad(address, regs, error_code);
11823-
11824- /*
11825- * If we're in an interrupt or have no user
11826- * context, we must not take the fault..
11827- */
11828- if (unlikely(in_atomic() || !mm))
11829- goto bad_area_nosemaphore;
11830-
11831- /*
11832- * User-mode registers count as a user access even for any
11833- * potential system fault or CPU buglet.
11834- */
11835- if (user_mode_vm(regs))
11836- error_code |= PF_USER;
11837-
11838- again:
11839- /* When running in the kernel we expect faults to occur only to
11840- * addresses in user space. All other faults represent errors in the
11841- * kernel and should generate an OOPS. Unfortunately, in the case of an
11842- * erroneous fault occurring in a code path which already holds mmap_sem
11843- * we will deadlock attempting to validate the fault against the
11844- * address space. Luckily the kernel only validly references user
11845- * space from well defined areas of code, which are listed in the
11846- * exceptions table.
11847- *
11848- * As the vast majority of faults will be valid we will only perform
11849- * the source reference check when there is a possibility of a deadlock.
11850- * Attempt to lock the address space, if we cannot we then validate the
11851- * source. If this is invalid we can skip the address space check,
11852- * thus avoiding the deadlock.
11853- */
11854- if (!down_read_trylock(&mm->mmap_sem)) {
11855- if ((error_code & PF_USER) == 0 &&
11856- !search_exception_tables(regs->rip))
11857- goto bad_area_nosemaphore;
11858- down_read(&mm->mmap_sem);
11859- }
11860-
11861- vma = find_vma(mm, address);
11862- if (!vma)
11863- goto bad_area;
11864- if (likely(vma->vm_start <= address))
11865- goto good_area;
11866- if (!(vma->vm_flags & VM_GROWSDOWN))
11867- goto bad_area;
11868- if (error_code & 4) {
11869- /* Allow userspace just enough access below the stack pointer
11870- * to let the 'enter' instruction work.
11871- */
11872- if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
11873- goto bad_area;
11874- }
11875- if (expand_stack(vma, address))
11876- goto bad_area;
11877-/*
11878- * Ok, we have a good vm_area for this memory access, so
11879- * we can handle it..
11880- */
11881-good_area:
11882- info.si_code = SEGV_ACCERR;
11883- write = 0;
11884- switch (error_code & (PF_PROT|PF_WRITE)) {
11885- default: /* 3: write, present */
11886- /* fall through */
11887- case PF_WRITE: /* write, not present */
11888- if (!(vma->vm_flags & VM_WRITE))
11889- goto bad_area;
11890- write++;
11891- break;
11892- case PF_PROT: /* read, present */
11893- goto bad_area;
11894- case 0: /* read, not present */
11895- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
11896- goto bad_area;
11897- }
11898-
11899- /*
11900- * If for any reason at all we couldn't handle the fault,
11901- * make sure we exit gracefully rather than endlessly redo
11902- * the fault.
11903- */
11904- fault = handle_mm_fault(mm, vma, address, write);
11905- if (unlikely(fault & VM_FAULT_ERROR)) {
11906- if (fault & VM_FAULT_OOM)
11907- goto out_of_memory;
11908- else if (fault & VM_FAULT_SIGBUS)
11909- goto do_sigbus;
11910- BUG();
11911- }
11912- if (fault & VM_FAULT_MAJOR)
11913- tsk->maj_flt++;
11914- else
11915- tsk->min_flt++;
11916- up_read(&mm->mmap_sem);
11917- return;
11918-
11919-/*
11920- * Something tried to access memory that isn't in our memory map..
11921- * Fix it, but check if it's kernel or user first..
11922- */
11923-bad_area:
11924- up_read(&mm->mmap_sem);
11925-
11926-bad_area_nosemaphore:
11927- /* User mode accesses just cause a SIGSEGV */
11928- if (error_code & PF_USER) {
11929-
11930- /*
11931- * It's possible to have interrupts off here.
11932- */
11933- local_irq_enable();
11934-
11935- if (is_prefetch(regs, address, error_code))
11936- return;
11937-
11938- /* Work around K8 erratum #100 K8 in compat mode
11939- occasionally jumps to illegal addresses >4GB. We
11940- catch this here in the page fault handler because
11941- these addresses are not reachable. Just detect this
11942- case and return. Any code segment in LDT is
11943- compatibility mode. */
11944- if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
11945- (address >> 32))
11946- return;
11947-
11948- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
11949- printk_ratelimit()) {
11950- printk(
11951- "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
11952- tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
11953- tsk->comm, tsk->pid, address, regs->rip,
11954- regs->rsp, error_code);
11955- }
11956-
11957- tsk->thread.cr2 = address;
11958- /* Kernel addresses are always protection faults */
11959- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
11960- tsk->thread.trap_no = 14;
11961- info.si_signo = SIGSEGV;
11962- info.si_errno = 0;
11963- /* info.si_code has been set above */
11964- info.si_addr = (void __user *)address;
11965- force_sig_info(SIGSEGV, &info, tsk);
11966- return;
11967- }
11968-
11969-no_context:
11970-
11971- /* Are we prepared to handle this kernel fault? */
11972- fixup = search_exception_tables(regs->rip);
11973- if (fixup) {
11974- regs->rip = fixup->fixup;
11975- return;
11976- }
11977-
11978- /*
11979- * Hall of shame of CPU/BIOS bugs.
11980- */
11981-
11982- if (is_prefetch(regs, address, error_code))
11983- return;
11984-
11985- if (is_errata93(regs, address))
11986- return;
11987-
11988-/*
11989- * Oops. The kernel tried to access some bad page. We'll have to
11990- * terminate things with extreme prejudice.
11991- */
11992-
11993- flags = oops_begin();
11994-
11995- if (address < PAGE_SIZE)
11996- printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
11997- else
11998- printk(KERN_ALERT "Unable to handle kernel paging request");
11999- printk(" at %016lx RIP: \n" KERN_ALERT,address);
12000- printk_address(regs->rip);
12001- dump_pagetable(address);
12002- tsk->thread.cr2 = address;
12003- tsk->thread.trap_no = 14;
12004- tsk->thread.error_code = error_code;
12005- __die("Oops", regs, error_code);
12006- /* Executive summary in case the body of the oops scrolled away */
12007- printk(KERN_EMERG "CR2: %016lx\n", address);
12008- oops_end(flags);
12009- do_exit(SIGKILL);
12010-
12011-/*
12012- * We ran out of memory, or some other thing happened to us that made
12013- * us unable to handle the page fault gracefully.
12014- */
12015-out_of_memory:
12016- up_read(&mm->mmap_sem);
12017- if (is_global_init(current)) {
12018- yield();
12019- goto again;
12020- }
12021- printk("VM: killing process %s\n", tsk->comm);
12022- if (error_code & 4)
12023- do_group_exit(SIGKILL);
12024- goto no_context;
12025-
12026-do_sigbus:
12027- up_read(&mm->mmap_sem);
12028-
12029- /* Kernel mode? Handle exceptions or die */
12030- if (!(error_code & PF_USER))
12031- goto no_context;
12032-
12033- tsk->thread.cr2 = address;
12034- tsk->thread.error_code = error_code;
12035- tsk->thread.trap_no = 14;
12036- info.si_signo = SIGBUS;
12037- info.si_errno = 0;
12038- info.si_code = BUS_ADRERR;
12039- info.si_addr = (void __user *)address;
12040- force_sig_info(SIGBUS, &info, tsk);
12041- return;
12042-}
12043-
12044-DEFINE_SPINLOCK(pgd_lock);
12045-LIST_HEAD(pgd_list);
12046-
12047-void vmalloc_sync_all(void)
12048-{
12049- /* Note that races in the updates of insync and start aren't
12050- problematic:
12051- insync can only get set bits added, and updates to start are only
12052- improving performance (without affecting correctness if undone). */
12053- static DECLARE_BITMAP(insync, PTRS_PER_PGD);
12054- static unsigned long start = VMALLOC_START & PGDIR_MASK;
12055- unsigned long address;
12056-
12057- for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
12058- if (!test_bit(pgd_index(address), insync)) {
12059- const pgd_t *pgd_ref = pgd_offset_k(address);
12060- struct page *page;
12061-
12062- if (pgd_none(*pgd_ref))
12063- continue;
12064- spin_lock(&pgd_lock);
12065- list_for_each_entry(page, &pgd_list, lru) {
12066- pgd_t *pgd;
12067- pgd = (pgd_t *)page_address(page) + pgd_index(address);
12068- if (pgd_none(*pgd))
12069- set_pgd(pgd, *pgd_ref);
12070- else
12071- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
12072- }
12073- spin_unlock(&pgd_lock);
12074- set_bit(pgd_index(address), insync);
12075- }
12076- if (address == start)
12077- start = address + PGDIR_SIZE;
12078- }
12079- /* Check that there is no need to do the same for the modules area. */
12080- BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
12081- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
12082- (__START_KERNEL & PGDIR_MASK)));
12083-}
12084--- /dev/null
12085+++ b/arch/x86/mm/fault-xen.c
12086@@ -0,0 +1,1026 @@
12087+/*
12088+ * Copyright (C) 1995 Linus Torvalds
12089+ * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
12090+ */
12091+
12092+#include <linux/signal.h>
12093+#include <linux/sched.h>
12094+#include <linux/kernel.h>
12095+#include <linux/errno.h>
12096+#include <linux/string.h>
12097+#include <linux/types.h>
12098+#include <linux/ptrace.h>
12099+#include <linux/mman.h>
12100+#include <linux/mm.h>
12101+#include <linux/smp.h>
12102+#include <linux/interrupt.h>
12103+#include <linux/init.h>
12104+#include <linux/tty.h>
12105+#include <linux/vt_kern.h> /* For unblank_screen() */
12106+#include <linux/compiler.h>
12107+#include <linux/highmem.h>
12108+#include <linux/bootmem.h> /* for max_low_pfn */
12109+#include <linux/vmalloc.h>
12110+#include <linux/module.h>
12111+#include <linux/kprobes.h>
12112+#include <linux/uaccess.h>
12113+#include <linux/kdebug.h>
12114+
12115+#include <asm/system.h>
12116+#include <asm/desc.h>
12117+#include <asm/segment.h>
12118+#include <asm/pgalloc.h>
12119+#include <asm/smp.h>
12120+#include <asm/tlbflush.h>
12121+#include <asm/proto.h>
12122+#include <asm-generic/sections.h>
12123+
12124+/*
12125+ * Page fault error code bits
12126+ * bit 0 == 0 means no page found, 1 means protection fault
12127+ * bit 1 == 0 means read, 1 means write
12128+ * bit 2 == 0 means kernel, 1 means user-mode
12129+ * bit 3 == 1 means use of reserved bit detected
12130+ * bit 4 == 1 means fault was an instruction fetch
12131+ */
12132+#define PF_PROT (1<<0)
12133+#define PF_WRITE (1<<1)
12134+#define PF_USER (1<<2)
12135+#define PF_RSVD (1<<3)
12136+#define PF_INSTR (1<<4)
12137+
12138+static inline int notify_page_fault(struct pt_regs *regs)
12139+{
12140+#ifdef CONFIG_KPROBES
12141+ int ret = 0;
12142+
12143+ /* kprobe_running() needs smp_processor_id() */
12144+#ifdef CONFIG_X86_32
12145+ if (!user_mode_vm(regs)) {
12146+#else
12147+ if (!user_mode(regs)) {
12148+#endif
12149+ preempt_disable();
12150+ if (kprobe_running() && kprobe_fault_handler(regs, 14))
12151+ ret = 1;
12152+ preempt_enable();
12153+ }
12154+
12155+ return ret;
12156+#else
12157+ return 0;
12158+#endif
12159+}
12160+
12161+/*
12162+ * X86_32
12163+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
12164+ * Check that here and ignore it.
12165+ *
12166+ * X86_64
12167+ * Sometimes the CPU reports invalid exceptions on prefetch.
12168+ * Check that here and ignore it.
12169+ *
12170+ * Opcode checker based on code by Richard Brunner
12171+ */
12172+static int is_prefetch(struct pt_regs *regs, unsigned long addr,
12173+ unsigned long error_code)
12174+{
12175+ unsigned char *instr;
12176+ int scan_more = 1;
12177+ int prefetch = 0;
12178+ unsigned char *max_instr;
12179+
12180+ /*
12181+ * If it was a exec (instruction fetch) fault on NX page, then
12182+ * do not ignore the fault:
12183+ */
12184+ if (error_code & PF_INSTR)
12185+ return 0;
12186+
12187+ instr = (unsigned char *)convert_ip_to_linear(current, regs);
12188+ max_instr = instr + 15;
12189+
12190+ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
12191+ return 0;
12192+
12193+ while (scan_more && instr < max_instr) {
12194+ unsigned char opcode;
12195+ unsigned char instr_hi;
12196+ unsigned char instr_lo;
12197+
12198+ if (probe_kernel_address(instr, opcode))
12199+ break;
12200+
12201+ instr_hi = opcode & 0xf0;
12202+ instr_lo = opcode & 0x0f;
12203+ instr++;
12204+
12205+ switch (instr_hi) {
12206+ case 0x20:
12207+ case 0x30:
12208+ /*
12209+ * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
12210+ * In X86_64 long mode, the CPU will signal invalid
12211+ * opcode if some of these prefixes are present so
12212+ * X86_64 will never get here anyway
12213+ */
12214+ scan_more = ((instr_lo & 7) == 0x6);
12215+ break;
12216+#ifdef CONFIG_X86_64
12217+ case 0x40:
12218+ /*
12219+ * In AMD64 long mode 0x40..0x4F are valid REX prefixes
12220+ * Need to figure out under what instruction mode the
12221+ * instruction was issued. Could check the LDT for lm,
12222+ * but for now it's good enough to assume that long
12223+ * mode only uses well known segments or kernel.
12224+ */
12225+ scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
12226+ break;
12227+#endif
12228+ case 0x60:
12229+ /* 0x64 thru 0x67 are valid prefixes in all modes. */
12230+ scan_more = (instr_lo & 0xC) == 0x4;
12231+ break;
12232+ case 0xF0:
12233+ /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
12234+ scan_more = !instr_lo || (instr_lo>>1) == 1;
12235+ break;
12236+ case 0x00:
12237+ /* Prefetch instruction is 0x0F0D or 0x0F18 */
12238+ scan_more = 0;
12239+
12240+ if (probe_kernel_address(instr, opcode))
12241+ break;
12242+ prefetch = (instr_lo == 0xF) &&
12243+ (opcode == 0x0D || opcode == 0x18);
12244+ break;
12245+ default:
12246+ scan_more = 0;
12247+ break;
12248+ }
12249+ }
12250+ return prefetch;
12251+}
12252+
12253+static void force_sig_info_fault(int si_signo, int si_code,
12254+ unsigned long address, struct task_struct *tsk)
12255+{
12256+ siginfo_t info;
12257+
12258+ info.si_signo = si_signo;
12259+ info.si_errno = 0;
12260+ info.si_code = si_code;
12261+ info.si_addr = (void __user *)address;
12262+ force_sig_info(si_signo, &info, tsk);
12263+}
12264+
12265+#ifdef CONFIG_X86_64
12266+static int bad_address(void *p)
12267+{
12268+ unsigned long dummy;
12269+ return probe_kernel_address((unsigned long *)p, dummy);
12270+}
12271+#endif
12272+
12273+static void dump_pagetable(unsigned long address)
12274+{
12275+#ifdef CONFIG_X86_32
12276+ __typeof__(pte_val(__pte(0))) page;
12277+
12278+ page = read_cr3();
12279+ page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
12280+#ifdef CONFIG_X86_PAE
12281+ printk("*pdpt = %016Lx ", page);
12282+ if ((page & _PAGE_PRESENT)
12283+ && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) {
12284+ page = mfn_to_pfn(page >> PAGE_SHIFT);
12285+ page <<= PAGE_SHIFT;
12286+ page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
12287+ & (PTRS_PER_PMD - 1)];
12288+ printk(KERN_CONT "*pde = %016Lx ", page);
12289+ page &= ~_PAGE_NX;
12290+ }
12291+#else
12292+ printk("*pde = %08lx ", page);
12293+#endif
12294+
12295+ /*
12296+ * We must not directly access the pte in the highpte
12297+ * case if the page table is located in highmem.
12298+ * And let's rather not kmap-atomic the pte, just in case
12299+ * it's allocated already.
12300+ */
12301+ if ((page & _PAGE_PRESENT)
12302+ && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn
12303+ && !(page & _PAGE_PSE)) {
12304+ page = mfn_to_pfn(page >> PAGE_SHIFT);
12305+ page <<= PAGE_SHIFT;
12306+ page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
12307+ & (PTRS_PER_PTE - 1)];
12308+ printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page);
12309+ }
12310+
12311+ printk(KERN_CONT "\n");
12312+#else /* CONFIG_X86_64 */
12313+ pgd_t *pgd;
12314+ pud_t *pud;
12315+ pmd_t *pmd;
12316+ pte_t *pte;
12317+
12318+ pgd = (pgd_t *)read_cr3();
12319+
12320+ pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
12321+ pgd += pgd_index(address);
12322+ if (bad_address(pgd)) goto bad;
12323+ printk("PGD %lx ", pgd_val(*pgd));
12324+ if (!pgd_present(*pgd)) goto ret;
12325+
12326+ pud = pud_offset(pgd, address);
12327+ if (bad_address(pud)) goto bad;
12328+ printk(KERN_CONT "PUD %lx ", pud_val(*pud));
12329+ if (!pud_present(*pud) || pud_large(*pud))
12330+ goto ret;
12331+
12332+ pmd = pmd_offset(pud, address);
12333+ if (bad_address(pmd)) goto bad;
12334+ printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
12335+ if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
12336+
12337+ pte = pte_offset_kernel(pmd, address);
12338+ if (bad_address(pte)) goto bad;
12339+ printk(KERN_CONT "PTE %lx", pte_val(*pte));
12340+ret:
12341+ printk(KERN_CONT "\n");
12342+ return;
12343+bad:
12344+ printk("BAD\n");
12345+#endif
12346+}
12347+
12348+#ifdef CONFIG_X86_32
12349+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
12350+{
12351+ unsigned index = pgd_index(address);
12352+ pgd_t *pgd_k;
12353+ pud_t *pud, *pud_k;
12354+ pmd_t *pmd, *pmd_k;
12355+
12356+ pgd += index;
12357+ pgd_k = init_mm.pgd + index;
12358+
12359+ if (!pgd_present(*pgd_k))
12360+ return NULL;
12361+
12362+ /*
12363+ * set_pgd(pgd, *pgd_k); here would be useless on PAE
12364+ * and redundant with the set_pmd() on non-PAE. As would
12365+ * set_pud.
12366+ */
12367+
12368+ pud = pud_offset(pgd, address);
12369+ pud_k = pud_offset(pgd_k, address);
12370+ if (!pud_present(*pud_k))
12371+ return NULL;
12372+
12373+ pmd = pmd_offset(pud, address);
12374+ pmd_k = pmd_offset(pud_k, address);
12375+ if (!pmd_present(*pmd_k))
12376+ return NULL;
12377+ if (!pmd_present(*pmd)) {
12378+ bool lazy = x86_read_percpu(xen_lazy_mmu);
12379+
12380+ x86_write_percpu(xen_lazy_mmu, false);
12381+#if CONFIG_XEN_COMPAT > 0x030002
12382+ set_pmd(pmd, *pmd_k);
12383+#else
12384+ /*
12385+ * When running on older Xen we must launder *pmd_k through
12386+ * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
12387+ */
12388+ set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
12389+#endif
12390+ x86_write_percpu(xen_lazy_mmu, lazy);
12391+ } else
12392+ BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
12393+ return pmd_k;
12394+}
12395+#endif
12396+
12397+#ifdef CONFIG_X86_64
12398+static const char errata93_warning[] =
12399+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
12400+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
12401+KERN_ERR "******* Please consider a BIOS update.\n"
12402+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
12403+#endif
12404+
12405+/* Workaround for K8 erratum #93 & buggy BIOS.
12406+ BIOS SMM functions are required to use a specific workaround
12407+ to avoid corruption of the 64bit RIP register on C stepping K8.
12408+ A lot of BIOS that didn't get tested properly miss this.
12409+ The OS sees this as a page fault with the upper 32bits of RIP cleared.
12410+ Try to work around it here.
12411+ Note we only handle faults in kernel here.
12412+ Does nothing for X86_32
12413+ */
12414+static int is_errata93(struct pt_regs *regs, unsigned long address)
12415+{
12416+#ifdef CONFIG_X86_64
12417+ static int warned;
12418+ if (address != regs->ip)
12419+ return 0;
12420+ if ((address >> 32) != 0)
12421+ return 0;
12422+ address |= 0xffffffffUL << 32;
12423+ if ((address >= (u64)_stext && address <= (u64)_etext) ||
12424+ (address >= MODULES_VADDR && address <= MODULES_END)) {
12425+ if (!warned) {
12426+ printk(errata93_warning);
12427+ warned = 1;
12428+ }
12429+ regs->ip = address;
12430+ return 1;
12431+ }
12432+#endif
12433+ return 0;
12434+}
12435+
12436+/*
12437+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
12438+ * addresses >4GB. We catch this in the page fault handler because these
12439+ * addresses are not reachable. Just detect this case and return. Any code
12440+ * segment in LDT is compatibility mode.
12441+ */
12442+static int is_errata100(struct pt_regs *regs, unsigned long address)
12443+{
12444+#ifdef CONFIG_X86_64
12445+ if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
12446+ (address >> 32))
12447+ return 1;
12448+#endif
12449+ return 0;
12450+}
12451+
12452+void do_invalid_op(struct pt_regs *, unsigned long);
12453+
12454+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
12455+{
12456+#ifdef CONFIG_X86_F00F_BUG
12457+ unsigned long nr;
12458+ /*
12459+ * Pentium F0 0F C7 C8 bug workaround.
12460+ */
12461+ if (boot_cpu_data.f00f_bug) {
12462+ nr = (address - idt_descr.address) >> 3;
12463+
12464+ if (nr == 6) {
12465+ do_invalid_op(regs, 0);
12466+ return 1;
12467+ }
12468+ }
12469+#endif
12470+ return 0;
12471+}
12472+
12473+static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
12474+ unsigned long address)
12475+{
12476+#ifdef CONFIG_X86_32
12477+ if (!oops_may_print())
12478+ return;
12479+#endif
12480+
12481+#ifdef CONFIG_X86_PAE
12482+ if (error_code & PF_INSTR) {
12483+ unsigned int level;
12484+ pte_t *pte = lookup_address(address, &level);
12485+
12486+ if (pte && pte_present(*pte) && !pte_exec(*pte))
12487+ printk(KERN_CRIT "kernel tried to execute "
12488+ "NX-protected page - exploit attempt? "
12489+ "(uid: %d)\n", current->uid);
12490+ }
12491+#endif
12492+
12493+ printk(KERN_ALERT "BUG: unable to handle kernel ");
12494+ if (address < PAGE_SIZE)
12495+ printk(KERN_CONT "NULL pointer dereference");
12496+ else
12497+ printk(KERN_CONT "paging request");
12498+#ifdef CONFIG_X86_32
12499+ printk(KERN_CONT " at %08lx\n", address);
12500+#else
12501+ printk(KERN_CONT " at %016lx\n", address);
12502+#endif
12503+ printk(KERN_ALERT "IP:");
12504+ printk_address(regs->ip, 1);
12505+ dump_pagetable(address);
12506+}
12507+
12508+#ifdef CONFIG_X86_64
12509+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
12510+ unsigned long error_code)
12511+{
12512+ unsigned long flags = oops_begin();
12513+ struct task_struct *tsk;
12514+
12515+ printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
12516+ current->comm, address);
12517+ dump_pagetable(address);
12518+ tsk = current;
12519+ tsk->thread.cr2 = address;
12520+ tsk->thread.trap_no = 14;
12521+ tsk->thread.error_code = error_code;
12522+ if (__die("Bad pagetable", regs, error_code))
12523+ regs = NULL;
12524+ oops_end(flags, regs, SIGKILL);
12525+}
12526+#endif
12527+
12528+static int spurious_fault_check(unsigned long error_code, pte_t *pte)
12529+{
12530+ if ((error_code & PF_WRITE) && !pte_write(*pte))
12531+ return 0;
12532+ if ((error_code & PF_INSTR) && !pte_exec(*pte))
12533+ return 0;
12534+
12535+ return 1;
12536+}
12537+
12538+/*
12539+ * Handle a spurious fault caused by a stale TLB entry. This allows
12540+ * us to lazily refresh the TLB when increasing the permissions of a
12541+ * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
12542+ * expensive since that implies doing a full cross-processor TLB
12543+ * flush, even if no stale TLB entries exist on other processors.
12544+ * There are no security implications to leaving a stale TLB when
12545+ * increasing the permissions on a page.
12546+ */
12547+static int spurious_fault(unsigned long address,
12548+ unsigned long error_code)
12549+{
12550+ pgd_t *pgd;
12551+ pud_t *pud;
12552+ pmd_t *pmd;
12553+ pte_t *pte;
12554+
12555+ /* Reserved-bit violation or user access to kernel space? */
12556+ if (error_code & (PF_USER | PF_RSVD))
12557+ return 0;
12558+
12559+ pgd = init_mm.pgd + pgd_index(address);
12560+ if (!pgd_present(*pgd))
12561+ return 0;
12562+
12563+ pud = pud_offset(pgd, address);
12564+ if (!pud_present(*pud))
12565+ return 0;
12566+
12567+ if (pud_large(*pud))
12568+ return spurious_fault_check(error_code, (pte_t *) pud);
12569+
12570+ pmd = pmd_offset(pud, address);
12571+ if (!pmd_present(*pmd))
12572+ return 0;
12573+
12574+ if (pmd_large(*pmd))
12575+ return spurious_fault_check(error_code, (pte_t *) pmd);
12576+
12577+ pte = pte_offset_kernel(pmd, address);
12578+ if (!pte_present(*pte))
12579+ return 0;
12580+
12581+ return spurious_fault_check(error_code, pte);
12582+}
12583+
12584+/*
12585+ * X86_32
12586+ * Handle a fault on the vmalloc or module mapping area
12587+ *
12588+ * X86_64
12589+ * Handle a fault on the vmalloc area
12590+ *
12591+ * This assumes no large pages in there.
12592+ */
12593+static int vmalloc_fault(unsigned long address)
12594+{
12595+#ifdef CONFIG_X86_32
12596+ unsigned long pgd_paddr;
12597+ pmd_t *pmd_k;
12598+ pte_t *pte_k;
12599+ /*
12600+ * Synchronize this task's top level page-table
12601+ * with the 'reference' page table.
12602+ *
12603+ * Do _not_ use "current" here. We might be inside
12604+ * an interrupt in the middle of a task switch..
12605+ */
12606+ pgd_paddr = read_cr3();
12607+ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
12608+ if (!pmd_k)
12609+ return -1;
12610+ pte_k = pte_offset_kernel(pmd_k, address);
12611+ if (!pte_present(*pte_k))
12612+ return -1;
12613+ return 0;
12614+#else
12615+ pgd_t *pgd, *pgd_ref;
12616+ pud_t *pud, *pud_ref;
12617+ pmd_t *pmd, *pmd_ref;
12618+ pte_t *pte, *pte_ref;
12619+
12620+ /* Make sure we are in vmalloc area */
12621+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
12622+ return -1;
12623+
12624+ /* Copy kernel mappings over when needed. This can also
12625+ happen within a race in page table update. In the later
12626+ case just flush. */
12627+
12628+ /* On Xen the line below does not always work. Needs investigating! */
12629+ /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
12630+ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
12631+ pgd += pgd_index(address);
12632+ pgd_ref = pgd_offset_k(address);
12633+ if (pgd_none(*pgd_ref))
12634+ return -1;
12635+ if (pgd_none(*pgd))
12636+ set_pgd(pgd, *pgd_ref);
12637+ else
12638+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
12639+
12640+ /* Below here mismatches are bugs because these lower tables
12641+ are shared */
12642+
12643+ pud = pud_offset(pgd, address);
12644+ pud_ref = pud_offset(pgd_ref, address);
12645+ if (pud_none(*pud_ref))
12646+ return -1;
12647+ if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
12648+ BUG();
12649+ pmd = pmd_offset(pud, address);
12650+ pmd_ref = pmd_offset(pud_ref, address);
12651+ if (pmd_none(*pmd_ref))
12652+ return -1;
12653+ if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
12654+ BUG();
12655+ pte_ref = pte_offset_kernel(pmd_ref, address);
12656+ if (!pte_present(*pte_ref))
12657+ return -1;
12658+ pte = pte_offset_kernel(pmd, address);
12659+ /* Don't use pte_page here, because the mappings can point
12660+ outside mem_map, and the NUMA hash lookup cannot handle
12661+ that. */
12662+ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
12663+ BUG();
12664+ return 0;
12665+#endif
12666+}
12667+
12668+int show_unhandled_signals = 1;
12669+
12670+/*
12671+ * This routine handles page faults. It determines the address,
12672+ * and the problem, and then passes it off to one of the appropriate
12673+ * routines.
12674+ */
12675+#ifdef CONFIG_X86_64
12676+asmlinkage
12677+#endif
12678+void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
12679+{
12680+ struct task_struct *tsk;
12681+ struct mm_struct *mm;
12682+ struct vm_area_struct *vma;
12683+ unsigned long address;
12684+ int write, si_code;
12685+ int fault;
12686+#ifdef CONFIG_X86_64
12687+ unsigned long flags;
12688+#endif
12689+
12690+ /*
12691+ * We can fault from pretty much anywhere, with unknown IRQ state.
12692+ */
12693+ trace_hardirqs_fixup();
12694+
12695+ /* Set the "privileged fault" bit to something sane. */
12696+ if (user_mode_vm(regs))
12697+ error_code |= PF_USER;
12698+ else
12699+ error_code &= ~PF_USER;
12700+
12701+ tsk = current;
12702+ mm = tsk->mm;
12703+ prefetchw(&mm->mmap_sem);
12704+
12705+ /* get the address */
12706+ address = read_cr2();
12707+
12708+ si_code = SEGV_MAPERR;
12709+
12710+ if (notify_page_fault(regs))
12711+ return;
12712+
12713+ /*
12714+ * We fault-in kernel-space virtual memory on-demand. The
12715+ * 'reference' page table is init_mm.pgd.
12716+ *
12717+ * NOTE! We MUST NOT take any locks for this case. We may
12718+ * be in an interrupt or a critical region, and should
12719+ * only copy the information from the master page table,
12720+ * nothing more.
12721+ *
12722+ * This verifies that the fault happens in kernel space
12723+ * (error_code & 4) == 0, and that the fault was not a
12724+ * protection error (error_code & 9) == 0.
12725+ */
12726+#ifdef CONFIG_X86_32
12727+ if (unlikely(address >= TASK_SIZE)) {
12728+#else
12729+ if (unlikely(address >= TASK_SIZE64)) {
12730+#endif
12731+ /* Faults in hypervisor area can never be patched up. */
12732+#if defined(CONFIG_X86_XEN)
12733+ if (address >= hypervisor_virt_start)
12734+ goto bad_area_nosemaphore;
12735+#elif defined(CONFIG_X86_64_XEN)
12736+ /* Faults in hypervisor area are never spurious. */
12737+ if (address >= HYPERVISOR_VIRT_START
12738+ && address < HYPERVISOR_VIRT_END)
12739+ goto bad_area_nosemaphore;
12740+#endif
12741+ if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
12742+ vmalloc_fault(address) >= 0)
12743+ return;
12744+
12745+ /* Can handle a stale RO->RW TLB */
12746+ if (spurious_fault(address, error_code))
12747+ return;
12748+
12749+ /*
12750+ * Don't take the mm semaphore here. If we fixup a prefetch
12751+ * fault we could otherwise deadlock.
12752+ */
12753+ goto bad_area_nosemaphore;
12754+ }
12755+
12756+
12757+#ifdef CONFIG_X86_32
12758+ /* It's safe to allow irq's after cr2 has been saved and the vmalloc
12759+ fault has been handled. */
12760+ if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
12761+ local_irq_enable();
12762+
12763+ /*
12764+ * If we're in an interrupt, have no user context or are running in an
12765+ * atomic region then we must not take the fault.
12766+ */
12767+ if (in_atomic() || !mm)
12768+ goto bad_area_nosemaphore;
12769+#else /* CONFIG_X86_64 */
12770+ if (likely(regs->flags & X86_EFLAGS_IF))
12771+ local_irq_enable();
12772+
12773+ if (unlikely(error_code & PF_RSVD))
12774+ pgtable_bad(address, regs, error_code);
12775+
12776+ /*
12777+ * If we're in an interrupt, have no user context or are running in an
12778+ * atomic region then we must not take the fault.
12779+ */
12780+ if (unlikely(in_atomic() || !mm))
12781+ goto bad_area_nosemaphore;
12782+
12783+ /*
12784+ * User-mode registers count as a user access even for any
12785+ * potential system fault or CPU buglet.
12786+ */
12787+ if (user_mode_vm(regs))
12788+ error_code |= PF_USER;
12789+again:
12790+#endif
12791+ /* When running in the kernel we expect faults to occur only to
12792+ * addresses in user space. All other faults represent errors in the
12793+ * kernel and should generate an OOPS. Unfortunately, in the case of an
12794+ * erroneous fault occurring in a code path which already holds mmap_sem
12795+ * we will deadlock attempting to validate the fault against the
12796+ * address space. Luckily the kernel only validly references user
12797+ * space from well defined areas of code, which are listed in the
12798+ * exceptions table.
12799+ *
12800+ * As the vast majority of faults will be valid we will only perform
12801+ * the source reference check when there is a possibility of a deadlock.
12802+ * Attempt to lock the address space, if we cannot we then validate the
12803+ * source. If this is invalid we can skip the address space check,
12804+ * thus avoiding the deadlock.
12805+ */
12806+ if (!down_read_trylock(&mm->mmap_sem)) {
12807+ if ((error_code & PF_USER) == 0 &&
12808+ !search_exception_tables(regs->ip))
12809+ goto bad_area_nosemaphore;
12810+ down_read(&mm->mmap_sem);
12811+ }
12812+
12813+ vma = find_vma(mm, address);
12814+ if (!vma)
12815+ goto bad_area;
12816+ if (vma->vm_start <= address)
12817+ goto good_area;
12818+ if (!(vma->vm_flags & VM_GROWSDOWN))
12819+ goto bad_area;
12820+ if (error_code & PF_USER) {
12821+ /*
12822+ * Accessing the stack below %sp is always a bug.
12823+ * The large cushion allows instructions like enter
12824+ * and pusha to work. ("enter $65535,$31" pushes
12825+ * 32 pointers and then decrements %sp by 65535.)
12826+ */
12827+ if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
12828+ goto bad_area;
12829+ }
12830+ if (expand_stack(vma, address))
12831+ goto bad_area;
12832+/*
12833+ * Ok, we have a good vm_area for this memory access, so
12834+ * we can handle it..
12835+ */
12836+good_area:
12837+ si_code = SEGV_ACCERR;
12838+ write = 0;
12839+ switch (error_code & (PF_PROT|PF_WRITE)) {
12840+ default: /* 3: write, present */
12841+ /* fall through */
12842+ case PF_WRITE: /* write, not present */
12843+ if (!(vma->vm_flags & VM_WRITE))
12844+ goto bad_area;
12845+ write++;
12846+ break;
12847+ case PF_PROT: /* read, present */
12848+ goto bad_area;
12849+ case 0: /* read, not present */
12850+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
12851+ goto bad_area;
12852+ }
12853+
12854+#ifdef CONFIG_X86_32
12855+survive:
12856+#endif
12857+ /*
12858+ * If for any reason at all we couldn't handle the fault,
12859+ * make sure we exit gracefully rather than endlessly redo
12860+ * the fault.
12861+ */
12862+ fault = handle_mm_fault(mm, vma, address, write);
12863+ if (unlikely(fault & VM_FAULT_ERROR)) {
12864+ if (fault & VM_FAULT_OOM)
12865+ goto out_of_memory;
12866+ else if (fault & VM_FAULT_SIGBUS)
12867+ goto do_sigbus;
12868+ BUG();
12869+ }
12870+ if (fault & VM_FAULT_MAJOR)
12871+ tsk->maj_flt++;
12872+ else
12873+ tsk->min_flt++;
12874+
12875+#ifdef CONFIG_X86_32
12876+ /*
12877+ * Did it hit the DOS screen memory VA from vm86 mode?
12878+ */
12879+ if (v8086_mode(regs)) {
12880+ unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
12881+ if (bit < 32)
12882+ tsk->thread.screen_bitmap |= 1 << bit;
12883+ }
12884+#endif
12885+ up_read(&mm->mmap_sem);
12886+ return;
12887+
12888+/*
12889+ * Something tried to access memory that isn't in our memory map..
12890+ * Fix it, but check if it's kernel or user first..
12891+ */
12892+bad_area:
12893+ up_read(&mm->mmap_sem);
12894+
12895+bad_area_nosemaphore:
12896+ /* User mode accesses just cause a SIGSEGV */
12897+ if (error_code & PF_USER) {
12898+ /*
12899+ * It's possible to have interrupts off here.
12900+ */
12901+ local_irq_enable();
12902+
12903+ /*
12904+ * Valid to do another page fault here because this one came
12905+ * from user space.
12906+ */
12907+ if (is_prefetch(regs, address, error_code))
12908+ return;
12909+
12910+ if (is_errata100(regs, address))
12911+ return;
12912+
12913+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
12914+ printk_ratelimit()) {
12915+ printk(
12916+#ifdef CONFIG_X86_32
12917+ "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
12918+#else
12919+ "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
12920+#endif
12921+ task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
12922+ tsk->comm, task_pid_nr(tsk), address, regs->ip,
12923+ regs->sp, error_code);
12924+ print_vma_addr(" in ", regs->ip);
12925+ printk("\n");
12926+ }
12927+
12928+ tsk->thread.cr2 = address;
12929+ /* Kernel addresses are always protection faults */
12930+ tsk->thread.error_code = error_code | (address >= TASK_SIZE);
12931+ tsk->thread.trap_no = 14;
12932+ force_sig_info_fault(SIGSEGV, si_code, address, tsk);
12933+ return;
12934+ }
12935+
12936+ if (is_f00f_bug(regs, address))
12937+ return;
12938+
12939+no_context:
12940+ /* Are we prepared to handle this kernel fault? */
12941+ if (fixup_exception(regs))
12942+ return;
12943+
12944+ /*
12945+ * X86_32
12946+ * Valid to do another page fault here, because if this fault
12947+ * had been triggered by is_prefetch fixup_exception would have
12948+ * handled it.
12949+ *
12950+ * X86_64
12951+ * Hall of shame of CPU/BIOS bugs.
12952+ */
12953+ if (is_prefetch(regs, address, error_code))
12954+ return;
12955+
12956+ if (is_errata93(regs, address))
12957+ return;
12958+
12959+/*
12960+ * Oops. The kernel tried to access some bad page. We'll have to
12961+ * terminate things with extreme prejudice.
12962+ */
12963+#ifdef CONFIG_X86_32
12964+ bust_spinlocks(1);
12965+#else
12966+ flags = oops_begin();
12967+#endif
12968+
12969+ show_fault_oops(regs, error_code, address);
12970+
12971+ tsk->thread.cr2 = address;
12972+ tsk->thread.trap_no = 14;
12973+ tsk->thread.error_code = error_code;
12974+
12975+#ifdef CONFIG_X86_32
12976+ die("Oops", regs, error_code);
12977+ bust_spinlocks(0);
12978+ do_exit(SIGKILL);
12979+#else
12980+ if (__die("Oops", regs, error_code))
12981+ regs = NULL;
12982+ /* Executive summary in case the body of the oops scrolled away */
12983+ printk(KERN_EMERG "CR2: %016lx\n", address);
12984+ oops_end(flags, regs, SIGKILL);
12985+#endif
12986+
12987+/*
12988+ * We ran out of memory, or some other thing happened to us that made
12989+ * us unable to handle the page fault gracefully.
12990+ */
12991+out_of_memory:
12992+ up_read(&mm->mmap_sem);
12993+ if (is_global_init(tsk)) {
12994+ yield();
12995+#ifdef CONFIG_X86_32
12996+ down_read(&mm->mmap_sem);
12997+ goto survive;
12998+#else
12999+ goto again;
13000+#endif
13001+ }
13002+
13003+ printk("VM: killing process %s\n", tsk->comm);
13004+ if (error_code & PF_USER)
13005+ do_group_exit(SIGKILL);
13006+ goto no_context;
13007+
13008+do_sigbus:
13009+ up_read(&mm->mmap_sem);
13010+
13011+ /* Kernel mode? Handle exceptions or die */
13012+ if (!(error_code & PF_USER))
13013+ goto no_context;
13014+#ifdef CONFIG_X86_32
13015+ /* User space => ok to do another page fault */
13016+ if (is_prefetch(regs, address, error_code))
13017+ return;
13018+#endif
13019+ tsk->thread.cr2 = address;
13020+ tsk->thread.error_code = error_code;
13021+ tsk->thread.trap_no = 14;
13022+ force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
13023+}
13024+
13025+DEFINE_SPINLOCK(pgd_lock);
13026+LIST_HEAD(pgd_list);
13027+
13028+void vmalloc_sync_all(void)
13029+{
13030+#ifdef CONFIG_X86_32
13031+ /*
13032+ * Note that races in the updates of insync and start aren't
13033+ * problematic: insync can only get set bits added, and updates to
13034+ * start are only improving performance (without affecting correctness
13035+ * if undone).
13036+ * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
13037+ * This change works just fine with 2-level paging too.
13038+ */
13039+#define sync_index(a) ((a) >> PMD_SHIFT)
13040+ static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
13041+ static unsigned long start = TASK_SIZE;
13042+ unsigned long address;
13043+
13044+ if (SHARED_KERNEL_PMD)
13045+ return;
13046+
13047+ BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
13048+ for (address = start;
13049+ address < hypervisor_virt_start;
13050+ address += PMD_SIZE) {
13051+ if (!test_bit(sync_index(address), insync)) {
13052+ unsigned long flags;
13053+ struct page *page;
13054+
13055+ spin_lock_irqsave(&pgd_lock, flags);
13056+ /* XEN: failure path assumes non-empty pgd_list. */
13057+ if (unlikely(list_empty(&pgd_list))) {
13058+ spin_unlock_irqrestore(&pgd_lock, flags);
13059+ return;
13060+ }
13061+ list_for_each_entry(page, &pgd_list, lru) {
13062+ if (!vmalloc_sync_one(page_address(page),
13063+ address))
13064+ break;
13065+ }
13066+ spin_unlock_irqrestore(&pgd_lock, flags);
13067+ if (!page)
13068+ set_bit(sync_index(address), insync);
13069+ }
13070+ if (address == start && test_bit(sync_index(address), insync))
13071+ start = address + PMD_SIZE;
13072+ }
13073+#else /* CONFIG_X86_64 */
13074+ /*
13075+ * Note that races in the updates of insync and start aren't
13076+ * problematic: insync can only get set bits added, and updates to
13077+ * start are only improving performance (without affecting correctness
13078+ * if undone).
13079+ */
13080+ static DECLARE_BITMAP(insync, PTRS_PER_PGD);
13081+ static unsigned long start = VMALLOC_START & PGDIR_MASK;
13082+ unsigned long address;
13083+
13084+ for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
13085+ if (!test_bit(pgd_index(address), insync)) {
13086+ const pgd_t *pgd_ref = pgd_offset_k(address);
13087+ unsigned long flags;
13088+ struct page *page;
13089+
13090+ if (pgd_none(*pgd_ref))
13091+ continue;
13092+ spin_lock_irqsave(&pgd_lock, flags);
13093+ list_for_each_entry(page, &pgd_list, lru) {
13094+ pgd_t *pgd;
13095+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
13096+ if (pgd_none(*pgd))
13097+ set_pgd(pgd, *pgd_ref);
13098+ else
13099+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
13100+ }
13101+ spin_unlock_irqrestore(&pgd_lock, flags);
13102+ set_bit(pgd_index(address), insync);
13103+ }
13104+ if (address == start)
13105+ start = address + PGDIR_SIZE;
13106+ }
13107+ /* Check that there is no need to do the same for the modules area. */
13108+ BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
13109+ BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
13110+ (__START_KERNEL & PGDIR_MASK)));
13111+#endif
13112+}
13113--- a/arch/x86/mm/highmem_32-xen.c
13114+++ b/arch/x86/mm/highmem_32-xen.c
13115@@ -18,6 +18,49 @@ void kunmap(struct page *page)
13116 kunmap_high(page);
13117 }
13118
13119+static void debug_kmap_atomic_prot(enum km_type type)
13120+{
13121+#ifdef CONFIG_DEBUG_HIGHMEM
13122+ static unsigned warn_count = 10;
13123+
13124+ if (unlikely(warn_count == 0))
13125+ return;
13126+
13127+ if (unlikely(in_interrupt())) {
13128+ if (in_irq()) {
13129+ if (type != KM_IRQ0 && type != KM_IRQ1 &&
13130+ type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
13131+ type != KM_BOUNCE_READ) {
13132+ WARN_ON(1);
13133+ warn_count--;
13134+ }
13135+ } else if (!irqs_disabled()) { /* softirq */
13136+ if (type != KM_IRQ0 && type != KM_IRQ1 &&
13137+ type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
13138+ type != KM_SKB_SUNRPC_DATA &&
13139+ type != KM_SKB_DATA_SOFTIRQ &&
13140+ type != KM_BOUNCE_READ) {
13141+ WARN_ON(1);
13142+ warn_count--;
13143+ }
13144+ }
13145+ }
13146+
13147+ if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
13148+ type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
13149+ if (!irqs_disabled()) {
13150+ WARN_ON(1);
13151+ warn_count--;
13152+ }
13153+ } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
13154+ if (irq_count() == 0 && !irqs_disabled()) {
13155+ WARN_ON(1);
13156+ warn_count--;
13157+ }
13158+ }
13159+#endif
13160+}
13161+
13162 /*
13163 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
13164 * no global lock is needed and because the kmap code must perform a global TLB
13165@@ -37,6 +80,8 @@ void *kmap_atomic_prot(struct page *page
13166 if (!PageHighMem(page))
13167 return page_address(page);
13168
13169+ debug_kmap_atomic_prot(type);
13170+
13171 idx = type + KM_TYPE_NR*smp_processor_id();
13172 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
13173 BUG_ON(!pte_none(*(kmap_pte-idx)));
13174--- a/arch/x86/mm/hypervisor.c
13175+++ b/arch/x86/mm/hypervisor.c
13176@@ -831,15 +831,11 @@ int xen_limit_pages_to_max_mfn(
13177 }
13178 EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
13179
13180-#ifdef __i386__
13181-int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
13182+int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
13183 {
13184- __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
13185- maddr_t mach_lp = arbitrary_virt_to_machine(lp);
13186- return HYPERVISOR_update_descriptor(
13187- mach_lp, (u64)entry_a | ((u64)entry_b<<32));
13188+ maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
13189+ return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
13190 }
13191-#endif
13192
13193 #define MAX_BATCHED_FULL_PTES 32
13194
13195--- a/arch/x86/mm/init_32-xen.c
13196+++ b/arch/x86/mm/init_32-xen.c
13197@@ -27,13 +27,13 @@
13198 #include <linux/bootmem.h>
13199 #include <linux/slab.h>
13200 #include <linux/proc_fs.h>
13201-#include <linux/efi.h>
13202 #include <linux/memory_hotplug.h>
13203 #include <linux/initrd.h>
13204 #include <linux/cpumask.h>
13205 #include <linux/dma-mapping.h>
13206 #include <linux/scatterlist.h>
13207
13208+#include <asm/asm.h>
13209 #include <asm/processor.h>
13210 #include <asm/system.h>
13211 #include <asm/uaccess.h>
13212@@ -42,18 +42,22 @@
13213 #include <asm/fixmap.h>
13214 #include <asm/e820.h>
13215 #include <asm/apic.h>
13216+#include <asm/bugs.h>
13217 #include <asm/tlb.h>
13218 #include <asm/tlbflush.h>
13219+#include <asm/pgalloc.h>
13220 #include <asm/sections.h>
13221 #include <asm/hypervisor.h>
13222 #include <asm/swiotlb.h>
13223+#include <asm/setup.h>
13224+#include <asm/cacheflush.h>
13225
13226 unsigned int __VMALLOC_RESERVE = 128 << 20;
13227
13228 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
13229 unsigned long highstart_pfn, highend_pfn;
13230
13231-static int noinline do_test_wp_bit(void);
13232+static noinline int do_test_wp_bit(void);
13233
13234 /*
13235 * Creates a middle page table and puts a pointer to it in the
13236@@ -64,17 +68,16 @@ static pmd_t * __init one_md_table_init(
13237 {
13238 pud_t *pud;
13239 pmd_t *pmd_table;
13240-
13241+
13242 #ifdef CONFIG_X86_PAE
13243 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
13244 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
13245
13246- paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
13247+ paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
13248 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
13249 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
13250 pud = pud_offset(pgd, 0);
13251- if (pmd_table != pmd_offset(pud, 0))
13252- BUG();
13253+ BUG_ON(pmd_table != pmd_offset(pud, 0));
13254 }
13255 #endif
13256 pud = pud_offset(pgd, 0);
13257@@ -85,7 +88,7 @@ static pmd_t * __init one_md_table_init(
13258
13259 /*
13260 * Create a page table and place a pointer to it in a middle page
13261- * directory entry.
13262+ * directory entry:
13263 */
13264 static pte_t * __init one_page_table_init(pmd_t *pmd)
13265 {
13266@@ -99,9 +102,10 @@ static pte_t * __init one_page_table_ini
13267 #ifdef CONFIG_DEBUG_PAGEALLOC
13268 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
13269 #endif
13270- if (!page_table)
13271+ if (!page_table) {
13272 page_table =
13273 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
13274+ }
13275
13276 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
13277 make_lowmem_page_readonly(page_table,
13278@@ -114,22 +118,21 @@ static pte_t * __init one_page_table_ini
13279 }
13280
13281 /*
13282- * This function initializes a certain range of kernel virtual memory
13283+ * This function initializes a certain range of kernel virtual memory
13284 * with new bootmem page tables, everywhere page tables are missing in
13285 * the given range.
13286- */
13287-
13288-/*
13289- * NOTE: The pagetables are allocated contiguous on the physical space
13290- * so we can cache the place of the first one and move around without
13291+ *
13292+ * NOTE: The pagetables are allocated contiguous on the physical space
13293+ * so we can cache the place of the first one and move around without
13294 * checking the pgd every time.
13295 */
13296-static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
13297+static void __init
13298+page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
13299 {
13300- pgd_t *pgd;
13301- pmd_t *pmd;
13302 int pgd_idx, pmd_idx;
13303 unsigned long vaddr;
13304+ pgd_t *pgd;
13305+ pmd_t *pmd;
13306
13307 vaddr = start;
13308 pgd_idx = pgd_index(vaddr);
13309@@ -139,7 +142,8 @@ static void __init page_table_range_init
13310 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
13311 pmd = one_md_table_init(pgd);
13312 pmd = pmd + pmd_index(vaddr);
13313- for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
13314+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
13315+ pmd++, pmd_idx++) {
13316 if (vaddr < hypervisor_virt_start)
13317 one_page_table_init(pmd);
13318
13319@@ -157,17 +161,17 @@ static inline int is_kernel_text(unsigne
13320 }
13321
13322 /*
13323- * This maps the physical memory to kernel virtual address space, a total
13324- * of max_low_pfn pages, by creating page tables starting from address
13325- * PAGE_OFFSET.
13326+ * This maps the physical memory to kernel virtual address space, a total
13327+ * of max_low_pfn pages, by creating page tables starting from address
13328+ * PAGE_OFFSET:
13329 */
13330 static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
13331 {
13332+ int pgd_idx, pmd_idx, pte_ofs;
13333 unsigned long pfn;
13334 pgd_t *pgd;
13335 pmd_t *pmd;
13336 pte_t *pte;
13337- int pgd_idx, pmd_idx, pte_ofs;
13338
13339 unsigned long max_ram_pfn = xen_start_info->nr_pages;
13340 if (max_ram_pfn > max_low_pfn)
13341@@ -195,36 +199,49 @@ static void __init kernel_physical_mappi
13342 if (pfn >= max_low_pfn)
13343 continue;
13344 pmd += pmd_idx;
13345- for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
13346- unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
13347- if (address >= hypervisor_virt_start)
13348+ for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
13349+ pmd++, pmd_idx++) {
13350+ unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
13351+
13352+ if (addr >= hypervisor_virt_start)
13353 continue;
13354
13355- /* Map with big pages if possible, otherwise create normal page tables. */
13356+ /*
13357+ * Map with big pages if possible, otherwise
13358+ * create normal page tables:
13359+ */
13360 if (cpu_has_pse) {
13361- unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
13362- if (is_kernel_text(address) || is_kernel_text(address2))
13363- set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
13364- else
13365- set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
13366+ unsigned int addr2;
13367+ pgprot_t prot = PAGE_KERNEL_LARGE;
13368+
13369+ addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
13370+ PAGE_OFFSET + PAGE_SIZE-1;
13371+
13372+ if (is_kernel_text(addr) ||
13373+ is_kernel_text(addr2))
13374+ prot = PAGE_KERNEL_LARGE_EXEC;
13375+
13376+ set_pmd(pmd, pfn_pmd(pfn, prot));
13377
13378 pfn += PTRS_PER_PTE;
13379- } else {
13380- pte = one_page_table_init(pmd);
13381+ continue;
13382+ }
13383+ pte = one_page_table_init(pmd);
13384+
13385+ for (pte += pte_ofs;
13386+ pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
13387+ pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
13388+ pgprot_t prot = PAGE_KERNEL;
13389+
13390+ /* XEN: Only map initial RAM allocation. */
13391+ if ((pfn >= max_ram_pfn) || pte_present(*pte))
13392+ continue;
13393+ if (is_kernel_text(addr))
13394+ prot = PAGE_KERNEL_EXEC;
13395
13396- for (pte += pte_ofs;
13397- pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
13398- pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
13399- /* XEN: Only map initial RAM allocation. */
13400- if ((pfn >= max_ram_pfn) || pte_present(*pte))
13401- continue;
13402- if (is_kernel_text(address))
13403- set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
13404- else
13405- set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
13406- }
13407- pte_ofs = 0;
13408+ set_pte(pte, pfn_pte(pfn, prot));
13409 }
13410+ pte_ofs = 0;
13411 }
13412 pmd_idx = 0;
13413 }
13414@@ -245,57 +262,23 @@ static inline int page_kills_ppro(unsign
13415
13416 #endif
13417
13418-int page_is_ram(unsigned long pagenr)
13419-{
13420- int i;
13421- unsigned long addr, end;
13422-
13423- if (efi_enabled) {
13424- efi_memory_desc_t *md;
13425- void *p;
13426-
13427- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
13428- md = p;
13429- if (!is_available_memory(md))
13430- continue;
13431- addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
13432- end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
13433-
13434- if ((pagenr >= addr) && (pagenr < end))
13435- return 1;
13436- }
13437- return 0;
13438- }
13439-
13440- for (i = 0; i < e820.nr_map; i++) {
13441-
13442- if (e820.map[i].type != E820_RAM) /* not usable memory */
13443- continue;
13444- /*
13445- * !!!FIXME!!! Some BIOSen report areas as RAM that
13446- * are not. Notably the 640->1Mb area. We need a sanity
13447- * check here.
13448- */
13449- addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
13450- end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
13451- if ((pagenr >= addr) && (pagenr < end))
13452- return 1;
13453- }
13454- return 0;
13455-}
13456-
13457 #ifdef CONFIG_HIGHMEM
13458 pte_t *kmap_pte;
13459 pgprot_t kmap_prot;
13460
13461-#define kmap_get_fixmap_pte(vaddr) \
13462- pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
13463+static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
13464+{
13465+ return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
13466+ vaddr), vaddr), vaddr);
13467+}
13468
13469 static void __init kmap_init(void)
13470 {
13471 unsigned long kmap_vstart;
13472
13473- /* cache the first kmap pte */
13474+ /*
13475+ * Cache the first kmap pte:
13476+ */
13477 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
13478 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
13479
13480@@ -304,11 +287,11 @@ static void __init kmap_init(void)
13481
13482 static void __init permanent_kmaps_init(pgd_t *pgd_base)
13483 {
13484+ unsigned long vaddr;
13485 pgd_t *pgd;
13486 pud_t *pud;
13487 pmd_t *pmd;
13488 pte_t *pte;
13489- unsigned long vaddr;
13490
13491 vaddr = PKMAP_BASE;
13492 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
13493@@ -317,7 +300,7 @@ static void __init permanent_kmaps_init(
13494 pud = pud_offset(pgd, vaddr);
13495 pmd = pmd_offset(pud, vaddr);
13496 pte = pte_offset_kernel(pmd, vaddr);
13497- pkmap_page_table = pte;
13498+ pkmap_page_table = pte;
13499 }
13500
13501 static void __meminit free_new_highpage(struct page *page, int pfn)
13502@@ -337,7 +320,8 @@ void __init add_one_highpage_init(struct
13503 SetPageReserved(page);
13504 }
13505
13506-static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
13507+static int __meminit
13508+add_one_highpage_hotplug(struct page *page, unsigned long pfn)
13509 {
13510 free_new_highpage(page, pfn);
13511 totalram_pages++;
13512@@ -345,6 +329,7 @@ static int __meminit add_one_highpage_ho
13513 max_mapnr = max(pfn, max_mapnr);
13514 #endif
13515 num_physpages++;
13516+
13517 return 0;
13518 }
13519
13520@@ -352,7 +337,7 @@ static int __meminit add_one_highpage_ho
13521 * Not currently handling the NUMA case.
13522 * Assuming single node and all memory that
13523 * has been added dynamically that would be
13524- * onlined here is in HIGHMEM
13525+ * onlined here is in HIGHMEM.
13526 */
13527 void __meminit online_page(struct page *page)
13528 {
13529@@ -360,13 +345,11 @@ void __meminit online_page(struct page *
13530 add_one_highpage_hotplug(page, page_to_pfn(page));
13531 }
13532
13533-
13534-#ifdef CONFIG_NUMA
13535-extern void set_highmem_pages_init(int);
13536-#else
13537+#ifndef CONFIG_NUMA
13538 static void __init set_highmem_pages_init(int bad_ppro)
13539 {
13540 int pfn;
13541+
13542 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
13543 /*
13544 * Holes under sparsemem might not have no mem_map[]:
13545@@ -376,23 +359,18 @@ static void __init set_highmem_pages_ini
13546 }
13547 totalram_pages += totalhigh_pages;
13548 }
13549-#endif /* CONFIG_FLATMEM */
13550+#endif /* !CONFIG_NUMA */
13551
13552 #else
13553-#define kmap_init() do { } while (0)
13554-#define permanent_kmaps_init(pgd_base) do { } while (0)
13555-#define set_highmem_pages_init(bad_ppro) do { } while (0)
13556+# define kmap_init() do { } while (0)
13557+# define permanent_kmaps_init(pgd_base) do { } while (0)
13558+# define set_highmem_pages_init(bad_ppro) do { } while (0)
13559 #endif /* CONFIG_HIGHMEM */
13560
13561-unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
13562+pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
13563 EXPORT_SYMBOL(__PAGE_KERNEL);
13564-unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
13565
13566-#ifdef CONFIG_NUMA
13567-extern void __init remap_numa_kva(void);
13568-#else
13569-#define remap_numa_kva() do {} while (0)
13570-#endif
13571+pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
13572
13573 pgd_t *swapper_pg_dir;
13574
13575@@ -410,9 +388,8 @@ static void __init xen_pagetable_setup_d
13576 * the boot process.
13577 *
13578 * If we're booting on native hardware, this will be a pagetable
13579- * constructed in arch/i386/kernel/head.S, and not running in PAE mode
13580- * (even if we'll end up running in PAE). The root of the pagetable
13581- * will be swapper_pg_dir.
13582+ * constructed in arch/x86/kernel/head_32.S. The root of the
13583+ * pagetable will be swapper_pg_dir.
13584 *
13585 * If we're booting paravirtualized under a hypervisor, then there are
13586 * more options: we may already be running PAE, and the pagetable may
13587@@ -424,10 +401,10 @@ static void __init xen_pagetable_setup_d
13588 * be partially populated, and so it avoids stomping on any existing
13589 * mappings.
13590 */
13591-static void __init pagetable_init (void)
13592+static void __init pagetable_init(void)
13593 {
13594- unsigned long vaddr, end;
13595 pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
13596+ unsigned long vaddr, end;
13597
13598 xen_pagetable_setup_start(pgd_base);
13599
13600@@ -449,34 +426,36 @@ static void __init pagetable_init (void)
13601 * Fixed mappings, only the page table structure has to be
13602 * created - mappings will be set by set_fixmap():
13603 */
13604+ early_ioremap_clear();
13605 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
13606 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
13607 page_table_range_init(vaddr, end, pgd_base);
13608+ early_ioremap_reset();
13609
13610 permanent_kmaps_init(pgd_base);
13611
13612 xen_pagetable_setup_done(pgd_base);
13613 }
13614
13615-#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
13616+#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
13617 /*
13618- * Swap suspend & friends need this for resume because things like the intel-agp
13619+ * ACPI suspend needs this for resume, because things like the intel-agp
13620 * driver might have split up a kernel 4MB mapping.
13621 */
13622-char __nosavedata swsusp_pg_dir[PAGE_SIZE]
13623- __attribute__ ((aligned (PAGE_SIZE)));
13624+char swsusp_pg_dir[PAGE_SIZE]
13625+ __attribute__ ((aligned(PAGE_SIZE)));
13626
13627 static inline void save_pg_dir(void)
13628 {
13629 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
13630 }
13631-#else
13632+#else /* !CONFIG_ACPI_SLEEP */
13633 static inline void save_pg_dir(void)
13634 {
13635 }
13636-#endif
13637+#endif /* !CONFIG_ACPI_SLEEP */
13638
13639-void zap_low_mappings (void)
13640+void zap_low_mappings(void)
13641 {
13642 int i;
13643
13644@@ -488,22 +467,24 @@ void zap_low_mappings (void)
13645 * Note that "pgd_clear()" doesn't do it for
13646 * us, because pgd_clear() is a no-op on i386.
13647 */
13648- for (i = 0; i < USER_PTRS_PER_PGD; i++)
13649+ for (i = 0; i < USER_PTRS_PER_PGD; i++) {
13650 #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
13651 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
13652 #else
13653 set_pgd(swapper_pg_dir+i, __pgd(0));
13654 #endif
13655+ }
13656 flush_tlb_all();
13657 }
13658
13659-int nx_enabled = 0;
13660+int nx_enabled;
13661+
13662+pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
13663+EXPORT_SYMBOL_GPL(__supported_pte_mask);
13664
13665 #ifdef CONFIG_X86_PAE
13666
13667-static int disable_nx __initdata = 0;
13668-u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
13669-EXPORT_SYMBOL_GPL(__supported_pte_mask);
13670+static int disable_nx __initdata;
13671
13672 /*
13673 * noexec = on|off
13674@@ -520,11 +501,14 @@ static int __init noexec_setup(char *str
13675 __supported_pte_mask |= _PAGE_NX;
13676 disable_nx = 0;
13677 }
13678- } else if (!strcmp(str,"off")) {
13679- disable_nx = 1;
13680- __supported_pte_mask &= ~_PAGE_NX;
13681- } else
13682- return -EINVAL;
13683+ } else {
13684+ if (!strcmp(str, "off")) {
13685+ disable_nx = 1;
13686+ __supported_pte_mask &= ~_PAGE_NX;
13687+ } else {
13688+ return -EINVAL;
13689+ }
13690+ }
13691
13692 return 0;
13693 }
13694@@ -536,6 +520,7 @@ static void __init set_nx(void)
13695
13696 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
13697 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
13698+
13699 if ((v[3] & (1 << 20)) && !disable_nx) {
13700 rdmsr(MSR_EFER, l, h);
13701 l |= EFER_NX;
13702@@ -545,35 +530,6 @@ static void __init set_nx(void)
13703 }
13704 }
13705 }
13706-
13707-/*
13708- * Enables/disables executability of a given kernel page and
13709- * returns the previous setting.
13710- */
13711-int __init set_kernel_exec(unsigned long vaddr, int enable)
13712-{
13713- pte_t *pte;
13714- int ret = 1;
13715-
13716- if (!nx_enabled)
13717- goto out;
13718-
13719- pte = lookup_address(vaddr);
13720- BUG_ON(!pte);
13721-
13722- if (!pte_exec_kernel(*pte))
13723- ret = 0;
13724-
13725- if (enable)
13726- pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
13727- else
13728- pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
13729- pte_update_defer(&init_mm, vaddr, pte);
13730- __flush_tlb_all();
13731-out:
13732- return ret;
13733-}
13734-
13735 #endif
13736
13737 /*
13738@@ -590,21 +546,10 @@ void __init paging_init(void)
13739 #ifdef CONFIG_X86_PAE
13740 set_nx();
13741 if (nx_enabled)
13742- printk("NX (Execute Disable) protection: active\n");
13743+ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
13744 #endif
13745-
13746 pagetable_init();
13747
13748-#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
13749- /*
13750- * We will bail out later - printk doesn't work right now so
13751- * the user would just see a hanging kernel.
13752- * when running as xen domain we are already in PAE mode at
13753- * this point.
13754- */
13755- if (cpu_has_pae)
13756- set_in_cr4(X86_CR4_PAE);
13757-#endif
13758 __flush_tlb_all();
13759
13760 kmap_init();
13761@@ -631,10 +576,10 @@ void __init paging_init(void)
13762 * used to involve black magic jumps to work around some nasty CPU bugs,
13763 * but fortunately the switch to using exceptions got rid of all that.
13764 */
13765-
13766 static void __init test_wp_bit(void)
13767 {
13768- printk("Checking if this processor honours the WP bit even in supervisor mode... ");
13769+ printk(KERN_INFO
13770+ "Checking if this processor honours the WP bit even in supervisor mode...");
13771
13772 /* Any page-aligned address will do, the test is non-destructive */
13773 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
13774@@ -642,23 +587,22 @@ static void __init test_wp_bit(void)
13775 clear_fixmap(FIX_WP_TEST);
13776
13777 if (!boot_cpu_data.wp_works_ok) {
13778- printk("No.\n");
13779+ printk(KERN_CONT "No.\n");
13780 #ifdef CONFIG_X86_WP_WORKS_OK
13781- panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
13782+ panic(
13783+ "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
13784 #endif
13785 } else {
13786- printk("Ok.\n");
13787+ printk(KERN_CONT "Ok.\n");
13788 }
13789 }
13790
13791-static struct kcore_list kcore_mem, kcore_vmalloc;
13792+static struct kcore_list kcore_mem, kcore_vmalloc;
13793
13794 void __init mem_init(void)
13795 {
13796- extern int ppro_with_ram_bug(void);
13797 int codesize, reservedpages, datasize, initsize;
13798- int tmp;
13799- int bad_ppro;
13800+ int tmp, bad_ppro;
13801 unsigned long pfn;
13802
13803 #if defined(CONFIG_SWIOTLB)
13804@@ -668,19 +612,19 @@ void __init mem_init(void)
13805 #ifdef CONFIG_FLATMEM
13806 BUG_ON(!mem_map);
13807 #endif
13808-
13809 bad_ppro = ppro_with_ram_bug();
13810
13811 #ifdef CONFIG_HIGHMEM
13812 /* check that fixmap and pkmap do not overlap */
13813- if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
13814- printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
13815+ if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
13816+ printk(KERN_ERR
13817+ "fixmap and kmap areas overlap - this will crash\n");
13818 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
13819- PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
13820+ PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
13821+ FIXADDR_START);
13822 BUG();
13823 }
13824 #endif
13825-
13826 /* this will put all low memory onto the freelists */
13827 totalram_pages += free_all_bootmem();
13828 /* XEN: init and count low-mem pages outside initial allocation. */
13829@@ -693,7 +637,7 @@ void __init mem_init(void)
13830 reservedpages = 0;
13831 for (tmp = 0; tmp < max_low_pfn; tmp++)
13832 /*
13833- * Only count reserved RAM pages
13834+ * Only count reserved RAM pages:
13835 */
13836 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
13837 reservedpages++;
13838@@ -704,11 +648,12 @@ void __init mem_init(void)
13839 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
13840 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
13841
13842- kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
13843- kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
13844+ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
13845+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
13846 VMALLOC_END-VMALLOC_START);
13847
13848- printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
13849+ printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
13850+ "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
13851 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
13852 num_physpages << (PAGE_SHIFT-10),
13853 codesize >> 10,
13854@@ -719,54 +664,53 @@ void __init mem_init(void)
13855 );
13856
13857 #if 1 /* double-sanity-check paranoia */
13858- printk("virtual kernel memory layout:\n"
13859- " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13860+ printk(KERN_INFO "virtual kernel memory layout:\n"
13861+ " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13862 #ifdef CONFIG_HIGHMEM
13863- " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13864+ " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13865 #endif
13866- " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
13867- " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
13868- " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
13869- " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
13870- " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
13871- FIXADDR_START, FIXADDR_TOP,
13872- (FIXADDR_TOP - FIXADDR_START) >> 10,
13873+ " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
13874+ " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
13875+ " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
13876+ " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
13877+ " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
13878+ FIXADDR_START, FIXADDR_TOP,
13879+ (FIXADDR_TOP - FIXADDR_START) >> 10,
13880
13881 #ifdef CONFIG_HIGHMEM
13882- PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
13883- (LAST_PKMAP*PAGE_SIZE) >> 10,
13884+ PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
13885+ (LAST_PKMAP*PAGE_SIZE) >> 10,
13886 #endif
13887
13888- VMALLOC_START, VMALLOC_END,
13889- (VMALLOC_END - VMALLOC_START) >> 20,
13890+ VMALLOC_START, VMALLOC_END,
13891+ (VMALLOC_END - VMALLOC_START) >> 20,
13892
13893- (unsigned long)__va(0), (unsigned long)high_memory,
13894- ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
13895+ (unsigned long)__va(0), (unsigned long)high_memory,
13896+ ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
13897
13898- (unsigned long)&__init_begin, (unsigned long)&__init_end,
13899- ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
13900+ (unsigned long)&__init_begin, (unsigned long)&__init_end,
13901+ ((unsigned long)&__init_end -
13902+ (unsigned long)&__init_begin) >> 10,
13903
13904- (unsigned long)&_etext, (unsigned long)&_edata,
13905- ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
13906+ (unsigned long)&_etext, (unsigned long)&_edata,
13907+ ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
13908
13909- (unsigned long)&_text, (unsigned long)&_etext,
13910- ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
13911+ (unsigned long)&_text, (unsigned long)&_etext,
13912+ ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
13913
13914 #ifdef CONFIG_HIGHMEM
13915- BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
13916- BUG_ON(VMALLOC_END > PKMAP_BASE);
13917+ BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
13918+ BUG_ON(VMALLOC_END > PKMAP_BASE);
13919 #endif
13920- BUG_ON(VMALLOC_START > VMALLOC_END);
13921- BUG_ON((unsigned long)high_memory > VMALLOC_START);
13922+ BUG_ON(VMALLOC_START > VMALLOC_END);
13923+ BUG_ON((unsigned long)high_memory > VMALLOC_START);
13924 #endif /* double-sanity-check paranoia */
13925
13926-#ifdef CONFIG_X86_PAE
13927- if (!cpu_has_pae)
13928- panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
13929-#endif
13930 if (boot_cpu_data.wp_works_ok < 0)
13931 test_wp_bit();
13932
13933+ cpa_init();
13934+
13935 /*
13936 * Subtle. SMP is doing it's boot stuff late (because it has to
13937 * fork idle threads) - but it also needs low mappings for the
13938@@ -790,49 +734,35 @@ int arch_add_memory(int nid, u64 start,
13939
13940 return __add_pages(zone, start_pfn, nr_pages);
13941 }
13942-
13943 #endif
13944
13945-struct kmem_cache *pmd_cache;
13946-
13947-void __init pgtable_cache_init(void)
13948-{
13949- if (PTRS_PER_PMD > 1)
13950- pmd_cache = kmem_cache_create("pmd",
13951- PTRS_PER_PMD*sizeof(pmd_t),
13952- PTRS_PER_PMD*sizeof(pmd_t),
13953- SLAB_PANIC,
13954- pmd_ctor);
13955-}
13956-
13957 /*
13958 * This function cannot be __init, since exceptions don't work in that
13959 * section. Put this after the callers, so that it cannot be inlined.
13960 */
13961-static int noinline do_test_wp_bit(void)
13962+static noinline int do_test_wp_bit(void)
13963 {
13964 char tmp_reg;
13965 int flag;
13966
13967 __asm__ __volatile__(
13968- " movb %0,%1 \n"
13969- "1: movb %1,%0 \n"
13970- " xorl %2,%2 \n"
13971+ " movb %0, %1 \n"
13972+ "1: movb %1, %0 \n"
13973+ " xorl %2, %2 \n"
13974 "2: \n"
13975- ".section __ex_table,\"a\"\n"
13976- " .align 4 \n"
13977- " .long 1b,2b \n"
13978- ".previous \n"
13979+ _ASM_EXTABLE(1b,2b)
13980 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
13981 "=q" (tmp_reg),
13982 "=r" (flag)
13983 :"2" (1)
13984 :"memory");
13985-
13986+
13987 return flag;
13988 }
13989
13990 #ifdef CONFIG_DEBUG_RODATA
13991+const int rodata_test_data = 0xC3;
13992+EXPORT_SYMBOL_GPL(rodata_test_data);
13993
13994 void mark_rodata_ro(void)
13995 {
13996@@ -845,32 +775,58 @@ void mark_rodata_ro(void)
13997 if (num_possible_cpus() <= 1)
13998 #endif
13999 {
14000- change_page_attr(virt_to_page(start),
14001- size >> PAGE_SHIFT, PAGE_KERNEL_RX);
14002- printk("Write protecting the kernel text: %luk\n", size >> 10);
14003+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
14004+ printk(KERN_INFO "Write protecting the kernel text: %luk\n",
14005+ size >> 10);
14006+
14007+#ifdef CONFIG_CPA_DEBUG
14008+ printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
14009+ start, start+size);
14010+ set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
14011+
14012+ printk(KERN_INFO "Testing CPA: write protecting again\n");
14013+ set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
14014+#endif
14015 }
14016 #endif
14017 start += size;
14018 size = (unsigned long)__end_rodata - start;
14019- change_page_attr(virt_to_page(start),
14020- size >> PAGE_SHIFT, PAGE_KERNEL_RO);
14021- printk("Write protecting the kernel read-only data: %luk\n",
14022- size >> 10);
14023+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
14024+ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
14025+ size >> 10);
14026+ rodata_test();
14027+
14028+#ifdef CONFIG_CPA_DEBUG
14029+ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
14030+ set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
14031
14032- /*
14033- * change_page_attr() requires a global_flush_tlb() call after it.
14034- * We do this after the printk so that if something went wrong in the
14035- * change, the printk gets out at least to give a better debug hint
14036- * of who is the culprit.
14037- */
14038- global_flush_tlb();
14039+ printk(KERN_INFO "Testing CPA: write protecting again\n");
14040+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
14041+#endif
14042 }
14043 #endif
14044
14045 void free_init_pages(char *what, unsigned long begin, unsigned long end)
14046 {
14047+#ifdef CONFIG_DEBUG_PAGEALLOC
14048+ /*
14049+ * If debugging page accesses then do not free this memory but
14050+ * mark them not present - any buggy init-section access will
14051+ * create a kernel page fault:
14052+ */
14053+ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
14054+ begin, PAGE_ALIGN(end));
14055+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
14056+#else
14057 unsigned long addr;
14058
14059+ /*
14060+ * We just marked the kernel text read only above, now that
14061+ * we are going to free part of that, we need to make that
14062+ * writeable first.
14063+ */
14064+ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
14065+
14066 for (addr = begin; addr < end; addr += PAGE_SIZE) {
14067 ClearPageReserved(virt_to_page(addr));
14068 init_page_count(virt_to_page(addr));
14069@@ -879,6 +835,7 @@ void free_init_pages(char *what, unsigne
14070 totalram_pages++;
14071 }
14072 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
14073+#endif
14074 }
14075
14076 void free_initmem(void)
14077@@ -894,4 +851,3 @@ void free_initrd_mem(unsigned long start
14078 free_init_pages("initrd memory", start, end);
14079 }
14080 #endif
14081-
14082--- a/arch/x86/mm/init_64-xen.c
14083+++ b/arch/x86/mm/init_64-xen.c
14084@@ -46,14 +46,13 @@
14085 #include <asm/proto.h>
14086 #include <asm/smp.h>
14087 #include <asm/sections.h>
14088+#include <asm/kdebug.h>
14089+#include <asm/numa.h>
14090+#include <asm/cacheflush.h>
14091
14092 #include <xen/features.h>
14093
14094-#ifndef Dprintk
14095-#define Dprintk(x...)
14096-#endif
14097-
14098-const struct dma_mapping_ops* dma_ops;
14099+const struct dma_mapping_ops *dma_ops;
14100 EXPORT_SYMBOL(dma_ops);
14101
14102 #if CONFIG_XEN_COMPAT <= 0x030002
14103@@ -80,7 +79,21 @@ extern pte_t level1_fixmap_pgt[PTRS_PER_
14104 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
14105 __START_KERNEL_map)))
14106
14107-static void __meminit early_make_page_readonly(void *va, unsigned int feature)
14108+pmd_t *__init early_get_pmd(unsigned long va)
14109+{
14110+ unsigned long addr;
14111+ unsigned long *page = (unsigned long *)init_level4_pgt;
14112+
14113+ addr = page[pgd_index(va)];
14114+ addr_to_page(addr, page);
14115+
14116+ addr = page[pud_index(va)];
14117+ addr_to_page(addr, page);
14118+
14119+ return (pmd_t *)&page[pmd_index(va)];
14120+}
14121+
14122+void __meminit early_make_page_readonly(void *va, unsigned int feature)
14123 {
14124 unsigned long addr, _va = (unsigned long)va;
14125 pte_t pte, *ptep;
14126@@ -107,76 +120,6 @@ static void __meminit early_make_page_re
14127 BUG();
14128 }
14129
14130-static void __make_page_readonly(void *va)
14131-{
14132- pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
14133- unsigned long addr = (unsigned long) va;
14134-
14135- pgd = pgd_offset_k(addr);
14136- pud = pud_offset(pgd, addr);
14137- pmd = pmd_offset(pud, addr);
14138- ptep = pte_offset_kernel(pmd, addr);
14139-
14140- pte.pte = ptep->pte & ~_PAGE_RW;
14141- if (HYPERVISOR_update_va_mapping(addr, pte, 0))
14142- xen_l1_entry_update(ptep, pte); /* fallback */
14143-
14144- if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
14145- __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
14146-}
14147-
14148-static void __make_page_writable(void *va)
14149-{
14150- pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
14151- unsigned long addr = (unsigned long) va;
14152-
14153- pgd = pgd_offset_k(addr);
14154- pud = pud_offset(pgd, addr);
14155- pmd = pmd_offset(pud, addr);
14156- ptep = pte_offset_kernel(pmd, addr);
14157-
14158- pte.pte = ptep->pte | _PAGE_RW;
14159- if (HYPERVISOR_update_va_mapping(addr, pte, 0))
14160- xen_l1_entry_update(ptep, pte); /* fallback */
14161-
14162- if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
14163- __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
14164-}
14165-
14166-void make_page_readonly(void *va, unsigned int feature)
14167-{
14168- if (!xen_feature(feature))
14169- __make_page_readonly(va);
14170-}
14171-
14172-void make_page_writable(void *va, unsigned int feature)
14173-{
14174- if (!xen_feature(feature))
14175- __make_page_writable(va);
14176-}
14177-
14178-void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
14179-{
14180- if (xen_feature(feature))
14181- return;
14182-
14183- while (nr-- != 0) {
14184- __make_page_readonly(va);
14185- va = (void*)((unsigned long)va + PAGE_SIZE);
14186- }
14187-}
14188-
14189-void make_pages_writable(void *va, unsigned nr, unsigned int feature)
14190-{
14191- if (xen_feature(feature))
14192- return;
14193-
14194- while (nr-- != 0) {
14195- __make_page_writable(va);
14196- va = (void*)((unsigned long)va + PAGE_SIZE);
14197- }
14198-}
14199-
14200 /*
14201 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
14202 * physical space so we can cache the place of the first one and move
14203@@ -187,22 +130,26 @@ void show_mem(void)
14204 {
14205 long i, total = 0, reserved = 0;
14206 long shared = 0, cached = 0;
14207- pg_data_t *pgdat;
14208 struct page *page;
14209+ pg_data_t *pgdat;
14210
14211 printk(KERN_INFO "Mem-info:\n");
14212 show_free_areas();
14213- printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
14214+ printk(KERN_INFO "Free swap: %6ldkB\n",
14215+ nr_swap_pages << (PAGE_SHIFT-10));
14216
14217 for_each_online_pgdat(pgdat) {
14218- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14219- /* this loop can take a while with 256 GB and 4k pages
14220- so update the NMI watchdog */
14221- if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
14222+ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14223+ /*
14224+ * This loop can take a while with 256 GB and
14225+ * 4k pages so defer the NMI watchdog:
14226+ */
14227+ if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
14228 touch_nmi_watchdog();
14229- }
14230+
14231 if (!pfn_valid(pgdat->node_start_pfn + i))
14232 continue;
14233+
14234 page = pfn_to_page(pgdat->node_start_pfn + i);
14235 total++;
14236 if (PageReserved(page))
14237@@ -211,58 +158,67 @@ void show_mem(void)
14238 cached++;
14239 else if (page_count(page))
14240 shared += page_count(page) - 1;
14241- }
14242+ }
14243 }
14244- printk(KERN_INFO "%lu pages of RAM\n", total);
14245- printk(KERN_INFO "%lu reserved pages\n",reserved);
14246- printk(KERN_INFO "%lu pages shared\n",shared);
14247- printk(KERN_INFO "%lu pages swap cached\n",cached);
14248+ printk(KERN_INFO "%lu pages of RAM\n", total);
14249+ printk(KERN_INFO "%lu reserved pages\n", reserved);
14250+ printk(KERN_INFO "%lu pages shared\n", shared);
14251+ printk(KERN_INFO "%lu pages swap cached\n", cached);
14252 }
14253
14254+static unsigned long __meminitdata table_start;
14255+static unsigned long __meminitdata table_end;
14256
14257 static __init void *spp_getpage(void)
14258-{
14259+{
14260 void *ptr;
14261+
14262 if (after_bootmem)
14263- ptr = (void *) get_zeroed_page(GFP_ATOMIC);
14264+ ptr = (void *) get_zeroed_page(GFP_ATOMIC);
14265 else if (start_pfn < table_end) {
14266 ptr = __va(start_pfn << PAGE_SHIFT);
14267 start_pfn++;
14268 memset(ptr, 0, PAGE_SIZE);
14269 } else
14270 ptr = alloc_bootmem_pages(PAGE_SIZE);
14271- if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
14272- panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
14273
14274- Dprintk("spp_getpage %p\n", ptr);
14275+ if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
14276+ panic("set_pte_phys: cannot allocate page data %s\n",
14277+ after_bootmem ? "after bootmem" : "");
14278+ }
14279+
14280+ pr_debug("spp_getpage %p\n", ptr);
14281+
14282 return ptr;
14283-}
14284+}
14285
14286 #define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
14287 #define pud_offset_u(address) (level3_user_pgt + pud_index(address))
14288
14289-static __init void set_pte_phys(unsigned long vaddr,
14290- unsigned long phys, pgprot_t prot, int user_mode)
14291+static __init void
14292+set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
14293 {
14294 pgd_t *pgd;
14295 pud_t *pud;
14296 pmd_t *pmd;
14297 pte_t *pte, new_pte;
14298
14299- Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
14300+ pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
14301
14302 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
14303 if (pgd_none(*pgd)) {
14304- printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
14305+ printk(KERN_ERR
14306+ "PGD FIXMAP MISSING, it should be setup in head.S!\n");
14307 return;
14308 }
14309 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
14310 if (pud_none(*pud)) {
14311- pmd = (pmd_t *) spp_getpage();
14312+ pmd = (pmd_t *) spp_getpage();
14313 make_page_readonly(pmd, XENFEAT_writable_page_tables);
14314 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
14315 if (pmd != pmd_offset(pud, 0)) {
14316- printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
14317+ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
14318+ pmd, pmd_offset(pud, 0));
14319 return;
14320 }
14321 }
14322@@ -272,7 +228,7 @@ static __init void set_pte_phys(unsigned
14323 make_page_readonly(pte, XENFEAT_writable_page_tables);
14324 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
14325 if (pte != pte_offset_kernel(pmd, 0)) {
14326- printk("PAGETABLE BUG #02!\n");
14327+ printk(KERN_ERR "PAGETABLE BUG #02!\n");
14328 return;
14329 }
14330 }
14331@@ -294,30 +250,30 @@ static __init void set_pte_phys(unsigned
14332 __flush_tlb_one(vaddr);
14333 }
14334
14335-static __init void set_pte_phys_ma(unsigned long vaddr,
14336- unsigned long phys, pgprot_t prot)
14337+static __init void
14338+set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
14339 {
14340 pgd_t *pgd;
14341 pud_t *pud;
14342 pmd_t *pmd;
14343 pte_t *pte, new_pte;
14344
14345- Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
14346+ pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
14347
14348 pgd = pgd_offset_k(vaddr);
14349 if (pgd_none(*pgd)) {
14350- printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
14351+ printk(KERN_ERR
14352+ "PGD FIXMAP MISSING, it should be setup in head.S!\n");
14353 return;
14354 }
14355 pud = pud_offset(pgd, vaddr);
14356 if (pud_none(*pud)) {
14357-
14358- pmd = (pmd_t *) spp_getpage();
14359+ pmd = (pmd_t *) spp_getpage();
14360 make_page_readonly(pmd, XENFEAT_writable_page_tables);
14361 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
14362 if (pmd != pmd_offset(pud, 0)) {
14363- printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
14364- return;
14365+ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
14366+ pmd, pmd_offset(pud, 0));
14367 }
14368 }
14369 pmd = pmd_offset(pud, vaddr);
14370@@ -326,7 +282,7 @@ static __init void set_pte_phys_ma(unsig
14371 make_page_readonly(pte, XENFEAT_writable_page_tables);
14372 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
14373 if (pte != pte_offset_kernel(pmd, 0)) {
14374- printk("PAGETABLE BUG #02!\n");
14375+ printk(KERN_ERR "PAGETABLE BUG #02!\n");
14376 return;
14377 }
14378 }
14379@@ -350,14 +306,44 @@ static __init void set_pte_phys_ma(unsig
14380 __flush_tlb_one(vaddr);
14381 }
14382
14383+#ifndef CONFIG_XEN
14384+/*
14385+ * The head.S code sets up the kernel high mapping:
14386+ *
14387+ * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
14388+ *
14389+ * phys_addr holds the negative offset to the kernel, which is added
14390+ * to the compile time generated pmds. This results in invalid pmds up
14391+ * to the point where we hit the physaddr 0 mapping.
14392+ *
14393+ * We limit the mappings to the region from _text to _end. _end is
14394+ * rounded up to the 2MB boundary. This catches the invalid pmds as
14395+ * well, as they are located before _text:
14396+ */
14397+void __init cleanup_highmap(void)
14398+{
14399+ unsigned long vaddr = __START_KERNEL_map;
14400+ unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
14401+ pmd_t *pmd = level2_kernel_pgt;
14402+ pmd_t *last_pmd = pmd + PTRS_PER_PMD;
14403+
14404+ for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
14405+ if (!pmd_present(*pmd))
14406+ continue;
14407+ if (vaddr < (unsigned long) _text || vaddr > end)
14408+ set_pmd(pmd, __pmd(0));
14409+ }
14410+}
14411+#endif
14412+
14413 /* NOTE: this is meant to be run only at boot */
14414-void __init
14415-__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
14416+void __init
14417+__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
14418 {
14419 unsigned long address = __fix_to_virt(idx);
14420
14421 if (idx >= __end_of_fixed_addresses) {
14422- printk("Invalid __set_fixmap\n");
14423+ printk(KERN_ERR "Invalid __set_fixmap\n");
14424 return;
14425 }
14426 switch (idx) {
14427@@ -375,16 +361,14 @@ __set_fixmap (enum fixed_addresses idx,
14428 }
14429 }
14430
14431-unsigned long __meminitdata table_start, table_end;
14432-
14433 static __meminit void *alloc_static_page(unsigned long *phys)
14434 {
14435 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
14436
14437 if (after_bootmem) {
14438 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
14439-
14440 *phys = __pa(adr);
14441+
14442 return adr;
14443 }
14444
14445@@ -396,7 +380,7 @@ static __meminit void *alloc_static_page
14446
14447 #define PTE_SIZE PAGE_SIZE
14448
14449-static inline int make_readonly(unsigned long paddr)
14450+static inline int __meminit make_readonly(unsigned long paddr)
14451 {
14452 extern char __vsyscall_0;
14453 int readonly = 0;
14454@@ -430,33 +414,38 @@ static inline int make_readonly(unsigned
14455 /* Must run before zap_low_mappings */
14456 __meminit void *early_ioremap(unsigned long addr, unsigned long size)
14457 {
14458- unsigned long vaddr;
14459 pmd_t *pmd, *last_pmd;
14460+ unsigned long vaddr;
14461 int i, pmds;
14462
14463 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
14464 vaddr = __START_KERNEL_map;
14465 pmd = level2_kernel_pgt;
14466 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
14467+
14468 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
14469 for (i = 0; i < pmds; i++) {
14470 if (pmd_present(pmd[i]))
14471- goto next;
14472+ goto continue_outer_loop;
14473 }
14474 vaddr += addr & ~PMD_MASK;
14475 addr &= PMD_MASK;
14476+
14477 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
14478- set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
14479- __flush_tlb();
14480+ set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
14481+ __flush_tlb_all();
14482+
14483 return (void *)vaddr;
14484- next:
14485+continue_outer_loop:
14486 ;
14487 }
14488 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
14489 return NULL;
14490 }
14491
14492-/* To avoid virtual aliases later */
14493+/*
14494+ * To avoid virtual aliases later:
14495+ */
14496 __meminit void early_iounmap(void *addr, unsigned long size)
14497 {
14498 unsigned long vaddr;
14499@@ -466,9 +455,11 @@ __meminit void early_iounmap(void *addr,
14500 vaddr = (unsigned long)addr;
14501 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
14502 pmd = level2_kernel_pgt + pmd_index(vaddr);
14503+
14504 for (i = 0; i < pmds; i++)
14505 pmd_clear(pmd + i);
14506- __flush_tlb();
14507+
14508+ __flush_tlb_all();
14509 }
14510 #endif
14511
14512@@ -517,18 +508,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
14513 static void __meminit
14514 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
14515 {
14516- pmd_t *pmd = pmd_offset(pud,0);
14517+ pmd_t *pmd = pmd_offset(pud, 0);
14518 spin_lock(&init_mm.page_table_lock);
14519 phys_pmd_init(pmd, address, end);
14520 spin_unlock(&init_mm.page_table_lock);
14521 __flush_tlb_all();
14522 }
14523
14524-static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
14525-{
14526+static void __meminit
14527+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
14528+{
14529 int i = pud_index(addr);
14530
14531- for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
14532+ for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
14533 unsigned long pmd_phys;
14534 pud_t *pud = pud_page + pud_index(addr);
14535 pmd_t *pmd;
14536@@ -550,8 +542,8 @@ static void __meminit phys_pud_init(pud_
14537
14538 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
14539 }
14540- __flush_tlb();
14541-}
14542+ __flush_tlb_all();
14543+}
14544
14545 void __init xen_init_pt(void)
14546 {
14547@@ -632,6 +624,7 @@ void __init xen_init_pt(void)
14548 static void __init extend_init_mapping(unsigned long tables_space)
14549 {
14550 unsigned long va = __START_KERNEL_map;
14551+ unsigned long start = start_pfn;
14552 unsigned long phys, addr, *pte_page;
14553 pmd_t *pmd;
14554 pte_t *pte, new_pte;
14555@@ -682,6 +675,10 @@ static void __init extend_init_mapping(u
14556 BUG();
14557 va += PAGE_SIZE;
14558 }
14559+
14560+ if (start_pfn > start)
14561+ reserve_early(start << PAGE_SHIFT,
14562+ start_pfn << PAGE_SHIFT, "INITMAP");
14563 }
14564
14565 static void __init find_early_table_space(unsigned long end)
14566@@ -706,7 +703,7 @@ static void __init find_early_table_spac
14567 (table_start << PAGE_SHIFT) + tables);
14568 }
14569
14570-static void xen_finish_init_mapping(void)
14571+static void __init xen_finish_init_mapping(void)
14572 {
14573 unsigned long i, start, end;
14574
14575@@ -742,8 +739,10 @@ static void xen_finish_init_mapping(void
14576 * Prefetch pte's for the bt_ioremap() area. It gets used before the
14577 * boot-time allocator is online, so allocate-on-demand would fail.
14578 */
14579+ early_ioremap_clear();
14580 for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
14581 __set_fixmap(i, 0, __pgprot(0));
14582+ early_ioremap_reset();
14583
14584 /* Switch to the real shared_info page, and clear the dummy page. */
14585 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
14586@@ -764,20 +763,23 @@ static void xen_finish_init_mapping(void
14587 table_end = start_pfn;
14588 }
14589
14590-/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
14591- This runs before bootmem is initialized and gets pages directly from the
14592- physical memory. To access them they are temporarily mapped. */
14593+/*
14594+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
14595+ * This runs before bootmem is initialized and gets pages directly from
14596+ * the physical memory. To access them they are temporarily mapped.
14597+ */
14598 void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
14599-{
14600+{
14601 unsigned long next;
14602
14603- Dprintk("init_memory_mapping\n");
14604+ pr_debug("init_memory_mapping\n");
14605
14606- /*
14607+ /*
14608 * Find space for the kernel direct mapping tables.
14609- * Later we should allocate these tables in the local node of the memory
14610- * mapped. Unfortunately this is done currently before the nodes are
14611- * discovered.
14612+ *
14613+ * Later we should allocate these tables in the local node of the
14614+ * memory mapped. Unfortunately this is done currently before the
14615+ * nodes are discovered.
14616 */
14617 if (!after_bootmem)
14618 find_early_table_space(end);
14619@@ -786,8 +788,8 @@ void __init_refok init_memory_mapping(un
14620 end = (unsigned long)__va(end);
14621
14622 for (; start < end; start = next) {
14623- unsigned long pud_phys;
14624 pgd_t *pgd = pgd_offset_k(start);
14625+ unsigned long pud_phys;
14626 pud_t *pud;
14627
14628 if (after_bootmem)
14629@@ -795,8 +797,8 @@ void __init_refok init_memory_mapping(un
14630 else
14631 pud = alloc_static_page(&pud_phys);
14632 next = start + PGDIR_SIZE;
14633- if (next > end)
14634- next = end;
14635+ if (next > end)
14636+ next = end;
14637 phys_pud_init(pud, __pa(start), __pa(next));
14638 if (!after_bootmem) {
14639 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
14640@@ -810,12 +812,17 @@ void __init_refok init_memory_mapping(un
14641 }
14642
14643 __flush_tlb_all();
14644+
14645+ if (!after_bootmem)
14646+ reserve_early(table_start << PAGE_SHIFT,
14647+ table_end << PAGE_SHIFT, "PGTABLE");
14648 }
14649
14650 #ifndef CONFIG_NUMA
14651 void __init paging_init(void)
14652 {
14653 unsigned long max_zone_pfns[MAX_NR_ZONES];
14654+
14655 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
14656 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
14657 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
14658@@ -829,40 +836,6 @@ void __init paging_init(void)
14659 }
14660 #endif
14661
14662-/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
14663- from the CPU leading to inconsistent cache lines. address and size
14664- must be aligned to 2MB boundaries.
14665- Does nothing when the mapping doesn't exist. */
14666-void __init clear_kernel_mapping(unsigned long address, unsigned long size)
14667-{
14668- unsigned long end = address + size;
14669-
14670- BUG_ON(address & ~LARGE_PAGE_MASK);
14671- BUG_ON(size & ~LARGE_PAGE_MASK);
14672-
14673- for (; address < end; address += LARGE_PAGE_SIZE) {
14674- pgd_t *pgd = pgd_offset_k(address);
14675- pud_t *pud;
14676- pmd_t *pmd;
14677- if (pgd_none(*pgd))
14678- continue;
14679- pud = pud_offset(pgd, address);
14680- if (pud_none(*pud))
14681- continue;
14682- pmd = pmd_offset(pud, address);
14683- if (!pmd || pmd_none(*pmd))
14684- continue;
14685- if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
14686- /* Could handle this, but it should not happen currently. */
14687- printk(KERN_ERR
14688- "clear_kernel_mapping: mapping has been split. will leak memory\n");
14689- pmd_ERROR(*pmd);
14690- }
14691- set_pmd(pmd, __pmd(0));
14692- }
14693- __flush_tlb_all();
14694-}
14695-
14696 /*
14697 * Memory hotplug specific functions
14698 */
14699@@ -888,16 +861,12 @@ int arch_add_memory(int nid, u64 start,
14700 unsigned long nr_pages = size >> PAGE_SHIFT;
14701 int ret;
14702
14703- init_memory_mapping(start, (start + size -1));
14704+ init_memory_mapping(start, start + size-1);
14705
14706 ret = __add_pages(zone, start_pfn, nr_pages);
14707- if (ret)
14708- goto error;
14709+ WARN_ON(1);
14710
14711 return ret;
14712-error:
14713- printk("%s: Problem encountered in __add_pages!\n", __func__);
14714- return ret;
14715 }
14716 EXPORT_SYMBOL_GPL(arch_add_memory);
14717
14718@@ -911,36 +880,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
14719
14720 #endif /* CONFIG_MEMORY_HOTPLUG */
14721
14722-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
14723-/*
14724- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
14725- * just online the pages.
14726- */
14727-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
14728-{
14729- int err = -EIO;
14730- unsigned long pfn;
14731- unsigned long total = 0, mem = 0;
14732- for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
14733- if (pfn_valid(pfn)) {
14734- online_page(pfn_to_page(pfn));
14735- err = 0;
14736- mem++;
14737- }
14738- total++;
14739- }
14740- if (!err) {
14741- z->spanned_pages += total;
14742- z->present_pages += mem;
14743- z->zone_pgdat->node_spanned_pages += total;
14744- z->zone_pgdat->node_present_pages += mem;
14745- }
14746- return err;
14747-}
14748-#endif
14749-
14750-static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
14751- kcore_vsyscall;
14752+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
14753+ kcore_modules, kcore_vsyscall;
14754
14755 void __init mem_init(void)
14756 {
14757@@ -949,8 +890,7 @@ void __init mem_init(void)
14758
14759 pci_iommu_alloc();
14760
14761- /* clear the zero-page */
14762- memset(empty_zero_page, 0, PAGE_SIZE);
14763+ /* clear_bss() already clear the empty_zero_page */
14764
14765 reservedpages = 0;
14766
14767@@ -968,7 +908,6 @@ void __init mem_init(void)
14768 }
14769 reservedpages = end_pfn - totalram_pages -
14770 absent_pages_in_range(0, end_pfn);
14771-
14772 after_bootmem = 1;
14773
14774 codesize = (unsigned long) &_etext - (unsigned long) &_text;
14775@@ -976,46 +915,64 @@ void __init mem_init(void)
14776 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
14777
14778 /* Register memory areas for /proc/kcore */
14779- kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
14780- kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
14781+ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
14782+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
14783 VMALLOC_END-VMALLOC_START);
14784 kclist_add(&kcore_kernel, &_stext, _end - _stext);
14785 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
14786- kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
14787+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
14788 VSYSCALL_END - VSYSCALL_START);
14789
14790- printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
14791+ printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
14792+ "%ldk reserved, %ldk data, %ldk init)\n",
14793 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
14794 end_pfn << (PAGE_SHIFT-10),
14795 codesize >> 10,
14796 reservedpages << (PAGE_SHIFT-10),
14797 datasize >> 10,
14798 initsize >> 10);
14799+
14800+ cpa_init();
14801 }
14802
14803 void free_init_pages(char *what, unsigned long begin, unsigned long end)
14804 {
14805- unsigned long addr;
14806+ unsigned long addr = begin;
14807
14808- if (begin >= end)
14809+ if (addr >= end)
14810 return;
14811
14812+ /*
14813+ * If debugging page accesses then do not free this memory but
14814+ * mark them not present - any buggy init-section access will
14815+ * create a kernel page fault:
14816+ */
14817+#ifdef CONFIG_DEBUG_PAGEALLOC
14818+ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
14819+ begin, PAGE_ALIGN(end));
14820+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
14821+#else
14822 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
14823- for (addr = begin; addr < end; addr += PAGE_SIZE) {
14824+
14825+ for (; addr < end; addr += PAGE_SIZE) {
14826 ClearPageReserved(virt_to_page(addr));
14827 init_page_count(virt_to_page(addr));
14828 memset((void *)(addr & ~(PAGE_SIZE-1)),
14829 POISON_FREE_INITMEM, PAGE_SIZE);
14830 if (addr >= __START_KERNEL_map) {
14831 /* make_readonly() reports all kernel addresses. */
14832- __make_page_writable(__va(__pa(addr)));
14833- change_page_attr_addr(addr, 1, __pgprot(0));
14834+ if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
14835+ pfn_pte(__pa(addr) >> PAGE_SHIFT,
14836+ PAGE_KERNEL),
14837+ 0))
14838+ BUG();
14839+ if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
14840+ BUG();
14841 }
14842 free_page(addr);
14843 totalram_pages++;
14844 }
14845- if (addr > __START_KERNEL_map)
14846- global_flush_tlb();
14847+#endif
14848 }
14849
14850 void free_initmem(void)
14851@@ -1026,6 +983,8 @@ void free_initmem(void)
14852 }
14853
14854 #ifdef CONFIG_DEBUG_RODATA
14855+const int rodata_test_data = 0xC3;
14856+EXPORT_SYMBOL_GPL(rodata_test_data);
14857
14858 void mark_rodata_ro(void)
14859 {
14860@@ -1047,18 +1006,27 @@ void mark_rodata_ro(void)
14861 if (end <= start)
14862 return;
14863
14864- change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
14865
14866 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
14867 (end - start) >> 10);
14868+ set_memory_ro(start, (end - start) >> PAGE_SHIFT);
14869
14870 /*
14871- * change_page_attr_addr() requires a global_flush_tlb() call after it.
14872- * We do this after the printk so that if something went wrong in the
14873- * change, the printk gets out at least to give a better debug hint
14874- * of who is the culprit.
14875+ * The rodata section (but not the kernel text!) should also be
14876+ * not-executable.
14877 */
14878- global_flush_tlb();
14879+ start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
14880+ set_memory_nx(start, (end - start) >> PAGE_SHIFT);
14881+
14882+ rodata_test();
14883+
14884+#ifdef CONFIG_CPA_DEBUG
14885+ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
14886+ set_memory_rw(start, (end-start) >> PAGE_SHIFT);
14887+
14888+ printk(KERN_INFO "Testing CPA: again\n");
14889+ set_memory_ro(start, (end-start) >> PAGE_SHIFT);
14890+#endif
14891 }
14892 #endif
14893
14894@@ -1069,17 +1037,21 @@ void free_initrd_mem(unsigned long start
14895 }
14896 #endif
14897
14898-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
14899-{
14900+void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
14901+{
14902 #ifdef CONFIG_NUMA
14903 int nid = phys_to_nid(phys);
14904 #endif
14905 unsigned long pfn = phys >> PAGE_SHIFT;
14906+
14907 if (pfn >= end_pfn) {
14908- /* This can happen with kdump kernels when accessing firmware
14909- tables. */
14910+ /*
14911+ * This can happen with kdump kernels when accessing
14912+ * firmware tables:
14913+ */
14914 if (pfn < end_pfn_map)
14915 return;
14916+
14917 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
14918 phys, len);
14919 return;
14920@@ -1087,9 +1059,9 @@ void __init reserve_bootmem_generic(unsi
14921
14922 /* Should check here against the e820 map to avoid double free */
14923 #ifdef CONFIG_NUMA
14924- reserve_bootmem_node(NODE_DATA(nid), phys, len);
14925-#else
14926- reserve_bootmem(phys, len);
14927+ reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
14928+#else
14929+ reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
14930 #endif
14931 #ifndef CONFIG_XEN
14932 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
14933@@ -1101,46 +1073,49 @@ void __init reserve_bootmem_generic(unsi
14934 #endif
14935 }
14936
14937-int kern_addr_valid(unsigned long addr)
14938-{
14939+int kern_addr_valid(unsigned long addr)
14940+{
14941 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
14942- pgd_t *pgd;
14943- pud_t *pud;
14944- pmd_t *pmd;
14945- pte_t *pte;
14946+ pgd_t *pgd;
14947+ pud_t *pud;
14948+ pmd_t *pmd;
14949+ pte_t *pte;
14950
14951 if (above != 0 && above != -1UL)
14952- return 0;
14953-
14954+ return 0;
14955+
14956 pgd = pgd_offset_k(addr);
14957 if (pgd_none(*pgd))
14958 return 0;
14959
14960 pud = pud_offset(pgd, addr);
14961 if (pud_none(*pud))
14962- return 0;
14963+ return 0;
14964
14965 pmd = pmd_offset(pud, addr);
14966 if (pmd_none(*pmd))
14967 return 0;
14968+
14969 if (pmd_large(*pmd))
14970 return pfn_valid(pmd_pfn(*pmd));
14971
14972 pte = pte_offset_kernel(pmd, addr);
14973 if (pte_none(*pte))
14974 return 0;
14975+
14976 return pfn_valid(pte_pfn(*pte));
14977 }
14978
14979-/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
14980- covers the 64bit vsyscall page now. 32bit has a real VMA now and does
14981- not need special handling anymore. */
14982-
14983+/*
14984+ * A pseudo VMA to allow ptrace access for the vsyscall page. This only
14985+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
14986+ * not need special handling anymore:
14987+ */
14988 static struct vm_area_struct gate_vma = {
14989- .vm_start = VSYSCALL_START,
14990- .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
14991- .vm_page_prot = PAGE_READONLY_EXEC,
14992- .vm_flags = VM_READ | VM_EXEC
14993+ .vm_start = VSYSCALL_START,
14994+ .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
14995+ .vm_page_prot = PAGE_READONLY_EXEC,
14996+ .vm_flags = VM_READ | VM_EXEC
14997 };
14998
14999 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
15000@@ -1155,14 +1130,17 @@ struct vm_area_struct *get_gate_vma(stru
15001 int in_gate_area(struct task_struct *task, unsigned long addr)
15002 {
15003 struct vm_area_struct *vma = get_gate_vma(task);
15004+
15005 if (!vma)
15006 return 0;
15007+
15008 return (addr >= vma->vm_start) && (addr < vma->vm_end);
15009 }
15010
15011-/* Use this when you have no reliable task/vma, typically from interrupt
15012- * context. It is less reliable than using the task's vma and may give
15013- * false positives.
15014+/*
15015+ * Use this when you have no reliable task/vma, typically from interrupt
15016+ * context. It is less reliable than using the task's vma and may give
15017+ * false positives:
15018 */
15019 int in_gate_area_no_task(unsigned long addr)
15020 {
15021@@ -1182,8 +1160,8 @@ const char *arch_vma_name(struct vm_area
15022 /*
15023 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
15024 */
15025-int __meminit vmemmap_populate(struct page *start_page,
15026- unsigned long size, int node)
15027+int __meminit
15028+vmemmap_populate(struct page *start_page, unsigned long size, int node)
15029 {
15030 unsigned long addr = (unsigned long)start_page;
15031 unsigned long end = (unsigned long)(start_page + size);
15032@@ -1198,6 +1176,7 @@ int __meminit vmemmap_populate(struct pa
15033 pgd = vmemmap_pgd_populate(addr, node);
15034 if (!pgd)
15035 return -ENOMEM;
15036+
15037 pud = vmemmap_pud_populate(pgd, addr, node);
15038 if (!pud)
15039 return -ENOMEM;
15040@@ -1205,20 +1184,22 @@ int __meminit vmemmap_populate(struct pa
15041 pmd = pmd_offset(pud, addr);
15042 if (pmd_none(*pmd)) {
15043 pte_t entry;
15044- void *p = vmemmap_alloc_block(PMD_SIZE, node);
15045+ void *p;
15046+
15047+ p = vmemmap_alloc_block(PMD_SIZE, node);
15048 if (!p)
15049 return -ENOMEM;
15050
15051- entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
15052- mk_pte_huge(entry);
15053- set_pmd(pmd, __pmd(pte_val(entry)));
15054+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
15055+ PAGE_KERNEL_LARGE);
15056+ set_pmd(pmd, __pmd_ma(__pte_val(entry)));
15057
15058 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
15059 addr, addr + PMD_SIZE - 1, p, node);
15060- } else
15061+ } else {
15062 vmemmap_verify((pte_t *)pmd, node, addr, next);
15063+ }
15064 }
15065-
15066 return 0;
15067 }
15068 #endif
15069--- a/arch/x86/mm/ioremap_32-xen.c
15070+++ /dev/null
15071@@ -1,445 +0,0 @@
15072-/*
15073- * arch/i386/mm/ioremap.c
15074- *
15075- * Re-map IO memory to kernel address space so that we can access it.
15076- * This is needed for high PCI addresses that aren't mapped in the
15077- * 640k-1MB IO memory area on PC's
15078- *
15079- * (C) Copyright 1995 1996 Linus Torvalds
15080- */
15081-
15082-#include <linux/vmalloc.h>
15083-#include <linux/init.h>
15084-#include <linux/slab.h>
15085-#include <linux/module.h>
15086-#include <linux/io.h>
15087-#include <linux/sched.h>
15088-#include <asm/fixmap.h>
15089-#include <asm/cacheflush.h>
15090-#include <asm/tlbflush.h>
15091-#include <asm/pgtable.h>
15092-#include <asm/pgalloc.h>
15093-
15094-#define ISA_START_ADDRESS 0x0
15095-#define ISA_END_ADDRESS 0x100000
15096-
15097-static int direct_remap_area_pte_fn(pte_t *pte,
15098- struct page *pmd_page,
15099- unsigned long address,
15100- void *data)
15101-{
15102- mmu_update_t **v = (mmu_update_t **)data;
15103-
15104- BUG_ON(!pte_none(*pte));
15105-
15106- (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15107- PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15108- (*v)++;
15109-
15110- return 0;
15111-}
15112-
15113-static int __direct_remap_pfn_range(struct mm_struct *mm,
15114- unsigned long address,
15115- unsigned long mfn,
15116- unsigned long size,
15117- pgprot_t prot,
15118- domid_t domid)
15119-{
15120- int rc;
15121- unsigned long i, start_address;
15122- mmu_update_t *u, *v, *w;
15123-
15124- u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
15125- if (u == NULL)
15126- return -ENOMEM;
15127-
15128- start_address = address;
15129-
15130- flush_cache_all();
15131-
15132- for (i = 0; i < size; i += PAGE_SIZE) {
15133- if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
15134- /* Flush a full batch after filling in the PTE ptrs. */
15135- rc = apply_to_page_range(mm, start_address,
15136- address - start_address,
15137- direct_remap_area_pte_fn, &w);
15138- if (rc)
15139- goto out;
15140- rc = -EFAULT;
15141- if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
15142- goto out;
15143- v = w = u;
15144- start_address = address;
15145- }
15146-
15147- /*
15148- * Fill in the machine address: PTE ptr is done later by
15149- * apply_to_page_range().
15150- */
15151- v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
15152-
15153- mfn++;
15154- address += PAGE_SIZE;
15155- v++;
15156- }
15157-
15158- if (v != u) {
15159- /* Final batch. */
15160- rc = apply_to_page_range(mm, start_address,
15161- address - start_address,
15162- direct_remap_area_pte_fn, &w);
15163- if (rc)
15164- goto out;
15165- rc = -EFAULT;
15166- if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
15167- goto out;
15168- }
15169-
15170- rc = 0;
15171-
15172- out:
15173- flush_tlb_all();
15174-
15175- free_page((unsigned long)u);
15176-
15177- return rc;
15178-}
15179-
15180-int direct_remap_pfn_range(struct vm_area_struct *vma,
15181- unsigned long address,
15182- unsigned long mfn,
15183- unsigned long size,
15184- pgprot_t prot,
15185- domid_t domid)
15186-{
15187- if (xen_feature(XENFEAT_auto_translated_physmap))
15188- return remap_pfn_range(vma, address, mfn, size, prot);
15189-
15190- if (domid == DOMID_SELF)
15191- return -EINVAL;
15192-
15193- vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
15194-
15195- vma->vm_mm->context.has_foreign_mappings = 1;
15196-
15197- return __direct_remap_pfn_range(
15198- vma->vm_mm, address, mfn, size, prot, domid);
15199-}
15200-EXPORT_SYMBOL(direct_remap_pfn_range);
15201-
15202-int direct_kernel_remap_pfn_range(unsigned long address,
15203- unsigned long mfn,
15204- unsigned long size,
15205- pgprot_t prot,
15206- domid_t domid)
15207-{
15208- return __direct_remap_pfn_range(
15209- &init_mm, address, mfn, size, prot, domid);
15210-}
15211-EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
15212-
15213-static int lookup_pte_fn(
15214- pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15215-{
15216- uint64_t *ptep = (uint64_t *)data;
15217- if (ptep)
15218- *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15219- PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15220- return 0;
15221-}
15222-
15223-int create_lookup_pte_addr(struct mm_struct *mm,
15224- unsigned long address,
15225- uint64_t *ptep)
15226-{
15227- return apply_to_page_range(mm, address, PAGE_SIZE,
15228- lookup_pte_fn, ptep);
15229-}
15230-
15231-EXPORT_SYMBOL(create_lookup_pte_addr);
15232-
15233-static int noop_fn(
15234- pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15235-{
15236- return 0;
15237-}
15238-
15239-int touch_pte_range(struct mm_struct *mm,
15240- unsigned long address,
15241- unsigned long size)
15242-{
15243- return apply_to_page_range(mm, address, size, noop_fn, NULL);
15244-}
15245-
15246-EXPORT_SYMBOL(touch_pte_range);
15247-
15248-/*
15249- * Does @address reside within a non-highmem page that is local to this virtual
15250- * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
15251- * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
15252- * why this works.
15253- */
15254-static inline int is_local_lowmem(unsigned long address)
15255-{
15256- extern unsigned long max_low_pfn;
15257- return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
15258-}
15259-
15260-/*
15261- * Generic mapping function (not visible outside):
15262- */
15263-
15264-/*
15265- * Remap an arbitrary physical address space into the kernel virtual
15266- * address space. Needed when the kernel wants to access high addresses
15267- * directly.
15268- *
15269- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
15270- * have to convert them into an offset in a page-aligned mapping, but the
15271- * caller shouldn't need to know that small detail.
15272- */
15273-void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
15274-{
15275- void __iomem * addr;
15276- struct vm_struct * area;
15277- unsigned long offset, last_addr;
15278- pgprot_t prot;
15279- domid_t domid = DOMID_IO;
15280-
15281- /* Don't allow wraparound or zero size */
15282- last_addr = phys_addr + size - 1;
15283- if (!size || last_addr < phys_addr)
15284- return NULL;
15285-
15286- /*
15287- * Don't remap the low PCI/ISA area, it's always mapped..
15288- */
15289- if (is_initial_xendomain() &&
15290- phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
15291- return (void __iomem *) isa_bus_to_virt(phys_addr);
15292-
15293- /*
15294- * Don't allow anybody to remap normal RAM that we're using..
15295- */
15296- if (is_local_lowmem(phys_addr)) {
15297- char *t_addr, *t_end;
15298- struct page *page;
15299-
15300- t_addr = bus_to_virt(phys_addr);
15301- t_end = t_addr + (size - 1);
15302-
15303- for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
15304- if(!PageReserved(page))
15305- return NULL;
15306-
15307- domid = DOMID_SELF;
15308- }
15309-
15310- prot = __pgprot(_KERNPG_TABLE | flags);
15311-
15312- /*
15313- * Mappings have to be page-aligned
15314- */
15315- offset = phys_addr & ~PAGE_MASK;
15316- phys_addr &= PAGE_MASK;
15317- size = PAGE_ALIGN(last_addr+1) - phys_addr;
15318-
15319- /*
15320- * Ok, go for it..
15321- */
15322- area = get_vm_area(size, VM_IOREMAP | (flags << 20));
15323- if (!area)
15324- return NULL;
15325- area->phys_addr = phys_addr;
15326- addr = (void __iomem *) area->addr;
15327- if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
15328- phys_addr>>PAGE_SHIFT,
15329- size, prot, domid)) {
15330- vunmap((void __force *) addr);
15331- return NULL;
15332- }
15333- return (void __iomem *) (offset + (char __iomem *)addr);
15334-}
15335-EXPORT_SYMBOL(__ioremap);
15336-
15337-/**
15338- * ioremap_nocache - map bus memory into CPU space
15339- * @offset: bus address of the memory
15340- * @size: size of the resource to map
15341- *
15342- * ioremap_nocache performs a platform specific sequence of operations to
15343- * make bus memory CPU accessible via the readb/readw/readl/writeb/
15344- * writew/writel functions and the other mmio helpers. The returned
15345- * address is not guaranteed to be usable directly as a virtual
15346- * address.
15347- *
15348- * This version of ioremap ensures that the memory is marked uncachable
15349- * on the CPU as well as honouring existing caching rules from things like
15350- * the PCI bus. Note that there are other caches and buffers on many
15351- * busses. In particular driver authors should read up on PCI writes
15352- *
15353- * It's useful if some control registers are in such an area and
15354- * write combining or read caching is not desirable:
15355- *
15356- * Must be freed with iounmap.
15357- */
15358-
15359-void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
15360-{
15361- unsigned long last_addr;
15362- void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
15363- if (!p)
15364- return p;
15365-
15366- /* Guaranteed to be > phys_addr, as per __ioremap() */
15367- last_addr = phys_addr + size - 1;
15368-
15369- if (is_local_lowmem(last_addr)) {
15370- struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
15371- unsigned long npages;
15372-
15373- phys_addr &= PAGE_MASK;
15374-
15375- /* This might overflow and become zero.. */
15376- last_addr = PAGE_ALIGN(last_addr);
15377-
15378- /* .. but that's ok, because modulo-2**n arithmetic will make
15379- * the page-aligned "last - first" come out right.
15380- */
15381- npages = (last_addr - phys_addr) >> PAGE_SHIFT;
15382-
15383- if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
15384- iounmap(p);
15385- p = NULL;
15386- }
15387- global_flush_tlb();
15388- }
15389-
15390- return p;
15391-}
15392-EXPORT_SYMBOL(ioremap_nocache);
15393-
15394-/**
15395- * iounmap - Free a IO remapping
15396- * @addr: virtual address from ioremap_*
15397- *
15398- * Caller must ensure there is only one unmapping for the same pointer.
15399- */
15400-void iounmap(volatile void __iomem *addr)
15401-{
15402- struct vm_struct *p, *o;
15403-
15404- if ((void __force *)addr <= high_memory)
15405- return;
15406-
15407- /*
15408- * __ioremap special-cases the PCI/ISA range by not instantiating a
15409- * vm_area and by simply returning an address into the kernel mapping
15410- * of ISA space. So handle that here.
15411- */
15412- if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15413- return;
15414-
15415- addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
15416-
15417- /* Use the vm area unlocked, assuming the caller
15418- ensures there isn't another iounmap for the same address
15419- in parallel. Reuse of the virtual address is prevented by
15420- leaving it in the global lists until we're done with it.
15421- cpa takes care of the direct mappings. */
15422- read_lock(&vmlist_lock);
15423- for (p = vmlist; p; p = p->next) {
15424- if (p->addr == addr)
15425- break;
15426- }
15427- read_unlock(&vmlist_lock);
15428-
15429- if (!p) {
15430- printk("iounmap: bad address %p\n", addr);
15431- dump_stack();
15432- return;
15433- }
15434-
15435- /* Reset the direct mapping. Can block */
15436- if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
15437- change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
15438- get_vm_area_size(p) >> PAGE_SHIFT,
15439- PAGE_KERNEL);
15440- global_flush_tlb();
15441- }
15442-
15443- /* Finally remove it */
15444- o = remove_vm_area((void *)addr);
15445- BUG_ON(p != o || o == NULL);
15446- kfree(p);
15447-}
15448-EXPORT_SYMBOL(iounmap);
15449-
15450-void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
15451-{
15452- unsigned long offset, last_addr;
15453- unsigned int nrpages;
15454- enum fixed_addresses idx;
15455-
15456- /* Don't allow wraparound or zero size */
15457- last_addr = phys_addr + size - 1;
15458- if (!size || last_addr < phys_addr)
15459- return NULL;
15460-
15461- /*
15462- * Don't remap the low PCI/ISA area, it's always mapped..
15463- */
15464- if (is_initial_xendomain() &&
15465- phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
15466- return isa_bus_to_virt(phys_addr);
15467-
15468- /*
15469- * Mappings have to be page-aligned
15470- */
15471- offset = phys_addr & ~PAGE_MASK;
15472- phys_addr &= PAGE_MASK;
15473- size = PAGE_ALIGN(last_addr) - phys_addr;
15474-
15475- /*
15476- * Mappings have to fit in the FIX_BTMAP area.
15477- */
15478- nrpages = size >> PAGE_SHIFT;
15479- if (nrpages > NR_FIX_BTMAPS)
15480- return NULL;
15481-
15482- /*
15483- * Ok, go for it..
15484- */
15485- idx = FIX_BTMAP_BEGIN;
15486- while (nrpages > 0) {
15487- set_fixmap(idx, phys_addr);
15488- phys_addr += PAGE_SIZE;
15489- --idx;
15490- --nrpages;
15491- }
15492- return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
15493-}
15494-
15495-void __init bt_iounmap(void *addr, unsigned long size)
15496-{
15497- unsigned long virt_addr;
15498- unsigned long offset;
15499- unsigned int nrpages;
15500- enum fixed_addresses idx;
15501-
15502- virt_addr = (unsigned long)addr;
15503- if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
15504- return;
15505- if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15506- return;
15507- offset = virt_addr & ~PAGE_MASK;
15508- nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
15509-
15510- idx = FIX_BTMAP_BEGIN;
15511- while (nrpages > 0) {
15512- clear_fixmap(idx);
15513- --idx;
15514- --nrpages;
15515- }
15516-}
15517--- /dev/null
15518+++ b/arch/x86/mm/ioremap-xen.c
15519@@ -0,0 +1,685 @@
15520+/*
15521+ * Re-map IO memory to kernel address space so that we can access it.
15522+ * This is needed for high PCI addresses that aren't mapped in the
15523+ * 640k-1MB IO memory area on PC's
15524+ *
15525+ * (C) Copyright 1995 1996 Linus Torvalds
15526+ */
15527+
15528+#include <linux/bootmem.h>
15529+#include <linux/init.h>
15530+#include <linux/io.h>
15531+#include <linux/module.h>
15532+#include <linux/pfn.h>
15533+#include <linux/slab.h>
15534+#include <linux/vmalloc.h>
15535+
15536+#include <asm/cacheflush.h>
15537+#include <asm/e820.h>
15538+#include <asm/fixmap.h>
15539+#include <asm/pgtable.h>
15540+#include <asm/tlbflush.h>
15541+#include <asm/pgalloc.h>
15542+
15543+enum ioremap_mode {
15544+ IOR_MODE_UNCACHED,
15545+ IOR_MODE_CACHED,
15546+};
15547+
15548+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
15549+
15550+unsigned long __phys_addr(unsigned long x)
15551+{
15552+ if (x >= __START_KERNEL_map)
15553+ return x - __START_KERNEL_map + phys_base;
15554+ return x - PAGE_OFFSET;
15555+}
15556+EXPORT_SYMBOL(__phys_addr);
15557+
15558+#endif
15559+
15560+static int direct_remap_area_pte_fn(pte_t *pte,
15561+ struct page *pmd_page,
15562+ unsigned long address,
15563+ void *data)
15564+{
15565+ mmu_update_t **v = (mmu_update_t **)data;
15566+
15567+ BUG_ON(!pte_none(*pte));
15568+
15569+ (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15570+ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15571+ (*v)++;
15572+
15573+ return 0;
15574+}
15575+
15576+static int __direct_remap_pfn_range(struct mm_struct *mm,
15577+ unsigned long address,
15578+ unsigned long mfn,
15579+ unsigned long size,
15580+ pgprot_t prot,
15581+ domid_t domid)
15582+{
15583+ int rc;
15584+ unsigned long i, start_address;
15585+ mmu_update_t *u, *v, *w;
15586+
15587+ u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
15588+ if (u == NULL)
15589+ return -ENOMEM;
15590+
15591+ start_address = address;
15592+
15593+ flush_cache_all();
15594+
15595+ for (i = 0; i < size; i += PAGE_SIZE) {
15596+ if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
15597+ /* Flush a full batch after filling in the PTE ptrs. */
15598+ rc = apply_to_page_range(mm, start_address,
15599+ address - start_address,
15600+ direct_remap_area_pte_fn, &w);
15601+ if (rc)
15602+ goto out;
15603+ rc = -EFAULT;
15604+ if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
15605+ goto out;
15606+ v = w = u;
15607+ start_address = address;
15608+ }
15609+
15610+ /*
15611+ * Fill in the machine address: PTE ptr is done later by
15612+ * apply_to_page_range().
15613+ */
15614+ v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
15615+
15616+ mfn++;
15617+ address += PAGE_SIZE;
15618+ v++;
15619+ }
15620+
15621+ if (v != u) {
15622+ /* Final batch. */
15623+ rc = apply_to_page_range(mm, start_address,
15624+ address - start_address,
15625+ direct_remap_area_pte_fn, &w);
15626+ if (rc)
15627+ goto out;
15628+ rc = -EFAULT;
15629+ if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
15630+ goto out;
15631+ }
15632+
15633+ rc = 0;
15634+
15635+ out:
15636+ flush_tlb_all();
15637+
15638+ free_page((unsigned long)u);
15639+
15640+ return rc;
15641+}
15642+
15643+int direct_remap_pfn_range(struct vm_area_struct *vma,
15644+ unsigned long address,
15645+ unsigned long mfn,
15646+ unsigned long size,
15647+ pgprot_t prot,
15648+ domid_t domid)
15649+{
15650+ if (xen_feature(XENFEAT_auto_translated_physmap))
15651+ return remap_pfn_range(vma, address, mfn, size, prot);
15652+
15653+ if (domid == DOMID_SELF)
15654+ return -EINVAL;
15655+
15656+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
15657+
15658+ vma->vm_mm->context.has_foreign_mappings = 1;
15659+
15660+ return __direct_remap_pfn_range(
15661+ vma->vm_mm, address, mfn, size, prot, domid);
15662+}
15663+EXPORT_SYMBOL(direct_remap_pfn_range);
15664+
15665+int direct_kernel_remap_pfn_range(unsigned long address,
15666+ unsigned long mfn,
15667+ unsigned long size,
15668+ pgprot_t prot,
15669+ domid_t domid)
15670+{
15671+ return __direct_remap_pfn_range(
15672+ &init_mm, address, mfn, size, prot, domid);
15673+}
15674+EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
15675+
15676+static int lookup_pte_fn(
15677+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15678+{
15679+ uint64_t *ptep = (uint64_t *)data;
15680+ if (ptep)
15681+ *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15682+ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15683+ return 0;
15684+}
15685+
15686+int create_lookup_pte_addr(struct mm_struct *mm,
15687+ unsigned long address,
15688+ uint64_t *ptep)
15689+{
15690+ return apply_to_page_range(mm, address, PAGE_SIZE,
15691+ lookup_pte_fn, ptep);
15692+}
15693+
15694+EXPORT_SYMBOL(create_lookup_pte_addr);
15695+
15696+static int noop_fn(
15697+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15698+{
15699+ return 0;
15700+}
15701+
15702+int touch_pte_range(struct mm_struct *mm,
15703+ unsigned long address,
15704+ unsigned long size)
15705+{
15706+ return apply_to_page_range(mm, address, size, noop_fn, NULL);
15707+}
15708+
15709+EXPORT_SYMBOL(touch_pte_range);
15710+
15711+#ifdef CONFIG_X86_32
15712+int page_is_ram(unsigned long pagenr)
15713+{
15714+ unsigned long addr, end;
15715+ int i;
15716+
15717+#ifndef CONFIG_XEN
15718+ /*
15719+ * A special case is the first 4Kb of memory;
15720+ * This is a BIOS owned area, not kernel ram, but generally
15721+ * not listed as such in the E820 table.
15722+ */
15723+ if (pagenr == 0)
15724+ return 0;
15725+
15726+ /*
15727+ * Second special case: Some BIOSen report the PC BIOS
15728+ * area (640->1Mb) as ram even though it is not.
15729+ */
15730+ if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
15731+ pagenr < (BIOS_END >> PAGE_SHIFT))
15732+ return 0;
15733+#endif
15734+
15735+ for (i = 0; i < e820.nr_map; i++) {
15736+ /*
15737+ * Not usable memory:
15738+ */
15739+ if (e820.map[i].type != E820_RAM)
15740+ continue;
15741+ addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
15742+ end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
15743+
15744+
15745+ if ((pagenr >= addr) && (pagenr < end))
15746+ return 1;
15747+ }
15748+ return 0;
15749+}
15750+#endif
15751+
15752+/*
15753+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
15754+ * conflicts.
15755+ */
15756+static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
15757+ enum ioremap_mode mode)
15758+{
15759+ unsigned long nrpages = size >> PAGE_SHIFT;
15760+ int err;
15761+
15762+ switch (mode) {
15763+ case IOR_MODE_UNCACHED:
15764+ default:
15765+ err = set_memory_uc(vaddr, nrpages);
15766+ break;
15767+ case IOR_MODE_CACHED:
15768+ err = set_memory_wb(vaddr, nrpages);
15769+ break;
15770+ }
15771+
15772+ return err;
15773+}
15774+
15775+/*
15776+ * Remap an arbitrary physical address space into the kernel virtual
15777+ * address space. Needed when the kernel wants to access high addresses
15778+ * directly.
15779+ *
15780+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
15781+ * have to convert them into an offset in a page-aligned mapping, but the
15782+ * caller shouldn't need to know that small detail.
15783+ */
15784+static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
15785+ enum ioremap_mode mode)
15786+{
15787+ unsigned long mfn, offset, last_addr, vaddr;
15788+ struct vm_struct *area;
15789+ pgprot_t prot;
15790+ domid_t domid = DOMID_IO;
15791+
15792+ /* Don't allow wraparound or zero size */
15793+ last_addr = phys_addr + size - 1;
15794+ if (!size || last_addr < phys_addr)
15795+ return NULL;
15796+
15797+ /*
15798+ * Don't remap the low PCI/ISA area, it's always mapped..
15799+ */
15800+ if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
15801+ return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
15802+
15803+ /*
15804+ * Don't allow anybody to remap normal RAM that we're using..
15805+ */
15806+ for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
15807+ unsigned long pfn = mfn_to_local_pfn(mfn);
15808+
15809+ if (pfn >= max_pfn)
15810+ continue;
15811+
15812+ domid = DOMID_SELF;
15813+
15814+ if (pfn >= max_pfn_mapped) /* bogus */
15815+ continue;
15816+
15817+ if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
15818+ return NULL;
15819+ }
15820+
15821+ switch (mode) {
15822+ case IOR_MODE_UNCACHED:
15823+ default:
15824+ /*
15825+ * FIXME: we will use UC MINUS for now, as video fb drivers
15826+ * depend on it. Upcoming ioremap_wc() will fix this behavior.
15827+ */
15828+ prot = PAGE_KERNEL_UC_MINUS;
15829+ break;
15830+ case IOR_MODE_CACHED:
15831+ prot = PAGE_KERNEL;
15832+ break;
15833+ }
15834+
15835+ /*
15836+ * Mappings have to be page-aligned
15837+ */
15838+ offset = phys_addr & ~PAGE_MASK;
15839+ phys_addr &= PAGE_MASK;
15840+ size = PAGE_ALIGN(last_addr+1) - phys_addr;
15841+
15842+ /*
15843+ * Ok, go for it..
15844+ */
15845+ area = get_vm_area(size, VM_IOREMAP | (mode << 20));
15846+ if (!area)
15847+ return NULL;
15848+ area->phys_addr = phys_addr;
15849+ vaddr = (unsigned long) area->addr;
15850+ if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
15851+ size, prot, domid)) {
15852+ free_vm_area(area);
15853+ return NULL;
15854+ }
15855+
15856+ if (ioremap_change_attr(vaddr, size, mode) < 0) {
15857+ iounmap((void __iomem *) vaddr);
15858+ return NULL;
15859+ }
15860+
15861+ return (void __iomem *) (vaddr + offset);
15862+}
15863+
15864+/**
15865+ * ioremap_nocache - map bus memory into CPU space
15866+ * @offset: bus address of the memory
15867+ * @size: size of the resource to map
15868+ *
15869+ * ioremap_nocache performs a platform specific sequence of operations to
15870+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
15871+ * writew/writel functions and the other mmio helpers. The returned
15872+ * address is not guaranteed to be usable directly as a virtual
15873+ * address.
15874+ *
15875+ * This version of ioremap ensures that the memory is marked uncachable
15876+ * on the CPU as well as honouring existing caching rules from things like
15877+ * the PCI bus. Note that there are other caches and buffers on many
15878+ * busses. In particular driver authors should read up on PCI writes
15879+ *
15880+ * It's useful if some control registers are in such an area and
15881+ * write combining or read caching is not desirable:
15882+ *
15883+ * Must be freed with iounmap.
15884+ */
15885+void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
15886+{
15887+ return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
15888+}
15889+EXPORT_SYMBOL(ioremap_nocache);
15890+
15891+void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
15892+{
15893+ return __ioremap(phys_addr, size, IOR_MODE_CACHED);
15894+}
15895+EXPORT_SYMBOL(ioremap_cache);
15896+
15897+/**
15898+ * iounmap - Free a IO remapping
15899+ * @addr: virtual address from ioremap_*
15900+ *
15901+ * Caller must ensure there is only one unmapping for the same pointer.
15902+ */
15903+void iounmap(volatile void __iomem *addr)
15904+{
15905+ struct vm_struct *p, *o;
15906+
15907+ if ((void __force *)addr <= high_memory)
15908+ return;
15909+
15910+ /*
15911+ * __ioremap special-cases the PCI/ISA range by not instantiating a
15912+ * vm_area and by simply returning an address into the kernel mapping
15913+ * of ISA space. So handle that here.
15914+ */
15915+ if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15916+ return;
15917+
15918+ addr = (volatile void __iomem *)
15919+ (PAGE_MASK & (unsigned long __force)addr);
15920+
15921+ /* Use the vm area unlocked, assuming the caller
15922+ ensures there isn't another iounmap for the same address
15923+ in parallel. Reuse of the virtual address is prevented by
15924+ leaving it in the global lists until we're done with it.
15925+ cpa takes care of the direct mappings. */
15926+ read_lock(&vmlist_lock);
15927+ for (p = vmlist; p; p = p->next) {
15928+ if (p->addr == addr)
15929+ break;
15930+ }
15931+ read_unlock(&vmlist_lock);
15932+
15933+ if (!p) {
15934+ printk(KERN_ERR "iounmap: bad address %p\n", addr);
15935+ dump_stack();
15936+ return;
15937+ }
15938+
15939+ if ((p->flags >> 20) != IOR_MODE_CACHED) {
15940+ unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
15941+ unsigned long mfn = p->phys_addr;
15942+ unsigned long va = (unsigned long)addr;
15943+
15944+ for (; n > 0; n--, mfn++, va += PAGE_SIZE)
15945+ if (mfn_to_local_pfn(mfn) < max_pfn)
15946+ set_memory_wb(va, 1);
15947+ }
15948+
15949+ /* Finally remove it */
15950+ o = remove_vm_area((void *)addr);
15951+ BUG_ON(p != o || o == NULL);
15952+ kfree(p);
15953+}
15954+EXPORT_SYMBOL(iounmap);
15955+
15956+int __initdata early_ioremap_debug;
15957+
15958+static int __init early_ioremap_debug_setup(char *str)
15959+{
15960+ early_ioremap_debug = 1;
15961+
15962+ return 0;
15963+}
15964+early_param("early_ioremap_debug", early_ioremap_debug_setup);
15965+
15966+static __initdata int after_paging_init;
15967+static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
15968+ __attribute__((aligned(PAGE_SIZE)));
15969+
15970+#ifdef CONFIG_X86_32
15971+static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
15972+{
15973+ /* Don't assume we're using swapper_pg_dir at this point */
15974+ pgd_t *base = __va(read_cr3());
15975+ pgd_t *pgd = &base[pgd_index(addr)];
15976+ pud_t *pud = pud_offset(pgd, addr);
15977+ pmd_t *pmd = pmd_offset(pud, addr);
15978+
15979+ return pmd;
15980+}
15981+#else
15982+#define early_ioremap_pmd early_get_pmd
15983+#define make_lowmem_page_readonly early_make_page_readonly
15984+#define make_lowmem_page_writable make_page_writable
15985+#endif
15986+
15987+static inline pte_t * __init early_ioremap_pte(unsigned long addr)
15988+{
15989+ return &bm_pte[pte_index(addr)];
15990+}
15991+
15992+void __init early_ioremap_init(void)
15993+{
15994+ pmd_t *pmd;
15995+
15996+ if (early_ioremap_debug)
15997+ printk(KERN_INFO "early_ioremap_init()\n");
15998+
15999+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
16000+ memset(bm_pte, 0, sizeof(bm_pte));
16001+ make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
16002+ pmd_populate_kernel(&init_mm, pmd, bm_pte);
16003+
16004+ /*
16005+ * The boot-ioremap range spans multiple pmds, for which
16006+ * we are not prepared:
16007+ */
16008+ if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
16009+ WARN_ON(1);
16010+ printk(KERN_WARNING "pmd %p != %p\n",
16011+ pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
16012+ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
16013+ fix_to_virt(FIX_BTMAP_BEGIN));
16014+ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
16015+ fix_to_virt(FIX_BTMAP_END));
16016+
16017+ printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
16018+ printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
16019+ FIX_BTMAP_BEGIN);
16020+ }
16021+}
16022+
16023+void __init early_ioremap_clear(void)
16024+{
16025+ pmd_t *pmd;
16026+
16027+ if (early_ioremap_debug)
16028+ printk(KERN_INFO "early_ioremap_clear()\n");
16029+
16030+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
16031+ pmd_clear(pmd);
16032+ make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
16033+ /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
16034+ __flush_tlb_all();
16035+}
16036+
16037+void __init early_ioremap_reset(void)
16038+{
16039+ enum fixed_addresses idx;
16040+ unsigned long addr, phys;
16041+ pte_t *pte;
16042+
16043+ after_paging_init = 1;
16044+ for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
16045+ addr = fix_to_virt(idx);
16046+ pte = early_ioremap_pte(addr);
16047+ if (pte_present(*pte)) {
16048+ phys = __pte_val(*pte) & PAGE_MASK;
16049+ set_fixmap(idx, phys);
16050+ }
16051+ }
16052+}
16053+
16054+static void __init __early_set_fixmap(enum fixed_addresses idx,
16055+ unsigned long phys, pgprot_t flags)
16056+{
16057+ unsigned long addr = __fix_to_virt(idx);
16058+ pte_t *pte;
16059+
16060+ if (idx >= __end_of_fixed_addresses) {
16061+ BUG();
16062+ return;
16063+ }
16064+ pte = early_ioremap_pte(addr);
16065+ if (pgprot_val(flags))
16066+ set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
16067+ else
16068+ pte_clear(NULL, addr, pte);
16069+ __flush_tlb_one(addr);
16070+}
16071+
16072+static inline void __init early_set_fixmap(enum fixed_addresses idx,
16073+ unsigned long phys)
16074+{
16075+ if (after_paging_init)
16076+ set_fixmap(idx, phys);
16077+ else
16078+ __early_set_fixmap(idx, phys, PAGE_KERNEL);
16079+}
16080+
16081+static inline void __init early_clear_fixmap(enum fixed_addresses idx)
16082+{
16083+ if (after_paging_init)
16084+ clear_fixmap(idx);
16085+ else
16086+ __early_set_fixmap(idx, 0, __pgprot(0));
16087+}
16088+
16089+
16090+int __initdata early_ioremap_nested;
16091+
16092+static int __init check_early_ioremap_leak(void)
16093+{
16094+ if (!early_ioremap_nested)
16095+ return 0;
16096+
16097+ printk(KERN_WARNING
16098+ "Debug warning: early ioremap leak of %d areas detected.\n",
16099+ early_ioremap_nested);
16100+ printk(KERN_WARNING
16101+ "please boot with early_ioremap_debug and report the dmesg.\n");
16102+ WARN_ON(1);
16103+
16104+ return 1;
16105+}
16106+late_initcall(check_early_ioremap_leak);
16107+
16108+void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
16109+{
16110+ unsigned long offset, last_addr;
16111+ unsigned int nrpages, nesting;
16112+ enum fixed_addresses idx0, idx;
16113+
16114+ WARN_ON(system_state != SYSTEM_BOOTING);
16115+
16116+ nesting = early_ioremap_nested;
16117+ if (early_ioremap_debug) {
16118+ printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
16119+ phys_addr, size, nesting);
16120+ dump_stack();
16121+ }
16122+
16123+ /* Don't allow wraparound or zero size */
16124+ last_addr = phys_addr + size - 1;
16125+ if (!size || last_addr < phys_addr) {
16126+ WARN_ON(1);
16127+ return NULL;
16128+ }
16129+
16130+ if (nesting >= FIX_BTMAPS_NESTING) {
16131+ WARN_ON(1);
16132+ return NULL;
16133+ }
16134+ early_ioremap_nested++;
16135+ /*
16136+ * Mappings have to be page-aligned
16137+ */
16138+ offset = phys_addr & ~PAGE_MASK;
16139+ phys_addr &= PAGE_MASK;
16140+ size = PAGE_ALIGN(last_addr) - phys_addr;
16141+
16142+ /*
16143+ * Mappings have to fit in the FIX_BTMAP area.
16144+ */
16145+ nrpages = size >> PAGE_SHIFT;
16146+ if (nrpages > NR_FIX_BTMAPS) {
16147+ WARN_ON(1);
16148+ return NULL;
16149+ }
16150+
16151+ /*
16152+ * Ok, go for it..
16153+ */
16154+ idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
16155+ idx = idx0;
16156+ while (nrpages > 0) {
16157+ early_set_fixmap(idx, phys_addr);
16158+ phys_addr += PAGE_SIZE;
16159+ --idx;
16160+ --nrpages;
16161+ }
16162+ if (early_ioremap_debug)
16163+ printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
16164+
16165+ return (void *) (offset + fix_to_virt(idx0));
16166+}
16167+
16168+void __init early_iounmap(void *addr, unsigned long size)
16169+{
16170+ unsigned long virt_addr;
16171+ unsigned long offset;
16172+ unsigned int nrpages;
16173+ enum fixed_addresses idx;
16174+ unsigned int nesting;
16175+
16176+ nesting = --early_ioremap_nested;
16177+ WARN_ON(nesting < 0);
16178+
16179+ if (early_ioremap_debug) {
16180+ printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
16181+ size, nesting);
16182+ dump_stack();
16183+ }
16184+
16185+ virt_addr = (unsigned long)addr;
16186+ if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
16187+ WARN_ON(1);
16188+ return;
16189+ }
16190+ offset = virt_addr & ~PAGE_MASK;
16191+ nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
16192+
16193+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
16194+ while (nrpages > 0) {
16195+ early_clear_fixmap(idx);
16196+ --idx;
16197+ --nrpages;
16198+ }
16199+}
16200+
16201+void __this_fixmap_does_not_exist(void)
16202+{
16203+ WARN_ON(1);
16204+}
16205--- a/arch/x86/mm/pageattr_64-xen.c
16206+++ /dev/null
16207@@ -1,542 +0,0 @@
16208-/*
16209- * Copyright 2002 Andi Kleen, SuSE Labs.
16210- * Thanks to Ben LaHaise for precious feedback.
16211- */
16212-
16213-#include <linux/mm.h>
16214-#include <linux/sched.h>
16215-#include <linux/highmem.h>
16216-#include <linux/module.h>
16217-#include <linux/slab.h>
16218-#include <asm/uaccess.h>
16219-#include <asm/processor.h>
16220-#include <asm/tlbflush.h>
16221-#include <asm/io.h>
16222-
16223-#ifdef CONFIG_XEN
16224-#include <asm/pgalloc.h>
16225-#include <asm/mmu_context.h>
16226-
16227-static void _pin_lock(struct mm_struct *mm, int lock) {
16228- if (lock)
16229- spin_lock(&mm->page_table_lock);
16230-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
16231- /* While mm->page_table_lock protects us against insertions and
16232- * removals of higher level page table pages, it doesn't protect
16233- * against updates of pte-s. Such updates, however, require the
16234- * pte pages to be in consistent state (unpinned+writable or
16235- * pinned+readonly). The pinning and attribute changes, however
16236- * cannot be done atomically, which is why such updates must be
16237- * prevented from happening concurrently.
16238- * Note that no pte lock can ever elsewhere be acquired nesting
16239- * with an already acquired one in the same mm, or with the mm's
16240- * page_table_lock already acquired, as that would break in the
16241- * non-split case (where all these are actually resolving to the
16242- * one page_table_lock). Thus acquiring all of them here is not
16243- * going to result in dead locks, and the order of acquires
16244- * doesn't matter.
16245- */
16246- {
16247- pgd_t *pgd = mm->pgd;
16248- unsigned g;
16249-
16250- for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16251- pud_t *pud;
16252- unsigned u;
16253-
16254- if (pgd_none(*pgd))
16255- continue;
16256- pud = pud_offset(pgd, 0);
16257- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16258- pmd_t *pmd;
16259- unsigned m;
16260-
16261- if (pud_none(*pud))
16262- continue;
16263- pmd = pmd_offset(pud, 0);
16264- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16265- spinlock_t *ptl;
16266-
16267- if (pmd_none(*pmd))
16268- continue;
16269- ptl = pte_lockptr(0, pmd);
16270- if (lock)
16271- spin_lock(ptl);
16272- else
16273- spin_unlock(ptl);
16274- }
16275- }
16276- }
16277- }
16278-#endif
16279- if (!lock)
16280- spin_unlock(&mm->page_table_lock);
16281-}
16282-#define pin_lock(mm) _pin_lock(mm, 1)
16283-#define pin_unlock(mm) _pin_lock(mm, 0)
16284-
16285-#define PIN_BATCH 8
16286-static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
16287-
16288-static inline unsigned int pgd_walk_set_prot(void *pt, pgprot_t flags,
16289- unsigned int cpu, unsigned int seq)
16290-{
16291- struct page *page = virt_to_page(pt);
16292- unsigned long pfn = page_to_pfn(page);
16293-
16294- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16295- (unsigned long)__va(pfn << PAGE_SHIFT),
16296- pfn_pte(pfn, flags), 0);
16297- if (unlikely(++seq == PIN_BATCH)) {
16298- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16299- PIN_BATCH, NULL)))
16300- BUG();
16301- seq = 0;
16302- }
16303-
16304- return seq;
16305-}
16306-
16307-static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
16308-{
16309- pgd_t *pgd = pgd_base;
16310- pud_t *pud;
16311- pmd_t *pmd;
16312- pte_t *pte;
16313- int g,u,m;
16314- unsigned int cpu, seq;
16315- multicall_entry_t *mcl;
16316-
16317- cpu = get_cpu();
16318-
16319- /*
16320- * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
16321- * be the 'current' task's pagetables (e.g., current may be 32-bit,
16322- * but the pagetables may be for a 64-bit task).
16323- * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
16324- * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
16325- */
16326- for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16327- if (pgd_none(*pgd))
16328- continue;
16329- pud = pud_offset(pgd, 0);
16330- if (PTRS_PER_PUD > 1) /* not folded */
16331- seq = pgd_walk_set_prot(pud,flags,cpu,seq);
16332- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16333- if (pud_none(*pud))
16334- continue;
16335- pmd = pmd_offset(pud, 0);
16336- if (PTRS_PER_PMD > 1) /* not folded */
16337- seq = pgd_walk_set_prot(pmd,flags,cpu,seq);
16338- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16339- if (pmd_none(*pmd))
16340- continue;
16341- pte = pte_offset_kernel(pmd,0);
16342- seq = pgd_walk_set_prot(pte,flags,cpu,seq);
16343- }
16344- }
16345- }
16346-
16347- mcl = per_cpu(pb_mcl, cpu);
16348- if (unlikely(seq > PIN_BATCH - 2)) {
16349- if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
16350- BUG();
16351- seq = 0;
16352- }
16353- MULTI_update_va_mapping(mcl + seq,
16354- (unsigned long)__user_pgd(pgd_base),
16355- pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
16356- 0);
16357- MULTI_update_va_mapping(mcl + seq + 1,
16358- (unsigned long)pgd_base,
16359- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16360- UVMF_TLB_FLUSH);
16361- if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
16362- BUG();
16363-
16364- put_cpu();
16365-}
16366-
16367-static void __pgd_pin(pgd_t *pgd)
16368-{
16369- pgd_walk(pgd, PAGE_KERNEL_RO);
16370- xen_pgd_pin(__pa(pgd)); /* kernel */
16371- xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
16372- SetPagePinned(virt_to_page(pgd));
16373-}
16374-
16375-static void __pgd_unpin(pgd_t *pgd)
16376-{
16377- xen_pgd_unpin(__pa(pgd));
16378- xen_pgd_unpin(__pa(__user_pgd(pgd)));
16379- pgd_walk(pgd, PAGE_KERNEL);
16380- ClearPagePinned(virt_to_page(pgd));
16381-}
16382-
16383-void pgd_test_and_unpin(pgd_t *pgd)
16384-{
16385- if (PagePinned(virt_to_page(pgd)))
16386- __pgd_unpin(pgd);
16387-}
16388-
16389-void mm_pin(struct mm_struct *mm)
16390-{
16391- if (xen_feature(XENFEAT_writable_page_tables))
16392- return;
16393-
16394- pin_lock(mm);
16395- __pgd_pin(mm->pgd);
16396- pin_unlock(mm);
16397-}
16398-
16399-void mm_unpin(struct mm_struct *mm)
16400-{
16401- if (xen_feature(XENFEAT_writable_page_tables))
16402- return;
16403-
16404- pin_lock(mm);
16405- __pgd_unpin(mm->pgd);
16406- pin_unlock(mm);
16407-}
16408-
16409-void mm_pin_all(void)
16410-{
16411- struct page *page;
16412- unsigned long flags;
16413-
16414- if (xen_feature(XENFEAT_writable_page_tables))
16415- return;
16416-
16417- /*
16418- * Allow uninterrupted access to the pgd_list. Also protects
16419- * __pgd_pin() by disabling preemption.
16420- * All other CPUs must be at a safe point (e.g., in stop_machine
16421- * or offlined entirely).
16422- */
16423- spin_lock_irqsave(&pgd_lock, flags);
16424- list_for_each_entry(page, &pgd_list, lru) {
16425- if (!PagePinned(page))
16426- __pgd_pin((pgd_t *)page_address(page));
16427- }
16428- spin_unlock_irqrestore(&pgd_lock, flags);
16429-}
16430-
16431-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
16432-{
16433- if (!PagePinned(virt_to_page(mm->pgd)))
16434- mm_pin(mm);
16435-}
16436-
16437-void arch_exit_mmap(struct mm_struct *mm)
16438-{
16439- struct task_struct *tsk = current;
16440-
16441- task_lock(tsk);
16442-
16443- /*
16444- * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
16445- * *much* faster this way, as no tlb flushes means bigger wrpt batches.
16446- */
16447- if (tsk->active_mm == mm) {
16448- tsk->active_mm = &init_mm;
16449- atomic_inc(&init_mm.mm_count);
16450-
16451- switch_mm(mm, &init_mm, tsk);
16452-
16453- atomic_dec(&mm->mm_count);
16454- BUG_ON(atomic_read(&mm->mm_count) == 0);
16455- }
16456-
16457- task_unlock(tsk);
16458-
16459- if (PagePinned(virt_to_page(mm->pgd))
16460- && (atomic_read(&mm->mm_count) == 1)
16461- && !mm->context.has_foreign_mappings)
16462- mm_unpin(mm);
16463-}
16464-
16465-static void _pte_free(struct page *page, unsigned int order)
16466-{
16467- BUG_ON(order);
16468- pte_free(page);
16469-}
16470-
16471-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
16472-{
16473- struct page *pte;
16474-
16475- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
16476- if (pte) {
16477- SetPageForeign(pte, _pte_free);
16478- init_page_count(pte);
16479- }
16480- return pte;
16481-}
16482-
16483-void pte_free(struct page *pte)
16484-{
16485- unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
16486-
16487- if (!pte_write(*virt_to_ptep(va)))
16488- if (HYPERVISOR_update_va_mapping(
16489- va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
16490- BUG();
16491-
16492- ClearPageForeign(pte);
16493- init_page_count(pte);
16494-
16495- __free_page(pte);
16496-}
16497-#endif /* CONFIG_XEN */
16498-
16499-pte_t *lookup_address(unsigned long address)
16500-{
16501- pgd_t *pgd = pgd_offset_k(address);
16502- pud_t *pud;
16503- pmd_t *pmd;
16504- pte_t *pte;
16505- if (pgd_none(*pgd))
16506- return NULL;
16507- pud = pud_offset(pgd, address);
16508- if (!pud_present(*pud))
16509- return NULL;
16510- pmd = pmd_offset(pud, address);
16511- if (!pmd_present(*pmd))
16512- return NULL;
16513- if (pmd_large(*pmd))
16514- return (pte_t *)pmd;
16515- pte = pte_offset_kernel(pmd, address);
16516- if (pte && !pte_present(*pte))
16517- pte = NULL;
16518- return pte;
16519-}
16520-
16521-static struct page *split_large_page(unsigned long address, pgprot_t prot,
16522- pgprot_t ref_prot)
16523-{
16524- int i;
16525- unsigned long addr;
16526- struct page *base = alloc_pages(GFP_KERNEL, 0);
16527- pte_t *pbase;
16528- if (!base)
16529- return NULL;
16530- /*
16531- * page_private is used to track the number of entries in
16532- * the page table page have non standard attributes.
16533- */
16534- SetPagePrivate(base);
16535- page_private(base) = 0;
16536-
16537- address = __pa(address);
16538- addr = address & LARGE_PAGE_MASK;
16539- pbase = (pte_t *)page_address(base);
16540- for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
16541- pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
16542- addr == address ? prot : ref_prot);
16543- }
16544- return base;
16545-}
16546-
16547-void clflush_cache_range(void *adr, int size)
16548-{
16549- int i;
16550- for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
16551- clflush(adr+i);
16552-}
16553-
16554-static void flush_kernel_map(void *arg)
16555-{
16556- struct list_head *l = (struct list_head *)arg;
16557- struct page *pg;
16558-
16559- /* When clflush is available always use it because it is
16560- much cheaper than WBINVD. */
16561- /* clflush is still broken. Disable for now. */
16562- if (1 || !cpu_has_clflush)
16563- asm volatile("wbinvd" ::: "memory");
16564- else list_for_each_entry(pg, l, lru) {
16565- void *adr = page_address(pg);
16566- clflush_cache_range(adr, PAGE_SIZE);
16567- }
16568- __flush_tlb_all();
16569-}
16570-
16571-static inline void flush_map(struct list_head *l)
16572-{
16573- on_each_cpu(flush_kernel_map, l, 1, 1);
16574-}
16575-
16576-static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
16577-
16578-static inline void save_page(struct page *fpage)
16579-{
16580- if (!test_and_set_bit(PG_arch_1, &fpage->flags))
16581- list_add(&fpage->lru, &deferred_pages);
16582-}
16583-
16584-/*
16585- * No more special protections in this 2/4MB area - revert to a
16586- * large page again.
16587- */
16588-static void revert_page(unsigned long address, pgprot_t ref_prot)
16589-{
16590- pgd_t *pgd;
16591- pud_t *pud;
16592- pmd_t *pmd;
16593- pte_t large_pte;
16594- unsigned long pfn;
16595-
16596- pgd = pgd_offset_k(address);
16597- BUG_ON(pgd_none(*pgd));
16598- pud = pud_offset(pgd,address);
16599- BUG_ON(pud_none(*pud));
16600- pmd = pmd_offset(pud, address);
16601- BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
16602- pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
16603- large_pte = pfn_pte(pfn, ref_prot);
16604- large_pte = pte_mkhuge(large_pte);
16605- set_pte((pte_t *)pmd, large_pte);
16606-}
16607-
16608-static int
16609-__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
16610- pgprot_t ref_prot)
16611-{
16612- pte_t *kpte;
16613- struct page *kpte_page;
16614- pgprot_t ref_prot2;
16615-
16616- kpte = lookup_address(address);
16617- if (!kpte) return 0;
16618- kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
16619- BUG_ON(PageLRU(kpte_page));
16620- BUG_ON(PageCompound(kpte_page));
16621- if (pgprot_val(prot) != pgprot_val(ref_prot)) {
16622- if (!pte_huge(*kpte)) {
16623- set_pte(kpte, pfn_pte(pfn, prot));
16624- } else {
16625- /*
16626- * split_large_page will take the reference for this
16627- * change_page_attr on the split page.
16628- */
16629- struct page *split;
16630- ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
16631- split = split_large_page(address, prot, ref_prot2);
16632- if (!split)
16633- return -ENOMEM;
16634- pgprot_val(ref_prot2) &= ~_PAGE_NX;
16635- set_pte(kpte, mk_pte(split, ref_prot2));
16636- kpte_page = split;
16637- }
16638- page_private(kpte_page)++;
16639- } else if (!pte_huge(*kpte)) {
16640- set_pte(kpte, pfn_pte(pfn, ref_prot));
16641- BUG_ON(page_private(kpte_page) == 0);
16642- page_private(kpte_page)--;
16643- } else
16644- BUG();
16645-
16646- /* on x86-64 the direct mapping set at boot is not using 4k pages */
16647- /*
16648- * ..., but the XEN guest kernels (currently) do:
16649- * If the pte was reserved, it means it was created at boot
16650- * time (not via split_large_page) and in turn we must not
16651- * replace it with a large page.
16652- */
16653-#ifndef CONFIG_XEN
16654- BUG_ON(PageReserved(kpte_page));
16655-#else
16656- if (PageReserved(kpte_page))
16657- return 0;
16658-#endif
16659-
16660- save_page(kpte_page);
16661- if (page_private(kpte_page) == 0)
16662- revert_page(address, ref_prot);
16663- return 0;
16664-}
16665-
16666-/*
16667- * Change the page attributes of an page in the linear mapping.
16668- *
16669- * This should be used when a page is mapped with a different caching policy
16670- * than write-back somewhere - some CPUs do not like it when mappings with
16671- * different caching policies exist. This changes the page attributes of the
16672- * in kernel linear mapping too.
16673- *
16674- * The caller needs to ensure that there are no conflicting mappings elsewhere.
16675- * This function only deals with the kernel linear map.
16676- *
16677- * Caller must call global_flush_tlb() after this.
16678- */
16679-int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
16680-{
16681- int err = 0, kernel_map = 0;
16682- int i;
16683-
16684- if (address >= __START_KERNEL_map
16685- && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
16686- address = (unsigned long)__va(__pa(address));
16687- kernel_map = 1;
16688- }
16689-
16690- down_write(&init_mm.mmap_sem);
16691- for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
16692- unsigned long pfn = __pa(address) >> PAGE_SHIFT;
16693-
16694- if (!kernel_map || pte_present(pfn_pte(0, prot))) {
16695- err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
16696- if (err)
16697- break;
16698- }
16699- /* Handle kernel mapping too which aliases part of the
16700- * lowmem */
16701- if (__pa(address) < KERNEL_TEXT_SIZE) {
16702- unsigned long addr2;
16703- pgprot_t prot2;
16704- addr2 = __START_KERNEL_map + __pa(address);
16705- /* Make sure the kernel mappings stay executable */
16706- prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
16707- err = __change_page_attr(addr2, pfn, prot2,
16708- PAGE_KERNEL_EXEC);
16709- }
16710- }
16711- up_write(&init_mm.mmap_sem);
16712- return err;
16713-}
16714-
16715-/* Don't call this for MMIO areas that may not have a mem_map entry */
16716-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
16717-{
16718- unsigned long addr = (unsigned long)page_address(page);
16719- return change_page_attr_addr(addr, numpages, prot);
16720-}
16721-
16722-void global_flush_tlb(void)
16723-{
16724- struct page *pg, *next;
16725- struct list_head l;
16726-
16727- /*
16728- * Write-protect the semaphore, to exclude two contexts
16729- * doing a list_replace_init() call in parallel and to
16730- * exclude new additions to the deferred_pages list:
16731- */
16732- down_write(&init_mm.mmap_sem);
16733- list_replace_init(&deferred_pages, &l);
16734- up_write(&init_mm.mmap_sem);
16735-
16736- flush_map(&l);
16737-
16738- list_for_each_entry_safe(pg, next, &l, lru) {
16739- list_del(&pg->lru);
16740- clear_bit(PG_arch_1, &pg->flags);
16741- if (page_private(pg) != 0)
16742- continue;
16743- ClearPagePrivate(pg);
16744- __free_page(pg);
16745- }
16746-}
16747-
16748-EXPORT_SYMBOL(change_page_attr);
16749-EXPORT_SYMBOL(global_flush_tlb);
16750--- /dev/null
16751+++ b/arch/x86/mm/pageattr-xen.c
16752@@ -0,0 +1,1412 @@
16753+/*
16754+ * Copyright 2002 Andi Kleen, SuSE Labs.
16755+ * Thanks to Ben LaHaise for precious feedback.
16756+ */
16757+#include <linux/highmem.h>
16758+#include <linux/bootmem.h>
16759+#include <linux/module.h>
16760+#include <linux/sched.h>
16761+#include <linux/slab.h>
16762+#include <linux/mm.h>
16763+#include <linux/interrupt.h>
16764+
16765+#include <asm/e820.h>
16766+#include <asm/processor.h>
16767+#include <asm/tlbflush.h>
16768+#include <asm/sections.h>
16769+#include <asm/uaccess.h>
16770+#include <asm/pgalloc.h>
16771+#include <asm/proto.h>
16772+#include <asm/mmu_context.h>
16773+
16774+#ifndef CONFIG_X86_64
16775+#define TASK_SIZE64 TASK_SIZE
16776+#endif
16777+
16778+static void _pin_lock(struct mm_struct *mm, int lock) {
16779+ if (lock)
16780+ spin_lock(&mm->page_table_lock);
16781+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
16782+ /* While mm->page_table_lock protects us against insertions and
16783+ * removals of higher level page table pages, it doesn't protect
16784+ * against updates of pte-s. Such updates, however, require the
16785+ * pte pages to be in consistent state (unpinned+writable or
16786+ * pinned+readonly). The pinning and attribute changes, however
16787+ * cannot be done atomically, which is why such updates must be
16788+ * prevented from happening concurrently.
16789+ * Note that no pte lock can ever elsewhere be acquired nesting
16790+ * with an already acquired one in the same mm, or with the mm's
16791+ * page_table_lock already acquired, as that would break in the
16792+ * non-split case (where all these are actually resolving to the
16793+ * one page_table_lock). Thus acquiring all of them here is not
16794+ * going to result in dead locks, and the order of acquires
16795+ * doesn't matter.
16796+ */
16797+ {
16798+ pgd_t *pgd = mm->pgd;
16799+ unsigned g;
16800+
16801+ for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16802+ pud_t *pud;
16803+ unsigned u;
16804+
16805+ if (pgd_none(*pgd))
16806+ continue;
16807+ pud = pud_offset(pgd, 0);
16808+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16809+ pmd_t *pmd;
16810+ unsigned m;
16811+
16812+ if (pud_none(*pud))
16813+ continue;
16814+ pmd = pmd_offset(pud, 0);
16815+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16816+ spinlock_t *ptl;
16817+
16818+ if (pmd_none(*pmd))
16819+ continue;
16820+ ptl = pte_lockptr(0, pmd);
16821+ if (lock)
16822+ spin_lock(ptl);
16823+ else
16824+ spin_unlock(ptl);
16825+ }
16826+ }
16827+ }
16828+ }
16829+#endif
16830+ if (!lock)
16831+ spin_unlock(&mm->page_table_lock);
16832+}
16833+#define pin_lock(mm) _pin_lock(mm, 1)
16834+#define pin_unlock(mm) _pin_lock(mm, 0)
16835+
16836+#define PIN_BATCH sizeof(void *)
16837+static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
16838+
16839+static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
16840+ unsigned int cpu, unsigned int seq)
16841+{
16842+ unsigned long pfn = page_to_pfn(page);
16843+
16844+ if (PageHighMem(page)) {
16845+ if (pgprot_val(flags) & _PAGE_RW)
16846+ ClearPagePinned(page);
16847+ else
16848+ SetPagePinned(page);
16849+ } else {
16850+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16851+ (unsigned long)__va(pfn << PAGE_SHIFT),
16852+ pfn_pte(pfn, flags), 0);
16853+ if (unlikely(++seq == PIN_BATCH)) {
16854+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16855+ PIN_BATCH, NULL)))
16856+ BUG();
16857+ seq = 0;
16858+ }
16859+ }
16860+
16861+ return seq;
16862+}
16863+
16864+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
16865+{
16866+ pgd_t *pgd = pgd_base;
16867+ pud_t *pud;
16868+ pmd_t *pmd;
16869+ int g,u,m;
16870+ unsigned int cpu, seq;
16871+ multicall_entry_t *mcl;
16872+
16873+ if (xen_feature(XENFEAT_auto_translated_physmap))
16874+ return;
16875+
16876+ cpu = get_cpu();
16877+
16878+ /*
16879+ * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
16880+ * may not be the 'current' task's pagetables (e.g., current may be
16881+ * 32-bit, but the pagetables may be for a 64-bit task).
16882+ * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
16883+ * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
16884+ */
16885+ for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16886+ if (pgd_none(*pgd))
16887+ continue;
16888+ pud = pud_offset(pgd, 0);
16889+ if (PTRS_PER_PUD > 1) /* not folded */
16890+ seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
16891+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16892+ if (pud_none(*pud))
16893+ continue;
16894+ pmd = pmd_offset(pud, 0);
16895+ if (PTRS_PER_PMD > 1) /* not folded */
16896+ seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
16897+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16898+ if (pmd_none(*pmd))
16899+ continue;
16900+ seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
16901+ }
16902+ }
16903+ }
16904+
16905+ mcl = per_cpu(pb_mcl, cpu);
16906+#ifdef CONFIG_X86_64
16907+ if (unlikely(seq > PIN_BATCH - 2)) {
16908+ if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
16909+ BUG();
16910+ seq = 0;
16911+ }
16912+ MULTI_update_va_mapping(mcl + seq,
16913+ (unsigned long)__user_pgd(pgd_base),
16914+ pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
16915+ 0);
16916+ MULTI_update_va_mapping(mcl + seq + 1,
16917+ (unsigned long)pgd_base,
16918+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16919+ UVMF_TLB_FLUSH);
16920+ if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
16921+ BUG();
16922+#else
16923+ if (likely(seq != 0)) {
16924+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16925+ (unsigned long)pgd_base,
16926+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16927+ UVMF_TLB_FLUSH);
16928+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16929+ seq + 1, NULL)))
16930+ BUG();
16931+ } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
16932+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16933+ UVMF_TLB_FLUSH))
16934+ BUG();
16935+#endif
16936+
16937+ put_cpu();
16938+}
16939+
16940+static void __pgd_pin(pgd_t *pgd)
16941+{
16942+ pgd_walk(pgd, PAGE_KERNEL_RO);
16943+ kmap_flush_unused();
16944+ xen_pgd_pin(__pa(pgd)); /* kernel */
16945+#ifdef CONFIG_X86_64
16946+ xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
16947+#endif
16948+ SetPagePinned(virt_to_page(pgd));
16949+}
16950+
16951+static void __pgd_unpin(pgd_t *pgd)
16952+{
16953+ xen_pgd_unpin(__pa(pgd));
16954+#ifdef CONFIG_X86_64
16955+ xen_pgd_unpin(__pa(__user_pgd(pgd)));
16956+#endif
16957+ pgd_walk(pgd, PAGE_KERNEL);
16958+ ClearPagePinned(virt_to_page(pgd));
16959+}
16960+
16961+void pgd_test_and_unpin(pgd_t *pgd)
16962+{
16963+ if (PagePinned(virt_to_page(pgd)))
16964+ __pgd_unpin(pgd);
16965+}
16966+
16967+void mm_pin(struct mm_struct *mm)
16968+{
16969+ if (xen_feature(XENFEAT_writable_page_tables))
16970+ return;
16971+
16972+ pin_lock(mm);
16973+ __pgd_pin(mm->pgd);
16974+ pin_unlock(mm);
16975+}
16976+
16977+void mm_unpin(struct mm_struct *mm)
16978+{
16979+ if (xen_feature(XENFEAT_writable_page_tables))
16980+ return;
16981+
16982+ pin_lock(mm);
16983+ __pgd_unpin(mm->pgd);
16984+ pin_unlock(mm);
16985+}
16986+
16987+void mm_pin_all(void)
16988+{
16989+ struct page *page;
16990+ unsigned long flags;
16991+
16992+ if (xen_feature(XENFEAT_writable_page_tables))
16993+ return;
16994+
16995+ /*
16996+ * Allow uninterrupted access to the pgd_list. Also protects
16997+ * __pgd_pin() by disabling preemption.
16998+ * All other CPUs must be at a safe point (e.g., in stop_machine
16999+ * or offlined entirely).
17000+ */
17001+ spin_lock_irqsave(&pgd_lock, flags);
17002+ list_for_each_entry(page, &pgd_list, lru) {
17003+ if (!PagePinned(page))
17004+ __pgd_pin((pgd_t *)page_address(page));
17005+ }
17006+ spin_unlock_irqrestore(&pgd_lock, flags);
17007+}
17008+
17009+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
17010+{
17011+ if (!PagePinned(virt_to_page(mm->pgd)))
17012+ mm_pin(mm);
17013+}
17014+
17015+void arch_exit_mmap(struct mm_struct *mm)
17016+{
17017+ struct task_struct *tsk = current;
17018+
17019+ task_lock(tsk);
17020+
17021+ /*
17022+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
17023+ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
17024+ */
17025+ if (tsk->active_mm == mm) {
17026+ tsk->active_mm = &init_mm;
17027+ atomic_inc(&init_mm.mm_count);
17028+
17029+ switch_mm(mm, &init_mm, tsk);
17030+
17031+ atomic_dec(&mm->mm_count);
17032+ BUG_ON(atomic_read(&mm->mm_count) == 0);
17033+ }
17034+
17035+ task_unlock(tsk);
17036+
17037+ if (PagePinned(virt_to_page(mm->pgd))
17038+ && atomic_read(&mm->mm_count) == 1
17039+ && !mm->context.has_foreign_mappings)
17040+ mm_unpin(mm);
17041+}
17042+
17043+static void _pte_free(struct page *page, unsigned int order)
17044+{
17045+ BUG_ON(order);
17046+ __pte_free(page);
17047+}
17048+
17049+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
17050+{
17051+ struct page *pte;
17052+
17053+#ifdef CONFIG_HIGHPTE
17054+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
17055+#else
17056+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17057+#endif
17058+ if (pte) {
17059+ pgtable_page_ctor(pte);
17060+ SetPageForeign(pte, _pte_free);
17061+ init_page_count(pte);
17062+ }
17063+ return pte;
17064+}
17065+
17066+void __pte_free(pgtable_t pte)
17067+{
17068+ if (!PageHighMem(pte)) {
17069+ unsigned long va = (unsigned long)page_address(pte);
17070+ unsigned int level;
17071+ pte_t *ptep = lookup_address(va, &level);
17072+
17073+ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
17074+ if (!pte_write(*ptep)
17075+ && HYPERVISOR_update_va_mapping(va,
17076+ mk_pte(pte, PAGE_KERNEL),
17077+ 0))
17078+ BUG();
17079+ } else
17080+#ifdef CONFIG_HIGHPTE
17081+ ClearPagePinned(pte);
17082+#else
17083+ BUG();
17084+#endif
17085+
17086+ ClearPageForeign(pte);
17087+ init_page_count(pte);
17088+ pgtable_page_dtor(pte);
17089+ __free_page(pte);
17090+}
17091+
17092+#if PAGETABLE_LEVELS >= 3
17093+static void _pmd_free(struct page *page, unsigned int order)
17094+{
17095+ BUG_ON(order);
17096+ __pmd_free(page);
17097+}
17098+
17099+pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
17100+{
17101+ struct page *pmd;
17102+
17103+ pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17104+ if (!pmd)
17105+ return NULL;
17106+ SetPageForeign(pmd, _pmd_free);
17107+ init_page_count(pmd);
17108+ return page_address(pmd);
17109+}
17110+
17111+void __pmd_free(pgtable_t pmd)
17112+{
17113+ unsigned long va = (unsigned long)page_address(pmd);
17114+ unsigned int level;
17115+ pte_t *ptep = lookup_address(va, &level);
17116+
17117+ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
17118+ if (!pte_write(*ptep)
17119+ && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
17120+ BUG();
17121+
17122+ ClearPageForeign(pmd);
17123+ init_page_count(pmd);
17124+ __free_page(pmd);
17125+}
17126+#endif
17127+
17128+/* blktap and gntdev need this, as otherwise they would implicitly (and
17129+ * needlessly, as they never use it) reference init_mm. */
17130+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
17131+ unsigned long addr, pte_t *ptep, int full)
17132+{
17133+ return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
17134+}
17135+EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
17136+
17137+/*
17138+ * The current flushing context - we pass it instead of 5 arguments:
17139+ */
17140+struct cpa_data {
17141+ unsigned long vaddr;
17142+ pgprot_t mask_set;
17143+ pgprot_t mask_clr;
17144+ int numpages;
17145+ int flushtlb;
17146+ unsigned long pfn;
17147+};
17148+
17149+#ifdef CONFIG_X86_64
17150+
17151+static inline unsigned long highmap_start_pfn(void)
17152+{
17153+ return __pa(_text) >> PAGE_SHIFT;
17154+}
17155+
17156+static inline unsigned long highmap_end_pfn(void)
17157+{
17158+ return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
17159+}
17160+
17161+#endif
17162+
17163+#ifdef CONFIG_DEBUG_PAGEALLOC
17164+# define debug_pagealloc 1
17165+#else
17166+# define debug_pagealloc 0
17167+#endif
17168+
17169+static inline int
17170+within(unsigned long addr, unsigned long start, unsigned long end)
17171+{
17172+ return addr >= start && addr < end;
17173+}
17174+
17175+/*
17176+ * Flushing functions
17177+ */
17178+
17179+/**
17180+ * clflush_cache_range - flush a cache range with clflush
17181+ * @addr: virtual start address
17182+ * @size: number of bytes to flush
17183+ *
17184+ * clflush is an unordered instruction which needs fencing with mfence
17185+ * to avoid ordering issues.
17186+ */
17187+void clflush_cache_range(void *vaddr, unsigned int size)
17188+{
17189+ void *vend = vaddr + size - 1;
17190+
17191+ mb();
17192+
17193+ for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
17194+ clflush(vaddr);
17195+ /*
17196+ * Flush any possible final partial cacheline:
17197+ */
17198+ clflush(vend);
17199+
17200+ mb();
17201+}
17202+
17203+static void __cpa_flush_all(void *arg)
17204+{
17205+ unsigned long cache = (unsigned long)arg;
17206+
17207+ /*
17208+ * Flush all to work around Errata in early athlons regarding
17209+ * large page flushing.
17210+ */
17211+ __flush_tlb_all();
17212+
17213+ if (cache && boot_cpu_data.x86_model >= 4)
17214+ wbinvd();
17215+}
17216+
17217+static void cpa_flush_all(unsigned long cache)
17218+{
17219+ BUG_ON(irqs_disabled());
17220+
17221+ on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
17222+}
17223+
17224+static void __cpa_flush_range(void *arg)
17225+{
17226+ /*
17227+ * We could optimize that further and do individual per page
17228+ * tlb invalidates for a low number of pages. Caveat: we must
17229+ * flush the high aliases on 64bit as well.
17230+ */
17231+ __flush_tlb_all();
17232+}
17233+
17234+static void cpa_flush_range(unsigned long start, int numpages, int cache)
17235+{
17236+ unsigned int i, level;
17237+ unsigned long addr;
17238+
17239+ BUG_ON(irqs_disabled());
17240+ WARN_ON(PAGE_ALIGN(start) != start);
17241+
17242+ on_each_cpu(__cpa_flush_range, NULL, 1, 1);
17243+
17244+ if (!cache)
17245+ return;
17246+
17247+ /*
17248+ * We only need to flush on one CPU,
17249+ * clflush is a MESI-coherent instruction that
17250+ * will cause all other CPUs to flush the same
17251+ * cachelines:
17252+ */
17253+ for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
17254+ pte_t *pte = lookup_address(addr, &level);
17255+
17256+ /*
17257+ * Only flush present addresses:
17258+ */
17259+ if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
17260+ clflush_cache_range((void *) addr, PAGE_SIZE);
17261+ }
17262+}
17263+
17264+/*
17265+ * Certain areas of memory on x86 require very specific protection flags,
17266+ * for example the BIOS area or kernel text. Callers don't always get this
17267+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
17268+ * checks and fixes these known static required protection bits.
17269+ */
17270+static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
17271+ unsigned long pfn)
17272+{
17273+ pgprot_t forbidden = __pgprot(0);
17274+
17275+#ifndef CONFIG_XEN
17276+ /*
17277+ * The BIOS area between 640k and 1Mb needs to be executable for
17278+ * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
17279+ */
17280+ if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
17281+ pgprot_val(forbidden) |= _PAGE_NX;
17282+#endif
17283+
17284+ /*
17285+ * The kernel text needs to be executable for obvious reasons
17286+ * Does not cover __inittext since that is gone later on. On
17287+ * 64bit we do not enforce !NX on the low mapping
17288+ */
17289+ if (within(address, (unsigned long)_text, (unsigned long)_etext))
17290+ pgprot_val(forbidden) |= _PAGE_NX;
17291+
17292+ /*
17293+ * The .rodata section needs to be read-only. Using the pfn
17294+ * catches all aliases.
17295+ */
17296+ if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
17297+ __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
17298+ pgprot_val(forbidden) |= _PAGE_RW;
17299+
17300+ prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
17301+
17302+ return prot;
17303+}
17304+
17305+/*
17306+ * Lookup the page table entry for a virtual address. Return a pointer
17307+ * to the entry and the level of the mapping.
17308+ *
17309+ * Note: We return pud and pmd either when the entry is marked large
17310+ * or when the present bit is not set. Otherwise we would return a
17311+ * pointer to a nonexisting mapping.
17312+ */
17313+pte_t *lookup_address(unsigned long address, unsigned int *level)
17314+{
17315+ pgd_t *pgd = pgd_offset_k(address);
17316+ pud_t *pud;
17317+ pmd_t *pmd;
17318+
17319+ *level = PG_LEVEL_NONE;
17320+
17321+ if (pgd_none(*pgd))
17322+ return NULL;
17323+
17324+ pud = pud_offset(pgd, address);
17325+ if (pud_none(*pud))
17326+ return NULL;
17327+
17328+ *level = PG_LEVEL_1G;
17329+ if (pud_large(*pud) || !pud_present(*pud))
17330+ return (pte_t *)pud;
17331+
17332+ pmd = pmd_offset(pud, address);
17333+ if (pmd_none(*pmd))
17334+ return NULL;
17335+
17336+ *level = PG_LEVEL_2M;
17337+ if (pmd_large(*pmd) || !pmd_present(*pmd))
17338+ return (pte_t *)pmd;
17339+
17340+ *level = PG_LEVEL_4K;
17341+
17342+ return pte_offset_kernel(pmd, address);
17343+}
17344+
17345+/*
17346+ * Set the new pmd in all the pgds we know about:
17347+ */
17348+static void __set_pmd_pte(pte_t *kpte, unsigned long address,
17349+ unsigned int level, pte_t pte)
17350+{
17351+ /* change init_mm */
17352+ switch(level) {
17353+ case PG_LEVEL_2M:
17354+ xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte)));
17355+ break;
17356+#ifdef CONFIG_X86_64
17357+ case PG_LEVEL_1G:
17358+ xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte)));
17359+ break;
17360+#endif
17361+ default:
17362+ BUG();
17363+ }
17364+#ifdef CONFIG_X86_32
17365+ if (!SHARED_KERNEL_PMD) {
17366+ struct page *page;
17367+
17368+ list_for_each_entry(page, &pgd_list, lru) {
17369+ pgd_t *pgd;
17370+ pud_t *pud;
17371+ pmd_t *pmd;
17372+
17373+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
17374+ pud = pud_offset(pgd, address);
17375+ pmd = pmd_offset(pud, address);
17376+ xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte)));
17377+ }
17378+ }
17379+#endif
17380+}
17381+
17382+static int
17383+try_preserve_large_page(pte_t *kpte, unsigned long address,
17384+ struct cpa_data *cpa)
17385+{
17386+ unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
17387+ pte_t new_pte, old_pte, *tmp;
17388+ pgprot_t old_prot, new_prot;
17389+ int i, do_split = 1;
17390+ unsigned int level;
17391+
17392+ spin_lock_irqsave(&pgd_lock, flags);
17393+ /*
17394+ * Check for races, another CPU might have split this page
17395+ * up already:
17396+ */
17397+ tmp = lookup_address(address, &level);
17398+ if (tmp != kpte)
17399+ goto out_unlock;
17400+
17401+ switch (level) {
17402+ case PG_LEVEL_2M:
17403+ psize = PMD_PAGE_SIZE;
17404+ pmask = PMD_PAGE_MASK;
17405+ break;
17406+#ifdef CONFIG_X86_64
17407+ case PG_LEVEL_1G:
17408+ psize = PUD_PAGE_SIZE;
17409+ pmask = PUD_PAGE_MASK;
17410+ break;
17411+#endif
17412+ default:
17413+ do_split = -EINVAL;
17414+ goto out_unlock;
17415+ }
17416+
17417+ /*
17418+ * Calculate the number of pages, which fit into this large
17419+ * page starting at address:
17420+ */
17421+ nextpage_addr = (address + psize) & pmask;
17422+ numpages = (nextpage_addr - address) >> PAGE_SHIFT;
17423+ if (numpages < cpa->numpages)
17424+ cpa->numpages = numpages;
17425+
17426+ /*
17427+ * We are safe now. Check whether the new pgprot is the same:
17428+ */
17429+ old_pte = *kpte;
17430+ old_prot = new_prot = pte_pgprot(old_pte);
17431+
17432+ pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
17433+ pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
17434+
17435+ /*
17436+ * old_pte points to the large page base address. So we need
17437+ * to add the offset of the virtual address:
17438+ */
17439+ pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
17440+ cpa->pfn = pfn;
17441+
17442+ new_prot = static_protections(new_prot, address, pfn);
17443+
17444+ /*
17445+ * We need to check the full range, whether
17446+ * static_protection() requires a different pgprot for one of
17447+ * the pages in the range we try to preserve:
17448+ */
17449+ if (pfn < max_mapnr) {
17450+ addr = address + PAGE_SIZE;
17451+ for (i = 1; i < cpa->numpages && ++pfn < max_mapnr;
17452+ i++, addr += PAGE_SIZE) {
17453+ pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
17454+
17455+ if (pgprot_val(chk_prot) != pgprot_val(new_prot))
17456+ goto out_unlock;
17457+ }
17458+ }
17459+
17460+ /*
17461+ * If there are no changes, return. maxpages has been updated
17462+ * above:
17463+ */
17464+ if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
17465+ do_split = 0;
17466+ goto out_unlock;
17467+ }
17468+
17469+ /*
17470+ * We need to change the attributes. Check, whether we can
17471+ * change the large page in one go. We request a split, when
17472+ * the address is not aligned and the number of pages is
17473+ * smaller than the number of pages in the large page. Note
17474+ * that we limited the number of possible pages already to
17475+ * the number of pages in the large page.
17476+ */
17477+ if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
17478+ /*
17479+ * The address is aligned and the number of pages
17480+ * covers the full page.
17481+ */
17482+ new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot));
17483+ __set_pmd_pte(kpte, address, level, new_pte);
17484+ cpa->flushtlb = 1;
17485+ do_split = 0;
17486+ }
17487+
17488+out_unlock:
17489+ spin_unlock_irqrestore(&pgd_lock, flags);
17490+
17491+ return do_split;
17492+}
17493+
17494+static LIST_HEAD(page_pool);
17495+static unsigned long pool_size, pool_pages, pool_low;
17496+static unsigned long pool_used, pool_failed;
17497+
17498+static void cpa_fill_pool(struct page **ret)
17499+{
17500+ gfp_t gfp = GFP_KERNEL;
17501+ unsigned long flags;
17502+ struct page *p;
17503+
17504+ /*
17505+ * Avoid recursion (on debug-pagealloc) and also signal
17506+ * our priority to get to these pagetables:
17507+ */
17508+ if (current->flags & PF_MEMALLOC)
17509+ return;
17510+ current->flags |= PF_MEMALLOC;
17511+
17512+ /*
17513+ * Allocate atomically from atomic contexts:
17514+ */
17515+ if (in_atomic() || irqs_disabled() || debug_pagealloc)
17516+ gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
17517+
17518+ while (pool_pages < pool_size || (ret && !*ret)) {
17519+ p = alloc_pages(gfp, 0);
17520+ if (!p) {
17521+ pool_failed++;
17522+ break;
17523+ }
17524+ /*
17525+ * If the call site needs a page right now, provide it:
17526+ */
17527+ if (ret && !*ret) {
17528+ *ret = p;
17529+ continue;
17530+ }
17531+ spin_lock_irqsave(&pgd_lock, flags);
17532+ list_add(&p->lru, &page_pool);
17533+ pool_pages++;
17534+ spin_unlock_irqrestore(&pgd_lock, flags);
17535+ }
17536+
17537+ current->flags &= ~PF_MEMALLOC;
17538+}
17539+
17540+#define SHIFT_MB (20 - PAGE_SHIFT)
17541+#define ROUND_MB_GB ((1 << 10) - 1)
17542+#define SHIFT_MB_GB 10
17543+#define POOL_PAGES_PER_GB 16
17544+
17545+void __init cpa_init(void)
17546+{
17547+ struct sysinfo si;
17548+ unsigned long gb;
17549+
17550+ si_meminfo(&si);
17551+ /*
17552+ * Calculate the number of pool pages:
17553+ *
17554+ * Convert totalram (nr of pages) to MiB and round to the next
17555+ * GiB. Shift MiB to Gib and multiply the result by
17556+ * POOL_PAGES_PER_GB:
17557+ */
17558+ if (debug_pagealloc) {
17559+ gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
17560+ pool_size = POOL_PAGES_PER_GB * gb;
17561+ } else {
17562+ pool_size = 1;
17563+ }
17564+ pool_low = pool_size;
17565+
17566+ cpa_fill_pool(NULL);
17567+ printk(KERN_DEBUG
17568+ "CPA: page pool initialized %lu of %lu pages preallocated\n",
17569+ pool_pages, pool_size);
17570+}
17571+
17572+static int split_large_page(pte_t *kpte, unsigned long address)
17573+{
17574+ unsigned long flags, mfn, mfninc = 1;
17575+ unsigned int i, level;
17576+ pte_t *pbase, *tmp;
17577+ pgprot_t ref_prot;
17578+ struct page *base;
17579+
17580+ /*
17581+ * Get a page from the pool. The pool list is protected by the
17582+ * pgd_lock, which we have to take anyway for the split
17583+ * operation:
17584+ */
17585+ spin_lock_irqsave(&pgd_lock, flags);
17586+ if (list_empty(&page_pool)) {
17587+ spin_unlock_irqrestore(&pgd_lock, flags);
17588+ base = NULL;
17589+ cpa_fill_pool(&base);
17590+ if (!base)
17591+ return -ENOMEM;
17592+ spin_lock_irqsave(&pgd_lock, flags);
17593+ } else {
17594+ base = list_first_entry(&page_pool, struct page, lru);
17595+ list_del(&base->lru);
17596+ pool_pages--;
17597+
17598+ if (pool_pages < pool_low)
17599+ pool_low = pool_pages;
17600+ }
17601+
17602+ /*
17603+ * Check for races, another CPU might have split this page
17604+ * up for us already:
17605+ */
17606+ tmp = lookup_address(address, &level);
17607+ if (tmp != kpte)
17608+ goto out_unlock;
17609+
17610+ pbase = (pte_t *)page_address(base);
17611+#ifdef CONFIG_X86_32
17612+ paravirt_alloc_pt(&init_mm, page_to_pfn(base));
17613+#endif
17614+ ref_prot = pte_pgprot(pte_clrhuge(*kpte));
17615+
17616+#ifdef CONFIG_X86_64
17617+ if (level == PG_LEVEL_1G) {
17618+ mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
17619+ pgprot_val(ref_prot) |= _PAGE_PSE;
17620+ }
17621+#endif
17622+
17623+ /*
17624+ * Get the target mfn from the original entry:
17625+ */
17626+ mfn = __pte_mfn(*kpte);
17627+ for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc)
17628+ set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
17629+
17630+ /*
17631+ * Install the new, split up pagetable. Important details here:
17632+ *
17633+ * On Intel the NX bit of all levels must be cleared to make a
17634+ * page executable. See section 4.13.2 of Intel 64 and IA-32
17635+ * Architectures Software Developer's Manual).
17636+ *
17637+ * Mark the entry present. The current mapping might be
17638+ * set to not present, which we preserved above.
17639+ */
17640+ if (HYPERVISOR_update_va_mapping((unsigned long)pbase,
17641+ mk_pte(base, PAGE_KERNEL_RO), 0))
17642+ BUG();
17643+ ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
17644+ pgprot_val(ref_prot) |= _PAGE_PRESENT;
17645+ __set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot));
17646+ base = NULL;
17647+
17648+out_unlock:
17649+ /*
17650+ * If we dropped out via the lookup_address check under
17651+ * pgd_lock then stick the page back into the pool:
17652+ */
17653+ if (base) {
17654+ list_add(&base->lru, &page_pool);
17655+ pool_pages++;
17656+ } else
17657+ pool_used++;
17658+ spin_unlock_irqrestore(&pgd_lock, flags);
17659+
17660+ return 0;
17661+}
17662+
17663+static int __change_page_attr(struct cpa_data *cpa, int primary)
17664+{
17665+ unsigned long address = cpa->vaddr;
17666+ int do_split, err;
17667+ unsigned int level;
17668+ pte_t *kpte, old_pte;
17669+
17670+repeat:
17671+ kpte = lookup_address(address, &level);
17672+ if (!kpte)
17673+ return primary ? -EINVAL : 0;
17674+
17675+ old_pte = *kpte;
17676+ if (!__pte_val(old_pte)) {
17677+ if (!primary)
17678+ return 0;
17679+ printk(KERN_WARNING "CPA: called for zero pte. "
17680+ "vaddr = %lx cpa->vaddr = %lx\n", address,
17681+ cpa->vaddr);
17682+ WARN_ON(1);
17683+ return -EINVAL;
17684+ }
17685+
17686+ if (level == PG_LEVEL_4K) {
17687+ pte_t new_pte;
17688+ pgprot_t new_prot = pte_pgprot(old_pte);
17689+ unsigned long mfn = __pte_mfn(old_pte);
17690+
17691+ pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
17692+ pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
17693+
17694+ new_prot = static_protections(new_prot, address,
17695+ mfn_to_local_pfn(mfn));
17696+
17697+ /*
17698+ * We need to keep the mfn from the existing PTE,
17699+ * after all we're only going to change it's attributes
17700+ * not the memory it points to
17701+ */
17702+ new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot));
17703+ cpa->pfn = mfn_to_local_pfn(mfn);
17704+ /*
17705+ * Do we really change anything ?
17706+ */
17707+ if (__pte_val(old_pte) != __pte_val(new_pte)) {
17708+ set_pte_atomic(kpte, new_pte);
17709+ cpa->flushtlb = 1;
17710+ }
17711+ cpa->numpages = 1;
17712+ return 0;
17713+ }
17714+
17715+ /*
17716+ * Check, whether we can keep the large page intact
17717+ * and just change the pte:
17718+ */
17719+ do_split = try_preserve_large_page(kpte, address, cpa);
17720+ /*
17721+ * When the range fits into the existing large page,
17722+ * return. cp->numpages and cpa->tlbflush have been updated in
17723+ * try_large_page:
17724+ */
17725+ if (do_split <= 0)
17726+ return do_split;
17727+
17728+ /*
17729+ * We have to split the large page:
17730+ */
17731+ err = split_large_page(kpte, address);
17732+ if (!err) {
17733+ cpa->flushtlb = 1;
17734+ goto repeat;
17735+ }
17736+
17737+ return err;
17738+}
17739+
17740+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
17741+
17742+static int cpa_process_alias(struct cpa_data *cpa)
17743+{
17744+ struct cpa_data alias_cpa;
17745+ int ret = 0;
17746+
17747+ if (cpa->pfn > max_pfn_mapped)
17748+ return 0;
17749+
17750+ /*
17751+ * No need to redo, when the primary call touched the direct
17752+ * mapping already:
17753+ */
17754+ if (!within(cpa->vaddr, PAGE_OFFSET,
17755+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
17756+
17757+ alias_cpa = *cpa;
17758+ alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
17759+
17760+ ret = __change_page_attr_set_clr(&alias_cpa, 0);
17761+ }
17762+
17763+#ifdef CONFIG_X86_64
17764+ if (ret)
17765+ return ret;
17766+ /*
17767+ * No need to redo, when the primary call touched the high
17768+ * mapping already:
17769+ */
17770+ if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
17771+ return 0;
17772+
17773+ /*
17774+ * If the physical address is inside the kernel map, we need
17775+ * to touch the high mapped kernel as well:
17776+ */
17777+ if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
17778+ return 0;
17779+
17780+ alias_cpa = *cpa;
17781+ alias_cpa.vaddr =
17782+ (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map;
17783+
17784+ /*
17785+ * The high mapping range is imprecise, so ignore the return value.
17786+ */
17787+ __change_page_attr_set_clr(&alias_cpa, 0);
17788+#endif
17789+ return ret;
17790+}
17791+
17792+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
17793+{
17794+ int ret, numpages = cpa->numpages;
17795+
17796+ while (numpages) {
17797+ /*
17798+ * Store the remaining nr of pages for the large page
17799+ * preservation check.
17800+ */
17801+ cpa->numpages = numpages;
17802+
17803+ ret = __change_page_attr(cpa, checkalias);
17804+ if (ret)
17805+ return ret;
17806+
17807+ if (checkalias) {
17808+ ret = cpa_process_alias(cpa);
17809+ if (ret)
17810+ return ret;
17811+ }
17812+
17813+ /*
17814+ * Adjust the number of pages with the result of the
17815+ * CPA operation. Either a large page has been
17816+ * preserved or a single page update happened.
17817+ */
17818+ BUG_ON(cpa->numpages > numpages);
17819+ numpages -= cpa->numpages;
17820+ cpa->vaddr += cpa->numpages * PAGE_SIZE;
17821+ }
17822+ return 0;
17823+}
17824+
17825+static inline int cache_attr(pgprot_t attr)
17826+{
17827+ return pgprot_val(attr) &
17828+ (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
17829+}
17830+
17831+static int change_page_attr_set_clr(unsigned long addr, int numpages,
17832+ pgprot_t mask_set, pgprot_t mask_clr)
17833+{
17834+ struct cpa_data cpa;
17835+ int ret, cache, checkalias;
17836+
17837+ /*
17838+ * Check, if we are requested to change a not supported
17839+ * feature:
17840+ */
17841+ mask_set = canon_pgprot(mask_set);
17842+ mask_clr = canon_pgprot(mask_clr);
17843+ if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
17844+ return 0;
17845+
17846+ /* Ensure we are PAGE_SIZE aligned */
17847+ if (addr & ~PAGE_MASK) {
17848+ addr &= PAGE_MASK;
17849+ /*
17850+ * People should not be passing in unaligned addresses:
17851+ */
17852+ WARN_ON_ONCE(1);
17853+ }
17854+
17855+ cpa.vaddr = addr;
17856+ cpa.numpages = numpages;
17857+ cpa.mask_set = mask_set;
17858+ cpa.mask_clr = mask_clr;
17859+ cpa.flushtlb = 0;
17860+
17861+ /* No alias checking for _NX bit modifications */
17862+ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
17863+
17864+ ret = __change_page_attr_set_clr(&cpa, checkalias);
17865+
17866+ /*
17867+ * Check whether we really changed something:
17868+ */
17869+ if (!cpa.flushtlb)
17870+ goto out;
17871+
17872+ /*
17873+ * No need to flush, when we did not set any of the caching
17874+ * attributes:
17875+ */
17876+ cache = cache_attr(mask_set);
17877+
17878+ /*
17879+ * On success we use clflush, when the CPU supports it to
17880+ * avoid the wbindv. If the CPU does not support it and in the
17881+ * error case we fall back to cpa_flush_all (which uses
17882+ * wbindv):
17883+ */
17884+ if (!ret && cpu_has_clflush)
17885+ cpa_flush_range(addr, numpages, cache);
17886+ else
17887+ cpa_flush_all(cache);
17888+
17889+out:
17890+ cpa_fill_pool(NULL);
17891+
17892+ return ret;
17893+}
17894+
17895+static inline int change_page_attr_set(unsigned long addr, int numpages,
17896+ pgprot_t mask)
17897+{
17898+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
17899+}
17900+
17901+static inline int change_page_attr_clear(unsigned long addr, int numpages,
17902+ pgprot_t mask)
17903+{
17904+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
17905+}
17906+
17907+int set_memory_uc(unsigned long addr, int numpages)
17908+{
17909+ return change_page_attr_set(addr, numpages,
17910+ __pgprot(_PAGE_PCD));
17911+}
17912+EXPORT_SYMBOL(set_memory_uc);
17913+
17914+int set_memory_wb(unsigned long addr, int numpages)
17915+{
17916+ return change_page_attr_clear(addr, numpages,
17917+ __pgprot(_PAGE_PCD | _PAGE_PWT));
17918+}
17919+EXPORT_SYMBOL(set_memory_wb);
17920+
17921+int set_memory_x(unsigned long addr, int numpages)
17922+{
17923+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
17924+}
17925+EXPORT_SYMBOL(set_memory_x);
17926+
17927+int set_memory_nx(unsigned long addr, int numpages)
17928+{
17929+ return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
17930+}
17931+EXPORT_SYMBOL(set_memory_nx);
17932+
17933+int set_memory_ro(unsigned long addr, int numpages)
17934+{
17935+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
17936+}
17937+
17938+int set_memory_rw(unsigned long addr, int numpages)
17939+{
17940+ return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
17941+}
17942+
17943+int set_memory_np(unsigned long addr, int numpages)
17944+{
17945+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
17946+}
17947+
17948+int set_pages_uc(struct page *page, int numpages)
17949+{
17950+ unsigned long addr = (unsigned long)page_address(page);
17951+
17952+ return set_memory_uc(addr, numpages);
17953+}
17954+EXPORT_SYMBOL(set_pages_uc);
17955+
17956+int set_pages_wb(struct page *page, int numpages)
17957+{
17958+ unsigned long addr = (unsigned long)page_address(page);
17959+
17960+ return set_memory_wb(addr, numpages);
17961+}
17962+EXPORT_SYMBOL(set_pages_wb);
17963+
17964+int set_pages_x(struct page *page, int numpages)
17965+{
17966+ unsigned long addr = (unsigned long)page_address(page);
17967+
17968+ return set_memory_x(addr, numpages);
17969+}
17970+EXPORT_SYMBOL(set_pages_x);
17971+
17972+int set_pages_nx(struct page *page, int numpages)
17973+{
17974+ unsigned long addr = (unsigned long)page_address(page);
17975+
17976+ return set_memory_nx(addr, numpages);
17977+}
17978+EXPORT_SYMBOL(set_pages_nx);
17979+
17980+int set_pages_ro(struct page *page, int numpages)
17981+{
17982+ unsigned long addr = (unsigned long)page_address(page);
17983+
17984+ return set_memory_ro(addr, numpages);
17985+}
17986+
17987+int set_pages_rw(struct page *page, int numpages)
17988+{
17989+ unsigned long addr = (unsigned long)page_address(page);
17990+
17991+ return set_memory_rw(addr, numpages);
17992+}
17993+
17994+#ifdef CONFIG_DEBUG_PAGEALLOC
17995+
17996+static int __set_pages_p(struct page *page, int numpages)
17997+{
17998+ struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
17999+ .numpages = numpages,
18000+ .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
18001+ .mask_clr = __pgprot(0)};
18002+
18003+ return __change_page_attr_set_clr(&cpa, 1);
18004+}
18005+
18006+static int __set_pages_np(struct page *page, int numpages)
18007+{
18008+ struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
18009+ .numpages = numpages,
18010+ .mask_set = __pgprot(0),
18011+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
18012+
18013+ return __change_page_attr_set_clr(&cpa, 1);
18014+}
18015+
18016+void kernel_map_pages(struct page *page, int numpages, int enable)
18017+{
18018+ if (PageHighMem(page))
18019+ return;
18020+ if (!enable) {
18021+ debug_check_no_locks_freed(page_address(page),
18022+ numpages * PAGE_SIZE);
18023+ }
18024+
18025+ /*
18026+ * If page allocator is not up yet then do not call c_p_a():
18027+ */
18028+ if (!debug_pagealloc_enabled)
18029+ return;
18030+
18031+ /*
18032+ * The return value is ignored as the calls cannot fail.
18033+ * Large pages are kept enabled at boot time, and are
18034+ * split up quickly with DEBUG_PAGEALLOC. If a splitup
18035+ * fails here (due to temporary memory shortage) no damage
18036+ * is done because we just keep the largepage intact up
18037+ * to the next attempt when it will likely be split up:
18038+ */
18039+ if (enable)
18040+ __set_pages_p(page, numpages);
18041+ else
18042+ __set_pages_np(page, numpages);
18043+
18044+ /*
18045+ * We should perform an IPI and flush all tlbs,
18046+ * but that can deadlock->flush only current cpu:
18047+ */
18048+ __flush_tlb_all();
18049+
18050+ /*
18051+ * Try to refill the page pool here. We can do this only after
18052+ * the tlb flush.
18053+ */
18054+ cpa_fill_pool(NULL);
18055+}
18056+
18057+#ifdef CONFIG_HIBERNATION
18058+
18059+bool kernel_page_present(struct page *page)
18060+{
18061+ unsigned int level;
18062+ pte_t *pte;
18063+
18064+ if (PageHighMem(page))
18065+ return false;
18066+
18067+ pte = lookup_address((unsigned long)page_address(page), &level);
18068+ return (__pte_val(*pte) & _PAGE_PRESENT);
18069+}
18070+
18071+#endif /* CONFIG_HIBERNATION */
18072+
18073+#endif /* CONFIG_DEBUG_PAGEALLOC */
18074+
18075+static inline int in_secondary_range(unsigned long va)
18076+{
18077+#ifdef CONFIG_X86_64
18078+ return va >= VMALLOC_START && va < VMALLOC_END;
18079+#else
18080+ return va >= (unsigned long)high_memory;
18081+#endif
18082+}
18083+
18084+static void __make_page_readonly(unsigned long va)
18085+{
18086+ pte_t *pte;
18087+ unsigned int level;
18088+
18089+ pte = lookup_address(va, &level);
18090+ BUG_ON(!pte || level != PG_LEVEL_4K);
18091+ if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0))
18092+ BUG();
18093+ if (in_secondary_range(va)) {
18094+ unsigned long pfn = pte_pfn(*pte);
18095+
18096+#ifdef CONFIG_HIGHMEM
18097+ if (pfn >= highstart_pfn)
18098+ kmap_flush_unused(); /* flush stale writable kmaps */
18099+ else
18100+#endif
18101+ __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT));
18102+ }
18103+}
18104+
18105+static void __make_page_writable(unsigned long va)
18106+{
18107+ pte_t *pte;
18108+ unsigned int level;
18109+
18110+ pte = lookup_address(va, &level);
18111+ BUG_ON(!pte || level != PG_LEVEL_4K);
18112+ if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
18113+ BUG();
18114+ if (in_secondary_range(va)) {
18115+ unsigned long pfn = pte_pfn(*pte);
18116+
18117+#ifdef CONFIG_HIGHMEM
18118+ if (pfn < highstart_pfn)
18119+#endif
18120+ __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT));
18121+ }
18122+}
18123+
18124+void make_page_readonly(void *va, unsigned int feature)
18125+{
18126+ if (!xen_feature(feature))
18127+ __make_page_readonly((unsigned long)va);
18128+}
18129+
18130+void make_page_writable(void *va, unsigned int feature)
18131+{
18132+ if (!xen_feature(feature))
18133+ __make_page_writable((unsigned long)va);
18134+}
18135+
18136+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
18137+{
18138+ unsigned long addr;
18139+
18140+ if (xen_feature(feature))
18141+ return;
18142+
18143+ for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
18144+ __make_page_readonly(addr);
18145+}
18146+
18147+void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
18148+{
18149+ unsigned long addr;
18150+
18151+ if (xen_feature(feature))
18152+ return;
18153+
18154+ for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
18155+ __make_page_writable(addr);
18156+}
18157+
18158+/*
18159+ * The testcases use internal knowledge of the implementation that shouldn't
18160+ * be exposed to the rest of the kernel. Include these directly here.
18161+ */
18162+#ifdef CONFIG_CPA_DEBUG
18163+#include "pageattr-test.c"
18164+#endif
18165--- a/arch/x86/mm/pgtable_32-xen.c
18166+++ b/arch/x86/mm/pgtable_32-xen.c
18167@@ -29,8 +29,6 @@
18168 #include <xen/features.h>
18169 #include <asm/hypervisor.h>
18170
18171-static void pgd_test_and_unpin(pgd_t *pgd);
18172-
18173 void show_mem(void)
18174 {
18175 int total = 0, reserved = 0;
18176@@ -167,53 +165,6 @@ pte_t *pte_alloc_one_kernel(struct mm_st
18177 return pte;
18178 }
18179
18180-static void _pte_free(struct page *page, unsigned int order)
18181-{
18182- BUG_ON(order);
18183- pte_free(page);
18184-}
18185-
18186-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
18187-{
18188- struct page *pte;
18189-
18190-#ifdef CONFIG_HIGHPTE
18191- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
18192-#else
18193- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
18194-#endif
18195- if (pte) {
18196- SetPageForeign(pte, _pte_free);
18197- init_page_count(pte);
18198- }
18199- return pte;
18200-}
18201-
18202-void pte_free(struct page *pte)
18203-{
18204- unsigned long pfn = page_to_pfn(pte);
18205-
18206- if (!PageHighMem(pte)) {
18207- unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT);
18208-
18209- if (!pte_write(*virt_to_ptep(va)))
18210- if (HYPERVISOR_update_va_mapping(
18211- va, pfn_pte(pfn, PAGE_KERNEL), 0))
18212- BUG();
18213- } else
18214- ClearPagePinned(pte);
18215-
18216- ClearPageForeign(pte);
18217- init_page_count(pte);
18218-
18219- __free_page(pte);
18220-}
18221-
18222-void pmd_ctor(struct kmem_cache *cache, void *pmd)
18223-{
18224- memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18225-}
18226-
18227 /*
18228 * List of all pgd's needed for non-PAE so it can invalidate entries
18229 * in both cached and uncached pgd's; not needed for PAE since the
18230@@ -224,224 +175,191 @@ void pmd_ctor(struct kmem_cache *cache,
18231 * vmalloc faults work because attached pagetables are never freed.
18232 * -- wli
18233 */
18234-DEFINE_SPINLOCK(pgd_lock);
18235-struct page *pgd_list;
18236-
18237 static inline void pgd_list_add(pgd_t *pgd)
18238 {
18239 struct page *page = virt_to_page(pgd);
18240- page->index = (unsigned long)pgd_list;
18241- if (pgd_list)
18242- set_page_private(pgd_list, (unsigned long)&page->index);
18243- pgd_list = page;
18244- set_page_private(page, (unsigned long)&pgd_list);
18245+
18246+ list_add(&page->lru, &pgd_list);
18247 }
18248
18249 static inline void pgd_list_del(pgd_t *pgd)
18250 {
18251- struct page *next, **pprev, *page = virt_to_page(pgd);
18252- next = (struct page *)page->index;
18253- pprev = (struct page **)page_private(page);
18254- *pprev = next;
18255- if (next)
18256- set_page_private(next, (unsigned long)pprev);
18257-}
18258+ struct page *page = virt_to_page(pgd);
18259
18260+ list_del(&page->lru);
18261+}
18262
18263+#define UNSHARED_PTRS_PER_PGD \
18264+ (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
18265
18266-#if (PTRS_PER_PMD == 1)
18267-/* Non-PAE pgd constructor */
18268-static void pgd_ctor(void *pgd)
18269+static void pgd_ctor(void *p)
18270 {
18271+ pgd_t *pgd = p;
18272 unsigned long flags;
18273
18274- /* !PAE, no pagetable sharing */
18275+ pgd_test_and_unpin(pgd);
18276+
18277+ /* Clear usermode parts of PGD */
18278 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18279
18280 spin_lock_irqsave(&pgd_lock, flags);
18281
18282- /* must happen under lock */
18283- clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18284- swapper_pg_dir + USER_PTRS_PER_PGD,
18285- KERNEL_PGD_PTRS);
18286-
18287- paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
18288- __pa(swapper_pg_dir) >> PAGE_SHIFT,
18289- USER_PTRS_PER_PGD,
18290- KERNEL_PGD_PTRS);
18291- pgd_list_add(pgd);
18292- spin_unlock_irqrestore(&pgd_lock, flags);
18293-}
18294-#else /* PTRS_PER_PMD > 1 */
18295-/* PAE pgd constructor */
18296-static void pgd_ctor(void *pgd)
18297-{
18298- /* PAE, kernel PMD may be shared */
18299-
18300- if (SHARED_KERNEL_PMD) {
18301- clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18302+ /* If the pgd points to a shared pagetable level (either the
18303+ ptes in non-PAE, or shared PMD in PAE), then just copy the
18304+ references from swapper_pg_dir. */
18305+ if (PAGETABLE_LEVELS == 2 ||
18306+ (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
18307+ clone_pgd_range(pgd + USER_PTRS_PER_PGD,
18308 swapper_pg_dir + USER_PTRS_PER_PGD,
18309 KERNEL_PGD_PTRS);
18310- } else {
18311- memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18312+ paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
18313+ __pa(swapper_pg_dir) >> PAGE_SHIFT,
18314+ USER_PTRS_PER_PGD,
18315+ KERNEL_PGD_PTRS);
18316 }
18317+
18318+ /* list required to sync kernel mapping updates */
18319+ if (PAGETABLE_LEVELS == 2)
18320+ pgd_list_add(pgd);
18321+
18322+ spin_unlock_irqrestore(&pgd_lock, flags);
18323 }
18324-#endif /* PTRS_PER_PMD */
18325
18326 static void pgd_dtor(void *pgd)
18327 {
18328 unsigned long flags; /* can be called from interrupt context */
18329
18330- if (SHARED_KERNEL_PMD)
18331- return;
18332-
18333- paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
18334- spin_lock_irqsave(&pgd_lock, flags);
18335- pgd_list_del(pgd);
18336- spin_unlock_irqrestore(&pgd_lock, flags);
18337+ if (!SHARED_KERNEL_PMD) {
18338+ spin_lock_irqsave(&pgd_lock, flags);
18339+ pgd_list_del(pgd);
18340+ spin_unlock_irqrestore(&pgd_lock, flags);
18341+ }
18342
18343 pgd_test_and_unpin(pgd);
18344 }
18345
18346-#define UNSHARED_PTRS_PER_PGD \
18347- (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
18348-
18349-/* If we allocate a pmd for part of the kernel address space, then
18350- make sure its initialized with the appropriate kernel mappings.
18351- Otherwise use a cached zeroed pmd. */
18352-static pmd_t *pmd_cache_alloc(int idx)
18353+#ifdef CONFIG_X86_PAE
18354+/*
18355+ * Mop up any pmd pages which may still be attached to the pgd.
18356+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
18357+ * preallocate which never got a corresponding vma will need to be
18358+ * freed manually.
18359+ */
18360+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
18361 {
18362- pmd_t *pmd;
18363+ int i;
18364
18365- if (idx >= USER_PTRS_PER_PGD) {
18366- pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
18367+ for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
18368+ pgd_t pgd = pgdp[i];
18369
18370-#ifndef CONFIG_XEN
18371- if (pmd)
18372- memcpy(pmd,
18373- (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
18374- sizeof(pmd_t) * PTRS_PER_PMD);
18375-#endif
18376- } else
18377- pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
18378+ if (__pgd_val(pgd) != 0) {
18379+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
18380
18381- return pmd;
18382-}
18383+ pgdp[i] = xen_make_pgd(0);
18384
18385-static void pmd_cache_free(pmd_t *pmd, int idx)
18386-{
18387- if (idx >= USER_PTRS_PER_PGD) {
18388- make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
18389- memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18390- free_page((unsigned long)pmd);
18391- } else
18392- kmem_cache_free(pmd_cache, pmd);
18393+ paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
18394+ pmd_free(mm, pmd);
18395+ }
18396+ }
18397 }
18398
18399-pgd_t *pgd_alloc(struct mm_struct *mm)
18400+/*
18401+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
18402+ * updating the top-level pagetable entries to guarantee the
18403+ * processor notices the update. Since this is expensive, and
18404+ * all 4 top-level entries are used almost immediately in a
18405+ * new process's life, we just pre-populate them here.
18406+ *
18407+ * Also, if we're in a paravirt environment where the kernel pmd is
18408+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
18409+ * and initialize the kernel pmds here.
18410+ */
18411+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
18412 {
18413+ pud_t *pud;
18414+ pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
18415+ unsigned long addr, flags;
18416 int i;
18417- pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
18418- pmd_t **pmds = NULL;
18419- unsigned long flags;
18420-
18421- pgd_test_and_unpin(pgd);
18422-
18423- if (PTRS_PER_PMD == 1 || !pgd)
18424- return pgd;
18425-
18426-#ifdef CONFIG_XEN
18427- if (!SHARED_KERNEL_PMD) {
18428- /*
18429- * We can race save/restore (if we sleep during a GFP_KERNEL memory
18430- * allocation). We therefore store virtual addresses of pmds as they
18431- * do not change across save/restore, and poke the machine addresses
18432- * into the pgdir under the pgd_lock.
18433- */
18434- pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
18435- if (!pmds) {
18436- quicklist_free(0, pgd_dtor, pgd);
18437- return NULL;
18438- }
18439- }
18440-#endif
18441
18442- /* Allocate pmds, remember virtual addresses. */
18443- for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
18444- pmd_t *pmd = pmd_cache_alloc(i);
18445-
18446- if (!pmd)
18447+ /*
18448+ * We can race save/restore (if we sleep during a GFP_KERNEL memory
18449+ * allocation). We therefore store virtual addresses of pmds as they
18450+ * do not change across save/restore, and poke the machine addresses
18451+ * into the pgdir under the pgd_lock.
18452+ */
18453+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
18454+ pmds[i] = pmd_alloc_one(mm, addr);
18455+ if (!pmds[i])
18456 goto out_oom;
18457-
18458- paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
18459- if (pmds)
18460- pmds[i] = pmd;
18461- else
18462- set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
18463 }
18464
18465-#ifdef CONFIG_XEN
18466- if (SHARED_KERNEL_PMD)
18467- return pgd;
18468-
18469 spin_lock_irqsave(&pgd_lock, flags);
18470
18471 /* Protect against save/restore: move below 4GB under pgd_lock. */
18472- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
18473- int rc = xen_create_contiguous_region(
18474- (unsigned long)pgd, 0, 32);
18475- if (rc) {
18476- spin_unlock_irqrestore(&pgd_lock, flags);
18477- goto out_oom;
18478- }
18479+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
18480+ && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
18481+ spin_unlock_irqrestore(&pgd_lock, flags);
18482+out_oom:
18483+ while (i--)
18484+ pmd_free(mm, pmds[i]);
18485+ return 0;
18486 }
18487
18488 /* Copy kernel pmd contents and write-protect the new pmds. */
18489- for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18490- memcpy(pmds[i],
18491- (void *)pgd_page_vaddr(swapper_pg_dir[i]),
18492- sizeof(pmd_t) * PTRS_PER_PMD);
18493- make_lowmem_page_readonly(
18494- pmds[i], XENFEAT_writable_page_tables);
18495- }
18496+ pud = pud_offset(pgd, 0);
18497+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
18498+ i++, pud++, addr += PUD_SIZE) {
18499+ if (i >= USER_PTRS_PER_PGD) {
18500+ memcpy(pmds[i],
18501+ (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
18502+ sizeof(pmd_t) * PTRS_PER_PMD);
18503+ make_lowmem_page_readonly(
18504+ pmds[i], XENFEAT_writable_page_tables);
18505+ }
18506
18507- /* It is safe to poke machine addresses of pmds under the pmd_lock. */
18508- for (i = 0; i < PTRS_PER_PGD; i++)
18509- set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
18510+ /* It is safe to poke machine addresses of pmds under the pgd_lock. */
18511+ pud_populate(mm, pud, pmds[i]);
18512+ }
18513
18514- /* Ensure this pgd gets picked up and pinned on save/restore. */
18515+ /* List required to sync kernel mapping updates and
18516+ * to pin/unpin on save/restore. */
18517 pgd_list_add(pgd);
18518
18519 spin_unlock_irqrestore(&pgd_lock, flags);
18520
18521- kfree(pmds);
18522-#endif
18523+ return 1;
18524+}
18525+#else /* !CONFIG_X86_PAE */
18526+/* No need to prepopulate any pagetable entries in non-PAE modes. */
18527+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
18528+{
18529+ return 1;
18530+}
18531
18532- return pgd;
18533+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
18534+{
18535+}
18536+#endif /* CONFIG_X86_PAE */
18537
18538-out_oom:
18539- if (!pmds) {
18540- for (i--; i >= 0; i--) {
18541- pgd_t pgdent = pgd[i];
18542- void* pmd = (void *)__va(pgd_val(pgdent)-1);
18543- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18544- pmd_cache_free(pmd, i);
18545- }
18546- } else {
18547- for (i--; i >= 0; i--) {
18548- paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
18549- pmd_cache_free(pmds[i], i);
18550- }
18551- kfree(pmds);
18552+pgd_t *pgd_alloc(struct mm_struct *mm)
18553+{
18554+ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
18555+
18556+ /* so that alloc_pd can use it */
18557+ mm->pgd = pgd;
18558+ if (pgd)
18559+ pgd_ctor(pgd);
18560+
18561+ if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
18562+ free_page((unsigned long)pgd);
18563+ pgd = NULL;
18564 }
18565- quicklist_free(0, pgd_dtor, pgd);
18566- return NULL;
18567+
18568+ return pgd;
18569 }
18570
18571-void pgd_free(pgd_t *pgd)
18572+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
18573 {
18574- int i;
18575-
18576 /*
18577 * After this the pgd should not be pinned for the duration of this
18578 * function's execution. We should never sleep and thus never race:
18579@@ -450,39 +368,43 @@ void pgd_free(pgd_t *pgd)
18580 * 2. The machine addresses in PGD entries will not become invalid
18581 * due to a concurrent save/restore.
18582 */
18583- pgd_test_and_unpin(pgd);
18584+ pgd_dtor(pgd);
18585
18586- /* in the PAE case user pgd entries are overwritten before usage */
18587- if (PTRS_PER_PMD > 1) {
18588- for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
18589- pgd_t pgdent = pgd[i];
18590- void* pmd = (void *)__va(pgd_val(pgdent)-1);
18591- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18592- pmd_cache_free(pmd, i);
18593- }
18594+ if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
18595+ xen_destroy_contiguous_region((unsigned long)pgd, 0);
18596
18597- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
18598- xen_destroy_contiguous_region((unsigned long)pgd, 0);
18599- }
18600+ pgd_mop_up_pmds(mm, pgd);
18601+ free_page((unsigned long)pgd);
18602+}
18603
18604- /* in the non-PAE case, free_pgtables() clears user pgd entries */
18605- quicklist_free(0, pgd_dtor, pgd);
18606+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
18607+{
18608+ pgtable_page_dtor(pte);
18609+ paravirt_release_pt(page_to_pfn(pte));
18610+ tlb_remove_page(tlb, pte);
18611 }
18612
18613-void check_pgt_cache(void)
18614+#ifdef CONFIG_X86_PAE
18615+
18616+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
18617 {
18618- quicklist_trim(0, pgd_dtor, 25, 16);
18619+ paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18620+ tlb_remove_page(tlb, virt_to_page(pmd));
18621 }
18622
18623+#endif
18624+
18625 void make_lowmem_page_readonly(void *va, unsigned int feature)
18626 {
18627 pte_t *pte;
18628+ unsigned int level;
18629 int rc;
18630
18631 if (xen_feature(feature))
18632 return;
18633
18634- pte = virt_to_ptep(va);
18635+ pte = lookup_address((unsigned long)va, &level);
18636+ BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
18637 rc = HYPERVISOR_update_va_mapping(
18638 (unsigned long)va, pte_wrprotect(*pte), 0);
18639 BUG_ON(rc);
18640@@ -491,313 +413,15 @@ void make_lowmem_page_readonly(void *va,
18641 void make_lowmem_page_writable(void *va, unsigned int feature)
18642 {
18643 pte_t *pte;
18644+ unsigned int level;
18645 int rc;
18646
18647 if (xen_feature(feature))
18648 return;
18649
18650- pte = virt_to_ptep(va);
18651+ pte = lookup_address((unsigned long)va, &level);
18652+ BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
18653 rc = HYPERVISOR_update_va_mapping(
18654 (unsigned long)va, pte_mkwrite(*pte), 0);
18655 BUG_ON(rc);
18656 }
18657-
18658-void make_page_readonly(void *va, unsigned int feature)
18659-{
18660- pte_t *pte;
18661- int rc;
18662-
18663- if (xen_feature(feature))
18664- return;
18665-
18666- pte = virt_to_ptep(va);
18667- rc = HYPERVISOR_update_va_mapping(
18668- (unsigned long)va, pte_wrprotect(*pte), 0);
18669- if (rc) /* fallback? */
18670- xen_l1_entry_update(pte, pte_wrprotect(*pte));
18671- if ((unsigned long)va >= (unsigned long)high_memory) {
18672- unsigned long pfn = pte_pfn(*pte);
18673-#ifdef CONFIG_HIGHMEM
18674- if (pfn >= highstart_pfn)
18675- kmap_flush_unused(); /* flush stale writable kmaps */
18676- else
18677-#endif
18678- make_lowmem_page_readonly(
18679- phys_to_virt(pfn << PAGE_SHIFT), feature);
18680- }
18681-}
18682-
18683-void make_page_writable(void *va, unsigned int feature)
18684-{
18685- pte_t *pte;
18686- int rc;
18687-
18688- if (xen_feature(feature))
18689- return;
18690-
18691- pte = virt_to_ptep(va);
18692- rc = HYPERVISOR_update_va_mapping(
18693- (unsigned long)va, pte_mkwrite(*pte), 0);
18694- if (rc) /* fallback? */
18695- xen_l1_entry_update(pte, pte_mkwrite(*pte));
18696- if ((unsigned long)va >= (unsigned long)high_memory) {
18697- unsigned long pfn = pte_pfn(*pte);
18698-#ifdef CONFIG_HIGHMEM
18699- if (pfn < highstart_pfn)
18700-#endif
18701- make_lowmem_page_writable(
18702- phys_to_virt(pfn << PAGE_SHIFT), feature);
18703- }
18704-}
18705-
18706-void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
18707-{
18708- if (xen_feature(feature))
18709- return;
18710-
18711- while (nr-- != 0) {
18712- make_page_readonly(va, feature);
18713- va = (void *)((unsigned long)va + PAGE_SIZE);
18714- }
18715-}
18716-
18717-void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
18718-{
18719- if (xen_feature(feature))
18720- return;
18721-
18722- while (nr-- != 0) {
18723- make_page_writable(va, feature);
18724- va = (void *)((unsigned long)va + PAGE_SIZE);
18725- }
18726-}
18727-
18728-static void _pin_lock(struct mm_struct *mm, int lock) {
18729- if (lock)
18730- spin_lock(&mm->page_table_lock);
18731-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
18732- /* While mm->page_table_lock protects us against insertions and
18733- * removals of higher level page table pages, it doesn't protect
18734- * against updates of pte-s. Such updates, however, require the
18735- * pte pages to be in consistent state (unpinned+writable or
18736- * pinned+readonly). The pinning and attribute changes, however
18737- * cannot be done atomically, which is why such updates must be
18738- * prevented from happening concurrently.
18739- * Note that no pte lock can ever elsewhere be acquired nesting
18740- * with an already acquired one in the same mm, or with the mm's
18741- * page_table_lock already acquired, as that would break in the
18742- * non-split case (where all these are actually resolving to the
18743- * one page_table_lock). Thus acquiring all of them here is not
18744- * going to result in dead locks, and the order of acquires
18745- * doesn't matter.
18746- */
18747- {
18748- pgd_t *pgd = mm->pgd;
18749- unsigned g;
18750-
18751- for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18752- pud_t *pud;
18753- unsigned u;
18754-
18755- if (pgd_none(*pgd))
18756- continue;
18757- pud = pud_offset(pgd, 0);
18758- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18759- pmd_t *pmd;
18760- unsigned m;
18761-
18762- if (pud_none(*pud))
18763- continue;
18764- pmd = pmd_offset(pud, 0);
18765- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18766- spinlock_t *ptl;
18767-
18768- if (pmd_none(*pmd))
18769- continue;
18770- ptl = pte_lockptr(0, pmd);
18771- if (lock)
18772- spin_lock(ptl);
18773- else
18774- spin_unlock(ptl);
18775- }
18776- }
18777- }
18778- }
18779-#endif
18780- if (!lock)
18781- spin_unlock(&mm->page_table_lock);
18782-}
18783-#define pin_lock(mm) _pin_lock(mm, 1)
18784-#define pin_unlock(mm) _pin_lock(mm, 0)
18785-
18786-#define PIN_BATCH 4
18787-static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
18788-
18789-static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
18790- unsigned int cpu, unsigned seq)
18791-{
18792- unsigned long pfn = page_to_pfn(page);
18793-
18794- if (PageHighMem(page)) {
18795- if (pgprot_val(flags) & _PAGE_RW)
18796- ClearPagePinned(page);
18797- else
18798- SetPagePinned(page);
18799- } else {
18800- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
18801- (unsigned long)__va(pfn << PAGE_SHIFT),
18802- pfn_pte(pfn, flags), 0);
18803- if (unlikely(++seq == PIN_BATCH)) {
18804- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
18805- PIN_BATCH, NULL)))
18806- BUG();
18807- seq = 0;
18808- }
18809- }
18810-
18811- return seq;
18812-}
18813-
18814-static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
18815-{
18816- pgd_t *pgd = pgd_base;
18817- pud_t *pud;
18818- pmd_t *pmd;
18819- int g, u, m;
18820- unsigned int cpu, seq;
18821-
18822- if (xen_feature(XENFEAT_auto_translated_physmap))
18823- return;
18824-
18825- cpu = get_cpu();
18826-
18827- for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18828- if (pgd_none(*pgd))
18829- continue;
18830- pud = pud_offset(pgd, 0);
18831- if (PTRS_PER_PUD > 1) /* not folded */
18832- seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
18833- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18834- if (pud_none(*pud))
18835- continue;
18836- pmd = pmd_offset(pud, 0);
18837- if (PTRS_PER_PMD > 1) /* not folded */
18838- seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
18839- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18840- if (pmd_none(*pmd))
18841- continue;
18842- seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
18843- }
18844- }
18845- }
18846-
18847- if (likely(seq != 0)) {
18848- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
18849- (unsigned long)pgd_base,
18850- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18851- UVMF_TLB_FLUSH);
18852- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
18853- seq + 1, NULL)))
18854- BUG();
18855- } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
18856- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18857- UVMF_TLB_FLUSH))
18858- BUG();
18859-
18860- put_cpu();
18861-}
18862-
18863-static void __pgd_pin(pgd_t *pgd)
18864-{
18865- pgd_walk(pgd, PAGE_KERNEL_RO);
18866- kmap_flush_unused();
18867- xen_pgd_pin(__pa(pgd));
18868- SetPagePinned(virt_to_page(pgd));
18869-}
18870-
18871-static void __pgd_unpin(pgd_t *pgd)
18872-{
18873- xen_pgd_unpin(__pa(pgd));
18874- pgd_walk(pgd, PAGE_KERNEL);
18875- ClearPagePinned(virt_to_page(pgd));
18876-}
18877-
18878-static void pgd_test_and_unpin(pgd_t *pgd)
18879-{
18880- if (PagePinned(virt_to_page(pgd)))
18881- __pgd_unpin(pgd);
18882-}
18883-
18884-void mm_pin(struct mm_struct *mm)
18885-{
18886- if (xen_feature(XENFEAT_writable_page_tables))
18887- return;
18888- pin_lock(mm);
18889- __pgd_pin(mm->pgd);
18890- pin_unlock(mm);
18891-}
18892-
18893-void mm_unpin(struct mm_struct *mm)
18894-{
18895- if (xen_feature(XENFEAT_writable_page_tables))
18896- return;
18897- pin_lock(mm);
18898- __pgd_unpin(mm->pgd);
18899- pin_unlock(mm);
18900-}
18901-
18902-void mm_pin_all(void)
18903-{
18904- struct page *page;
18905- unsigned long flags;
18906-
18907- if (xen_feature(XENFEAT_writable_page_tables))
18908- return;
18909-
18910- /*
18911- * Allow uninterrupted access to the pgd_list. Also protects
18912- * __pgd_pin() by disabling preemption.
18913- * All other CPUs must be at a safe point (e.g., in stop_machine
18914- * or offlined entirely).
18915- */
18916- spin_lock_irqsave(&pgd_lock, flags);
18917- for (page = pgd_list; page; page = (struct page *)page->index) {
18918- if (!PagePinned(page))
18919- __pgd_pin((pgd_t *)page_address(page));
18920- }
18921- spin_unlock_irqrestore(&pgd_lock, flags);
18922-}
18923-
18924-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
18925-{
18926- if (!PagePinned(virt_to_page(mm->pgd)))
18927- mm_pin(mm);
18928-}
18929-
18930-void arch_exit_mmap(struct mm_struct *mm)
18931-{
18932- struct task_struct *tsk = current;
18933-
18934- task_lock(tsk);
18935-
18936- /*
18937- * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
18938- * *much* faster this way, as no tlb flushes means bigger wrpt batches.
18939- */
18940- if (tsk->active_mm == mm) {
18941- tsk->active_mm = &init_mm;
18942- atomic_inc(&init_mm.mm_count);
18943-
18944- switch_mm(mm, &init_mm, tsk);
18945-
18946- atomic_dec(&mm->mm_count);
18947- BUG_ON(atomic_read(&mm->mm_count) == 0);
18948- }
18949-
18950- task_unlock(tsk);
18951-
18952- if (PagePinned(virt_to_page(mm->pgd)) &&
18953- (atomic_read(&mm->mm_count) == 1) &&
18954- !mm->context.has_foreign_mappings)
18955- mm_unpin(mm);
18956-}
18957--- a/arch/x86/pci/irq-xen.c
18958+++ b/arch/x86/pci/irq-xen.c
18959@@ -204,6 +204,7 @@ static int pirq_ali_get(struct pci_dev *
18960 {
18961 static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
18962
18963+ WARN_ON_ONCE(pirq >= 16);
18964 return irqmap[read_config_nybble(router, 0x48, pirq-1)];
18965 }
18966
18967@@ -211,7 +212,8 @@ static int pirq_ali_set(struct pci_dev *
18968 {
18969 static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
18970 unsigned int val = irqmap[irq];
18971-
18972+
18973+ WARN_ON_ONCE(pirq >= 16);
18974 if (val) {
18975 write_config_nybble(router, 0x48, pirq-1, val);
18976 return 1;
18977@@ -261,12 +263,16 @@ static int pirq_via_set(struct pci_dev *
18978 static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18979 {
18980 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18981+
18982+ WARN_ON_ONCE(pirq >= 5);
18983 return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
18984 }
18985
18986 static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18987 {
18988 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18989+
18990+ WARN_ON_ONCE(pirq >= 5);
18991 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
18992 return 1;
18993 }
18994@@ -279,12 +285,16 @@ static int pirq_via586_set(struct pci_de
18995 static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18996 {
18997 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18998+
18999+ WARN_ON_ONCE(pirq >= 4);
19000 return read_config_nybble(router,0x43, pirqmap[pirq-1]);
19001 }
19002
19003 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19004 {
19005 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
19006+
19007+ WARN_ON_ONCE(pirq >= 4);
19008 write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
19009 return 1;
19010 }
19011@@ -423,6 +433,7 @@ static int pirq_sis_set(struct pci_dev *
19012
19013 static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19014 {
19015+ WARN_ON_ONCE(pirq >= 9);
19016 if (pirq > 8) {
19017 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
19018 return 0;
19019@@ -432,6 +443,7 @@ static int pirq_vlsi_get(struct pci_dev
19020
19021 static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19022 {
19023+ WARN_ON_ONCE(pirq >= 9);
19024 if (pirq > 8) {
19025 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
19026 return 0;
19027@@ -453,14 +465,14 @@ static int pirq_vlsi_set(struct pci_dev
19028 */
19029 static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19030 {
19031- outb_p(pirq, 0xc00);
19032+ outb(pirq, 0xc00);
19033 return inb(0xc01) & 0xf;
19034 }
19035
19036 static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19037 {
19038- outb_p(pirq, 0xc00);
19039- outb_p(irq, 0xc01);
19040+ outb(pirq, 0xc00);
19041+ outb(irq, 0xc01);
19042 return 1;
19043 }
19044
19045@@ -575,6 +587,10 @@ static __init int intel_router_probe(str
19046 case PCI_DEVICE_ID_INTEL_ICH9_4:
19047 case PCI_DEVICE_ID_INTEL_ICH9_5:
19048 case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
19049+ case PCI_DEVICE_ID_INTEL_ICH10_0:
19050+ case PCI_DEVICE_ID_INTEL_ICH10_1:
19051+ case PCI_DEVICE_ID_INTEL_ICH10_2:
19052+ case PCI_DEVICE_ID_INTEL_ICH10_3:
19053 r->name = "PIIX/ICH";
19054 r->get = pirq_piix_get;
19055 r->set = pirq_piix_set;
19056--- a/arch/x86/vdso/Makefile
19057+++ b/arch/x86/vdso/Makefile
19058@@ -66,6 +66,7 @@ vdso32.so-$(VDSO32-y) += int80
19059 vdso32.so-$(CONFIG_COMPAT) += syscall
19060 vdso32.so-$(VDSO32-y) += sysenter
19061 xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
19062+xen-vdso32-$(CONFIG_X86_32) += syscall
19063 vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
19064
19065 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
19066--- a/arch/x86/vdso/vdso32.S
19067+++ b/arch/x86/vdso/vdso32.S
19068@@ -19,4 +19,16 @@ vdso32_sysenter_start:
19069 .incbin "arch/x86/vdso/vdso32-sysenter.so"
19070 vdso32_sysenter_end:
19071
19072+#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
19073+ .globl vdso32_int80_start, vdso32_int80_end
19074+vdso32_int80_start:
19075+ .incbin "arch/x86/vdso/vdso32-int80.so"
19076+vdso32_int80_end:
19077+#elif defined(CONFIG_X86_XEN)
19078+ .globl vdso32_syscall_start, vdso32_syscall_end
19079+vdso32_syscall_start:
19080+ .incbin "arch/x86/vdso/vdso32-syscall.so"
19081+vdso32_syscall_end:
19082+#endif
19083+
19084 __FINIT
19085--- a/arch/x86/vdso/vdso32-setup.c
19086+++ b/arch/x86/vdso/vdso32-setup.c
19087@@ -26,10 +26,6 @@
19088 #include <asm/vdso.h>
19089 #include <asm/proto.h>
19090
19091-#ifdef CONFIG_XEN
19092-#include <xen/interface/callback.h>
19093-#endif
19094-
19095 enum {
19096 VDSO_DISABLED = 0,
19097 VDSO_ENABLED = 1,
19098@@ -229,7 +225,6 @@ static inline void map_compat_vdso(int m
19099
19100 void enable_sep_cpu(void)
19101 {
19102-#ifndef CONFIG_XEN
19103 int cpu = get_cpu();
19104 struct tss_struct *tss = &per_cpu(init_tss, cpu);
19105
19106@@ -244,35 +239,6 @@ void enable_sep_cpu(void)
19107 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
19108 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
19109 put_cpu();
19110-#else
19111- extern asmlinkage void ia32pv_sysenter_target(void);
19112- static struct callback_register sysenter = {
19113- .type = CALLBACKTYPE_sysenter,
19114- .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
19115- };
19116-
19117- if (!boot_cpu_has(X86_FEATURE_SEP))
19118- return;
19119-
19120- get_cpu();
19121-
19122- if (xen_feature(XENFEAT_supervisor_mode_kernel))
19123- sysenter.address.eip = (unsigned long)ia32_sysenter_target;
19124-
19125- switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
19126- case 0:
19127- break;
19128-#if CONFIG_XEN_COMPAT < 0x030200
19129- case -ENOSYS:
19130- sysenter.type = CALLBACKTYPE_sysenter_deprecated;
19131- if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
19132- break;
19133-#endif
19134- default:
19135- clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
19136- break;
19137- }
19138-#endif
19139 }
19140
19141 static struct vm_area_struct gate_vma;
19142--- /dev/null
19143+++ b/arch/x86/vdso/vdso32-setup-xen.c
19144@@ -0,0 +1,506 @@
19145+/*
19146+ * (C) Copyright 2002 Linus Torvalds
19147+ * Portions based on the vdso-randomization code from exec-shield:
19148+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
19149+ *
19150+ * This file contains the needed initializations to support sysenter.
19151+ */
19152+
19153+#include <linux/init.h>
19154+#include <linux/smp.h>
19155+#include <linux/thread_info.h>
19156+#include <linux/sched.h>
19157+#include <linux/gfp.h>
19158+#include <linux/string.h>
19159+#include <linux/elf.h>
19160+#include <linux/mm.h>
19161+#include <linux/err.h>
19162+#include <linux/module.h>
19163+
19164+#include <asm/cpufeature.h>
19165+#include <asm/msr.h>
19166+#include <asm/pgtable.h>
19167+#include <asm/unistd.h>
19168+#include <asm/elf.h>
19169+#include <asm/tlbflush.h>
19170+#include <asm/vdso.h>
19171+#include <asm/proto.h>
19172+
19173+#include <xen/interface/callback.h>
19174+
19175+enum {
19176+ VDSO_DISABLED = 0,
19177+ VDSO_ENABLED = 1,
19178+ VDSO_COMPAT = 2,
19179+};
19180+
19181+#ifdef CONFIG_COMPAT_VDSO
19182+#define VDSO_DEFAULT VDSO_COMPAT
19183+#else
19184+#define VDSO_DEFAULT VDSO_ENABLED
19185+#endif
19186+
19187+#ifdef CONFIG_X86_64
19188+#define vdso_enabled sysctl_vsyscall32
19189+#define arch_setup_additional_pages syscall32_setup_pages
19190+#endif
19191+
19192+/*
19193+ * This is the difference between the prelinked addresses in the vDSO images
19194+ * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
19195+ * in the user address space.
19196+ */
19197+#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
19198+
19199+/*
19200+ * Should the kernel map a VDSO page into processes and pass its
19201+ * address down to glibc upon exec()?
19202+ */
19203+unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
19204+
19205+static int __init vdso_setup(char *s)
19206+{
19207+ vdso_enabled = simple_strtoul(s, NULL, 0);
19208+
19209+ return 1;
19210+}
19211+
19212+/*
19213+ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
19214+ * behavior on both 64-bit and 32-bit kernels.
19215+ * On 32-bit kernels, vdso=[012] means the same thing.
19216+ */
19217+__setup("vdso32=", vdso_setup);
19218+
19219+#ifdef CONFIG_X86_32
19220+__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
19221+
19222+EXPORT_SYMBOL_GPL(vdso_enabled);
19223+#endif
19224+
19225+static __init void reloc_symtab(Elf32_Ehdr *ehdr,
19226+ unsigned offset, unsigned size)
19227+{
19228+ Elf32_Sym *sym = (void *)ehdr + offset;
19229+ unsigned nsym = size / sizeof(*sym);
19230+ unsigned i;
19231+
19232+ for(i = 0; i < nsym; i++, sym++) {
19233+ if (sym->st_shndx == SHN_UNDEF ||
19234+ sym->st_shndx == SHN_ABS)
19235+ continue; /* skip */
19236+
19237+ if (sym->st_shndx > SHN_LORESERVE) {
19238+ printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
19239+ sym->st_shndx);
19240+ continue;
19241+ }
19242+
19243+ switch(ELF_ST_TYPE(sym->st_info)) {
19244+ case STT_OBJECT:
19245+ case STT_FUNC:
19246+ case STT_SECTION:
19247+ case STT_FILE:
19248+ sym->st_value += VDSO_ADDR_ADJUST;
19249+ }
19250+ }
19251+}
19252+
19253+static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
19254+{
19255+ Elf32_Dyn *dyn = (void *)ehdr + offset;
19256+
19257+ for(; dyn->d_tag != DT_NULL; dyn++)
19258+ switch(dyn->d_tag) {
19259+ case DT_PLTGOT:
19260+ case DT_HASH:
19261+ case DT_STRTAB:
19262+ case DT_SYMTAB:
19263+ case DT_RELA:
19264+ case DT_INIT:
19265+ case DT_FINI:
19266+ case DT_REL:
19267+ case DT_DEBUG:
19268+ case DT_JMPREL:
19269+ case DT_VERSYM:
19270+ case DT_VERDEF:
19271+ case DT_VERNEED:
19272+ case DT_ADDRRNGLO ... DT_ADDRRNGHI:
19273+ /* definitely pointers needing relocation */
19274+ dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
19275+ break;
19276+
19277+ case DT_ENCODING ... OLD_DT_LOOS-1:
19278+ case DT_LOOS ... DT_HIOS-1:
19279+ /* Tags above DT_ENCODING are pointers if
19280+ they're even */
19281+ if (dyn->d_tag >= DT_ENCODING &&
19282+ (dyn->d_tag & 1) == 0)
19283+ dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
19284+ break;
19285+
19286+ case DT_VERDEFNUM:
19287+ case DT_VERNEEDNUM:
19288+ case DT_FLAGS_1:
19289+ case DT_RELACOUNT:
19290+ case DT_RELCOUNT:
19291+ case DT_VALRNGLO ... DT_VALRNGHI:
19292+ /* definitely not pointers */
19293+ break;
19294+
19295+ case OLD_DT_LOOS ... DT_LOOS-1:
19296+ case DT_HIOS ... DT_VALRNGLO-1:
19297+ default:
19298+ if (dyn->d_tag > DT_ENCODING)
19299+ printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
19300+ dyn->d_tag);
19301+ break;
19302+ }
19303+}
19304+
19305+static __init void relocate_vdso(Elf32_Ehdr *ehdr)
19306+{
19307+ Elf32_Phdr *phdr;
19308+ Elf32_Shdr *shdr;
19309+ int i;
19310+
19311+ BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
19312+ !elf_check_arch_ia32(ehdr) ||
19313+ ehdr->e_type != ET_DYN);
19314+
19315+ ehdr->e_entry += VDSO_ADDR_ADJUST;
19316+
19317+ /* rebase phdrs */
19318+ phdr = (void *)ehdr + ehdr->e_phoff;
19319+ for (i = 0; i < ehdr->e_phnum; i++) {
19320+ phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
19321+
19322+ /* relocate dynamic stuff */
19323+ if (phdr[i].p_type == PT_DYNAMIC)
19324+ reloc_dyn(ehdr, phdr[i].p_offset);
19325+ }
19326+
19327+ /* rebase sections */
19328+ shdr = (void *)ehdr + ehdr->e_shoff;
19329+ for(i = 0; i < ehdr->e_shnum; i++) {
19330+ if (!(shdr[i].sh_flags & SHF_ALLOC))
19331+ continue;
19332+
19333+ shdr[i].sh_addr += VDSO_ADDR_ADJUST;
19334+
19335+ if (shdr[i].sh_type == SHT_SYMTAB ||
19336+ shdr[i].sh_type == SHT_DYNSYM)
19337+ reloc_symtab(ehdr, shdr[i].sh_offset,
19338+ shdr[i].sh_size);
19339+ }
19340+}
19341+
19342+/*
19343+ * These symbols are defined by vdso32.S to mark the bounds
19344+ * of the ELF DSO images included therein.
19345+ */
19346+extern const char vdso32_default_start, vdso32_default_end;
19347+extern const char vdso32_sysenter_start, vdso32_sysenter_end;
19348+static struct page *vdso32_pages[1];
19349+
19350+#ifdef CONFIG_X86_64
19351+
19352+#if CONFIG_XEN_COMPAT < 0x030200
19353+static int use_int80 = 1;
19354+#endif
19355+static int use_sysenter __read_mostly = -1;
19356+
19357+#define vdso32_sysenter() (use_sysenter > 0)
19358+
19359+/* May not be __init: called during resume */
19360+void syscall32_cpu_init(void)
19361+{
19362+ static const struct callback_register cstar = {
19363+ .type = CALLBACKTYPE_syscall32,
19364+ .address = (unsigned long)ia32_cstar_target
19365+ };
19366+ static const struct callback_register sysenter = {
19367+ .type = CALLBACKTYPE_sysenter,
19368+ .address = (unsigned long)ia32_sysenter_target
19369+ };
19370+
19371+ if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
19372+ (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
19373+#if CONFIG_XEN_COMPAT < 0x030200
19374+ return;
19375+ use_int80 = 0;
19376+#else
19377+ BUG();
19378+#endif
19379+
19380+ if (use_sysenter < 0)
19381+ use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
19382+}
19383+
19384+#define compat_uses_vma 1
19385+
19386+static inline void map_compat_vdso(int map)
19387+{
19388+}
19389+
19390+#else /* CONFIG_X86_32 */
19391+
19392+#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
19393+
19394+extern asmlinkage void ia32pv_cstar_target(void);
19395+static /*const*/ struct callback_register __cpuinitdata cstar = {
19396+ .type = CALLBACKTYPE_syscall32,
19397+ .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target },
19398+};
19399+
19400+void __cpuinit enable_sep_cpu(void)
19401+{
19402+ extern asmlinkage void ia32pv_sysenter_target(void);
19403+ static struct callback_register __cpuinitdata sysenter = {
19404+ .type = CALLBACKTYPE_sysenter,
19405+ .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
19406+ };
19407+
19408+ if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
19409+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
19410+ BUG();
19411+ return;
19412+ }
19413+
19414+ if (!boot_cpu_has(X86_FEATURE_SEP))
19415+ return;
19416+
19417+ if (xen_feature(XENFEAT_supervisor_mode_kernel))
19418+ sysenter.address.eip = (unsigned long)ia32_sysenter_target;
19419+
19420+ switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
19421+ case 0:
19422+ break;
19423+#if CONFIG_XEN_COMPAT < 0x030200
19424+ case -ENOSYS:
19425+ sysenter.type = CALLBACKTYPE_sysenter_deprecated;
19426+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
19427+ break;
19428+#endif
19429+ default:
19430+ setup_clear_cpu_cap(X86_FEATURE_SEP);
19431+ break;
19432+ }
19433+}
19434+
19435+static struct vm_area_struct gate_vma;
19436+
19437+static int __init gate_vma_init(void)
19438+{
19439+ gate_vma.vm_mm = NULL;
19440+ gate_vma.vm_start = FIXADDR_USER_START;
19441+ gate_vma.vm_end = FIXADDR_USER_END;
19442+ gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
19443+ gate_vma.vm_page_prot = __P101;
19444+ /*
19445+ * Make sure the vDSO gets into every core dump.
19446+ * Dumping its contents makes post-mortem fully interpretable later
19447+ * without matching up the same kernel and hardware config to see
19448+ * what PC values meant.
19449+ */
19450+ gate_vma.vm_flags |= VM_ALWAYSDUMP;
19451+ return 0;
19452+}
19453+
19454+#define compat_uses_vma 0
19455+
19456+static void map_compat_vdso(int map)
19457+{
19458+ static int vdso_mapped;
19459+
19460+ if (map == vdso_mapped)
19461+ return;
19462+
19463+ vdso_mapped = map;
19464+
19465+ __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
19466+ map ? PAGE_READONLY_EXEC : PAGE_NONE);
19467+
19468+ /* flush stray tlbs */
19469+ flush_tlb_all();
19470+}
19471+
19472+#endif /* CONFIG_X86_64 */
19473+
19474+int __init sysenter_setup(void)
19475+{
19476+ void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
19477+ const void *vsyscall;
19478+ size_t vsyscall_len;
19479+
19480+ vdso32_pages[0] = virt_to_page(syscall_page);
19481+
19482+#ifdef CONFIG_X86_32
19483+ gate_vma_init();
19484+
19485+ printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
19486+#endif
19487+
19488+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
19489+ if (use_int80) {
19490+ extern const char vdso32_int80_start, vdso32_int80_end;
19491+
19492+ vsyscall = &vdso32_int80_start;
19493+ vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
19494+ } else
19495+#elif defined(CONFIG_X86_32)
19496+ if (boot_cpu_has(X86_FEATURE_SYSCALL)
19497+ && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
19498+ || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
19499+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
19500+ barrier(); /* until clear_bit()'s constraints are correct ... */
19501+ if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
19502+ extern const char vdso32_syscall_start, vdso32_syscall_end;
19503+
19504+ vsyscall = &vdso32_syscall_start;
19505+ vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
19506+ } else
19507+#endif
19508+ if (!vdso32_sysenter()) {
19509+ vsyscall = &vdso32_default_start;
19510+ vsyscall_len = &vdso32_default_end - &vdso32_default_start;
19511+ } else {
19512+ vsyscall = &vdso32_sysenter_start;
19513+ vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
19514+ }
19515+
19516+ memcpy(syscall_page, vsyscall, vsyscall_len);
19517+ relocate_vdso(syscall_page);
19518+
19519+ return 0;
19520+}
19521+
19522+/* Setup a VMA at program startup for the vsyscall page */
19523+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
19524+{
19525+ struct mm_struct *mm = current->mm;
19526+ unsigned long addr;
19527+ int ret = 0;
19528+ bool compat;
19529+
19530+ down_write(&mm->mmap_sem);
19531+
19532+ /* Test compat mode once here, in case someone
19533+ changes it via sysctl */
19534+ compat = (vdso_enabled == VDSO_COMPAT);
19535+
19536+ map_compat_vdso(compat);
19537+
19538+ if (compat)
19539+ addr = VDSO_HIGH_BASE;
19540+ else {
19541+ addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
19542+ if (IS_ERR_VALUE(addr)) {
19543+ ret = addr;
19544+ goto up_fail;
19545+ }
19546+ }
19547+
19548+ if (compat_uses_vma || !compat) {
19549+ /*
19550+ * MAYWRITE to allow gdb to COW and set breakpoints
19551+ *
19552+ * Make sure the vDSO gets into every core dump.
19553+ * Dumping its contents makes post-mortem fully
19554+ * interpretable later without matching up the same
19555+ * kernel and hardware config to see what PC values
19556+ * meant.
19557+ */
19558+ ret = install_special_mapping(mm, addr, PAGE_SIZE,
19559+ VM_READ|VM_EXEC|
19560+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
19561+ VM_ALWAYSDUMP,
19562+ vdso32_pages);
19563+
19564+ if (ret)
19565+ goto up_fail;
19566+ }
19567+
19568+ current->mm->context.vdso = (void *)addr;
19569+ current_thread_info()->sysenter_return =
19570+ VDSO32_SYMBOL(addr, SYSENTER_RETURN);
19571+
19572+ up_fail:
19573+ up_write(&mm->mmap_sem);
19574+
19575+ return ret;
19576+}
19577+
19578+#ifdef CONFIG_X86_64
19579+
19580+/*
19581+ * This must be done early in case we have an initrd containing 32-bit
19582+ * binaries (e.g., hotplug). This could be pushed upstream.
19583+ */
19584+core_initcall(sysenter_setup);
19585+
19586+#ifdef CONFIG_SYSCTL
19587+/* Register vsyscall32 into the ABI table */
19588+#include <linux/sysctl.h>
19589+
19590+static ctl_table abi_table2[] = {
19591+ {
19592+ .procname = "vsyscall32",
19593+ .data = &sysctl_vsyscall32,
19594+ .maxlen = sizeof(int),
19595+ .mode = 0644,
19596+ .proc_handler = proc_dointvec
19597+ },
19598+ {}
19599+};
19600+
19601+static ctl_table abi_root_table2[] = {
19602+ {
19603+ .ctl_name = CTL_ABI,
19604+ .procname = "abi",
19605+ .mode = 0555,
19606+ .child = abi_table2
19607+ },
19608+ {}
19609+};
19610+
19611+static __init int ia32_binfmt_init(void)
19612+{
19613+ register_sysctl_table(abi_root_table2);
19614+ return 0;
19615+}
19616+__initcall(ia32_binfmt_init);
19617+#endif
19618+
19619+#else /* CONFIG_X86_32 */
19620+
19621+const char *arch_vma_name(struct vm_area_struct *vma)
19622+{
19623+ if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
19624+ return "[vdso]";
19625+ return NULL;
19626+}
19627+
19628+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
19629+{
19630+ struct mm_struct *mm = tsk->mm;
19631+
19632+ /* Check to see if this task was created in compat vdso mode */
19633+ if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
19634+ return &gate_vma;
19635+ return NULL;
19636+}
19637+
19638+int in_gate_area(struct task_struct *task, unsigned long addr)
19639+{
19640+ const struct vm_area_struct *vma = get_gate_vma(task);
19641+
19642+ return vma && addr >= vma->vm_start && addr < vma->vm_end;
19643+}
19644+
19645+int in_gate_area_no_task(unsigned long addr)
19646+{
19647+ return 0;
19648+}
19649+
19650+#endif /* CONFIG_X86_64 */
19651--- a/arch/x86/vdso/vdso32/syscall.S
19652+++ b/arch/x86/vdso/vdso32/syscall.S
19653@@ -19,8 +19,10 @@ __kernel_vsyscall:
19654 .Lpush_ebp:
19655 movl %ecx, %ebp
19656 syscall
19657+#ifndef CONFIG_XEN
19658 movl $__USER32_DS, %ecx
19659 movl %ecx, %ss
19660+#endif
19661 movl %ebp, %ecx
19662 popl %ebp
19663 .Lpop_ebp:
19664--- a/drivers/pci/msi-xen.c
19665+++ b/drivers/pci/msi-xen.c
19666@@ -43,6 +43,53 @@ struct msi_pirq_entry {
19667 int entry_nr;
19668 };
19669
19670+/* Arch hooks */
19671+
19672+int __attribute__ ((weak))
19673+arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
19674+{
19675+ return 0;
19676+}
19677+
19678+#ifndef CONFIG_XEN
19679+int __attribute__ ((weak))
19680+arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
19681+{
19682+ return 0;
19683+}
19684+
19685+int __attribute__ ((weak))
19686+arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
19687+{
19688+ struct msi_desc *entry;
19689+ int ret;
19690+
19691+ list_for_each_entry(entry, &dev->msi_list, list) {
19692+ ret = arch_setup_msi_irq(dev, entry);
19693+ if (ret)
19694+ return ret;
19695+ }
19696+
19697+ return 0;
19698+}
19699+
19700+void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
19701+{
19702+ return;
19703+}
19704+
19705+void __attribute__ ((weak))
19706+arch_teardown_msi_irqs(struct pci_dev *dev)
19707+{
19708+ struct msi_desc *entry;
19709+
19710+ list_for_each_entry(entry, &dev->msi_list, list) {
19711+ if (entry->irq != 0)
19712+ arch_teardown_msi_irq(entry->irq);
19713+ }
19714+}
19715+#endif
19716+
19717 static void msi_set_enable(struct pci_dev *dev, int enable)
19718 {
19719 int pos;
19720@@ -270,7 +317,6 @@ static void pci_intx_for_msi(struct pci_
19721 pci_intx(dev, enable);
19722 }
19723
19724-#ifdef CONFIG_PM
19725 static void __pci_restore_msi_state(struct pci_dev *dev)
19726 {
19727 int pirq;
19728@@ -328,7 +374,7 @@ void pci_restore_msi_state(struct pci_de
19729 __pci_restore_msi_state(dev);
19730 __pci_restore_msix_state(dev);
19731 }
19732-#endif /* CONFIG_PM */
19733+EXPORT_SYMBOL_GPL(pci_restore_msi_state);
19734
19735 /**
19736 * msi_capability_init - configure device's MSI capability structure
19737@@ -760,51 +806,3 @@ void pci_msi_init_pci_dev(struct pci_dev
19738 INIT_LIST_HEAD(&dev->msi_list);
19739 #endif
19740 }
19741-
19742-
19743-/* Arch hooks */
19744-
19745-int __attribute__ ((weak))
19746-arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
19747-{
19748- return 0;
19749-}
19750-
19751-#ifndef CONFIG_XEN
19752-int __attribute__ ((weak))
19753-arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
19754-{
19755- return 0;
19756-}
19757-
19758-int __attribute__ ((weak))
19759-arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
19760-{
19761- struct msi_desc *entry;
19762- int ret;
19763-
19764- list_for_each_entry(entry, &dev->msi_list, list) {
19765- ret = arch_setup_msi_irq(dev, entry);
19766- if (ret)
19767- return ret;
19768- }
19769-
19770- return 0;
19771-}
19772-
19773-void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
19774-{
19775- return;
19776-}
19777-
19778-void __attribute__ ((weak))
19779-arch_teardown_msi_irqs(struct pci_dev *dev)
19780-{
19781- struct msi_desc *entry;
19782-
19783- list_for_each_entry(entry, &dev->msi_list, list) {
19784- if (entry->irq != 0)
19785- arch_teardown_msi_irq(entry->irq);
19786- }
19787-}
19788-#endif
19789--- a/drivers/pci/pci.c
19790+++ b/drivers/pci/pci.c
19791@@ -353,7 +353,12 @@ pci_find_parent_resource(const struct pc
19792 * Restore the BAR values for a given device, so as to make it
19793 * accessible by its driver.
19794 */
19795+#ifndef CONFIG_XEN
19796 static void
19797+#else
19798+EXPORT_SYMBOL_GPL(pci_restore_bars);
19799+void
19800+#endif
19801 pci_restore_bars(struct pci_dev *dev)
19802 {
19803 int i, numres;
19804--- a/drivers/xen/balloon/sysfs.c
19805+++ b/drivers/xen/balloon/sysfs.c
19806@@ -108,7 +108,7 @@ static struct attribute_group balloon_in
19807 };
19808
19809 static struct sysdev_class balloon_sysdev_class = {
19810- set_kset_name(BALLOON_CLASS_NAME),
19811+ .name = BALLOON_CLASS_NAME,
19812 };
19813
19814 static struct sys_device balloon_sysdev;
19815--- a/drivers/xen/blkback/blkback.c
19816+++ b/drivers/xen/blkback/blkback.c
19817@@ -148,7 +148,7 @@ static void unplug_queue(blkif_t *blkif)
19818 return;
19819 if (blkif->plug->unplug_fn)
19820 blkif->plug->unplug_fn(blkif->plug);
19821- blk_put_queue(blkif->plug);
19822+ kobject_put(&blkif->plug->kobj);
19823 blkif->plug = NULL;
19824 }
19825
19826@@ -159,7 +159,8 @@ static void plug_queue(blkif_t *blkif, s
19827 if (q == blkif->plug)
19828 return;
19829 unplug_queue(blkif);
19830- blk_get_queue(q);
19831+ WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags));
19832+ kobject_get(&q->kobj);
19833 blkif->plug = q;
19834 }
19835
19836--- a/drivers/xen/blkfront/blkfront.c
19837+++ b/drivers/xen/blkfront/blkfront.c
19838@@ -716,7 +716,6 @@ static irqreturn_t blkif_int(int irq, vo
19839 RING_IDX i, rp;
19840 unsigned long flags;
19841 struct blkfront_info *info = (struct blkfront_info *)dev_id;
19842- int uptodate;
19843
19844 spin_lock_irqsave(&blkif_io_lock, flags);
19845
19846@@ -741,13 +740,13 @@ static irqreturn_t blkif_int(int irq, vo
19847
19848 ADD_ID_TO_FREELIST(info, id);
19849
19850- uptodate = (bret->status == BLKIF_RSP_OKAY);
19851+ ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
19852 switch (bret->operation) {
19853 case BLKIF_OP_WRITE_BARRIER:
19854 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
19855 printk("blkfront: %s: write barrier op failed\n",
19856 info->gd->disk_name);
19857- uptodate = -EOPNOTSUPP;
19858+ ret = -EOPNOTSUPP;
19859 info->feature_barrier = 0;
19860 xlvbd_barrier(info);
19861 }
19862@@ -758,10 +757,8 @@ static irqreturn_t blkif_int(int irq, vo
19863 DPRINTK("Bad return from blkdev data "
19864 "request: %x\n", bret->status);
19865
19866- ret = end_that_request_first(req, uptodate,
19867- req->hard_nr_sectors);
19868+ ret = __blk_end_request(req, ret, blk_rq_bytes(req));
19869 BUG_ON(ret);
19870- end_that_request_last(req, uptodate);
19871 break;
19872 default:
19873 BUG();
19874--- a/drivers/xen/blktap/blktap.c
19875+++ b/drivers/xen/blktap/blktap.c
19876@@ -327,8 +327,8 @@ static pte_t blktap_clear_pte(struct vm_
19877 * if vm_file is NULL (meaning mmap failed and we have nothing to do)
19878 */
19879 if (uvaddr < uvstart || vma->vm_file == NULL)
19880- return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
19881- ptep, is_fullmm);
19882+ return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
19883+ is_fullmm);
19884
19885 info = vma->vm_file->private_data;
19886 map = vma->vm_private_data;
19887@@ -375,8 +375,8 @@ static pte_t blktap_clear_pte(struct vm_
19888 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
19889
19890 /* USING SHADOW PAGE TABLES. */
19891- copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
19892- is_fullmm);
19893+ copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
19894+ is_fullmm);
19895 }
19896
19897 if (count) {
19898--- a/drivers/xen/core/evtchn.c
19899+++ b/drivers/xen/core/evtchn.c
19900@@ -193,7 +193,7 @@ static inline unsigned int cpu_from_evtc
19901
19902 /* Upcall to generic IRQ layer. */
19903 #ifdef CONFIG_X86
19904-extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
19905+extern unsigned int do_IRQ(struct pt_regs *regs);
19906 void __init xen_init_IRQ(void);
19907 void __init init_IRQ(void)
19908 {
19909@@ -202,13 +202,11 @@ void __init init_IRQ(void)
19910 }
19911 #if defined (__i386__)
19912 static inline void exit_idle(void) {}
19913-#define IRQ_REG orig_eax
19914 #elif defined (__x86_64__)
19915 #include <asm/idle.h>
19916-#define IRQ_REG orig_rax
19917 #endif
19918 #define do_IRQ(irq, regs) do { \
19919- (regs)->IRQ_REG = ~(irq); \
19920+ (regs)->orig_ax = ~(irq); \
19921 do_IRQ((regs)); \
19922 } while (0)
19923 #endif
19924@@ -669,13 +667,12 @@ static void set_affinity_irq(unsigned in
19925 int resend_irq_on_evtchn(unsigned int irq)
19926 {
19927 int masked, evtchn = evtchn_from_irq(irq);
19928- shared_info_t *s = HYPERVISOR_shared_info;
19929
19930 if (!VALID_EVTCHN(evtchn))
19931 return 1;
19932
19933 masked = test_and_set_evtchn_mask(evtchn);
19934- synch_set_bit(evtchn, s->evtchn_pending);
19935+ set_evtchn(evtchn);
19936 if (!masked)
19937 unmask_evtchn(evtchn);
19938
19939@@ -968,6 +965,43 @@ void disable_all_local_evtchn(void)
19940 synch_set_bit(i, &s->evtchn_mask[0]);
19941 }
19942
19943+/* Clear an irq's pending state, in preparation for polling on it. */
19944+void xen_clear_irq_pending(int irq)
19945+{
19946+ int evtchn = evtchn_from_irq(irq);
19947+
19948+ if (VALID_EVTCHN(evtchn))
19949+ clear_evtchn(evtchn);
19950+}
19951+
19952+/* Set an irq's pending state, to avoid blocking on it. */
19953+void xen_set_irq_pending(int irq)
19954+{
19955+ int evtchn = evtchn_from_irq(irq);
19956+
19957+ if (VALID_EVTCHN(evtchn))
19958+ set_evtchn(evtchn);
19959+}
19960+
19961+/* Test an irq's pending state. */
19962+int xen_test_irq_pending(int irq)
19963+{
19964+ int evtchn = evtchn_from_irq(irq);
19965+
19966+ return VALID_EVTCHN(evtchn) && test_evtchn(evtchn);
19967+}
19968+
19969+/* Poll waiting for an irq to become pending. In the usual case, the
19970+ irq will be disabled so it won't deliver an interrupt. */
19971+void xen_poll_irq(int irq)
19972+{
19973+ evtchn_port_t evtchn = evtchn_from_irq(irq);
19974+
19975+ if (VALID_EVTCHN(evtchn)
19976+ && HYPERVISOR_poll_no_timeout(&evtchn, 1))
19977+ BUG();
19978+}
19979+
19980 static void restore_cpu_virqs(unsigned int cpu)
19981 {
19982 struct evtchn_bind_virq bind_virq;
19983--- a/drivers/xen/core/hypervisor_sysfs.c
19984+++ b/drivers/xen/core/hypervisor_sysfs.c
19985@@ -50,7 +50,7 @@ static int __init hypervisor_subsys_init
19986 if (!is_running_on_xen())
19987 return -ENODEV;
19988
19989- hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type;
19990+ hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
19991 return 0;
19992 }
19993
19994--- a/drivers/xen/core/Makefile
19995+++ b/drivers/xen/core/Makefile
19996@@ -10,5 +10,6 @@ obj-$(CONFIG_SYS_HYPERVISOR) += hypervis
19997 obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
19998 obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
19999 obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o
20000+obj-$(CONFIG_X86_SMP) += spinlock.o
20001 obj-$(CONFIG_KEXEC) += machine_kexec.o
20002 obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
20003--- a/drivers/xen/core/smpboot.c
20004+++ b/drivers/xen/core/smpboot.c
20005@@ -139,6 +139,10 @@ static int __cpuinit xen_smp_intr_init(u
20006 goto fail;
20007 per_cpu(callfunc_irq, cpu) = rc;
20008
20009+ rc = xen_spinlock_init(cpu);
20010+ if (rc < 0)
20011+ goto fail;
20012+
20013 if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
20014 goto fail;
20015
20016@@ -149,6 +153,7 @@ static int __cpuinit xen_smp_intr_init(u
20017 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
20018 if (per_cpu(callfunc_irq, cpu) >= 0)
20019 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
20020+ xen_spinlock_cleanup(cpu);
20021 return rc;
20022 }
20023
20024@@ -160,6 +165,7 @@ static void xen_smp_intr_exit(unsigned i
20025
20026 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
20027 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
20028+ xen_spinlock_cleanup(cpu);
20029 }
20030 #endif
20031
20032@@ -212,36 +218,25 @@ static void __cpuinit cpu_initialize_con
20033 smp_trap_init(ctxt.trap_ctxt);
20034
20035 ctxt.ldt_ents = 0;
20036- ctxt.gdt_ents = GDT_SIZE / 8;
20037-
20038-#ifdef __i386__
20039 ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
20040+ ctxt.gdt_ents = GDT_SIZE / 8;
20041
20042 ctxt.user_regs.cs = __KERNEL_CS;
20043- ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
20044+ ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
20045
20046 ctxt.kernel_ss = __KERNEL_DS;
20047- ctxt.kernel_sp = idle->thread.esp0;
20048+ ctxt.kernel_sp = idle->thread.sp0;
20049
20050- ctxt.event_callback_cs = __KERNEL_CS;
20051 ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
20052- ctxt.failsafe_callback_cs = __KERNEL_CS;
20053 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
20054+#ifdef __i386__
20055+ ctxt.event_callback_cs = __KERNEL_CS;
20056+ ctxt.failsafe_callback_cs = __KERNEL_CS;
20057
20058 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
20059
20060 ctxt.user_regs.fs = __KERNEL_PERCPU;
20061 #else /* __x86_64__ */
20062- ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address);
20063-
20064- ctxt.user_regs.cs = __KERNEL_CS;
20065- ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
20066-
20067- ctxt.kernel_ss = __KERNEL_DS;
20068- ctxt.kernel_sp = idle->thread.rsp0;
20069-
20070- ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
20071- ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
20072 ctxt.syscall_callback_eip = (unsigned long)system_call;
20073
20074 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
20075--- /dev/null
20076+++ b/drivers/xen/core/spinlock.c
20077@@ -0,0 +1,161 @@
20078+/*
20079+ * Xen spinlock functions
20080+ *
20081+ * See arch/x86/xen/smp.c for copyright and credits for derived
20082+ * portions of this file.
20083+ */
20084+
20085+#include <linux/init.h>
20086+#include <linux/irq.h>
20087+#include <linux/kernel.h>
20088+#include <linux/kernel_stat.h>
20089+#include <linux/module.h>
20090+#include <xen/evtchn.h>
20091+
20092+extern irqreturn_t smp_reschedule_interrupt(int, void *);
20093+
20094+static DEFINE_PER_CPU(int, spinlock_irq) = -1;
20095+static char spinlock_name[NR_CPUS][15];
20096+
20097+struct spinning {
20098+ raw_spinlock_t *lock;
20099+ unsigned int ticket;
20100+ struct spinning *prev;
20101+};
20102+static DEFINE_PER_CPU(struct spinning *, spinning);
20103+/*
20104+ * Protect removal of objects: Addition can be done lockless, and even
20105+ * removal itself doesn't need protection - what needs to be prevented is
20106+ * removed objects going out of scope (as they're allocated on the stack.
20107+ */
20108+static DEFINE_PER_CPU(raw_rwlock_t, spinning_rm_lock) = __RAW_RW_LOCK_UNLOCKED;
20109+
20110+int __cpuinit xen_spinlock_init(unsigned int cpu)
20111+{
20112+ int rc;
20113+
20114+ sprintf(spinlock_name[cpu], "spinlock%u", cpu);
20115+ rc = bind_ipi_to_irqhandler(SPIN_UNLOCK_VECTOR,
20116+ cpu,
20117+ smp_reschedule_interrupt,
20118+ IRQF_DISABLED|IRQF_NOBALANCING,
20119+ spinlock_name[cpu],
20120+ NULL);
20121+ if (rc < 0)
20122+ return rc;
20123+
20124+ disable_irq(rc); /* make sure it's never delivered */
20125+ per_cpu(spinlock_irq, cpu) = rc;
20126+
20127+ return 0;
20128+}
20129+
20130+void __cpuinit xen_spinlock_cleanup(unsigned int cpu)
20131+{
20132+ if (per_cpu(spinlock_irq, cpu) >= 0)
20133+ unbind_from_irqhandler(per_cpu(spinlock_irq, cpu), NULL);
20134+ per_cpu(spinlock_irq, cpu) = -1;
20135+}
20136+
20137+int xen_spin_wait(raw_spinlock_t *lock, unsigned int token)
20138+{
20139+ int rc = 0, irq = __get_cpu_var(spinlock_irq);
20140+ raw_rwlock_t *rm_lock;
20141+ unsigned long flags;
20142+ struct spinning spinning;
20143+
20144+ /* If kicker interrupt not initialized yet, just spin. */
20145+ if (unlikely(irq < 0))
20146+ return 0;
20147+
20148+ token >>= TICKET_SHIFT;
20149+
20150+ /* announce we're spinning */
20151+ spinning.ticket = token;
20152+ spinning.lock = lock;
20153+ spinning.prev = __get_cpu_var(spinning);
20154+ smp_wmb();
20155+ __get_cpu_var(spinning) = &spinning;
20156+
20157+ /* clear pending */
20158+ xen_clear_irq_pending(irq);
20159+
20160+ do {
20161+ /* Check again to make sure it didn't become free while
20162+ * we weren't looking. */
20163+ if ((lock->slock & ((1U << TICKET_SHIFT) - 1)) == token) {
20164+ /* If we interrupted another spinlock while it was
20165+ * blocking, make sure it doesn't block (again)
20166+ * without rechecking the lock. */
20167+ if (spinning.prev)
20168+ xen_set_irq_pending(irq);
20169+ rc = 1;
20170+ break;
20171+ }
20172+
20173+ /* block until irq becomes pending */
20174+ xen_poll_irq(irq);
20175+ } while (!xen_test_irq_pending(irq));
20176+
20177+ /* Leave the irq pending so that any interrupted blocker will
20178+ * re-check. */
20179+ kstat_this_cpu.irqs[irq] += !rc;
20180+
20181+ /* announce we're done */
20182+ __get_cpu_var(spinning) = spinning.prev;
20183+ rm_lock = &__get_cpu_var(spinning_rm_lock);
20184+ raw_local_irq_save(flags);
20185+ __raw_write_lock(rm_lock);
20186+ __raw_write_unlock(rm_lock);
20187+ raw_local_irq_restore(flags);
20188+
20189+ return rc;
20190+}
20191+EXPORT_SYMBOL(xen_spin_wait);
20192+
20193+unsigned int xen_spin_adjust(raw_spinlock_t *lock, unsigned int token)
20194+{
20195+ return token;//todo
20196+}
20197+EXPORT_SYMBOL(xen_spin_adjust);
20198+
20199+int xen_spin_wait_flags(raw_spinlock_t *lock, unsigned int *token,
20200+ unsigned int flags)
20201+{
20202+ return xen_spin_wait(lock, *token);//todo
20203+}
20204+EXPORT_SYMBOL(xen_spin_wait_flags);
20205+
20206+void xen_spin_kick(raw_spinlock_t *lock, unsigned int token)
20207+{
20208+ unsigned int cpu;
20209+
20210+ token &= (1U << TICKET_SHIFT) - 1;
20211+ for_each_online_cpu(cpu) {
20212+ raw_rwlock_t *rm_lock;
20213+ unsigned long flags;
20214+ struct spinning *spinning;
20215+
20216+ if (cpu == raw_smp_processor_id())
20217+ continue;
20218+
20219+ rm_lock = &per_cpu(spinning_rm_lock, cpu);
20220+ raw_local_irq_save(flags);
20221+ __raw_read_lock(rm_lock);
20222+
20223+ spinning = per_cpu(spinning, cpu);
20224+ smp_rmb();
20225+ if (spinning
20226+ && (spinning->lock != lock || spinning->ticket != token))
20227+ spinning = NULL;
20228+
20229+ __raw_read_unlock(rm_lock);
20230+ raw_local_irq_restore(flags);
20231+
20232+ if (unlikely(spinning)) {
20233+ notify_remote_via_irq(per_cpu(spinlock_irq, cpu));
20234+ return;
20235+ }
20236+ }
20237+}
20238+EXPORT_SYMBOL(xen_spin_kick);
20239--- a/drivers/xen/core/xen_sysfs.c
20240+++ b/drivers/xen/core/xen_sysfs.c
20241@@ -29,12 +29,12 @@ HYPERVISOR_ATTR_RO(type);
20242
20243 static int __init xen_sysfs_type_init(void)
20244 {
20245- return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr);
20246+ return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
20247 }
20248
20249 static void xen_sysfs_type_destroy(void)
20250 {
20251- sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr);
20252+ sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
20253 }
20254
20255 /* xen version attributes */
20256@@ -90,13 +90,12 @@ static struct attribute_group version_gr
20257
20258 static int __init xen_sysfs_version_init(void)
20259 {
20260- return sysfs_create_group(&hypervisor_subsys.kobj,
20261- &version_group);
20262+ return sysfs_create_group(hypervisor_kobj, &version_group);
20263 }
20264
20265 static void xen_sysfs_version_destroy(void)
20266 {
20267- sysfs_remove_group(&hypervisor_subsys.kobj, &version_group);
20268+ sysfs_remove_group(hypervisor_kobj, &version_group);
20269 }
20270
20271 /* UUID */
20272@@ -126,12 +125,12 @@ HYPERVISOR_ATTR_RO(uuid);
20273
20274 static int __init xen_sysfs_uuid_init(void)
20275 {
20276- return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
20277+ return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
20278 }
20279
20280 static void xen_sysfs_uuid_destroy(void)
20281 {
20282- sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
20283+ sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
20284 }
20285
20286 /* xen compilation attributes */
20287@@ -204,14 +203,12 @@ static struct attribute_group xen_compil
20288
20289 int __init static xen_compilation_init(void)
20290 {
20291- return sysfs_create_group(&hypervisor_subsys.kobj,
20292- &xen_compilation_group);
20293+ return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
20294 }
20295
20296 static void xen_compilation_destroy(void)
20297 {
20298- sysfs_remove_group(&hypervisor_subsys.kobj,
20299- &xen_compilation_group);
20300+ sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
20301 }
20302
20303 /* xen properties info */
20304@@ -325,14 +322,12 @@ static struct attribute_group xen_proper
20305
20306 static int __init xen_properties_init(void)
20307 {
20308- return sysfs_create_group(&hypervisor_subsys.kobj,
20309- &xen_properties_group);
20310+ return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
20311 }
20312
20313 static void xen_properties_destroy(void)
20314 {
20315- sysfs_remove_group(&hypervisor_subsys.kobj,
20316- &xen_properties_group);
20317+ sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
20318 }
20319
20320 #ifdef CONFIG_KEXEC
20321@@ -350,13 +345,12 @@ HYPERVISOR_ATTR_RO(vmcoreinfo);
20322
20323 static int __init xen_sysfs_vmcoreinfo_init(void)
20324 {
20325- return sysfs_create_file(&hypervisor_subsys.kobj,
20326- &vmcoreinfo_attr.attr);
20327+ return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
20328 }
20329
20330 static void xen_sysfs_vmcoreinfo_destroy(void)
20331 {
20332- sysfs_remove_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr);
20333+ sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
20334 }
20335
20336 #endif
20337--- a/drivers/xen/gntdev/gntdev.c
20338+++ b/drivers/xen/gntdev/gntdev.c
20339@@ -782,7 +782,7 @@ static pte_t gntdev_clear_pte(struct vm_
20340 op.status);
20341 } else {
20342 /* USING SHADOW PAGE TABLES. */
20343- copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
20344+ copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
20345 }
20346
20347 /* Finally, we unmap the grant from kernel space. */
20348@@ -810,7 +810,7 @@ static pte_t gntdev_clear_pte(struct vm_
20349 >> PAGE_SHIFT, INVALID_P2M_ENTRY);
20350
20351 } else {
20352- copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
20353+ copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
20354 }
20355
20356 return copy;
20357--- a/drivers/xen/scsifront/scsifront.c
20358+++ b/drivers/xen/scsifront/scsifront.c
20359@@ -260,19 +260,19 @@ static int map_data_for_request(struct v
20360 return -ENOMEM;
20361 }
20362
20363- if (sc->use_sg) {
20364+ if (scsi_bufflen(sc)) {
20365 /* quoted scsi_lib.c/scsi_req_map_sg . */
20366- struct scatterlist *sg, *sgl = (struct scatterlist *)sc->request_buffer;
20367- unsigned int data_len = sc->request_bufflen;
20368+ struct scatterlist *sg, *sgl = scsi_sglist(sc);
20369+ unsigned int data_len = scsi_bufflen(sc);
20370
20371- nr_pages = (sc->request_bufflen + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
20372+ nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
20373 if (nr_pages > VSCSIIF_SG_TABLESIZE) {
20374 printk(KERN_ERR "scsifront: Unable to map request_buffer for command!\n");
20375 ref_cnt = (-E2BIG);
20376 goto big_to_sg;
20377 }
20378
20379- for_each_sg (sgl, sg, sc->use_sg, i) {
20380+ for_each_sg (sgl, sg, scsi_sg_count(sc), i) {
20381 page = sg_page(sg);
20382 off = sg->offset;
20383 len = sg->length;
20384@@ -306,45 +306,6 @@ static int map_data_for_request(struct v
20385 ref_cnt++;
20386 }
20387 }
20388- } else if (sc->request_bufflen) {
20389- unsigned long end = ((unsigned long)sc->request_buffer
20390- + sc->request_bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
20391- unsigned long start = (unsigned long)sc->request_buffer >> PAGE_SHIFT;
20392-
20393- page = virt_to_page(sc->request_buffer);
20394- nr_pages = end - start;
20395- len = sc->request_bufflen;
20396-
20397- if (nr_pages > VSCSIIF_SG_TABLESIZE) {
20398- ref_cnt = (-E2BIG);
20399- goto big_to_sg;
20400- }
20401-
20402- buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
20403-
20404- off = offset_in_page((unsigned long)sc->request_buffer);
20405- for (i = 0; i < nr_pages; i++) {
20406- bytes = PAGE_SIZE - off;
20407-
20408- if (bytes > len)
20409- bytes = len;
20410-
20411- ref = gnttab_claim_grant_reference(&gref_head);
20412- BUG_ON(ref == -ENOSPC);
20413-
20414- gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
20415- buffer_pfn, write);
20416-
20417- info->shadow[id].gref[i] = ref;
20418- ring_req->seg[i].gref = ref;
20419- ring_req->seg[i].offset = (uint16_t)off;
20420- ring_req->seg[i].length = (uint16_t)bytes;
20421-
20422- buffer_pfn++;
20423- len -= bytes;
20424- off = 0;
20425- ref_cnt++;
20426- }
20427 }
20428
20429 big_to_sg:
20430--- a/drivers/xen/xenoprof/xenoprofile.c
20431+++ b/drivers/xen/xenoprof/xenoprofile.c
20432@@ -79,7 +79,7 @@ static int xenoprof_resume(struct sys_de
20433
20434
20435 static struct sysdev_class oprofile_sysclass = {
20436- set_kset_name("oprofile"),
20437+ .name = "oprofile",
20438 .resume = xenoprof_resume,
20439 .suspend = xenoprof_suspend
20440 };
20441--- a/include/asm-x86/mach-xen/asm/agp.h
20442+++ b/include/asm-x86/mach-xen/asm/agp.h
20443@@ -13,18 +13,13 @@
20444 * page. This avoids data corruption on some CPUs.
20445 */
20446
20447-/*
20448- * Caller's responsibility to call global_flush_tlb() for performance
20449- * reasons
20450- */
20451 #define map_page_into_agp(page) ( \
20452 xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
20453- ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE))
20454+ ?: set_pages_uc(page, 1))
20455 #define unmap_page_from_agp(page) ( \
20456 xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
20457 /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
20458- change_page_attr(page, 1, PAGE_KERNEL))
20459-#define flush_agp_mappings() global_flush_tlb()
20460+ set_pages_wb(page, 1))
20461
20462 /*
20463 * Could use CLFLUSH here if the cpu supports it. But then it would
20464--- a/include/asm-x86/mach-xen/asm/desc_32.h
20465+++ /dev/null
20466@@ -1,262 +0,0 @@
20467-#ifndef __ARCH_DESC_H
20468-#define __ARCH_DESC_H
20469-
20470-#include <asm/ldt.h>
20471-#include <asm/segment.h>
20472-
20473-#ifndef __ASSEMBLY__
20474-
20475-#include <linux/preempt.h>
20476-#include <linux/smp.h>
20477-
20478-#include <asm/mmu.h>
20479-
20480-struct Xgt_desc_struct {
20481- unsigned short size;
20482- unsigned long address __attribute__((packed));
20483- unsigned short pad;
20484-} __attribute__ ((packed));
20485-
20486-struct gdt_page
20487-{
20488- struct desc_struct gdt[GDT_ENTRIES];
20489-} __attribute__((aligned(PAGE_SIZE)));
20490-DECLARE_PER_CPU(struct gdt_page, gdt_page);
20491-
20492-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
20493-{
20494- return per_cpu(gdt_page, cpu).gdt;
20495-}
20496-
20497-extern struct Xgt_desc_struct idt_descr;
20498-extern struct desc_struct idt_table[];
20499-extern void set_intr_gate(unsigned int irq, void * addr);
20500-
20501-static inline void pack_descriptor(__u32 *a, __u32 *b,
20502- unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
20503-{
20504- *a = ((base & 0xffff) << 16) | (limit & 0xffff);
20505- *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
20506- (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
20507-}
20508-
20509-static inline void pack_gate(__u32 *a, __u32 *b,
20510- unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
20511-{
20512- *a = (seg << 16) | (base & 0xffff);
20513- *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
20514-}
20515-
20516-#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */
20517-#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */
20518-#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */
20519-#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */
20520-#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */
20521-#define DESCTYPE_DPL3 0x60 /* DPL-3 */
20522-#define DESCTYPE_S 0x10 /* !system */
20523-
20524-#ifndef CONFIG_XEN
20525-#define load_TR_desc() native_load_tr_desc()
20526-#define load_gdt(dtr) native_load_gdt(dtr)
20527-#define load_idt(dtr) native_load_idt(dtr)
20528-#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
20529-#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
20530-
20531-#define store_gdt(dtr) native_store_gdt(dtr)
20532-#define store_idt(dtr) native_store_idt(dtr)
20533-#define store_tr(tr) (tr = native_store_tr())
20534-#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
20535-
20536-#define load_TLS(t, cpu) native_load_tls(t, cpu)
20537-#define set_ldt native_set_ldt
20538-
20539-#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20540-#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20541-#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20542-
20543-static inline void write_dt_entry(struct desc_struct *dt,
20544- int entry, u32 entry_low, u32 entry_high)
20545-{
20546- dt[entry].a = entry_low;
20547- dt[entry].b = entry_high;
20548-}
20549-
20550-static inline void native_set_ldt(const void *addr, unsigned int entries)
20551-{
20552- if (likely(entries == 0))
20553- __asm__ __volatile__("lldt %w0"::"q" (0));
20554- else {
20555- unsigned cpu = smp_processor_id();
20556- __u32 a, b;
20557-
20558- pack_descriptor(&a, &b, (unsigned long)addr,
20559- entries * sizeof(struct desc_struct) - 1,
20560- DESCTYPE_LDT, 0);
20561- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
20562- __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
20563- }
20564-}
20565-
20566-
20567-static inline void native_load_tr_desc(void)
20568-{
20569- asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
20570-}
20571-
20572-static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
20573-{
20574- asm volatile("lgdt %0"::"m" (*dtr));
20575-}
20576-
20577-static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
20578-{
20579- asm volatile("lidt %0"::"m" (*dtr));
20580-}
20581-
20582-static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
20583-{
20584- asm ("sgdt %0":"=m" (*dtr));
20585-}
20586-
20587-static inline void native_store_idt(struct Xgt_desc_struct *dtr)
20588-{
20589- asm ("sidt %0":"=m" (*dtr));
20590-}
20591-
20592-static inline unsigned long native_store_tr(void)
20593-{
20594- unsigned long tr;
20595- asm ("str %0":"=r" (tr));
20596- return tr;
20597-}
20598-
20599-static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
20600-{
20601- unsigned int i;
20602- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
20603-
20604- for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20605- gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
20606-}
20607-#else
20608-#define load_TLS(t, cpu) xen_load_tls(t, cpu)
20609-#define set_ldt xen_set_ldt
20610-
20611-extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
20612-extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
20613-
20614-static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
20615-{
20616- unsigned int i;
20617- struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
20618-
20619- for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20620- if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
20621- *(u64 *)&t->tls_array[i]))
20622- BUG();
20623-}
20624-#endif
20625-
20626-#ifndef CONFIG_X86_NO_IDT
20627-static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
20628-{
20629- __u32 a, b;
20630- pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
20631- write_idt_entry(idt_table, gate, a, b);
20632-}
20633-#endif
20634-
20635-#ifndef CONFIG_X86_NO_TSS
20636-static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
20637-{
20638- __u32 a, b;
20639- pack_descriptor(&a, &b, (unsigned long)addr,
20640- offsetof(struct tss_struct, __cacheline_filler) - 1,
20641- DESCTYPE_TSS, 0);
20642- write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
20643-}
20644-#endif
20645-
20646-
20647-#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
20648-
20649-#define LDT_entry_a(info) \
20650- ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
20651-
20652-#define LDT_entry_b(info) \
20653- (((info)->base_addr & 0xff000000) | \
20654- (((info)->base_addr & 0x00ff0000) >> 16) | \
20655- ((info)->limit & 0xf0000) | \
20656- (((info)->read_exec_only ^ 1) << 9) | \
20657- ((info)->contents << 10) | \
20658- (((info)->seg_not_present ^ 1) << 15) | \
20659- ((info)->seg_32bit << 22) | \
20660- ((info)->limit_in_pages << 23) | \
20661- ((info)->useable << 20) | \
20662- 0x7000)
20663-
20664-#define LDT_empty(info) (\
20665- (info)->base_addr == 0 && \
20666- (info)->limit == 0 && \
20667- (info)->contents == 0 && \
20668- (info)->read_exec_only == 1 && \
20669- (info)->seg_32bit == 0 && \
20670- (info)->limit_in_pages == 0 && \
20671- (info)->seg_not_present == 1 && \
20672- (info)->useable == 0 )
20673-
20674-static inline void clear_LDT(void)
20675-{
20676- set_ldt(NULL, 0);
20677-}
20678-
20679-/*
20680- * load one particular LDT into the current CPU
20681- */
20682-static inline void load_LDT_nolock(mm_context_t *pc)
20683-{
20684- set_ldt(pc->ldt, pc->size);
20685-}
20686-
20687-static inline void load_LDT(mm_context_t *pc)
20688-{
20689- preempt_disable();
20690- load_LDT_nolock(pc);
20691- preempt_enable();
20692-}
20693-
20694-static inline unsigned long get_desc_base(unsigned long *desc)
20695-{
20696- unsigned long base;
20697- base = ((desc[0] >> 16) & 0x0000ffff) |
20698- ((desc[1] << 16) & 0x00ff0000) |
20699- (desc[1] & 0xff000000);
20700- return base;
20701-}
20702-
20703-#else /* __ASSEMBLY__ */
20704-
20705-/*
20706- * GET_DESC_BASE reads the descriptor base of the specified segment.
20707- *
20708- * Args:
20709- * idx - descriptor index
20710- * gdt - GDT pointer
20711- * base - 32bit register to which the base will be written
20712- * lo_w - lo word of the "base" register
20713- * lo_b - lo byte of the "base" register
20714- * hi_b - hi byte of the low word of the "base" register
20715- *
20716- * Example:
20717- * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
20718- * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
20719- */
20720-#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
20721- movb idx*8+4(gdt), lo_b; \
20722- movb idx*8+7(gdt), hi_b; \
20723- shll $16, base; \
20724- movw idx*8+2(gdt), lo_w;
20725-
20726-#endif /* !__ASSEMBLY__ */
20727-
20728-#endif
20729--- a/include/asm-x86/mach-xen/asm/desc_64.h
20730+++ /dev/null
20731@@ -1,228 +0,0 @@
20732-/* Written 2000 by Andi Kleen */
20733-#ifndef __ARCH_DESC_H
20734-#define __ARCH_DESC_H
20735-
20736-#include <linux/threads.h>
20737-#include <asm/ldt.h>
20738-
20739-#ifndef __ASSEMBLY__
20740-
20741-#include <linux/string.h>
20742-#include <linux/smp.h>
20743-#include <asm/desc_defs.h>
20744-
20745-#include <asm/segment.h>
20746-#include <asm/mmu.h>
20747-
20748-extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
20749-
20750-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
20751-
20752-#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
20753-#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
20754-
20755-static inline void clear_LDT(void)
20756-{
20757- int cpu = get_cpu();
20758-
20759- /*
20760- * NB. We load the default_ldt for lcall7/27 handling on demand, as
20761- * it slows down context switching. Noone uses it anyway.
20762- */
20763- cpu = cpu; /* XXX avoid compiler warning */
20764- xen_set_ldt(NULL, 0);
20765- put_cpu();
20766-}
20767-
20768-#ifndef CONFIG_X86_NO_TSS
20769-static inline unsigned long __store_tr(void)
20770-{
20771- unsigned long tr;
20772-
20773- asm volatile ("str %w0":"=r" (tr));
20774- return tr;
20775-}
20776-
20777-#define store_tr(tr) (tr) = __store_tr()
20778-#endif
20779-
20780-/*
20781- * This is the ldt that every process will get unless we need
20782- * something other than this.
20783- */
20784-extern struct desc_struct default_ldt[];
20785-#ifndef CONFIG_X86_NO_IDT
20786-extern struct gate_struct idt_table[];
20787-#endif
20788-extern struct desc_ptr cpu_gdt_descr[];
20789-
20790-/* the cpu gdt accessor */
20791-#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
20792-
20793-#ifndef CONFIG_XEN
20794-static inline void load_gdt(const struct desc_ptr *ptr)
20795-{
20796- asm volatile("lgdt %w0"::"m" (*ptr));
20797-}
20798-
20799-static inline void store_gdt(struct desc_ptr *ptr)
20800-{
20801- asm("sgdt %w0":"=m" (*ptr));
20802-}
20803-#endif
20804-
20805-static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
20806-{
20807- struct gate_struct s;
20808- s.offset_low = PTR_LOW(func);
20809- s.segment = __KERNEL_CS;
20810- s.ist = ist;
20811- s.p = 1;
20812- s.dpl = dpl;
20813- s.zero0 = 0;
20814- s.zero1 = 0;
20815- s.type = type;
20816- s.offset_middle = PTR_MIDDLE(func);
20817- s.offset_high = PTR_HIGH(func);
20818- /* does not need to be atomic because it is only done once at setup time */
20819- memcpy(adr, &s, 16);
20820-}
20821-
20822-#ifndef CONFIG_X86_NO_IDT
20823-static inline void set_intr_gate(int nr, void *func)
20824-{
20825- BUG_ON((unsigned)nr > 0xFF);
20826- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
20827-}
20828-
20829-static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
20830-{
20831- BUG_ON((unsigned)nr > 0xFF);
20832- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
20833-}
20834-
20835-static inline void set_system_gate(int nr, void *func)
20836-{
20837- BUG_ON((unsigned)nr > 0xFF);
20838- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
20839-}
20840-
20841-static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
20842-{
20843- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
20844-}
20845-
20846-static inline void load_idt(const struct desc_ptr *ptr)
20847-{
20848- asm volatile("lidt %w0"::"m" (*ptr));
20849-}
20850-
20851-static inline void store_idt(struct desc_ptr *dtr)
20852-{
20853- asm("sidt %w0":"=m" (*dtr));
20854-}
20855-#endif
20856-
20857-static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
20858- unsigned size)
20859-{
20860- struct ldttss_desc d;
20861- memset(&d,0,sizeof(d));
20862- d.limit0 = size & 0xFFFF;
20863- d.base0 = PTR_LOW(tss);
20864- d.base1 = PTR_MIDDLE(tss) & 0xFF;
20865- d.type = type;
20866- d.p = 1;
20867- d.limit1 = (size >> 16) & 0xF;
20868- d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
20869- d.base3 = PTR_HIGH(tss);
20870- memcpy(ptr, &d, 16);
20871-}
20872-
20873-#ifndef CONFIG_X86_NO_TSS
20874-static inline void set_tss_desc(unsigned cpu, void *addr)
20875-{
20876- /*
20877- * sizeof(unsigned long) coming from an extra "long" at the end
20878- * of the iobitmap. See tss_struct definition in processor.h
20879- *
20880- * -1? seg base+limit should be pointing to the address of the
20881- * last valid byte
20882- */
20883- set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
20884- (unsigned long)addr, DESC_TSS,
20885- IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
20886-}
20887-#endif
20888-
20889-static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
20890-{
20891- set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
20892- DESC_LDT, size * 8 - 1);
20893-}
20894-
20895-#define LDT_entry_a(info) \
20896- ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
20897-/* Don't allow setting of the lm bit. It is useless anyways because
20898- 64bit system calls require __USER_CS. */
20899-#define LDT_entry_b(info) \
20900- (((info)->base_addr & 0xff000000) | \
20901- (((info)->base_addr & 0x00ff0000) >> 16) | \
20902- ((info)->limit & 0xf0000) | \
20903- (((info)->read_exec_only ^ 1) << 9) | \
20904- ((info)->contents << 10) | \
20905- (((info)->seg_not_present ^ 1) << 15) | \
20906- ((info)->seg_32bit << 22) | \
20907- ((info)->limit_in_pages << 23) | \
20908- ((info)->useable << 20) | \
20909- /* ((info)->lm << 21) | */ \
20910- 0x7000)
20911-
20912-#define LDT_empty(info) (\
20913- (info)->base_addr == 0 && \
20914- (info)->limit == 0 && \
20915- (info)->contents == 0 && \
20916- (info)->read_exec_only == 1 && \
20917- (info)->seg_32bit == 0 && \
20918- (info)->limit_in_pages == 0 && \
20919- (info)->seg_not_present == 1 && \
20920- (info)->useable == 0 && \
20921- (info)->lm == 0)
20922-
20923-static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
20924-{
20925- unsigned int i;
20926- u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
20927-
20928- for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20929- if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
20930- t->tls_array[i]))
20931- BUG();
20932-}
20933-
20934-/*
20935- * load one particular LDT into the current CPU
20936- */
20937-static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
20938-{
20939- void *segments = pc->ldt;
20940- int count = pc->size;
20941-
20942- if (likely(!count))
20943- segments = NULL;
20944-
20945- xen_set_ldt(segments, count);
20946-}
20947-
20948-static inline void load_LDT(mm_context_t *pc)
20949-{
20950- int cpu = get_cpu();
20951- load_LDT_nolock(pc, cpu);
20952- put_cpu();
20953-}
20954-
20955-extern struct desc_ptr idt_descr;
20956-
20957-#endif /* !__ASSEMBLY__ */
20958-
20959-#endif
20960--- a/include/asm-x86/mach-xen/asm/desc.h
20961+++ b/include/asm-x86/mach-xen/asm/desc.h
20962@@ -1,5 +1,404 @@
20963+#ifndef _ASM_DESC_H_
20964+#define _ASM_DESC_H_
20965+
20966+#ifndef __ASSEMBLY__
20967+#include <asm/desc_defs.h>
20968+#include <asm/ldt.h>
20969+#include <asm/mmu.h>
20970+#include <linux/smp.h>
20971+
20972+static inline void fill_ldt(struct desc_struct *desc,
20973+ const struct user_desc *info)
20974+{
20975+ desc->limit0 = info->limit & 0x0ffff;
20976+ desc->base0 = info->base_addr & 0x0000ffff;
20977+
20978+ desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
20979+ desc->type = (info->read_exec_only ^ 1) << 1;
20980+ desc->type |= info->contents << 2;
20981+ desc->s = 1;
20982+ desc->dpl = 0x3;
20983+ desc->p = info->seg_not_present ^ 1;
20984+ desc->limit = (info->limit & 0xf0000) >> 16;
20985+ desc->avl = info->useable;
20986+ desc->d = info->seg_32bit;
20987+ desc->g = info->limit_in_pages;
20988+ desc->base2 = (info->base_addr & 0xff000000) >> 24;
20989+}
20990+
20991+#ifndef CONFIG_X86_NO_IDT
20992+extern struct desc_ptr idt_descr;
20993+extern gate_desc idt_table[];
20994+#endif
20995+
20996+#ifdef CONFIG_X86_64
20997+extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
20998+extern struct desc_ptr cpu_gdt_descr[];
20999+/* the cpu gdt accessor */
21000+#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
21001+
21002+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
21003+ unsigned dpl, unsigned ist, unsigned seg)
21004+{
21005+ gate->offset_low = PTR_LOW(func);
21006+ gate->segment = __KERNEL_CS;
21007+ gate->ist = ist;
21008+ gate->p = 1;
21009+ gate->dpl = dpl;
21010+ gate->zero0 = 0;
21011+ gate->zero1 = 0;
21012+ gate->type = type;
21013+ gate->offset_middle = PTR_MIDDLE(func);
21014+ gate->offset_high = PTR_HIGH(func);
21015+}
21016+
21017+#else
21018+struct gdt_page {
21019+ struct desc_struct gdt[GDT_ENTRIES];
21020+} __attribute__((aligned(PAGE_SIZE)));
21021+DECLARE_PER_CPU(struct gdt_page, gdt_page);
21022+
21023+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
21024+{
21025+ return per_cpu(gdt_page, cpu).gdt;
21026+}
21027+
21028+static inline void pack_gate(gate_desc *gate, unsigned char type,
21029+ unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
21030+
21031+{
21032+ gate->a = (seg << 16) | (base & 0xffff);
21033+ gate->b = (base & 0xffff0000) |
21034+ (((0x80 | type | (dpl << 5)) & 0xff) << 8);
21035+}
21036+
21037+#endif
21038+
21039+static inline int desc_empty(const void *ptr)
21040+{
21041+ const u32 *desc = ptr;
21042+ return !(desc[0] | desc[1]);
21043+}
21044+
21045+#ifndef CONFIG_XEN
21046+#define load_TR_desc() native_load_tr_desc()
21047+#define load_gdt(dtr) native_load_gdt(dtr)
21048+#define load_idt(dtr) native_load_idt(dtr)
21049+#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
21050+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
21051+
21052+#define store_gdt(dtr) native_store_gdt(dtr)
21053+#define store_idt(dtr) native_store_idt(dtr)
21054+#define store_tr(tr) (tr = native_store_tr())
21055+#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
21056+
21057+#define load_TLS(t, cpu) native_load_tls(t, cpu)
21058+#define set_ldt native_set_ldt
21059+
21060+#define write_ldt_entry(dt, entry, desc) \
21061+ native_write_ldt_entry(dt, entry, desc)
21062+#define write_gdt_entry(dt, entry, desc, type) \
21063+ native_write_gdt_entry(dt, entry, desc, type)
21064+#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
21065+
21066+static inline void native_write_idt_entry(gate_desc *idt, int entry,
21067+ const gate_desc *gate)
21068+{
21069+ memcpy(&idt[entry], gate, sizeof(*gate));
21070+}
21071+
21072+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
21073+ const void *desc)
21074+{
21075+ memcpy(&ldt[entry], desc, 8);
21076+}
21077+
21078+static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
21079+ const void *desc, int type)
21080+{
21081+ unsigned int size;
21082+ switch (type) {
21083+ case DESC_TSS:
21084+ size = sizeof(tss_desc);
21085+ break;
21086+ case DESC_LDT:
21087+ size = sizeof(ldt_desc);
21088+ break;
21089+ default:
21090+ size = sizeof(struct desc_struct);
21091+ break;
21092+ }
21093+ memcpy(&gdt[entry], desc, size);
21094+}
21095+#endif
21096+
21097+static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
21098+ unsigned long limit, unsigned char type,
21099+ unsigned char flags)
21100+{
21101+ desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
21102+ desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
21103+ (limit & 0x000f0000) | ((type & 0xff) << 8) |
21104+ ((flags & 0xf) << 20);
21105+ desc->p = 1;
21106+}
21107+
21108+
21109+#ifndef CONFIG_XEN
21110+static inline void set_tssldt_descriptor(void *d, unsigned long addr,
21111+ unsigned type, unsigned size)
21112+{
21113+#ifdef CONFIG_X86_64
21114+ struct ldttss_desc64 *desc = d;
21115+ memset(desc, 0, sizeof(*desc));
21116+ desc->limit0 = size & 0xFFFF;
21117+ desc->base0 = PTR_LOW(addr);
21118+ desc->base1 = PTR_MIDDLE(addr) & 0xFF;
21119+ desc->type = type;
21120+ desc->p = 1;
21121+ desc->limit1 = (size >> 16) & 0xF;
21122+ desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
21123+ desc->base3 = PTR_HIGH(addr);
21124+#else
21125+
21126+ pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
21127+#endif
21128+}
21129+
21130+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
21131+{
21132+ struct desc_struct *d = get_cpu_gdt_table(cpu);
21133+ tss_desc tss;
21134+
21135+ /*
21136+ * sizeof(unsigned long) coming from an extra "long" at the end
21137+ * of the iobitmap. See tss_struct definition in processor.h
21138+ *
21139+ * -1? seg base+limit should be pointing to the address of the
21140+ * last valid byte
21141+ */
21142+ set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
21143+ IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
21144+ write_gdt_entry(d, entry, &tss, DESC_TSS);
21145+}
21146+
21147+#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
21148+
21149+static inline void native_set_ldt(const void *addr, unsigned int entries)
21150+{
21151+ if (likely(entries == 0))
21152+ __asm__ __volatile__("lldt %w0"::"q" (0));
21153+ else {
21154+ unsigned cpu = smp_processor_id();
21155+ ldt_desc ldt;
21156+
21157+ set_tssldt_descriptor(&ldt, (unsigned long)addr,
21158+ DESC_LDT, entries * sizeof(ldt) - 1);
21159+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
21160+ &ldt, DESC_LDT);
21161+ __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
21162+ }
21163+}
21164+
21165+static inline void native_load_tr_desc(void)
21166+{
21167+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
21168+}
21169+
21170+static inline void native_load_gdt(const struct desc_ptr *dtr)
21171+{
21172+ asm volatile("lgdt %0"::"m" (*dtr));
21173+}
21174+
21175+static inline void native_load_idt(const struct desc_ptr *dtr)
21176+{
21177+ asm volatile("lidt %0"::"m" (*dtr));
21178+}
21179+
21180+static inline void native_store_gdt(struct desc_ptr *dtr)
21181+{
21182+ asm volatile("sgdt %0":"=m" (*dtr));
21183+}
21184+
21185+static inline void native_store_idt(struct desc_ptr *dtr)
21186+{
21187+ asm volatile("sidt %0":"=m" (*dtr));
21188+}
21189+
21190+static inline unsigned long native_store_tr(void)
21191+{
21192+ unsigned long tr;
21193+ asm volatile("str %0":"=r" (tr));
21194+ return tr;
21195+}
21196+
21197+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
21198+{
21199+ unsigned int i;
21200+ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
21201+
21202+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
21203+ gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
21204+}
21205+#else
21206+#define load_TLS(t, cpu) xen_load_tls(t, cpu)
21207+#define set_ldt xen_set_ldt
21208+
21209+extern int write_ldt_entry(struct desc_struct *ldt, int entry,
21210+ const void *desc);
21211+extern int write_gdt_entry(struct desc_struct *gdt, int entry,
21212+ const void *desc, int type);
21213+
21214+static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
21215+{
21216+ unsigned int i;
21217+ struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
21218+
21219+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
21220+ if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
21221+ *(u64 *)&t->tls_array[i]))
21222+ BUG();
21223+}
21224+#endif
21225+
21226+#define _LDT_empty(info) (\
21227+ (info)->base_addr == 0 && \
21228+ (info)->limit == 0 && \
21229+ (info)->contents == 0 && \
21230+ (info)->read_exec_only == 1 && \
21231+ (info)->seg_32bit == 0 && \
21232+ (info)->limit_in_pages == 0 && \
21233+ (info)->seg_not_present == 1 && \
21234+ (info)->useable == 0)
21235+
21236+#ifdef CONFIG_X86_64
21237+#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
21238+#else
21239+#define LDT_empty(info) (_LDT_empty(info))
21240+#endif
21241+
21242+static inline void clear_LDT(void)
21243+{
21244+ set_ldt(NULL, 0);
21245+}
21246+
21247+/*
21248+ * load one particular LDT into the current CPU
21249+ */
21250+static inline void load_LDT_nolock(mm_context_t *pc)
21251+{
21252+ set_ldt(pc->ldt, pc->size);
21253+}
21254+
21255+static inline void load_LDT(mm_context_t *pc)
21256+{
21257+ preempt_disable();
21258+ load_LDT_nolock(pc);
21259+ preempt_enable();
21260+}
21261+
21262+static inline unsigned long get_desc_base(const struct desc_struct *desc)
21263+{
21264+ return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
21265+}
21266+
21267+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
21268+{
21269+ return desc->limit0 | (desc->limit << 16);
21270+}
21271+
21272+#ifndef CONFIG_X86_NO_IDT
21273+static inline void _set_gate(int gate, unsigned type, void *addr,
21274+ unsigned dpl, unsigned ist, unsigned seg)
21275+{
21276+ gate_desc s;
21277+ pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
21278+ /*
21279+ * does not need to be atomic because it is only done once at
21280+ * setup time
21281+ */
21282+ write_idt_entry(idt_table, gate, &s);
21283+}
21284+
21285+/*
21286+ * This needs to use 'idt_table' rather than 'idt', and
21287+ * thus use the _nonmapped_ version of the IDT, as the
21288+ * Pentium F0 0F bugfix can have resulted in the mapped
21289+ * IDT being write-protected.
21290+ */
21291+static inline void set_intr_gate(unsigned int n, void *addr)
21292+{
21293+ BUG_ON((unsigned)n > 0xFF);
21294+ _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
21295+}
21296+
21297+/*
21298+ * This routine sets up an interrupt gate at directory privilege level 3.
21299+ */
21300+static inline void set_system_intr_gate(unsigned int n, void *addr)
21301+{
21302+ BUG_ON((unsigned)n > 0xFF);
21303+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
21304+}
21305+
21306+static inline void set_trap_gate(unsigned int n, void *addr)
21307+{
21308+ BUG_ON((unsigned)n > 0xFF);
21309+ _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
21310+}
21311+
21312+static inline void set_system_gate(unsigned int n, void *addr)
21313+{
21314+ BUG_ON((unsigned)n > 0xFF);
21315 #ifdef CONFIG_X86_32
21316-# include "desc_32.h"
21317+ _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
21318+#else
21319+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
21320+#endif
21321+}
21322+
21323+static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
21324+{
21325+ BUG_ON((unsigned)n > 0xFF);
21326+ _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
21327+}
21328+
21329+static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
21330+{
21331+ BUG_ON((unsigned)n > 0xFF);
21332+ _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
21333+}
21334+
21335+static inline void set_system_gate_ist(int n, void *addr, unsigned ist)
21336+{
21337+ BUG_ON((unsigned)n > 0xFF);
21338+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
21339+}
21340+#endif
21341+
21342 #else
21343-# include "desc_64.h"
21344+/*
21345+ * GET_DESC_BASE reads the descriptor base of the specified segment.
21346+ *
21347+ * Args:
21348+ * idx - descriptor index
21349+ * gdt - GDT pointer
21350+ * base - 32bit register to which the base will be written
21351+ * lo_w - lo word of the "base" register
21352+ * lo_b - lo byte of the "base" register
21353+ * hi_b - hi byte of the low word of the "base" register
21354+ *
21355+ * Example:
21356+ * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
21357+ * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
21358+ */
21359+#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
21360+ movb idx*8+4(gdt), lo_b; \
21361+ movb idx*8+7(gdt), hi_b; \
21362+ shll $16, base; \
21363+ movw idx*8+2(gdt), lo_w;
21364+
21365+
21366+#endif /* __ASSEMBLY__ */
21367+
21368 #endif
21369--- a/include/asm-x86/mach-xen/asm/dma-mapping_32.h
21370+++ b/include/asm-x86/mach-xen/asm/dma-mapping_32.h
21371@@ -84,23 +84,13 @@ dma_sync_single_range_for_device(struct
21372 dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
21373 }
21374
21375-static inline void
21376+extern void
21377 dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
21378- enum dma_data_direction direction)
21379-{
21380- if (swiotlb)
21381- swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
21382- flush_write_buffers();
21383-}
21384+ enum dma_data_direction direction);
21385
21386-static inline void
21387+extern void
21388 dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
21389- enum dma_data_direction direction)
21390-{
21391- if (swiotlb)
21392- swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
21393- flush_write_buffers();
21394-}
21395+ enum dma_data_direction direction);
21396
21397 extern int
21398 dma_mapping_error(dma_addr_t dma_addr);
21399--- a/include/asm-x86/mach-xen/asm/fixmap_32.h
21400+++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
21401@@ -64,7 +64,7 @@ enum fixed_addresses {
21402 #endif
21403 #ifdef CONFIG_X86_VISWS_APIC
21404 FIX_CO_CPU, /* Cobalt timer */
21405- FIX_CO_APIC, /* Cobalt APIC Redirection Table */
21406+ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
21407 FIX_LI_PCIA, /* Lithium PCI Bridge A */
21408 FIX_LI_PCIB, /* Lithium PCI Bridge B */
21409 #endif
21410@@ -73,7 +73,7 @@ enum fixed_addresses {
21411 #endif
21412 #ifdef CONFIG_X86_CYCLONE_TIMER
21413 FIX_CYCLONE_TIMER, /*cyclone timer register*/
21414-#endif
21415+#endif
21416 #ifdef CONFIG_HIGHMEM
21417 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
21418 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
21419@@ -93,11 +93,23 @@ enum fixed_addresses {
21420 FIX_ISAMAP_END,
21421 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
21422 __end_of_permanent_fixed_addresses,
21423- /* temporary boot-time mappings, used before ioremap() is functional */
21424-#define NR_FIX_BTMAPS 16
21425- FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
21426- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
21427+ /*
21428+ * 256 temporary boot-time mappings, used by early_ioremap(),
21429+ * before ioremap() is functional.
21430+ *
21431+ * We round it up to the next 512 pages boundary so that we
21432+ * can have a single pgd entry and a single pte table:
21433+ */
21434+#define NR_FIX_BTMAPS 64
21435+#define FIX_BTMAPS_NESTING 4
21436+ FIX_BTMAP_END =
21437+ __end_of_permanent_fixed_addresses + 512 -
21438+ (__end_of_permanent_fixed_addresses & 511),
21439+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
21440 FIX_WP_TEST,
21441+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
21442+ FIX_OHCI1394_BASE,
21443+#endif
21444 __end_of_fixed_addresses
21445 };
21446
21447--- a/include/asm-x86/mach-xen/asm/fixmap_64.h
21448+++ b/include/asm-x86/mach-xen/asm/fixmap_64.h
21449@@ -15,6 +15,7 @@
21450 #include <asm/apicdef.h>
21451 #include <asm/page.h>
21452 #include <asm/vsyscall.h>
21453+#include <asm/efi.h>
21454 #include <asm/acpi.h>
21455
21456 /*
21457@@ -46,6 +47,10 @@ enum fixed_addresses {
21458 FIX_IO_APIC_BASE_0,
21459 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
21460 #endif
21461+#ifdef CONFIG_EFI
21462+ FIX_EFI_IO_MAP_LAST_PAGE,
21463+ FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
21464+#endif
21465 #ifdef CONFIG_ACPI
21466 FIX_ACPI_BEGIN,
21467 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
21468@@ -55,10 +60,22 @@ enum fixed_addresses {
21469 FIX_ISAMAP_END,
21470 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
21471 __end_of_permanent_fixed_addresses,
21472- /* temporary boot-time mappings, used before ioremap() is functional */
21473-#define NR_FIX_BTMAPS 16
21474- FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
21475- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
21476+ /*
21477+ * 256 temporary boot-time mappings, used by early_ioremap(),
21478+ * before ioremap() is functional.
21479+ *
21480+ * We round it up to the next 512 pages boundary so that we
21481+ * can have a single pgd entry and a single pte table:
21482+ */
21483+#define NR_FIX_BTMAPS 64
21484+#define FIX_BTMAPS_NESTING 4
21485+ FIX_BTMAP_END =
21486+ __end_of_permanent_fixed_addresses + 512 -
21487+ (__end_of_permanent_fixed_addresses & 511),
21488+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
21489+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
21490+ FIX_OHCI1394_BASE,
21491+#endif
21492 __end_of_fixed_addresses
21493 };
21494
21495--- a/include/asm-x86/mach-xen/asm/highmem.h
21496+++ b/include/asm-x86/mach-xen/asm/highmem.h
21497@@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table;
21498 * easily, subsequent pte tables have to be allocated in one physical
21499 * chunk of RAM.
21500 */
21501-#ifdef CONFIG_X86_PAE
21502-#define LAST_PKMAP 512
21503-#else
21504-#define LAST_PKMAP 1024
21505-#endif
21506 /*
21507 * Ordering is:
21508 *
21509@@ -57,13 +52,12 @@ extern pte_t *pkmap_page_table;
21510 * VMALLOC_START
21511 * high_memory
21512 */
21513-#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
21514 #define LAST_PKMAP_MASK (LAST_PKMAP-1)
21515 #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
21516 #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
21517
21518-extern void * FASTCALL(kmap_high(struct page *page));
21519-extern void FASTCALL(kunmap_high(struct page *page));
21520+extern void *kmap_high(struct page *page);
21521+extern void kunmap_high(struct page *page);
21522
21523 void *kmap(struct page *page);
21524 void kunmap(struct page *page);
21525--- a/include/asm-x86/mach-xen/asm/hypervisor.h
21526+++ b/include/asm-x86/mach-xen/asm/hypervisor.h
21527@@ -264,6 +264,25 @@ HYPERVISOR_poll(
21528 return rc;
21529 }
21530
21531+static inline int __must_check
21532+HYPERVISOR_poll_no_timeout(
21533+ evtchn_port_t *ports, unsigned int nr_ports)
21534+{
21535+ int rc;
21536+ struct sched_poll sched_poll = {
21537+ .nr_ports = nr_ports
21538+ };
21539+ set_xen_guest_handle(sched_poll.ports, ports);
21540+
21541+ rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
21542+#if CONFIG_XEN_COMPAT <= 0x030002
21543+ if (rc == -ENOSYS)
21544+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
21545+#endif
21546+
21547+ return rc;
21548+}
21549+
21550 #ifdef CONFIG_XEN
21551
21552 static inline void
21553--- a/include/asm-x86/mach-xen/asm/io_32.h
21554+++ b/include/asm-x86/mach-xen/asm/io_32.h
21555@@ -113,8 +113,6 @@ static inline void * phys_to_virt(unsign
21556 ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
21557 bvec_to_pseudophys((vec2))))
21558
21559-extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
21560-
21561 /**
21562 * ioremap - map bus memory into CPU space
21563 * @offset: bus address of the memory
21564@@ -124,32 +122,39 @@ extern void __iomem * __ioremap(unsigned
21565 * make bus memory CPU accessible via the readb/readw/readl/writeb/
21566 * writew/writel functions and the other mmio helpers. The returned
21567 * address is not guaranteed to be usable directly as a virtual
21568- * address.
21569+ * address.
21570 *
21571 * If the area you are trying to map is a PCI BAR you should have a
21572 * look at pci_iomap().
21573 */
21574+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
21575+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
21576
21577-static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
21578+/*
21579+ * The default ioremap() behavior is non-cached:
21580+ */
21581+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
21582 {
21583- return __ioremap(offset, size, 0);
21584+ return ioremap_nocache(offset, size);
21585 }
21586
21587-extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
21588 extern void iounmap(volatile void __iomem *addr);
21589
21590 /*
21591- * bt_ioremap() and bt_iounmap() are for temporary early boot-time
21592+ * early_ioremap() and early_iounmap() are for temporary early boot-time
21593 * mappings, before the real ioremap() is functional.
21594 * A boot-time mapping is currently limited to at most 16 pages.
21595 */
21596-extern void *bt_ioremap(unsigned long offset, unsigned long size);
21597-extern void bt_iounmap(void *addr, unsigned long size);
21598+extern void early_ioremap_init(void);
21599+extern void early_ioremap_clear(void);
21600+extern void early_ioremap_reset(void);
21601+extern void *early_ioremap(unsigned long offset, unsigned long size);
21602+extern void early_iounmap(void *addr, unsigned long size);
21603 extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
21604
21605 /* Use early IO mappings for DMI because it's initialized early */
21606-#define dmi_ioremap bt_ioremap
21607-#define dmi_iounmap bt_iounmap
21608+#define dmi_ioremap early_ioremap
21609+#define dmi_iounmap early_iounmap
21610 #define dmi_alloc alloc_bootmem
21611
21612 /*
21613@@ -263,43 +268,21 @@ static inline void flush_write_buffers(v
21614
21615 #endif /* __KERNEL__ */
21616
21617-static inline void xen_io_delay(void)
21618-{
21619- asm volatile("outb %%al,$0x80" : : : "memory");
21620-}
21621+extern void xen_io_delay(void);
21622+#define native_io_delay xen_io_delay
21623+
21624+extern int io_delay_type;
21625+extern void io_delay_init(void);
21626
21627 static inline void slow_down_io(void) {
21628- xen_io_delay();
21629+ native_io_delay();
21630 #ifdef REALLY_SLOW_IO
21631- xen_io_delay();
21632- xen_io_delay();
21633- xen_io_delay();
21634+ native_io_delay();
21635+ native_io_delay();
21636+ native_io_delay();
21637 #endif
21638 }
21639
21640-#ifdef CONFIG_X86_NUMAQ
21641-extern void *xquad_portio; /* Where the IO area was mapped */
21642-#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
21643-#define __BUILDIO(bwl,bw,type) \
21644-static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
21645- if (xquad_portio) \
21646- write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
21647- else \
21648- out##bwl##_local(value, port); \
21649-} \
21650-static inline void out##bwl(unsigned type value, int port) { \
21651- out##bwl##_quad(value, port, 0); \
21652-} \
21653-static inline unsigned type in##bwl##_quad(int port, int quad) { \
21654- if (xquad_portio) \
21655- return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
21656- else \
21657- return in##bwl##_local(port); \
21658-} \
21659-static inline unsigned type in##bwl(int port) { \
21660- return in##bwl##_quad(port, 0); \
21661-}
21662-#else
21663 #define __BUILDIO(bwl,bw,type) \
21664 static inline void out##bwl(unsigned type value, int port) { \
21665 out##bwl##_local(value, port); \
21666@@ -307,8 +290,6 @@ static inline void out##bwl(unsigned typ
21667 static inline unsigned type in##bwl(int port) { \
21668 return in##bwl##_local(port); \
21669 }
21670-#endif
21671-
21672
21673 #define BUILDIO(bwl,bw,type) \
21674 static inline void out##bwl##_local(unsigned type value, int port) { \
21675--- a/include/asm-x86/mach-xen/asm/io_64.h
21676+++ b/include/asm-x86/mach-xen/asm/io_64.h
21677@@ -36,13 +36,21 @@
21678 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
21679 */
21680
21681-#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
21682+extern void xen_io_delay(void);
21683+#define native_io_delay xen_io_delay
21684
21685+extern int io_delay_type;
21686+extern void io_delay_init(void);
21687+
21688+static inline void slow_down_io(void)
21689+{
21690+ native_io_delay();
21691 #ifdef REALLY_SLOW_IO
21692-#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
21693-#else
21694-#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
21695+ native_io_delay();
21696+ native_io_delay();
21697+ native_io_delay();
21698 #endif
21699+}
21700
21701 /*
21702 * Talk about misusing macros..
21703@@ -53,9 +61,15 @@ static inline void out##s(unsigned x val
21704 #define __OUT2(s,s1,s2) \
21705 __asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
21706
21707+#ifndef REALLY_SLOW_IO
21708+#define REALLY_SLOW_IO
21709+#define UNSET_REALLY_SLOW_IO
21710+#endif
21711+
21712 #define __OUT(s,s1,x) \
21713 __OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
21714-__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
21715+__OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
21716+ slow_down_io(); }
21717
21718 #define __IN1(s) \
21719 static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
21720@@ -64,8 +78,13 @@ static inline RETURN_TYPE in##s(unsigned
21721 __asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
21722
21723 #define __IN(s,s1,i...) \
21724-__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
21725-__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
21726+__IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); return _v; } \
21727+__IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
21728+ slow_down_io(); return _v; }
21729+
21730+#ifdef UNSET_REALLY_SLOW_IO
21731+#undef REALLY_SLOW_IO
21732+#endif
21733
21734 #define __INS(s) \
21735 static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
21736@@ -143,25 +162,30 @@ static inline void * phys_to_virt(unsign
21737
21738 #include <asm-generic/iomap.h>
21739
21740-extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
21741-
21742-static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
21743-{
21744- return __ioremap(offset, size, 0);
21745-}
21746-
21747-extern void *bt_ioremap(unsigned long addr, unsigned long size);
21748-extern void bt_iounmap(void *addr, unsigned long size);
21749-#define early_ioremap bt_ioremap
21750-#define early_iounmap bt_iounmap
21751+extern void early_ioremap_init(void);
21752+extern void early_ioremap_clear(void);
21753+extern void early_ioremap_reset(void);
21754+extern void *early_ioremap(unsigned long addr, unsigned long size);
21755+extern void early_iounmap(void *addr, unsigned long size);
21756
21757 /*
21758 * This one maps high address device memory and turns off caching for that area.
21759 * it's useful if some control registers are in such an area and write combining
21760 * or read caching is not desirable:
21761 */
21762-extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
21763+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
21764+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
21765+
21766+/*
21767+ * The default ioremap() behavior is non-cached:
21768+ */
21769+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
21770+{
21771+ return ioremap_nocache(offset, size);
21772+}
21773+
21774 extern void iounmap(volatile void __iomem *addr);
21775+
21776 extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
21777
21778 /*
21779--- a/include/asm-x86/mach-xen/asm/irqflags_32.h
21780+++ /dev/null
21781@@ -1,212 +0,0 @@
21782-/*
21783- * include/asm-i386/irqflags.h
21784- *
21785- * IRQ flags handling
21786- *
21787- * This file gets included from lowlevel asm headers too, to provide
21788- * wrapped versions of the local_irq_*() APIs, based on the
21789- * raw_local_irq_*() functions from the lowlevel headers.
21790- */
21791-#ifndef _ASM_IRQFLAGS_H
21792-#define _ASM_IRQFLAGS_H
21793-
21794-#ifndef __ASSEMBLY__
21795-#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
21796-
21797-#define xen_restore_fl(f) \
21798-do { \
21799- vcpu_info_t *_vcpu; \
21800- barrier(); \
21801- _vcpu = current_vcpu_info(); \
21802- if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
21803- barrier(); /* unmask then check (avoid races) */\
21804- if (unlikely(_vcpu->evtchn_upcall_pending)) \
21805- force_evtchn_callback(); \
21806- } \
21807-} while (0)
21808-
21809-#define xen_irq_disable() \
21810-do { \
21811- current_vcpu_info()->evtchn_upcall_mask = 1; \
21812- barrier(); \
21813-} while (0)
21814-
21815-#define xen_irq_enable() \
21816-do { \
21817- vcpu_info_t *_vcpu; \
21818- barrier(); \
21819- _vcpu = current_vcpu_info(); \
21820- _vcpu->evtchn_upcall_mask = 0; \
21821- barrier(); /* unmask then check (avoid races) */ \
21822- if (unlikely(_vcpu->evtchn_upcall_pending)) \
21823- force_evtchn_callback(); \
21824-} while (0)
21825-
21826-void xen_safe_halt(void);
21827-
21828-void xen_halt(void);
21829-
21830-/*
21831- * The use of 'barrier' in the following reflects their use as local-lock
21832- * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
21833- * critical operations are executed. All critical operations must complete
21834- * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
21835- * includes these barriers, for example.
21836- */
21837-
21838-#define __raw_local_save_flags() xen_save_fl()
21839-
21840-#define raw_local_irq_restore(flags) xen_restore_fl(flags)
21841-
21842-#define raw_local_irq_disable() xen_irq_disable()
21843-
21844-#define raw_local_irq_enable() xen_irq_enable()
21845-
21846-/*
21847- * Used in the idle loop; sti takes one instruction cycle
21848- * to complete:
21849- */
21850-static inline void raw_safe_halt(void)
21851-{
21852- xen_safe_halt();
21853-}
21854-
21855-/*
21856- * Used when interrupts are already enabled or to
21857- * shutdown the processor:
21858- */
21859-static inline void halt(void)
21860-{
21861- xen_halt();
21862-}
21863-
21864-/*
21865- * For spinlocks, etc:
21866- */
21867-#define __raw_local_irq_save() \
21868-({ \
21869- unsigned long flags = __raw_local_save_flags(); \
21870- \
21871- raw_local_irq_disable(); \
21872- \
21873- flags; \
21874-})
21875-
21876-#else
21877-/* Offsets into shared_info_t. */
21878-#define evtchn_upcall_pending /* 0 */
21879-#define evtchn_upcall_mask 1
21880-
21881-#define sizeof_vcpu_shift 6
21882-
21883-#ifdef CONFIG_SMP
21884-#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
21885- shl $sizeof_vcpu_shift,%esi ; \
21886- addl HYPERVISOR_shared_info,%esi
21887-#else
21888-#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
21889-#endif
21890-
21891-#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
21892-#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
21893-#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
21894-#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
21895- __DISABLE_INTERRUPTS
21896-#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
21897- __ENABLE_INTERRUPTS
21898-#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
21899-sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
21900- __TEST_PENDING ; \
21901- jnz 14f /* process more events if necessary... */ ; \
21902- movl PT_ESI(%esp), %esi ; \
21903- sysexit ; \
21904-14: __DISABLE_INTERRUPTS ; \
21905- TRACE_IRQS_OFF ; \
21906-sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
21907- push %esp ; \
21908- call evtchn_do_upcall ; \
21909- add $4,%esp ; \
21910- jmp ret_from_intr
21911-#define INTERRUPT_RETURN iret
21912-#endif /* __ASSEMBLY__ */
21913-
21914-#ifndef __ASSEMBLY__
21915-#define raw_local_save_flags(flags) \
21916- do { (flags) = __raw_local_save_flags(); } while (0)
21917-
21918-#define raw_local_irq_save(flags) \
21919- do { (flags) = __raw_local_irq_save(); } while (0)
21920-
21921-static inline int raw_irqs_disabled_flags(unsigned long flags)
21922-{
21923- return (flags != 0);
21924-}
21925-
21926-#define raw_irqs_disabled() \
21927-({ \
21928- unsigned long flags = __raw_local_save_flags(); \
21929- \
21930- raw_irqs_disabled_flags(flags); \
21931-})
21932-
21933-/*
21934- * makes the traced hardirq state match with the machine state
21935- *
21936- * should be a rarely used function, only in places where its
21937- * otherwise impossible to know the irq state, like in traps.
21938- */
21939-static inline void trace_hardirqs_fixup_flags(unsigned long flags)
21940-{
21941- if (raw_irqs_disabled_flags(flags))
21942- trace_hardirqs_off();
21943- else
21944- trace_hardirqs_on();
21945-}
21946-
21947-#define trace_hardirqs_fixup() \
21948- trace_hardirqs_fixup_flags(__raw_local_save_flags())
21949-#endif /* __ASSEMBLY__ */
21950-
21951-/*
21952- * Do the CPU's IRQ-state tracing from assembly code. We call a
21953- * C function, so save all the C-clobbered registers:
21954- */
21955-#ifdef CONFIG_TRACE_IRQFLAGS
21956-
21957-# define TRACE_IRQS_ON \
21958- pushl %eax; \
21959- pushl %ecx; \
21960- pushl %edx; \
21961- call trace_hardirqs_on; \
21962- popl %edx; \
21963- popl %ecx; \
21964- popl %eax;
21965-
21966-# define TRACE_IRQS_OFF \
21967- pushl %eax; \
21968- pushl %ecx; \
21969- pushl %edx; \
21970- call trace_hardirqs_off; \
21971- popl %edx; \
21972- popl %ecx; \
21973- popl %eax;
21974-
21975-#else
21976-# define TRACE_IRQS_ON
21977-# define TRACE_IRQS_OFF
21978-#endif
21979-
21980-#ifdef CONFIG_DEBUG_LOCK_ALLOC
21981-# define LOCKDEP_SYS_EXIT \
21982- pushl %eax; \
21983- pushl %ecx; \
21984- pushl %edx; \
21985- call lockdep_sys_exit; \
21986- popl %edx; \
21987- popl %ecx; \
21988- popl %eax;
21989-#else
21990-# define LOCKDEP_SYS_EXIT
21991-#endif
21992-
21993-#endif
21994--- a/include/asm-x86/mach-xen/asm/irqflags_64.h
21995+++ /dev/null
21996@@ -1,178 +0,0 @@
21997-/*
21998- * include/asm-x86_64/irqflags.h
21999- *
22000- * IRQ flags handling
22001- *
22002- * This file gets included from lowlevel asm headers too, to provide
22003- * wrapped versions of the local_irq_*() APIs, based on the
22004- * raw_local_irq_*() functions from the lowlevel headers.
22005- */
22006-#ifndef _ASM_IRQFLAGS_H
22007-#define _ASM_IRQFLAGS_H
22008-#include <asm/processor-flags.h>
22009-
22010-#ifndef __ASSEMBLY__
22011-/*
22012- * Interrupt control:
22013- */
22014-
22015-/*
22016- * The use of 'barrier' in the following reflects their use as local-lock
22017- * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
22018- * critical operations are executed. All critical operations must complete
22019- * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
22020- * includes these barriers, for example.
22021- */
22022-
22023-#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
22024-
22025-#define raw_local_save_flags(flags) \
22026- do { (flags) = __raw_local_save_flags(); } while (0)
22027-
22028-#define raw_local_irq_restore(x) \
22029-do { \
22030- vcpu_info_t *_vcpu; \
22031- barrier(); \
22032- _vcpu = current_vcpu_info(); \
22033- if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
22034- barrier(); /* unmask then check (avoid races) */ \
22035- if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
22036- force_evtchn_callback(); \
22037- } \
22038-} while (0)
22039-
22040-#ifdef CONFIG_X86_VSMP
22041-
22042-/*
22043- * Interrupt control for the VSMP architecture:
22044- */
22045-
22046-static inline void raw_local_irq_disable(void)
22047-{
22048- unsigned long flags = __raw_local_save_flags();
22049-
22050- raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
22051-}
22052-
22053-static inline void raw_local_irq_enable(void)
22054-{
22055- unsigned long flags = __raw_local_save_flags();
22056-
22057- raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
22058-}
22059-
22060-static inline int raw_irqs_disabled_flags(unsigned long flags)
22061-{
22062- return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
22063-}
22064-
22065-#else /* CONFIG_X86_VSMP */
22066-
22067-#define raw_local_irq_disable() \
22068-do { \
22069- current_vcpu_info()->evtchn_upcall_mask = 1; \
22070- barrier(); \
22071-} while (0)
22072-
22073-#define raw_local_irq_enable() \
22074-do { \
22075- vcpu_info_t *_vcpu; \
22076- barrier(); \
22077- _vcpu = current_vcpu_info(); \
22078- _vcpu->evtchn_upcall_mask = 0; \
22079- barrier(); /* unmask then check (avoid races) */ \
22080- if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
22081- force_evtchn_callback(); \
22082-} while (0)
22083-
22084-static inline int raw_irqs_disabled_flags(unsigned long flags)
22085-{
22086- return (flags != 0);
22087-}
22088-
22089-#endif
22090-
22091-/*
22092- * For spinlocks, etc.:
22093- */
22094-
22095-#define __raw_local_irq_save() \
22096-({ \
22097- unsigned long flags = __raw_local_save_flags(); \
22098- \
22099- raw_local_irq_disable(); \
22100- \
22101- flags; \
22102-})
22103-
22104-#define raw_local_irq_save(flags) \
22105- do { (flags) = __raw_local_irq_save(); } while (0)
22106-
22107-#define raw_irqs_disabled() \
22108-({ \
22109- unsigned long flags = __raw_local_save_flags(); \
22110- \
22111- raw_irqs_disabled_flags(flags); \
22112-})
22113-
22114-/*
22115- * makes the traced hardirq state match with the machine state
22116- *
22117- * should be a rarely used function, only in places where its
22118- * otherwise impossible to know the irq state, like in traps.
22119- */
22120-static inline void trace_hardirqs_fixup_flags(unsigned long flags)
22121-{
22122- if (raw_irqs_disabled_flags(flags))
22123- trace_hardirqs_off();
22124- else
22125- trace_hardirqs_on();
22126-}
22127-
22128-#define trace_hardirqs_fixup() \
22129- trace_hardirqs_fixup_flags(__raw_local_save_flags())
22130-/*
22131- * Used in the idle loop; sti takes one instruction cycle
22132- * to complete:
22133- */
22134-void xen_safe_halt(void);
22135-static inline void raw_safe_halt(void)
22136-{
22137- xen_safe_halt();
22138-}
22139-
22140-/*
22141- * Used when interrupts are already enabled or to
22142- * shutdown the processor:
22143- */
22144-void xen_halt(void);
22145-static inline void halt(void)
22146-{
22147- xen_halt();
22148-}
22149-
22150-#else /* __ASSEMBLY__: */
22151-# ifdef CONFIG_TRACE_IRQFLAGS
22152-# define TRACE_IRQS_ON call trace_hardirqs_on_thunk
22153-# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk
22154-# else
22155-# define TRACE_IRQS_ON
22156-# define TRACE_IRQS_OFF
22157-# endif
22158-# ifdef CONFIG_DEBUG_LOCK_ALLOC
22159-# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
22160-# define LOCKDEP_SYS_EXIT_IRQ \
22161- TRACE_IRQS_ON; \
22162- sti; \
22163- SAVE_REST; \
22164- LOCKDEP_SYS_EXIT; \
22165- RESTORE_REST; \
22166- cli; \
22167- TRACE_IRQS_OFF;
22168-# else
22169-# define LOCKDEP_SYS_EXIT
22170-# define LOCKDEP_SYS_EXIT_IRQ
22171-# endif
22172-#endif
22173-
22174-#endif
22175--- a/include/asm-x86/mach-xen/asm/irqflags.h
22176+++ b/include/asm-x86/mach-xen/asm/irqflags.h
22177@@ -1,5 +1,247 @@
22178-#ifdef CONFIG_X86_32
22179-# include "irqflags_32.h"
22180+#ifndef _X86_IRQFLAGS_H_
22181+#define _X86_IRQFLAGS_H_
22182+
22183+#include <asm/processor-flags.h>
22184+
22185+#ifndef __ASSEMBLY__
22186+/*
22187+ * The use of 'barrier' in the following reflects their use as local-lock
22188+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
22189+ * critical operations are executed. All critical operations must complete
22190+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
22191+ * includes these barriers, for example.
22192+ */
22193+
22194+#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
22195+
22196+#define xen_restore_fl(f) \
22197+do { \
22198+ vcpu_info_t *_vcpu; \
22199+ barrier(); \
22200+ _vcpu = current_vcpu_info(); \
22201+ if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
22202+ barrier(); /* unmask then check (avoid races) */\
22203+ if (unlikely(_vcpu->evtchn_upcall_pending)) \
22204+ force_evtchn_callback(); \
22205+ } \
22206+} while (0)
22207+
22208+#define xen_irq_disable() \
22209+do { \
22210+ current_vcpu_info()->evtchn_upcall_mask = 1; \
22211+ barrier(); \
22212+} while (0)
22213+
22214+#define xen_irq_enable() \
22215+do { \
22216+ vcpu_info_t *_vcpu; \
22217+ barrier(); \
22218+ _vcpu = current_vcpu_info(); \
22219+ _vcpu->evtchn_upcall_mask = 0; \
22220+ barrier(); /* unmask then check (avoid races) */ \
22221+ if (unlikely(_vcpu->evtchn_upcall_pending)) \
22222+ force_evtchn_callback(); \
22223+} while (0)
22224+
22225+void xen_safe_halt(void);
22226+
22227+void xen_halt(void);
22228+
22229+#define __raw_local_save_flags() xen_save_fl()
22230+
22231+#define raw_local_irq_restore(flags) xen_restore_fl(flags)
22232+
22233+#define raw_local_irq_disable() xen_irq_disable()
22234+
22235+#define raw_local_irq_enable() xen_irq_enable()
22236+
22237+/*
22238+ * Used in the idle loop; sti takes one instruction cycle
22239+ * to complete:
22240+ */
22241+static inline void raw_safe_halt(void)
22242+{
22243+ xen_safe_halt();
22244+}
22245+
22246+/*
22247+ * Used when interrupts are already enabled or to
22248+ * shutdown the processor:
22249+ */
22250+static inline void halt(void)
22251+{
22252+ xen_halt();
22253+}
22254+
22255+/*
22256+ * For spinlocks, etc:
22257+ */
22258+#define __raw_local_irq_save() \
22259+({ \
22260+ unsigned long flags = __raw_local_save_flags(); \
22261+ \
22262+ raw_local_irq_disable(); \
22263+ \
22264+ flags; \
22265+})
22266 #else
22267-# include "irqflags_64.h"
22268+
22269+/* Offsets into shared_info_t. */
22270+#define evtchn_upcall_pending /* 0 */
22271+#define evtchn_upcall_mask 1
22272+
22273+#define sizeof_vcpu_shift 6
22274+
22275+#ifdef CONFIG_X86_64
22276+# define __REG_si %rsi
22277+# define __CPU_num %gs:pda_cpunumber
22278+#else
22279+# define __REG_si %esi
22280+# define __CPU_num TI_cpu(%ebp)
22281+#endif
22282+
22283+#ifdef CONFIG_SMP
22284+#define GET_VCPU_INFO movl __CPU_num,%esi ; \
22285+ shl $sizeof_vcpu_shift,%esi ; \
22286+ add HYPERVISOR_shared_info,__REG_si
22287+#else
22288+#define GET_VCPU_INFO mov HYPERVISOR_shared_info,__REG_si
22289+#endif
22290+
22291+#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(__REG_si)
22292+#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(__REG_si)
22293+#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(__REG_si)
22294+#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
22295+ __DISABLE_INTERRUPTS
22296+#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
22297+ __ENABLE_INTERRUPTS
22298+
22299+#ifndef CONFIG_X86_64
22300+#define INTERRUPT_RETURN iret
22301+#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
22302+sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
22303+ __TEST_PENDING ; \
22304+ jnz 14f /* process more events if necessary... */ ; \
22305+ movl PT_ESI(%esp), %esi ; \
22306+ sysexit ; \
22307+14: __DISABLE_INTERRUPTS ; \
22308+ TRACE_IRQS_OFF ; \
22309+sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
22310+ push %esp ; \
22311+ call evtchn_do_upcall ; \
22312+ add $4,%esp ; \
22313+ jmp ret_from_intr
22314+#endif
22315+
22316+
22317+#endif /* __ASSEMBLY__ */
22318+
22319+#ifndef __ASSEMBLY__
22320+#define raw_local_save_flags(flags) \
22321+ do { (flags) = __raw_local_save_flags(); } while (0)
22322+
22323+#define raw_local_irq_save(flags) \
22324+ do { (flags) = __raw_local_irq_save(); } while (0)
22325+
22326+static inline int raw_irqs_disabled_flags(unsigned long flags)
22327+{
22328+ return (flags != 0);
22329+}
22330+
22331+#define raw_irqs_disabled() \
22332+({ \
22333+ unsigned long flags = __raw_local_save_flags(); \
22334+ \
22335+ raw_irqs_disabled_flags(flags); \
22336+})
22337+
22338+/*
22339+ * makes the traced hardirq state match with the machine state
22340+ *
22341+ * should be a rarely used function, only in places where its
22342+ * otherwise impossible to know the irq state, like in traps.
22343+ */
22344+static inline void trace_hardirqs_fixup_flags(unsigned long flags)
22345+{
22346+ if (raw_irqs_disabled_flags(flags))
22347+ trace_hardirqs_off();
22348+ else
22349+ trace_hardirqs_on();
22350+}
22351+
22352+#define trace_hardirqs_fixup() \
22353+ trace_hardirqs_fixup_flags(__raw_local_save_flags())
22354+
22355+#else
22356+
22357+#ifdef CONFIG_X86_64
22358+/*
22359+ * Currently paravirt can't handle swapgs nicely when we
22360+ * don't have a stack we can rely on (such as a user space
22361+ * stack). So we either find a way around these or just fault
22362+ * and emulate if a guest tries to call swapgs directly.
22363+ *
22364+ * Either way, this is a good way to document that we don't
22365+ * have a reliable stack. x86_64 only.
22366+ */
22367+#define SWAPGS_UNSAFE_STACK swapgs
22368+#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
22369+#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
22370+#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
22371+#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
22372+ TRACE_IRQS_ON; \
22373+ ENABLE_INTERRUPTS(CLBR_NONE); \
22374+ SAVE_REST; \
22375+ LOCKDEP_SYS_EXIT; \
22376+ RESTORE_REST; \
22377+ __DISABLE_INTERRUPTS; \
22378+ TRACE_IRQS_OFF;
22379+
22380+#else
22381+#define ARCH_TRACE_IRQS_ON \
22382+ pushl %eax; \
22383+ pushl %ecx; \
22384+ pushl %edx; \
22385+ call trace_hardirqs_on; \
22386+ popl %edx; \
22387+ popl %ecx; \
22388+ popl %eax;
22389+
22390+#define ARCH_TRACE_IRQS_OFF \
22391+ pushl %eax; \
22392+ pushl %ecx; \
22393+ pushl %edx; \
22394+ call trace_hardirqs_off; \
22395+ popl %edx; \
22396+ popl %ecx; \
22397+ popl %eax;
22398+
22399+#define ARCH_LOCKDEP_SYS_EXIT \
22400+ pushl %eax; \
22401+ pushl %ecx; \
22402+ pushl %edx; \
22403+ call lockdep_sys_exit; \
22404+ popl %edx; \
22405+ popl %ecx; \
22406+ popl %eax;
22407+
22408+#define ARCH_LOCKDEP_SYS_EXIT_IRQ
22409+#endif
22410+
22411+#ifdef CONFIG_TRACE_IRQFLAGS
22412+# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
22413+# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
22414+#else
22415+# define TRACE_IRQS_ON
22416+# define TRACE_IRQS_OFF
22417+#endif
22418+#ifdef CONFIG_DEBUG_LOCK_ALLOC
22419+# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
22420+# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
22421+# else
22422+# define LOCKDEP_SYS_EXIT
22423+# define LOCKDEP_SYS_EXIT_IRQ
22424+# endif
22425+
22426+#endif /* __ASSEMBLY__ */
22427 #endif
22428--- a/include/asm-x86/mach-xen/asm/maddr_32.h
22429+++ b/include/asm-x86/mach-xen/asm/maddr_32.h
22430@@ -1,6 +1,7 @@
22431 #ifndef _I386_MADDR_H
22432 #define _I386_MADDR_H
22433
22434+#include <asm/bug.h>
22435 #include <xen/features.h>
22436 #include <xen/interface/xen.h>
22437
22438@@ -151,25 +152,9 @@ static inline paddr_t pte_machine_to_phy
22439 phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
22440 return phys;
22441 }
22442-#endif
22443-
22444-#ifdef CONFIG_X86_PAE
22445-#define __pte_ma(x) ((pte_t) { (x), (maddr_t)(x) >> 32 } )
22446-extern unsigned long long __supported_pte_mask;
22447-static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
22448-{
22449- pte_t pte;
22450-
22451- pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
22452- (pgprot_val(pgprot) >> 32);
22453- pte.pte_high &= (__supported_pte_mask >> 32);
22454- pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
22455- __supported_pte_mask;
22456- return pte;
22457-}
22458 #else
22459-#define __pte_ma(x) ((pte_t) { (x) } )
22460-#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
22461+#define pte_phys_to_machine phys_to_machine
22462+#define pte_machine_to_phys machine_to_phys
22463 #endif
22464
22465 #else /* !CONFIG_XEN */
22466--- a/include/asm-x86/mach-xen/asm/maddr_64.h
22467+++ b/include/asm-x86/mach-xen/asm/maddr_64.h
22468@@ -1,6 +1,7 @@
22469 #ifndef _X86_64_MADDR_H
22470 #define _X86_64_MADDR_H
22471
22472+#include <asm/bug.h>
22473 #include <xen/features.h>
22474 #include <xen/interface/xen.h>
22475
22476@@ -16,6 +17,7 @@ typedef unsigned long maddr_t;
22477 #ifdef CONFIG_XEN
22478
22479 extern unsigned long *phys_to_machine_mapping;
22480+extern unsigned long max_mapnr;
22481
22482 #undef machine_to_phys_mapping
22483 extern unsigned long *machine_to_phys_mapping;
22484@@ -25,7 +27,7 @@ static inline unsigned long pfn_to_mfn(u
22485 {
22486 if (xen_feature(XENFEAT_auto_translated_physmap))
22487 return pfn;
22488- BUG_ON(end_pfn && pfn >= end_pfn);
22489+ BUG_ON(max_mapnr && pfn >= max_mapnr);
22490 return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
22491 }
22492
22493@@ -33,7 +35,7 @@ static inline int phys_to_machine_mappin
22494 {
22495 if (xen_feature(XENFEAT_auto_translated_physmap))
22496 return 1;
22497- BUG_ON(end_pfn && pfn >= end_pfn);
22498+ BUG_ON(max_mapnr && pfn >= max_mapnr);
22499 return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
22500 }
22501
22502@@ -45,7 +47,7 @@ static inline unsigned long mfn_to_pfn(u
22503 return mfn;
22504
22505 if (unlikely((mfn >> machine_to_phys_order) != 0))
22506- return end_pfn;
22507+ return max_mapnr;
22508
22509 /* The array access can fail (e.g., device space beyond end of RAM). */
22510 asm (
22511@@ -60,7 +62,7 @@ static inline unsigned long mfn_to_pfn(u
22512 " .quad 1b,3b\n"
22513 ".previous"
22514 : "=r" (pfn)
22515- : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
22516+ : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
22517
22518 return pfn;
22519 }
22520@@ -88,16 +90,16 @@ static inline unsigned long mfn_to_pfn(u
22521 static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
22522 {
22523 unsigned long pfn = mfn_to_pfn(mfn);
22524- if ((pfn < end_pfn)
22525+ if ((pfn < max_mapnr)
22526 && !xen_feature(XENFEAT_auto_translated_physmap)
22527 && (phys_to_machine_mapping[pfn] != mfn))
22528- return end_pfn; /* force !pfn_valid() */
22529+ return max_mapnr; /* force !pfn_valid() */
22530 return pfn;
22531 }
22532
22533 static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
22534 {
22535- BUG_ON(end_pfn && pfn >= end_pfn);
22536+ BUG_ON(max_mapnr && pfn >= max_mapnr);
22537 if (xen_feature(XENFEAT_auto_translated_physmap)) {
22538 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
22539 return;
22540@@ -135,9 +137,6 @@ static inline paddr_t pte_machine_to_phy
22541 return phys;
22542 }
22543
22544-#define __pte_ma(x) ((pte_t) { (x) } )
22545-#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
22546-
22547 #else /* !CONFIG_XEN */
22548
22549 #define pfn_to_mfn(pfn) (pfn)
22550--- a/include/asm-x86/mach-xen/asm/mmu_context_32.h
22551+++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h
22552@@ -51,8 +51,6 @@ static inline void __prepare_arch_switch
22553 : : "r" (0) );
22554 }
22555
22556-void leave_mm(unsigned long cpu);
22557-
22558 static inline void switch_mm(struct mm_struct *prev,
22559 struct mm_struct *next,
22560 struct task_struct *tsk)
22561--- a/include/asm-x86/mach-xen/asm/mmu_context_64.h
22562+++ b/include/asm-x86/mach-xen/asm/mmu_context_64.h
22563@@ -62,12 +62,6 @@ extern void mm_pin(struct mm_struct *mm)
22564 extern void mm_unpin(struct mm_struct *mm);
22565 void mm_pin_all(void);
22566
22567-static inline void load_cr3(pgd_t *pgd)
22568-{
22569- asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
22570- "memory");
22571-}
22572-
22573 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
22574 struct task_struct *tsk)
22575 {
22576@@ -97,7 +91,7 @@ static inline void switch_mm(struct mm_s
22577 op++;
22578
22579 if (unlikely(next->context.ldt != prev->context.ldt)) {
22580- /* load_LDT_nolock(&next->context, cpu) */
22581+ /* load_LDT_nolock(&next->context) */
22582 op->cmd = MMUEXT_SET_LDT;
22583 op->arg1.linear_addr = (unsigned long)next->context.ldt;
22584 op->arg2.nr_ents = next->context.size;
22585@@ -110,7 +104,7 @@ static inline void switch_mm(struct mm_s
22586 else {
22587 write_pda(mmu_state, TLBSTATE_OK);
22588 if (read_pda(active_mm) != next)
22589- out_of_line_bug();
22590+ BUG();
22591 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
22592 /* We were in lazy tlb mode and leave_mm disabled
22593 * tlb flush IPI delivery. We must reload CR3
22594@@ -118,7 +112,7 @@ static inline void switch_mm(struct mm_s
22595 */
22596 load_cr3(next->pgd);
22597 xen_new_user_pt(__pa(__user_pgd(next->pgd)));
22598- load_LDT_nolock(&next->context, cpu);
22599+ load_LDT_nolock(&next->context);
22600 }
22601 }
22602 #endif
22603--- a/include/asm-x86/mach-xen/asm/page_64.h
22604+++ b/include/asm-x86/mach-xen/asm/page_64.h
22605@@ -1,37 +1,9 @@
22606 #ifndef _X86_64_PAGE_H
22607 #define _X86_64_PAGE_H
22608
22609-/* #include <linux/string.h> */
22610-#ifndef __ASSEMBLY__
22611-#include <linux/kernel.h>
22612-#include <linux/types.h>
22613-#include <asm/bug.h>
22614-#endif
22615-#include <linux/const.h>
22616-#include <xen/interface/xen.h>
22617-
22618-/*
22619- * Need to repeat this here in order to not include pgtable.h (which in turn
22620- * depends on definitions made here), but to be able to use the symbolic
22621- * below. The preprocessor will warn if the two definitions aren't identical.
22622- */
22623-#define _PAGE_PRESENT 0x001
22624-#define _PAGE_IO 0x200
22625-
22626-/* PAGE_SHIFT determines the page size */
22627-#define PAGE_SHIFT 12
22628-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
22629-#define PAGE_MASK (~(PAGE_SIZE-1))
22630-
22631-/* See Documentation/x86_64/mm.txt for a description of the memory map. */
22632-#define __PHYSICAL_MASK_SHIFT 46
22633-#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
22634-#define __VIRTUAL_MASK_SHIFT 48
22635-#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
22636-
22637-#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
22638+#define PAGETABLE_LEVELS 4
22639
22640-#define THREAD_ORDER 1
22641+#define THREAD_ORDER 1
22642 #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
22643 #define CURRENT_MASK (~(THREAD_SIZE-1))
22644
22645@@ -51,106 +23,10 @@
22646 #define MCE_STACK 5
22647 #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
22648
22649-#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
22650-#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
22651-
22652-#define HPAGE_SHIFT PMD_SHIFT
22653-#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
22654-#define HPAGE_MASK (~(HPAGE_SIZE - 1))
22655-#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
22656-
22657-#ifdef __KERNEL__
22658-#ifndef __ASSEMBLY__
22659-
22660-extern unsigned long end_pfn;
22661-
22662-#include <asm/maddr.h>
22663-
22664-void clear_page(void *);
22665-void copy_page(void *, void *);
22666-
22667-#define clear_user_page(page, vaddr, pg) clear_page(page)
22668-#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
22669-
22670-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
22671- alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
22672-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
22673-
22674-/*
22675- * These are used to make use of C type-checking..
22676- */
22677-typedef struct { unsigned long pte; } pte_t;
22678-typedef struct { unsigned long pmd; } pmd_t;
22679-typedef struct { unsigned long pud; } pud_t;
22680-typedef struct { unsigned long pgd; } pgd_t;
22681-#define PTE_MASK PHYSICAL_PAGE_MASK
22682-
22683-typedef struct { unsigned long pgprot; } pgprot_t;
22684-
22685-#define __pte_val(x) ((x).pte)
22686-#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO)) \
22687- == _PAGE_PRESENT ? \
22688- pte_machine_to_phys(__pte_val(x)) : \
22689- __pte_val(x))
22690-
22691-#define __pmd_val(x) ((x).pmd)
22692-static inline unsigned long pmd_val(pmd_t x)
22693-{
22694- unsigned long ret = __pmd_val(x);
22695-#if CONFIG_XEN_COMPAT <= 0x030002
22696- if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
22697-#else
22698- if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22699-#endif
22700- return ret;
22701-}
22702-
22703-#define __pud_val(x) ((x).pud)
22704-static inline unsigned long pud_val(pud_t x)
22705-{
22706- unsigned long ret = __pud_val(x);
22707- if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22708- return ret;
22709-}
22710-
22711-#define __pgd_val(x) ((x).pgd)
22712-static inline unsigned long pgd_val(pgd_t x)
22713-{
22714- unsigned long ret = __pgd_val(x);
22715- if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22716- return ret;
22717-}
22718-
22719-#define pgprot_val(x) ((x).pgprot)
22720-
22721-static inline pte_t __pte(unsigned long x)
22722-{
22723- if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
22724- x = pte_phys_to_machine(x);
22725- return ((pte_t) { (x) });
22726-}
22727-
22728-static inline pmd_t __pmd(unsigned long x)
22729-{
22730- if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22731- return ((pmd_t) { (x) });
22732-}
22733-
22734-static inline pud_t __pud(unsigned long x)
22735-{
22736- if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22737- return ((pud_t) { (x) });
22738-}
22739-
22740-static inline pgd_t __pgd(unsigned long x)
22741-{
22742- if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22743- return ((pgd_t) { (x) });
22744-}
22745-
22746-#define __pgprot(x) ((pgprot_t) { (x) } )
22747+#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
22748+#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
22749
22750-#endif /* !__ASSEMBLY__ */
22751+#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
22752
22753 #define __PHYSICAL_START CONFIG_PHYSICAL_START
22754 #define __KERNEL_ALIGN 0x200000
22755@@ -166,52 +42,58 @@ static inline pgd_t __pgd(unsigned long
22756
22757 #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
22758 #define __START_KERNEL_map _AC(0xffffffff80000000, UL)
22759-#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
22760
22761 #if CONFIG_XEN_COMPAT <= 0x030002
22762 #undef LOAD_OFFSET
22763 #define LOAD_OFFSET 0
22764 #endif
22765
22766-/* to align the pointer to the (next) page boundary */
22767-#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
22768-
22769-#define KERNEL_TEXT_SIZE (40*1024*1024)
22770-#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
22771+/* See Documentation/x86_64/mm.txt for a description of the memory map. */
22772+#define __PHYSICAL_MASK_SHIFT 46
22773+#define __VIRTUAL_MASK_SHIFT 48
22774
22775-#define PAGE_OFFSET __PAGE_OFFSET
22776+/*
22777+ * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
22778+ * arch/x86/kernel/head_64.S), and it is mapped here:
22779+ */
22780+#define KERNEL_IMAGE_SIZE (128*1024*1024)
22781+#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
22782
22783 #ifndef __ASSEMBLY__
22784+void clear_page(void *page);
22785+void copy_page(void *to, void *from);
22786+
22787+extern unsigned long end_pfn;
22788+extern unsigned long end_pfn_map;
22789+
22790 static inline unsigned long __phys_addr(unsigned long x)
22791 {
22792- return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET);
22793+ return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : __PAGE_OFFSET);
22794 }
22795-#endif
22796
22797-#define __pa(x) __phys_addr((unsigned long)(x))
22798-#define __pa_symbol(x) __phys_addr((unsigned long)(x))
22799+#define __phys_reloc_hide(x) (x)
22800
22801-#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
22802-#define __boot_va(x) __va(x)
22803-#define __boot_pa(x) __pa(x)
22804-#ifdef CONFIG_FLATMEM
22805-#define pfn_valid(pfn) ((pfn) < end_pfn)
22806-#endif
22807+/*
22808+ * These are used to make use of C type-checking..
22809+ */
22810+typedef unsigned long pteval_t;
22811+typedef unsigned long pmdval_t;
22812+typedef unsigned long pudval_t;
22813+typedef unsigned long pgdval_t;
22814+typedef unsigned long pgprotval_t;
22815+typedef unsigned long phys_addr_t;
22816
22817-#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
22818-#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
22819-#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
22820-
22821-#define VM_DATA_DEFAULT_FLAGS \
22822- (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
22823- VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
22824+typedef struct page *pgtable_t;
22825+
22826+typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
22827
22828-#define __HAVE_ARCH_GATE_AREA 1
22829 #define vmemmap ((struct page *)VMEMMAP_START)
22830
22831-#include <asm-generic/memory_model.h>
22832-#include <asm-generic/page.h>
22833+#endif /* !__ASSEMBLY__ */
22834+
22835+#ifdef CONFIG_FLATMEM
22836+#define pfn_valid(pfn) ((pfn) < max_mapnr)
22837+#endif
22838
22839-#endif /* __KERNEL__ */
22840
22841 #endif /* _X86_64_PAGE_H */
22842--- a/include/asm-x86/mach-xen/asm/page.h
22843+++ b/include/asm-x86/mach-xen/asm/page.h
22844@@ -1,13 +1,231 @@
22845+#ifndef _ASM_X86_PAGE_H
22846+#define _ASM_X86_PAGE_H
22847+
22848+#include <linux/const.h>
22849+
22850+/* PAGE_SHIFT determines the page size */
22851+#define PAGE_SHIFT 12
22852+#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
22853+#define PAGE_MASK (~(PAGE_SIZE-1))
22854+
22855 #ifdef __KERNEL__
22856-# ifdef CONFIG_X86_32
22857-# include "page_32.h"
22858-# else
22859-# include "page_64.h"
22860-# endif
22861+
22862+/*
22863+ * Need to repeat this here in order to not include pgtable.h (which in turn
22864+ * depends on definitions made here), but to be able to use the symbolics
22865+ * below. The preprocessor will warn if the two definitions aren't identical.
22866+ */
22867+#define _PAGE_BIT_PRESENT 0
22868+#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
22869+#define _PAGE_BIT_IO 9
22870+#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
22871+
22872+#define PHYSICAL_PAGE_MASK (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK)
22873+#define PTE_MASK _AT(pteval_t, PHYSICAL_PAGE_MASK)
22874+
22875+#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
22876+#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
22877+
22878+#define HPAGE_SHIFT PMD_SHIFT
22879+#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
22880+#define HPAGE_MASK (~(HPAGE_SIZE - 1))
22881+#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
22882+
22883+/* to align the pointer to the (next) page boundary */
22884+#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
22885+
22886+#define __PHYSICAL_MASK _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
22887+#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
22888+
22889+#ifndef __ASSEMBLY__
22890+#include <linux/types.h>
22891+#endif
22892+
22893+#ifdef CONFIG_X86_64
22894+#include <asm/page_64.h>
22895+#define max_pfn_mapped end_pfn_map
22896+#else
22897+#include <asm/page_32.h>
22898+#define max_pfn_mapped max_low_pfn
22899+#endif /* CONFIG_X86_64 */
22900+
22901+#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
22902+
22903+#define VM_DATA_DEFAULT_FLAGS \
22904+ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
22905+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
22906+
22907+
22908+#ifndef __ASSEMBLY__
22909+
22910+extern int page_is_ram(unsigned long pagenr);
22911+
22912+struct page;
22913+
22914+static inline void clear_user_page(void *page, unsigned long vaddr,
22915+ struct page *pg)
22916+{
22917+ clear_page(page);
22918+}
22919+
22920+static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
22921+ struct page *topage)
22922+{
22923+ copy_page(to, from);
22924+}
22925+
22926+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
22927+ alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
22928+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
22929+
22930+typedef struct { pgprotval_t pgprot; } pgprot_t;
22931+
22932+#define pgprot_val(x) ((x).pgprot)
22933+#define __pgprot(x) ((pgprot_t) { (x) } )
22934+
22935+#include <asm/maddr.h>
22936+
22937+typedef struct { pgdval_t pgd; } pgd_t;
22938+
22939+#define __pgd_ma(x) ((pgd_t) { (x) } )
22940+static inline pgd_t xen_make_pgd(pgdval_t val)
22941+{
22942+ if (val & _PAGE_PRESENT)
22943+ val = pte_phys_to_machine(val);
22944+ return (pgd_t) { val };
22945+}
22946+
22947+#define __pgd_val(x) ((x).pgd)
22948+static inline pgdval_t xen_pgd_val(pgd_t pgd)
22949+{
22950+ pgdval_t ret = __pgd_val(pgd);
22951+#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
22952+ if (ret)
22953+ ret = machine_to_phys(ret) | _PAGE_PRESENT;
22954+#else
22955+ if (ret & _PAGE_PRESENT)
22956+ ret = pte_machine_to_phys(ret);
22957+#endif
22958+ return ret;
22959+}
22960+
22961+#if PAGETABLE_LEVELS >= 3
22962+#if PAGETABLE_LEVELS == 4
22963+typedef struct { pudval_t pud; } pud_t;
22964+
22965+#define __pud_ma(x) ((pud_t) { (x) } )
22966+static inline pud_t xen_make_pud(pudval_t val)
22967+{
22968+ if (val & _PAGE_PRESENT)
22969+ val = pte_phys_to_machine(val);
22970+ return (pud_t) { val };
22971+}
22972+
22973+#define __pud_val(x) ((x).pud)
22974+static inline pudval_t xen_pud_val(pud_t pud)
22975+{
22976+ pudval_t ret = __pud_val(pud);
22977+ if (ret & _PAGE_PRESENT)
22978+ ret = pte_machine_to_phys(ret);
22979+ return ret;
22980+}
22981+#else /* PAGETABLE_LEVELS == 3 */
22982+#include <asm-generic/pgtable-nopud.h>
22983+
22984+#define __pud_val(x) __pgd_val((x).pgd)
22985+static inline pudval_t xen_pud_val(pud_t pud)
22986+{
22987+ return xen_pgd_val(pud.pgd);
22988+}
22989+#endif /* PAGETABLE_LEVELS == 4 */
22990+
22991+typedef struct { pmdval_t pmd; } pmd_t;
22992+
22993+#define __pmd_ma(x) ((pmd_t) { (x) } )
22994+static inline pmd_t xen_make_pmd(pmdval_t val)
22995+{
22996+ if (val & _PAGE_PRESENT)
22997+ val = pte_phys_to_machine(val);
22998+ return (pmd_t) { val };
22999+}
23000+
23001+#define __pmd_val(x) ((x).pmd)
23002+static inline pmdval_t xen_pmd_val(pmd_t pmd)
23003+{
23004+ pmdval_t ret = __pmd_val(pmd);
23005+#if CONFIG_XEN_COMPAT <= 0x030002
23006+ if (ret)
23007+ ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
23008 #else
23009-# ifdef __i386__
23010-# include "page_32.h"
23011-# else
23012-# include "page_64.h"
23013-# endif
23014+ if (ret & _PAGE_PRESENT)
23015+ ret = pte_machine_to_phys(ret);
23016+#endif
23017+ return ret;
23018+}
23019+#else /* PAGETABLE_LEVELS == 2 */
23020+#include <asm-generic/pgtable-nopmd.h>
23021+
23022+#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
23023+#define __pmd_val(x) __pgd_val((x).pud.pgd)
23024+static inline pmdval_t xen_pmd_val(pmd_t pmd)
23025+{
23026+ return xen_pgd_val(pmd.pud.pgd);
23027+}
23028+#endif /* PAGETABLE_LEVELS >= 3 */
23029+
23030+#define __pte_ma(x) ((pte_t) { .pte = (x) } )
23031+static inline pte_t xen_make_pte(pteval_t val)
23032+{
23033+ if ((val & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
23034+ val = pte_phys_to_machine(val);
23035+ return (pte_t) { .pte = val };
23036+}
23037+
23038+#define __pte_val(x) ((x).pte)
23039+static inline pteval_t xen_pte_val(pte_t pte)
23040+{
23041+ pteval_t ret = __pte_val(pte);
23042+ if ((pte.pte_low & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
23043+ ret = pte_machine_to_phys(ret);
23044+ return ret;
23045+}
23046+
23047+#define pgd_val(x) xen_pgd_val(x)
23048+#define __pgd(x) xen_make_pgd(x)
23049+
23050+#ifndef __PAGETABLE_PUD_FOLDED
23051+#define pud_val(x) xen_pud_val(x)
23052+#define __pud(x) xen_make_pud(x)
23053+#endif
23054+
23055+#ifndef __PAGETABLE_PMD_FOLDED
23056+#define pmd_val(x) xen_pmd_val(x)
23057+#define __pmd(x) xen_make_pmd(x)
23058 #endif
23059+
23060+#define pte_val(x) xen_pte_val(x)
23061+#define __pte(x) xen_make_pte(x)
23062+
23063+#define __pa(x) __phys_addr((unsigned long)(x))
23064+/* __pa_symbol should be used for C visible symbols.
23065+ This seems to be the official gcc blessed way to do such arithmetic. */
23066+#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
23067+
23068+#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
23069+
23070+#define __boot_va(x) __va(x)
23071+#define __boot_pa(x) __pa(x)
23072+
23073+#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
23074+#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
23075+#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
23076+
23077+#endif /* __ASSEMBLY__ */
23078+
23079+#include <asm-generic/memory_model.h>
23080+#include <asm-generic/page.h>
23081+
23082+#define __HAVE_ARCH_GATE_AREA 1
23083+
23084+#endif /* __KERNEL__ */
23085+#endif /* _ASM_X86_PAGE_H */
23086--- a/include/asm-x86/mach-xen/asm/pci_64.h
23087+++ b/include/asm-x86/mach-xen/asm/pci_64.h
23088@@ -26,7 +26,6 @@ extern int (*pci_config_write)(int seg,
23089
23090
23091 extern void pci_iommu_alloc(void);
23092-extern int iommu_setup(char *opt);
23093
23094 /* The PCI address space does equal the physical memory
23095 * address space. The networking and block device layers use
23096--- a/include/asm-x86/mach-xen/asm/pci.h
23097+++ b/include/asm-x86/mach-xen/asm/pci.h
23098@@ -71,6 +71,7 @@ extern int pci_mmap_page_range(struct pc
23099
23100
23101 #ifdef CONFIG_PCI
23102+extern void early_quirks(void);
23103 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
23104 enum pci_dma_burst_strategy *strat,
23105 unsigned long *strategy_parameter)
23106@@ -78,9 +79,10 @@ static inline void pci_dma_burst_advice(
23107 *strat = PCI_DMA_BURST_INFINITY;
23108 *strategy_parameter = ~0UL;
23109 }
23110+#else
23111+static inline void early_quirks(void) { }
23112 #endif
23113
23114-
23115 #endif /* __KERNEL__ */
23116
23117 #ifdef CONFIG_X86_32
23118@@ -95,6 +97,19 @@ static inline void pci_dma_burst_advice(
23119 /* generic pci stuff */
23120 #include <asm-generic/pci.h>
23121
23122+#ifdef CONFIG_NUMA
23123+/* Returns the node based on pci bus */
23124+static inline int __pcibus_to_node(struct pci_bus *bus)
23125+{
23126+ struct pci_sysdata *sd = bus->sysdata;
23127+
23128+ return sd->node;
23129+}
23130
23131+static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
23132+{
23133+ return node_to_cpumask(__pcibus_to_node(bus));
23134+}
23135+#endif
23136
23137 #endif
23138--- a/include/asm-x86/mach-xen/asm/pgalloc_32.h
23139+++ b/include/asm-x86/mach-xen/asm/pgalloc_32.h
23140@@ -3,69 +3,109 @@
23141
23142 #include <linux/threads.h>
23143 #include <linux/mm.h> /* for struct page */
23144+#include <linux/pagemap.h>
23145+#include <asm/tlb.h>
23146+#include <asm-generic/tlb.h>
23147 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
23148
23149 #define paravirt_alloc_pt(mm, pfn) do { } while (0)
23150-#define paravirt_alloc_pd(pfn) do { } while (0)
23151-#define paravirt_alloc_pd(pfn) do { } while (0)
23152+#define paravirt_alloc_pd(mm, pfn) do { } while (0)
23153 #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
23154 #define paravirt_release_pt(pfn) do { } while (0)
23155 #define paravirt_release_pd(pfn) do { } while (0)
23156
23157-#define pmd_populate_kernel(mm, pmd, pte) \
23158-do { \
23159- paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); \
23160- set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \
23161-} while (0)
23162-
23163-#define pmd_populate(mm, pmd, pte) \
23164-do { \
23165- unsigned long pfn = page_to_pfn(pte); \
23166- paravirt_alloc_pt(mm, pfn); \
23167- if (PagePinned(virt_to_page((mm)->pgd))) { \
23168- if (!PageHighMem(pte)) \
23169- BUG_ON(HYPERVISOR_update_va_mapping( \
23170- (unsigned long)__va(pfn << PAGE_SHIFT), \
23171- pfn_pte(pfn, PAGE_KERNEL_RO), 0)); \
23172- else if (!test_and_set_bit(PG_pinned, &pte->flags)) \
23173- kmap_flush_unused(); \
23174- set_pmd(pmd, \
23175- __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \
23176- } else \
23177- *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \
23178-} while (0)
23179+static inline void pmd_populate_kernel(struct mm_struct *mm,
23180+ pmd_t *pmd, pte_t *pte)
23181+{
23182+ paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
23183+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
23184+}
23185+
23186+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
23187+{
23188+ unsigned long pfn = page_to_pfn(pte);
23189+
23190+ paravirt_alloc_pt(mm, pfn);
23191+ if (PagePinned(virt_to_page(mm->pgd))) {
23192+ if (!PageHighMem(pte))
23193+ BUG_ON(HYPERVISOR_update_va_mapping(
23194+ (unsigned long)__va(pfn << PAGE_SHIFT),
23195+ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
23196+ else if (!test_and_set_bit(PG_pinned, &pte->flags))
23197+ kmap_flush_unused();
23198+ set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
23199+ } else
23200+ *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
23201+}
23202+#define pmd_pgtable(pmd) pmd_page(pmd)
23203
23204 /*
23205 * Allocate and free page tables.
23206 */
23207+extern void pgd_test_and_unpin(pgd_t *);
23208 extern pgd_t *pgd_alloc(struct mm_struct *);
23209-extern void pgd_free(pgd_t *pgd);
23210+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
23211
23212 extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
23213-extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
23214+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
23215
23216-static inline void pte_free_kernel(pte_t *pte)
23217+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
23218 {
23219 make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
23220 free_page((unsigned long)pte);
23221 }
23222
23223-extern void pte_free(struct page *pte);
23224+extern void __pte_free(pgtable_t);
23225+static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
23226+{
23227+ __pte_free(pte);
23228+}
23229+
23230
23231-#define __pte_free_tlb(tlb,pte) \
23232-do { \
23233- paravirt_release_pt(page_to_pfn(pte)); \
23234- tlb_remove_page((tlb),(pte)); \
23235-} while (0)
23236+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
23237
23238 #ifdef CONFIG_X86_PAE
23239 /*
23240 * In the PAE case we free the pmds as part of the pgd.
23241 */
23242-#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); })
23243-#define pmd_free(x) do { } while (0)
23244-#define __pmd_free_tlb(tlb,x) do { } while (0)
23245-#define pud_populate(mm, pmd, pte) BUG()
23246-#endif
23247+extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
23248+
23249+extern void __pmd_free(pgtable_t);
23250+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
23251+{
23252+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
23253+ __pmd_free(virt_to_page(pmd));
23254+}
23255+
23256+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
23257+
23258+static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
23259+{
23260+ struct page *page = virt_to_page(pmd);
23261+ unsigned long pfn = page_to_pfn(page);
23262+
23263+ paravirt_alloc_pd(mm, pfn);
23264+
23265+ /* Note: almost everything apart from _PAGE_PRESENT is
23266+ reserved at the pmd (PDPT) level. */
23267+ if (PagePinned(virt_to_page(mm->pgd))) {
23268+ BUG_ON(PageHighMem(page));
23269+ BUG_ON(HYPERVISOR_update_va_mapping(
23270+ (unsigned long)__va(pfn << PAGE_SHIFT),
23271+ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
23272+ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
23273+ } else
23274+ *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
23275+
23276+ /*
23277+ * According to Intel App note "TLBs, Paging-Structure Caches,
23278+ * and Their Invalidation", April 2007, document 317080-001,
23279+ * section 8.1: in PAE mode we explicitly have to flush the
23280+ * TLB via cr3 if the top-level pgd is changed...
23281+ */
23282+ if (mm == current->active_mm)
23283+ xen_tlb_flush();
23284+}
23285+#endif /* CONFIG_X86_PAE */
23286
23287 #endif /* _I386_PGALLOC_H */
23288--- a/include/asm-x86/mach-xen/asm/pgalloc_64.h
23289+++ b/include/asm-x86/mach-xen/asm/pgalloc_64.h
23290@@ -6,30 +6,13 @@
23291 #include <linux/mm.h>
23292 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
23293
23294-#include <xen/features.h>
23295-void make_page_readonly(void *va, unsigned int feature);
23296-void make_page_writable(void *va, unsigned int feature);
23297-void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
23298-void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
23299+pmd_t *early_get_pmd(unsigned long va);
23300+void early_make_page_readonly(void *va, unsigned int feature);
23301
23302 #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
23303
23304-static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
23305-{
23306- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
23307-}
23308-
23309-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
23310-{
23311- if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
23312- BUG_ON(HYPERVISOR_update_va_mapping(
23313- (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
23314- pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
23315- set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
23316- } else {
23317- *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
23318- }
23319-}
23320+#define pmd_populate_kernel(mm, pmd, pte) \
23321+ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
23322
23323 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
23324 {
23325@@ -63,53 +46,58 @@ static inline void pgd_populate(struct m
23326 }
23327 }
23328
23329-extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
23330-extern void pte_free(struct page *pte);
23331+#define pmd_pgtable(pmd) pmd_page(pmd)
23332
23333-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
23334+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
23335 {
23336- struct page *pg;
23337-
23338- pg = pte_alloc_one(mm, addr);
23339- return pg ? page_address(pg) : NULL;
23340+ if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
23341+ BUG_ON(HYPERVISOR_update_va_mapping(
23342+ (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
23343+ pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
23344+ set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
23345+ } else {
23346+ *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
23347+ }
23348 }
23349
23350-static inline void pmd_free(pmd_t *pmd)
23351+extern void __pmd_free(pgtable_t);
23352+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
23353 {
23354 BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
23355- pte_free(virt_to_page(pmd));
23356+ __pmd_free(virt_to_page(pmd));
23357 }
23358
23359+extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
23360+
23361 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
23362 {
23363- struct page *pg;
23364-
23365- pg = pte_alloc_one(mm, addr);
23366- return pg ? page_address(pg) : NULL;
23367+ return (pud_t *)pmd_alloc_one(mm, addr);
23368 }
23369
23370-static inline void pud_free(pud_t *pud)
23371+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
23372 {
23373 BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
23374- pte_free(virt_to_page(pud));
23375+ __pmd_free(virt_to_page(pud));
23376 }
23377
23378 static inline void pgd_list_add(pgd_t *pgd)
23379 {
23380 struct page *page = virt_to_page(pgd);
23381+ unsigned long flags;
23382
23383- spin_lock(&pgd_lock);
23384+ spin_lock_irqsave(&pgd_lock, flags);
23385 list_add(&page->lru, &pgd_list);
23386- spin_unlock(&pgd_lock);
23387+ spin_unlock_irqrestore(&pgd_lock, flags);
23388 }
23389
23390 static inline void pgd_list_del(pgd_t *pgd)
23391 {
23392 struct page *page = virt_to_page(pgd);
23393+ unsigned long flags;
23394
23395- spin_lock(&pgd_lock);
23396+ spin_lock_irqsave(&pgd_lock, flags);
23397 list_del(&page->lru);
23398- spin_unlock(&pgd_lock);
23399+ spin_unlock_irqrestore(&pgd_lock, flags);
23400 }
23401
23402 extern void pgd_test_and_unpin(pgd_t *);
23403@@ -145,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm
23404 return pgd;
23405 }
23406
23407-static inline void pgd_free(pgd_t *pgd)
23408+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
23409 {
23410 pgd_test_and_unpin(pgd);
23411 pgd_list_del(pgd);
23412@@ -161,17 +149,30 @@ static inline pte_t *pte_alloc_one_kerne
23413 return pte;
23414 }
23415
23416+extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
23417+
23418 /* Should really implement gc for free page table pages. This could be
23419 done with a reference count in struct page. */
23420
23421-static inline void pte_free_kernel(pte_t *pte)
23422+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
23423 {
23424 BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
23425 make_page_writable(pte, XENFEAT_writable_page_tables);
23426 free_page((unsigned long)pte);
23427 }
23428
23429-#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
23430+extern void __pte_free(pgtable_t);
23431+static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
23432+{
23433+ __pte_free(pte);
23434+}
23435+
23436+#define __pte_free_tlb(tlb,pte) \
23437+do { \
23438+ pgtable_page_dtor((pte)); \
23439+ tlb_remove_page((tlb), (pte)); \
23440+} while (0)
23441+
23442 #define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
23443 #define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
23444
23445--- a/include/asm-x86/mach-xen/asm/pgtable_32.h
23446+++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
23447@@ -1,8 +1,6 @@
23448 #ifndef _I386_PGTABLE_H
23449 #define _I386_PGTABLE_H
23450
23451-#include <asm/hypervisor.h>
23452-
23453 /*
23454 * The Linux memory management assumes a three-level page table setup. On
23455 * the i386, we use that, but "fold" the mid level into the top-level page
23456@@ -25,20 +23,10 @@
23457
23458 struct vm_area_struct;
23459
23460-/*
23461- * ZERO_PAGE is a global shared page that is always zero: used
23462- * for zero-mapped memory areas etc..
23463- */
23464-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
23465-extern unsigned long empty_zero_page[1024];
23466 extern pgd_t *swapper_pg_dir;
23467-extern struct kmem_cache *pmd_cache;
23468-extern spinlock_t pgd_lock;
23469-extern struct page *pgd_list;
23470-void check_pgt_cache(void);
23471
23472-void pmd_ctor(struct kmem_cache *, void *);
23473-void pgtable_cache_init(void);
23474+static inline void pgtable_cache_init(void) { }
23475+static inline void check_pgt_cache(void) { }
23476 void paging_init(void);
23477
23478
23479@@ -58,16 +46,9 @@ void paging_init(void);
23480 #define PGDIR_SIZE (1UL << PGDIR_SHIFT)
23481 #define PGDIR_MASK (~(PGDIR_SIZE-1))
23482
23483-#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
23484-#define FIRST_USER_ADDRESS 0
23485-
23486 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
23487 #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
23488
23489-#define TWOLEVEL_PGDIR_SHIFT 22
23490-#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
23491-#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
23492-
23493 /* Just any arbitrary offset to the start of the vmalloc VM area: the
23494 * current 8MB value just means that there will be a 8MB "hole" after the
23495 * physical memory until the kernel virtual memory starts. That means that
23496@@ -78,121 +59,19 @@ void paging_init(void);
23497 #define VMALLOC_OFFSET (8*1024*1024)
23498 #define VMALLOC_START (((unsigned long) high_memory + \
23499 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
23500-#ifdef CONFIG_HIGHMEM
23501-# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
23502-#else
23503-# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
23504-#endif
23505-
23506-/*
23507- * _PAGE_PSE set in the page directory entry just means that
23508- * the page directory entry points directly to a 4MB-aligned block of
23509- * memory.
23510- */
23511-#define _PAGE_BIT_PRESENT 0
23512-#define _PAGE_BIT_RW 1
23513-#define _PAGE_BIT_USER 2
23514-#define _PAGE_BIT_PWT 3
23515-#define _PAGE_BIT_PCD 4
23516-#define _PAGE_BIT_ACCESSED 5
23517-#define _PAGE_BIT_DIRTY 6
23518-#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
23519-#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
23520-/*#define _PAGE_BIT_UNUSED1 9*/ /* available for programmer */
23521-#define _PAGE_BIT_UNUSED2 10
23522-#define _PAGE_BIT_UNUSED3 11
23523-#define _PAGE_BIT_NX 63
23524-
23525-#define _PAGE_PRESENT 0x001
23526-#define _PAGE_RW 0x002
23527-#define _PAGE_USER 0x004
23528-#define _PAGE_PWT 0x008
23529-#define _PAGE_PCD 0x010
23530-#define _PAGE_ACCESSED 0x020
23531-#define _PAGE_DIRTY 0x040
23532-#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */
23533-#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */
23534-/*#define _PAGE_UNUSED1 0x200*/ /* available for programmer */
23535-#define _PAGE_UNUSED2 0x400
23536-#define _PAGE_UNUSED3 0x800
23537-
23538-/* If _PAGE_PRESENT is clear, we use these: */
23539-#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
23540-#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE;
23541- pte_present gives true */
23542 #ifdef CONFIG_X86_PAE
23543-#define _PAGE_NX (1ULL<<_PAGE_BIT_NX)
23544+#define LAST_PKMAP 512
23545 #else
23546-#define _PAGE_NX 0
23547+#define LAST_PKMAP 1024
23548 #endif
23549
23550-/* Mapped page is I/O or foreign and has no associated page struct. */
23551-#define _PAGE_IO 0x200
23552+#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
23553
23554-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
23555-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
23556-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
23557-
23558-#define PAGE_NONE \
23559- __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
23560-#define PAGE_SHARED \
23561- __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23562-
23563-#define PAGE_SHARED_EXEC \
23564- __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23565-#define PAGE_COPY_NOEXEC \
23566- __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23567-#define PAGE_COPY_EXEC \
23568- __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23569-#define PAGE_COPY \
23570- PAGE_COPY_NOEXEC
23571-#define PAGE_READONLY \
23572- __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23573-#define PAGE_READONLY_EXEC \
23574- __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23575-
23576-#define _PAGE_KERNEL \
23577- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
23578-#define _PAGE_KERNEL_EXEC \
23579- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
23580-
23581-extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
23582-#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
23583-#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
23584-#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
23585-#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
23586-#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
23587-
23588-#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
23589-#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
23590-#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
23591-#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
23592-#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
23593-#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
23594-#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
23595-
23596-/*
23597- * The i386 can't do page protection for execute, and considers that
23598- * the same are read. Also, write permissions imply read permissions.
23599- * This is the closest we can get..
23600- */
23601-#define __P000 PAGE_NONE
23602-#define __P001 PAGE_READONLY
23603-#define __P010 PAGE_COPY
23604-#define __P011 PAGE_COPY
23605-#define __P100 PAGE_READONLY_EXEC
23606-#define __P101 PAGE_READONLY_EXEC
23607-#define __P110 PAGE_COPY_EXEC
23608-#define __P111 PAGE_COPY_EXEC
23609-
23610-#define __S000 PAGE_NONE
23611-#define __S001 PAGE_READONLY
23612-#define __S010 PAGE_SHARED
23613-#define __S011 PAGE_SHARED
23614-#define __S100 PAGE_READONLY_EXEC
23615-#define __S101 PAGE_READONLY_EXEC
23616-#define __S110 PAGE_SHARED_EXEC
23617-#define __S111 PAGE_SHARED_EXEC
23618+#ifdef CONFIG_HIGHMEM
23619+# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
23620+#else
23621+# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
23622+#endif
23623
23624 /*
23625 * Define this if things work differently on an i386 and an i486:
23626@@ -221,28 +100,6 @@ extern unsigned long pg0[];
23627
23628 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
23629
23630-/*
23631- * The following only work if pte_present() is true.
23632- * Undefined behaviour if not..
23633- */
23634-static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
23635-static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
23636-static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
23637-static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; }
23638-
23639-/*
23640- * The following only works if pte_present() is not true.
23641- */
23642-static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; }
23643-
23644-static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
23645-static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
23646-static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
23647-static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
23648-static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
23649-static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
23650-static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
23651-
23652 #ifdef CONFIG_X86_PAE
23653 # include <asm/pgtable-3level.h>
23654 #else
23655@@ -250,111 +107,6 @@ static inline pte_t pte_mkhuge(pte_t pte
23656 #endif
23657
23658 /*
23659- * Rules for using pte_update - it must be called after any PTE update which
23660- * has not been done using the set_pte / clear_pte interfaces. It is used by
23661- * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
23662- * updates should either be sets, clears, or set_pte_atomic for P->P
23663- * transitions, which means this hook should only be called for user PTEs.
23664- * This hook implies a P->P protection or access change has taken place, which
23665- * requires a subsequent TLB flush. The notification can optionally be delayed
23666- * until the TLB flush event by using the pte_update_defer form of the
23667- * interface, but care must be taken to assure that the flush happens while
23668- * still holding the same page table lock so that the shadow and primary pages
23669- * do not become out of sync on SMP.
23670- */
23671-#define pte_update(mm, addr, ptep) do { } while (0)
23672-#define pte_update_defer(mm, addr, ptep) do { } while (0)
23673-
23674-/* local pte updates need not use xchg for locking */
23675-static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
23676-{
23677- xen_set_pte(ptep, __pte(0));
23678- return res;
23679-}
23680-
23681-/*
23682- * We only update the dirty/accessed state if we set
23683- * the dirty bit by hand in the kernel, since the hardware
23684- * will do the accessed bit for us, and we don't want to
23685- * race with other CPU's that might be updating the dirty
23686- * bit at the same time.
23687- */
23688-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
23689-#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
23690-({ \
23691- int __changed = !pte_same(*(ptep), entry); \
23692- if (__changed && (dirty)) { \
23693- if ( likely((vma)->vm_mm == current->mm) ) { \
23694- BUG_ON(HYPERVISOR_update_va_mapping(address, \
23695- entry, \
23696- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23697- UVMF_INVLPG|UVMF_MULTI)); \
23698- } else { \
23699- xen_l1_entry_update(ptep, entry); \
23700- flush_tlb_page(vma, address); \
23701- } \
23702- } \
23703- __changed; \
23704-})
23705-
23706-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
23707-#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
23708- int __ret = 0; \
23709- if (pte_young(*(ptep))) \
23710- __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
23711- &(ptep)->pte_low); \
23712- if (__ret) \
23713- pte_update((vma)->vm_mm, addr, ptep); \
23714- __ret; \
23715-})
23716-
23717-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
23718-#define ptep_clear_flush_young(vma, address, ptep) \
23719-({ \
23720- pte_t __pte = *(ptep); \
23721- int __young = pte_young(__pte); \
23722- __pte = pte_mkold(__pte); \
23723- if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
23724- (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
23725- else if (__young) \
23726- (ptep)->pte_low = __pte.pte_low; \
23727- __young; \
23728-})
23729-
23730-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
23731-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23732-{
23733- pte_t pte = *ptep;
23734- if (!pte_none(pte)
23735- && (mm != &init_mm
23736- || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
23737- pte = xen_ptep_get_and_clear(ptep, pte);
23738- pte_update(mm, addr, ptep);
23739- }
23740- return pte;
23741-}
23742-
23743-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
23744-#define ptep_get_and_clear_full(mm, addr, ptep, full) \
23745- ((full) ? ({ \
23746- pte_t __res = *(ptep); \
23747- if (PagePinned(virt_to_page((mm)->pgd))) \
23748- xen_l1_entry_update(ptep, __pte(0)); \
23749- else \
23750- *(ptep) = __pte(0); \
23751- __res; \
23752- }) : \
23753- ptep_get_and_clear(mm, addr, ptep))
23754-
23755-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
23756-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23757-{
23758- pte_t pte = *ptep;
23759- if (pte_write(pte))
23760- set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
23761-}
23762-
23763-/*
23764 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
23765 *
23766 * dst - pointer to pgd range anwhere on a pgd page
23767@@ -383,26 +135,6 @@ static inline void clone_pgd_range(pgd_t
23768
23769 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
23770
23771-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
23772-{
23773- /*
23774- * Since this might change the present bit (which controls whether
23775- * a pte_t object has undergone p2m translation), we must use
23776- * pte_val() on the input pte and __pte() for the return value.
23777- */
23778- paddr_t pteval = pte_val(pte);
23779-
23780- pteval &= _PAGE_CHG_MASK;
23781- pteval |= pgprot_val(newprot);
23782-#ifdef CONFIG_X86_PAE
23783- pteval &= __supported_pte_mask;
23784-#endif
23785- return __pte(pteval);
23786-}
23787-
23788-#define pmd_large(pmd) \
23789-((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
23790-
23791 /*
23792 * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
23793 *
23794@@ -424,6 +156,8 @@ static inline pte_t pte_modify(pte_t pte
23795 */
23796 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
23797
23798+static inline int pud_large(pud_t pud) { return 0; }
23799+
23800 /*
23801 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
23802 *
23803@@ -449,26 +183,6 @@ static inline pte_t pte_modify(pte_t pte
23804 #define pmd_page_vaddr(pmd) \
23805 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
23806
23807-/*
23808- * Helper function that returns the kernel pagetable entry controlling
23809- * the virtual address 'address'. NULL means no pagetable entry present.
23810- * NOTE: the return type is pte_t but if the pmd is PSE then we return it
23811- * as a pte too.
23812- */
23813-extern pte_t *lookup_address(unsigned long address);
23814-
23815-/*
23816- * Make a given kernel text page executable/non-executable.
23817- * Returns the previous executability setting of that page (which
23818- * is used to restore the previous state). Used by the SMP bootup code.
23819- * NOTE: this is an __init function for security reasons.
23820- */
23821-#ifdef CONFIG_X86_PAE
23822- extern int set_kernel_exec(unsigned long vaddr, int enable);
23823-#else
23824- static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
23825-#endif
23826-
23827 #if defined(CONFIG_HIGHPTE)
23828 #define pte_offset_map(dir, address) \
23829 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
23830@@ -496,59 +210,22 @@ extern pte_t *lookup_address(unsigned lo
23831 */
23832 #define update_mmu_cache(vma,address,pte) do { } while (0)
23833
23834-#include <xen/features.h>
23835 void make_lowmem_page_readonly(void *va, unsigned int feature);
23836 void make_lowmem_page_writable(void *va, unsigned int feature);
23837-void make_page_readonly(void *va, unsigned int feature);
23838-void make_page_writable(void *va, unsigned int feature);
23839-void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
23840-void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
23841-
23842-#define virt_to_ptep(va) \
23843-({ \
23844- pte_t *__ptep = lookup_address((unsigned long)(va)); \
23845- BUG_ON(!__ptep || !pte_present(*__ptep)); \
23846- __ptep; \
23847-})
23848-
23849-#define arbitrary_virt_to_machine(va) \
23850- (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
23851- | ((unsigned long)(va) & (PAGE_SIZE - 1)))
23852
23853 #endif /* !__ASSEMBLY__ */
23854
23855+/*
23856+ * kern_addr_valid() is (1) for FLATMEM and (0) for
23857+ * SPARSEMEM and DISCONTIGMEM
23858+ */
23859 #ifdef CONFIG_FLATMEM
23860 #define kern_addr_valid(addr) (1)
23861-#endif /* CONFIG_FLATMEM */
23862-
23863-int direct_remap_pfn_range(struct vm_area_struct *vma,
23864- unsigned long address,
23865- unsigned long mfn,
23866- unsigned long size,
23867- pgprot_t prot,
23868- domid_t domid);
23869-int direct_kernel_remap_pfn_range(unsigned long address,
23870- unsigned long mfn,
23871- unsigned long size,
23872- pgprot_t prot,
23873- domid_t domid);
23874-int create_lookup_pte_addr(struct mm_struct *mm,
23875- unsigned long address,
23876- uint64_t *ptep);
23877-int touch_pte_range(struct mm_struct *mm,
23878- unsigned long address,
23879- unsigned long size);
23880-
23881-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
23882- unsigned long addr, unsigned long end, pgprot_t newprot,
23883- int dirty_accountable);
23884-
23885-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
23886- xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
23887+#else
23888+#define kern_addr_valid(kaddr) (0)
23889+#endif
23890
23891 #define io_remap_pfn_range(vma,from,pfn,size,prot) \
23892 direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
23893
23894-#include <asm-generic/pgtable.h>
23895-
23896 #endif /* _I386_PGTABLE_H */
23897--- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
23898+++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
23899@@ -18,16 +18,18 @@
23900 printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
23901 &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
23902
23903-#define pud_none(pud) 0
23904-#define pud_bad(pud) 0
23905-#define pud_present(pud) 1
23906
23907-/*
23908- * All present pages with !NX bit are kernel-executable:
23909- */
23910-static inline int pte_exec_kernel(pte_t pte)
23911+static inline int pud_none(pud_t pud)
23912+{
23913+ return __pud_val(pud) == 0;
23914+}
23915+static inline int pud_bad(pud_t pud)
23916 {
23917- return !(__pte_val(pte) & _PAGE_NX);
23918+ return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
23919+}
23920+static inline int pud_present(pud_t pud)
23921+{
23922+ return __pud_val(pud) & _PAGE_PRESENT;
23923 }
23924
23925 /* Rules for using set_pte: the pte being assigned *must* be
23926@@ -44,14 +46,6 @@ static inline void xen_set_pte(pte_t *pt
23927 ptep->pte_low = pte.pte_low;
23928 }
23929
23930-static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
23931- pte_t *ptep , pte_t pte)
23932-{
23933- if ((mm != current->mm && mm != &init_mm) ||
23934- HYPERVISOR_update_va_mapping(addr, pte, 0))
23935- xen_set_pte(ptep, pte);
23936-}
23937-
23938 static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
23939 {
23940 set_64bit((unsigned long long *)(ptep),__pte_val(pte));
23941@@ -70,14 +64,11 @@ static inline void xen_set_pud(pud_t *pu
23942 * entry, so clear the bottom half first and enforce ordering with a compiler
23943 * barrier.
23944 */
23945-static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23946+static inline void __xen_pte_clear(pte_t *ptep)
23947 {
23948- if ((mm != current->mm && mm != &init_mm)
23949- || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
23950- ptep->pte_low = 0;
23951- smp_wmb();
23952- ptep->pte_high = 0;
23953- }
23954+ ptep->pte_low = 0;
23955+ smp_wmb();
23956+ ptep->pte_high = 0;
23957 }
23958
23959 static inline void xen_pmd_clear(pmd_t *pmd)
23960@@ -85,21 +76,25 @@ static inline void xen_pmd_clear(pmd_t *
23961 xen_l2_entry_update(pmd, __pmd(0));
23962 }
23963
23964-#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
23965-#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
23966-#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte)
23967-#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
23968-#define set_pud(pudp, pud) xen_set_pud(pudp, pud)
23969-#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
23970-#define pmd_clear(pmd) xen_pmd_clear(pmd)
23971+static inline void pud_clear(pud_t *pudp)
23972+{
23973+ pgdval_t pgd;
23974+
23975+ set_pud(pudp, __pud(0));
23976
23977-/*
23978- * Pentium-II erratum A13: in PAE mode we explicitly have to flush
23979- * the TLB via cr3 if the top-level pgd is changed...
23980- * We do not let the generic code free and clear pgd entries due to
23981- * this erratum.
23982- */
23983-static inline void pud_clear (pud_t * pud) { }
23984+ /*
23985+ * According to Intel App note "TLBs, Paging-Structure Caches,
23986+ * and Their Invalidation", April 2007, document 317080-001,
23987+ * section 8.1: in PAE mode we explicitly have to flush the
23988+ * TLB via cr3 if the top-level pgd is changed...
23989+ *
23990+ * Make sure the pud entry we're updating is within the
23991+ * current pgd to avoid unnecessary TLB flushes.
23992+ */
23993+ pgd = read_cr3();
23994+ if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
23995+ xen_tlb_flush();
23996+}
23997
23998 #define pud_page(pud) \
23999 ((struct page *) __va(pud_val(pud) & PAGE_MASK))
24000@@ -128,24 +123,6 @@ static inline pte_t xen_ptep_get_and_cle
24001 #define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
24002 #endif
24003
24004-#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
24005-#define ptep_clear_flush(vma, addr, ptep) \
24006-({ \
24007- pte_t *__ptep = (ptep); \
24008- pte_t __res = *__ptep; \
24009- if (!pte_none(__res) && \
24010- ((vma)->vm_mm != current->mm || \
24011- HYPERVISOR_update_va_mapping(addr, __pte(0), \
24012- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24013- UVMF_INVLPG|UVMF_MULTI))) { \
24014- __ptep->pte_low = 0; \
24015- smp_wmb(); \
24016- __ptep->pte_high = 0; \
24017- flush_tlb_page(vma, addr); \
24018- } \
24019- __res; \
24020-})
24021-
24022 #define __HAVE_ARCH_PTE_SAME
24023 static inline int pte_same(pte_t a, pte_t b)
24024 {
24025@@ -168,26 +145,12 @@ static inline int pte_none(pte_t pte)
24026 mfn_to_local_pfn(__pte_mfn(_pte)) : \
24027 __pte_mfn(_pte))
24028
24029-extern unsigned long long __supported_pte_mask;
24030-
24031-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
24032-{
24033- return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
24034- pgprot_val(pgprot)) & __supported_pte_mask);
24035-}
24036-
24037-static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
24038-{
24039- return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
24040- pgprot_val(pgprot)) & __supported_pte_mask);
24041-}
24042-
24043 /*
24044 * Bits 0, 6 and 7 are taken in the low part of the pte,
24045 * put the 32 bits of offset into the high part.
24046 */
24047 #define pte_to_pgoff(pte) ((pte).pte_high)
24048-#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
24049+#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
24050 #define PTE_FILE_MAX_BITS 32
24051
24052 /* Encode and de-code a swap entry */
24053@@ -195,8 +158,6 @@ static inline pmd_t pfn_pmd(unsigned lon
24054 #define __swp_offset(x) ((x).val >> 5)
24055 #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
24056 #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
24057-#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val })
24058-
24059-#define __pmd_free_tlb(tlb, x) do { } while (0)
24060+#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
24061
24062 #endif /* _I386_PGTABLE_3LEVEL_H */
24063--- a/include/asm-x86/mach-xen/asm/pgtable_64.h
24064+++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
24065@@ -13,47 +13,26 @@
24066 #include <linux/threads.h>
24067 #include <linux/sched.h>
24068 #include <asm/pda.h>
24069-#ifdef CONFIG_XEN
24070-#include <asm/hypervisor.h>
24071
24072+#ifdef CONFIG_XEN
24073 extern pud_t level3_user_pgt[512];
24074
24075 extern void xen_init_pt(void);
24076-
24077-extern pte_t *lookup_address(unsigned long address);
24078-
24079-#define virt_to_ptep(va) \
24080-({ \
24081- pte_t *__ptep = lookup_address((unsigned long)(va)); \
24082- BUG_ON(!__ptep || !pte_present(*__ptep)); \
24083- __ptep; \
24084-})
24085-
24086-#define arbitrary_virt_to_machine(va) \
24087- (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
24088- | ((unsigned long)(va) & (PAGE_SIZE - 1)))
24089 #endif
24090
24091 extern pud_t level3_kernel_pgt[512];
24092 extern pud_t level3_ident_pgt[512];
24093 extern pmd_t level2_kernel_pgt[512];
24094 extern pgd_t init_level4_pgt[];
24095-extern unsigned long __supported_pte_mask;
24096
24097 #define swapper_pg_dir init_level4_pgt
24098
24099 extern void paging_init(void);
24100-extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
24101-
24102-/*
24103- * ZERO_PAGE is a global shared page that is always zero: used
24104- * for zero-mapped memory areas etc..
24105- */
24106-extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
24107-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
24108
24109 #endif /* !__ASSEMBLY__ */
24110
24111+#define SHARED_KERNEL_PMD 1
24112+
24113 /*
24114 * PGDIR_SHIFT determines what a top-level page table entry can map
24115 */
24116@@ -96,31 +75,63 @@ extern unsigned long empty_zero_page[PAG
24117 #define pgd_none(x) (!__pgd_val(x))
24118 #define pud_none(x) (!__pud_val(x))
24119
24120-static inline void set_pte(pte_t *dst, pte_t val)
24121+struct mm_struct;
24122+
24123+#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
24124+
24125+static inline void xen_set_pte(pte_t *ptep, pte_t pte)
24126+{
24127+ *ptep = pte;
24128+}
24129+
24130+static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
24131+{
24132+ xen_set_pte(ptep, pte);
24133+}
24134+
24135+#ifdef CONFIG_SMP
24136+static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
24137+{
24138+ return __pte_ma(xchg(&xp->pte, 0));
24139+}
24140+#else
24141+#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
24142+#endif
24143+
24144+static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
24145 {
24146- *dst = val;
24147+ xen_l2_entry_update(pmdp, pmd);
24148 }
24149
24150-#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
24151-#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
24152-#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
24153+static inline void xen_pmd_clear(pmd_t *pmd)
24154+{
24155+ xen_set_pmd(pmd, xen_make_pmd(0));
24156+}
24157+
24158+static inline void xen_set_pud(pud_t *pudp, pud_t pud)
24159+{
24160+ xen_l3_entry_update(pudp, pud);
24161+}
24162
24163-static inline void pud_clear (pud_t * pud)
24164+static inline void xen_pud_clear(pud_t *pud)
24165 {
24166- set_pud(pud, __pud(0));
24167+ xen_set_pud(pud, xen_make_pud(0));
24168 }
24169
24170 #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
24171
24172-static inline void pgd_clear (pgd_t * pgd)
24173+static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
24174 {
24175- set_pgd(pgd, __pgd(0));
24176- set_pgd(__user_pgd(pgd), __pgd(0));
24177+ xen_l4_entry_update(pgdp, pgd);
24178 }
24179
24180-#define pte_same(a, b) ((a).pte == (b).pte)
24181+static inline void xen_pgd_clear(pgd_t * pgd)
24182+{
24183+ xen_set_pgd(pgd, xen_make_pgd(0));
24184+ xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
24185+}
24186
24187-#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
24188+#define pte_same(a, b) ((a).pte == (b).pte)
24189
24190 #endif /* !__ASSEMBLY__ */
24191
24192@@ -131,8 +142,6 @@ static inline void pgd_clear (pgd_t * pg
24193 #define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
24194 #define PGDIR_MASK (~(PGDIR_SIZE-1))
24195
24196-#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
24197-#define FIRST_USER_ADDRESS 0
24198
24199 #define MAXMEM _AC(0x3fffffffffff, UL)
24200 #define VMALLOC_START _AC(0xffffc20000000000, UL)
24201@@ -142,105 +151,6 @@ static inline void pgd_clear (pgd_t * pg
24202 #define MODULES_END _AC(0xfffffffffff00000, UL)
24203 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
24204
24205-#define _PAGE_BIT_PRESENT 0
24206-#define _PAGE_BIT_RW 1
24207-#define _PAGE_BIT_USER 2
24208-#define _PAGE_BIT_PWT 3
24209-#define _PAGE_BIT_PCD 4
24210-#define _PAGE_BIT_ACCESSED 5
24211-#define _PAGE_BIT_DIRTY 6
24212-#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
24213-#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
24214-#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
24215-
24216-#define _PAGE_PRESENT 0x001
24217-#define _PAGE_RW 0x002
24218-#define _PAGE_USER 0x004
24219-#define _PAGE_PWT 0x008
24220-#define _PAGE_PCD 0x010
24221-#define _PAGE_ACCESSED 0x020
24222-#define _PAGE_DIRTY 0x040
24223-#define _PAGE_PSE 0x080 /* 2MB page */
24224-#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
24225-#define _PAGE_GLOBAL 0x100 /* Global TLB entry */
24226-
24227-#define _PAGE_PROTNONE 0x080 /* If not present */
24228-#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX)
24229-
24230-/* Mapped page is I/O or foreign and has no associated page struct. */
24231-#define _PAGE_IO 0x200
24232-
24233-#ifndef __ASSEMBLY__
24234-#if CONFIG_XEN_COMPAT <= 0x030002
24235-extern unsigned int __kernel_page_user;
24236-#else
24237-#define __kernel_page_user 0
24238-#endif
24239-#endif
24240-
24241-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
24242-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
24243-
24244-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
24245-
24246-#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
24247-#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24248-#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
24249-#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24250-#define PAGE_COPY PAGE_COPY_NOEXEC
24251-#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24252-#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24253-#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24254-#define __PAGE_KERNEL \
24255- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24256-#define __PAGE_KERNEL_EXEC \
24257- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
24258-#define __PAGE_KERNEL_NOCACHE \
24259- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24260-#define __PAGE_KERNEL_RO \
24261- (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24262-#define __PAGE_KERNEL_VSYSCALL \
24263- (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24264-#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
24265- (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
24266-#define __PAGE_KERNEL_LARGE \
24267- (__PAGE_KERNEL | _PAGE_PSE)
24268-#define __PAGE_KERNEL_LARGE_EXEC \
24269- (__PAGE_KERNEL_EXEC | _PAGE_PSE)
24270-
24271-/*
24272- * We don't support GLOBAL page in xenolinux64
24273- */
24274-#define MAKE_GLOBAL(x) __pgprot((x))
24275-
24276-#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
24277-#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
24278-#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
24279-#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
24280-#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
24281-#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
24282-#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
24283-#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
24284-
24285-/* xwr */
24286-#define __P000 PAGE_NONE
24287-#define __P001 PAGE_READONLY
24288-#define __P010 PAGE_COPY
24289-#define __P011 PAGE_COPY
24290-#define __P100 PAGE_READONLY_EXEC
24291-#define __P101 PAGE_READONLY_EXEC
24292-#define __P110 PAGE_COPY_EXEC
24293-#define __P111 PAGE_COPY_EXEC
24294-
24295-#define __S000 PAGE_NONE
24296-#define __S001 PAGE_READONLY
24297-#define __S010 PAGE_SHARED
24298-#define __S011 PAGE_SHARED
24299-#define __S100 PAGE_READONLY_EXEC
24300-#define __S101 PAGE_READONLY_EXEC
24301-#define __S110 PAGE_SHARED_EXEC
24302-#define __S111 PAGE_SHARED_EXEC
24303-
24304 #ifndef __ASSEMBLY__
24305
24306 static inline unsigned long pgd_bad(pgd_t pgd)
24307@@ -258,119 +168,26 @@ static inline unsigned long pmd_bad(pmd_
24308 return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
24309 }
24310
24311-#define set_pte_at(_mm,addr,ptep,pteval) do { \
24312- if (((_mm) != current->mm && (_mm) != &init_mm) || \
24313- HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
24314- set_pte((ptep), (pteval)); \
24315-} while (0)
24316-
24317 #define pte_none(x) (!(x).pte)
24318 #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
24319-#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
24320
24321-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
24322+#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */
24323
24324 #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
24325 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
24326 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
24327-#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn : \
24328+#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
24329 (_pte).pte & _PAGE_PRESENT ? \
24330 mfn_to_local_pfn(__pte_mfn(_pte)) : \
24331 __pte_mfn(_pte))
24332
24333 #define pte_page(x) pfn_to_page(pte_pfn(x))
24334
24335-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
24336-{
24337- unsigned long pte = page_nr << PAGE_SHIFT;
24338- pte |= pgprot_val(pgprot);
24339- pte &= __supported_pte_mask;
24340- return __pte(pte);
24341-}
24342-
24343-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24344-{
24345- pte_t pte = *ptep;
24346- if (!pte_none(pte)) {
24347- if ((mm != &init_mm) ||
24348- HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
24349- pte = __pte_ma(xchg(&ptep->pte, 0));
24350- }
24351- return pte;
24352-}
24353-
24354-static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
24355-{
24356- if (full) {
24357- pte_t pte = *ptep;
24358- if (PagePinned(virt_to_page(mm->pgd)))
24359- xen_l1_entry_update(ptep, __pte(0));
24360- else
24361- *ptep = __pte(0);
24362- return pte;
24363- }
24364- return ptep_get_and_clear(mm, addr, ptep);
24365-}
24366-
24367-#define ptep_clear_flush(vma, addr, ptep) \
24368-({ \
24369- pte_t *__ptep = (ptep); \
24370- pte_t __res = *__ptep; \
24371- if (!pte_none(__res) && \
24372- ((vma)->vm_mm != current->mm || \
24373- HYPERVISOR_update_va_mapping(addr, __pte(0), \
24374- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24375- UVMF_INVLPG|UVMF_MULTI))) { \
24376- __ptep->pte = 0; \
24377- flush_tlb_page(vma, addr); \
24378- } \
24379- __res; \
24380-})
24381-
24382-/*
24383- * The following only work if pte_present() is true.
24384- * Undefined behaviour if not..
24385- */
24386-#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
24387-static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
24388-static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
24389-static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
24390-static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
24391-static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
24392-
24393-static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
24394-static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
24395-static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
24396-static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) &= ~_PAGE_NX; return pte; }
24397-static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
24398-static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
24399-static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
24400-static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
24401-static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
24402-
24403-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
24404-{
24405- if (!pte_young(*ptep))
24406- return 0;
24407- return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
24408-}
24409-
24410-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24411-{
24412- pte_t pte = *ptep;
24413- if (pte_write(pte))
24414- set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
24415-}
24416-
24417 /*
24418 * Macro to mark a page protection value as "uncacheable".
24419 */
24420 #define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
24421
24422-static inline int pmd_large(pmd_t pte) {
24423- return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
24424-}
24425-
24426
24427 /*
24428 * Conversion functions: convert a page and protection to a page entry,
24429@@ -386,6 +203,7 @@ static inline int pmd_large(pmd_t pte) {
24430 #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
24431 #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
24432 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
24433+static inline int pgd_large(pgd_t pgd) { return 0; }
24434 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
24435
24436 /* PUD - Level3 access */
24437@@ -396,6 +214,12 @@ static inline int pmd_large(pmd_t pte) {
24438 #define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
24439 #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
24440
24441+static inline int pud_large(pud_t pte)
24442+{
24443+ return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
24444+ (_PAGE_PSE|_PAGE_PRESENT);
24445+}
24446+
24447 /* PMD - Level 2 access */
24448 #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
24449 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
24450@@ -411,36 +235,18 @@ static inline int pmd_large(pmd_t pte) {
24451 #else
24452 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
24453 #endif
24454-#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
24455 #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
24456 #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
24457
24458 #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
24459-#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
24460+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
24461 #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
24462
24463 /* PTE - Level 1 access. */
24464
24465 /* page, protection -> pte */
24466 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
24467-#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
24468
24469-/* Change flags of a PTE */
24470-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
24471-{
24472- /*
24473- * Since this might change the present bit (which controls whether
24474- * a pte_t object has undergone p2m translation), we must use
24475- * pte_val() on the input pte and __pte() for the return value.
24476- */
24477- unsigned long pteval = pte_val(pte);
24478-
24479- pteval &= _PAGE_CHG_MASK;
24480- pteval |= pgprot_val(newprot);
24481- pteval &= __supported_pte_mask;
24482- return __pte(pteval);
24483-}
24484-
24485 #define pte_index(address) \
24486 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
24487 #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
24488@@ -454,101 +260,21 @@ static inline pte_t pte_modify(pte_t pte
24489
24490 #define update_mmu_cache(vma,address,pte) do { } while (0)
24491
24492-/*
24493- * Rules for using ptep_establish: the pte MUST be a user pte, and
24494- * must be a present->present transition.
24495- */
24496-#define __HAVE_ARCH_PTEP_ESTABLISH
24497-#define ptep_establish(vma, address, ptep, pteval) \
24498- do { \
24499- if ( likely((vma)->vm_mm == current->mm) ) { \
24500- BUG_ON(HYPERVISOR_update_va_mapping(address, \
24501- pteval, \
24502- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24503- UVMF_INVLPG|UVMF_MULTI)); \
24504- } else { \
24505- xen_l1_entry_update(ptep, pteval); \
24506- flush_tlb_page(vma, address); \
24507- } \
24508- } while (0)
24509-
24510-/* We only update the dirty/accessed state if we set
24511- * the dirty bit by hand in the kernel, since the hardware
24512- * will do the accessed bit for us, and we don't want to
24513- * race with other CPU's that might be updating the dirty
24514- * bit at the same time. */
24515-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
24516-#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
24517-({ \
24518- int __changed = !pte_same(*(ptep), entry); \
24519- if (__changed && (dirty)) \
24520- ptep_establish(vma, address, ptep, entry); \
24521- __changed; \
24522-})
24523-
24524-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
24525-#define ptep_clear_flush_young(vma, address, ptep) \
24526-({ \
24527- pte_t __pte = *(ptep); \
24528- int __young = pte_young(__pte); \
24529- __pte = pte_mkold(__pte); \
24530- if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
24531- (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
24532- else if (__young) \
24533- set_pte(ptep, __pte); \
24534- __young; \
24535-})
24536-
24537 /* Encode and de-code a swap entry */
24538 #define __swp_type(x) (((x).val >> 1) & 0x3f)
24539 #define __swp_offset(x) ((x).val >> 8)
24540 #define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
24541 #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
24542-#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
24543-
24544-extern spinlock_t pgd_lock;
24545-extern struct list_head pgd_list;
24546+#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
24547
24548 extern int kern_addr_valid(unsigned long addr);
24549-
24550-#define DOMID_LOCAL (0xFFFFU)
24551-
24552-struct vm_area_struct;
24553-
24554-int direct_remap_pfn_range(struct vm_area_struct *vma,
24555- unsigned long address,
24556- unsigned long mfn,
24557- unsigned long size,
24558- pgprot_t prot,
24559- domid_t domid);
24560-
24561-int direct_kernel_remap_pfn_range(unsigned long address,
24562- unsigned long mfn,
24563- unsigned long size,
24564- pgprot_t prot,
24565- domid_t domid);
24566-
24567-int create_lookup_pte_addr(struct mm_struct *mm,
24568- unsigned long address,
24569- uint64_t *ptep);
24570-
24571-int touch_pte_range(struct mm_struct *mm,
24572- unsigned long address,
24573- unsigned long size);
24574-
24575-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
24576- unsigned long addr, unsigned long end, pgprot_t newprot,
24577- int dirty_accountable);
24578-
24579-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
24580- xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
24581-
24582-pte_t *lookup_address(unsigned long addr);
24583+extern void cleanup_highmap(void);
24584
24585 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
24586 direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
24587
24588 #define HAVE_ARCH_UNMAPPED_AREA
24589+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
24590
24591 #define pgtable_cache_init() do { } while (0)
24592 #define check_pgt_cache() do { } while (0)
24593@@ -561,13 +287,7 @@ pte_t *lookup_address(unsigned long addr
24594 #define kc_offset_to_vaddr(o) \
24595 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
24596
24597-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
24598-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
24599-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
24600-#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
24601-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
24602 #define __HAVE_ARCH_PTE_SAME
24603-#include <asm-generic/pgtable.h>
24604 #endif /* !__ASSEMBLY__ */
24605
24606 #endif /* _X86_64_PGTABLE_H */
24607--- a/include/asm-x86/mach-xen/asm/pgtable.h
24608+++ b/include/asm-x86/mach-xen/asm/pgtable.h
24609@@ -1,5 +1,454 @@
24610+#ifndef _ASM_X86_PGTABLE_H
24611+#define _ASM_X86_PGTABLE_H
24612+
24613+#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
24614+#define FIRST_USER_ADDRESS 0
24615+
24616+#define _PAGE_BIT_PRESENT 0
24617+#define _PAGE_BIT_RW 1
24618+#define _PAGE_BIT_USER 2
24619+#define _PAGE_BIT_PWT 3
24620+#define _PAGE_BIT_PCD 4
24621+#define _PAGE_BIT_ACCESSED 5
24622+#define _PAGE_BIT_DIRTY 6
24623+#define _PAGE_BIT_FILE 6
24624+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
24625+#define _PAGE_BIT_PAT 7 /* on 4KB pages */
24626+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
24627+#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
24628+ * has no associated page struct. */
24629+#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
24630+#define _PAGE_BIT_UNUSED3 11
24631+#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
24632+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
24633+
24634+/*
24635+ * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
24636+ * sign-extended value on 32-bit with all 1's in the upper word,
24637+ * which preserves the upper pte values on 64-bit ptes:
24638+ */
24639+#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
24640+#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
24641+#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
24642+#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
24643+#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
24644+#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
24645+#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
24646+#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
24647+#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
24648+#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
24649+#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
24650+#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
24651+#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
24652+#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
24653+
24654+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
24655+#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
24656+#else
24657+#define _PAGE_NX 0
24658+#endif
24659+
24660+/* If _PAGE_PRESENT is clear, we use these: */
24661+#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */
24662+#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
24663+ pte_present gives true */
24664+
24665+#ifndef __ASSEMBLY__
24666+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
24667+extern unsigned int __kernel_page_user;
24668+#else
24669+#define __kernel_page_user 0
24670+#endif
24671+#endif
24672+
24673+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
24674+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
24675+
24676+#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
24677+
24678+#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
24679+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24680+
24681+#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
24682+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24683+#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24684+#define PAGE_COPY PAGE_COPY_NOEXEC
24685+#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24686+#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24687+
24688+#ifdef CONFIG_X86_32
24689+#define _PAGE_KERNEL_EXEC \
24690+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
24691+#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
24692+
24693+#ifndef __ASSEMBLY__
24694+extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
24695+#endif /* __ASSEMBLY__ */
24696+#else
24697+#define __PAGE_KERNEL_EXEC \
24698+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
24699+#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
24700+#endif
24701+
24702+#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
24703+#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
24704+#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
24705+#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
24706+#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
24707+#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
24708+#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
24709+#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
24710+#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
24711+
24712+/*
24713+ * We don't support GLOBAL page in xenolinux64
24714+ */
24715+#define MAKE_GLOBAL(x) __pgprot((x))
24716+
24717+#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
24718+#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
24719+#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
24720+#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
24721+#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
24722+#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
24723+#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
24724+#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
24725+#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
24726+#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
24727+#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
24728+
24729+/* xwr */
24730+#define __P000 PAGE_NONE
24731+#define __P001 PAGE_READONLY
24732+#define __P010 PAGE_COPY
24733+#define __P011 PAGE_COPY
24734+#define __P100 PAGE_READONLY_EXEC
24735+#define __P101 PAGE_READONLY_EXEC
24736+#define __P110 PAGE_COPY_EXEC
24737+#define __P111 PAGE_COPY_EXEC
24738+
24739+#define __S000 PAGE_NONE
24740+#define __S001 PAGE_READONLY
24741+#define __S010 PAGE_SHARED
24742+#define __S011 PAGE_SHARED
24743+#define __S100 PAGE_READONLY_EXEC
24744+#define __S101 PAGE_READONLY_EXEC
24745+#define __S110 PAGE_SHARED_EXEC
24746+#define __S111 PAGE_SHARED_EXEC
24747+
24748+#ifndef __ASSEMBLY__
24749+
24750+/*
24751+ * ZERO_PAGE is a global shared page that is always zero: used
24752+ * for zero-mapped memory areas etc..
24753+ */
24754+extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
24755+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
24756+
24757+extern spinlock_t pgd_lock;
24758+extern struct list_head pgd_list;
24759+
24760+/*
24761+ * The following only work if pte_present() is true.
24762+ * Undefined behaviour if not..
24763+ */
24764+static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
24765+static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
24766+static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
24767+static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
24768+static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
24769+static inline int pte_global(pte_t pte) { return 0; }
24770+static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
24771+
24772+static inline int pmd_large(pmd_t pte) {
24773+ return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
24774+ (_PAGE_PSE|_PAGE_PRESENT);
24775+}
24776+
24777+static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
24778+static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
24779+static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
24780+static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
24781+static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
24782+static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
24783+static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); }
24784+static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
24785+static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
24786+static inline pte_t pte_mkglobal(pte_t pte) { return pte; }
24787+static inline pte_t pte_clrglobal(pte_t pte) { return pte; }
24788+
24789+extern pteval_t __supported_pte_mask;
24790+
24791+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
24792+{
24793+ return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
24794+ pgprot_val(pgprot)) & __supported_pte_mask);
24795+}
24796+
24797+static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
24798+{
24799+ return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
24800+ pgprot_val(pgprot)) & __supported_pte_mask);
24801+}
24802+
24803+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
24804+{
24805+ return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
24806+ pgprot_val(pgprot)) & __supported_pte_mask);
24807+}
24808+
24809+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
24810+{
24811+ pteval_t val = pte_val(pte);
24812+
24813+ val &= _PAGE_CHG_MASK;
24814+ val |= pgprot_val(newprot) & __supported_pte_mask;
24815+
24816+ return __pte(val);
24817+}
24818+
24819+#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
24820+
24821+#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
24822+
24823+#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
24824+#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
24825+
24826+#define set_pte_atomic(ptep, pte) \
24827+ xen_set_pte_atomic(ptep, pte)
24828+
24829+#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
24830+
24831+#ifndef __PAGETABLE_PUD_FOLDED
24832+#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd)
24833+#define pgd_clear(pgd) xen_pgd_clear(pgd)
24834+#endif
24835+
24836+#ifndef set_pud
24837+# define set_pud(pudp, pud) xen_set_pud(pudp, pud)
24838+#endif
24839+
24840+#ifndef __PAGETABLE_PMD_FOLDED
24841+#define pud_clear(pud) xen_pud_clear(pud)
24842+#endif
24843+
24844+#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
24845+#define pmd_clear(pmd) xen_pmd_clear(pmd)
24846+
24847+#define pte_update(mm, addr, ptep) do { } while (0)
24848+#define pte_update_defer(mm, addr, ptep) do { } while (0)
24849+
24850+#endif /* __ASSEMBLY__ */
24851+
24852 #ifdef CONFIG_X86_32
24853 # include "pgtable_32.h"
24854 #else
24855 # include "pgtable_64.h"
24856 #endif
24857+
24858+#ifndef __ASSEMBLY__
24859+
24860+enum {
24861+ PG_LEVEL_NONE,
24862+ PG_LEVEL_4K,
24863+ PG_LEVEL_2M,
24864+ PG_LEVEL_1G,
24865+};
24866+
24867+/*
24868+ * Helper function that returns the kernel pagetable entry controlling
24869+ * the virtual address 'address'. NULL means no pagetable entry present.
24870+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
24871+ * as a pte too.
24872+ */
24873+extern pte_t *lookup_address(unsigned long address, unsigned int *level);
24874+
24875+/* local pte updates need not use xchg for locking */
24876+static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
24877+{
24878+ xen_set_pte(ptep, __pte(0));
24879+ return res;
24880+}
24881+
24882+static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
24883+ pte_t *ptep , pte_t pte)
24884+{
24885+ if ((mm != current->mm && mm != &init_mm) ||
24886+ HYPERVISOR_update_va_mapping(addr, pte, 0))
24887+ xen_set_pte(ptep, pte);
24888+}
24889+
24890+static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
24891+ pte_t *ptep)
24892+{
24893+ if ((mm != current->mm && mm != &init_mm)
24894+ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
24895+ __xen_pte_clear(ptep);
24896+}
24897+
24898+#ifndef CONFIG_PARAVIRT
24899+/*
24900+ * Rules for using pte_update - it must be called after any PTE update which
24901+ * has not been done using the set_pte / clear_pte interfaces. It is used by
24902+ * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
24903+ * updates should either be sets, clears, or set_pte_atomic for P->P
24904+ * transitions, which means this hook should only be called for user PTEs.
24905+ * This hook implies a P->P protection or access change has taken place, which
24906+ * requires a subsequent TLB flush. The notification can optionally be delayed
24907+ * until the TLB flush event by using the pte_update_defer form of the
24908+ * interface, but care must be taken to assure that the flush happens while
24909+ * still holding the same page table lock so that the shadow and primary pages
24910+ * do not become out of sync on SMP.
24911+ */
24912+#define pte_update(mm, addr, ptep) do { } while (0)
24913+#define pte_update_defer(mm, addr, ptep) do { } while (0)
24914+#endif
24915+
24916+/*
24917+ * We only update the dirty/accessed state if we set
24918+ * the dirty bit by hand in the kernel, since the hardware
24919+ * will do the accessed bit for us, and we don't want to
24920+ * race with other CPU's that might be updating the dirty
24921+ * bit at the same time.
24922+ */
24923+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
24924+#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
24925+({ \
24926+ int __changed = !pte_same(*(ptep), entry); \
24927+ if (__changed && (dirty)) { \
24928+ if ( likely((vma)->vm_mm == current->mm) ) { \
24929+ BUG_ON(HYPERVISOR_update_va_mapping(address, \
24930+ entry, \
24931+ (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24932+ UVMF_INVLPG|UVMF_MULTI)); \
24933+ } else { \
24934+ xen_l1_entry_update(ptep, entry); \
24935+ flush_tlb_page(vma, address); \
24936+ } \
24937+ } \
24938+ __changed; \
24939+})
24940+
24941+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
24942+#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
24943+ int __ret = 0; \
24944+ if (pte_young(*(ptep))) \
24945+ __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
24946+ &(ptep)->pte); \
24947+ if (__ret) \
24948+ pte_update((vma)->vm_mm, addr, ptep); \
24949+ __ret; \
24950+})
24951+
24952+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
24953+#define ptep_clear_flush_young(vma, address, ptep) \
24954+({ \
24955+ pte_t __pte = *(ptep); \
24956+ int __young = pte_young(__pte); \
24957+ __pte = pte_mkold(__pte); \
24958+ if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
24959+ (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
24960+ else if (__young) \
24961+ (ptep)->pte_low = __pte.pte_low; \
24962+ __young; \
24963+})
24964+
24965+#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
24966+#define ptep_clear_flush(vma, addr, ptep) \
24967+({ \
24968+ pte_t *__ptep = (ptep); \
24969+ pte_t __res = *__ptep; \
24970+ if (!pte_none(__res) && \
24971+ ((vma)->vm_mm != current->mm || \
24972+ HYPERVISOR_update_va_mapping(addr, __pte(0), \
24973+ (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24974+ UVMF_INVLPG|UVMF_MULTI))) { \
24975+ __xen_pte_clear(__ptep); \
24976+ flush_tlb_page(vma, addr); \
24977+ } \
24978+ __res; \
24979+})
24980+
24981+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
24982+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24983+{
24984+ pte_t pte = *ptep;
24985+ if (!pte_none(pte)
24986+ && (mm != &init_mm
24987+ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
24988+ pte = xen_ptep_get_and_clear(ptep, pte);
24989+ pte_update(mm, addr, ptep);
24990+ }
24991+ return pte;
24992+}
24993+
24994+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
24995+#define ptep_get_and_clear_full(mm, addr, ptep, full) \
24996+ ((full) ? ({ \
24997+ pte_t *__ptep = (ptep); \
24998+ pte_t __res = *__ptep; \
24999+ if (!PagePinned(virt_to_page((mm)->pgd))) \
25000+ __xen_pte_clear(__ptep); \
25001+ else if (!pte_none(__res)) \
25002+ xen_l1_entry_update(__ptep, __pte(0)); \
25003+ __res; \
25004+ }) : \
25005+ ptep_get_and_clear(mm, addr, ptep))
25006+
25007+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
25008+
25009+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
25010+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
25011+{
25012+ pte_t pte = *ptep;
25013+ if (pte_write(pte))
25014+ set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
25015+}
25016+
25017+#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
25018+ xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
25019+
25020+#define arbitrary_virt_to_machine(va) \
25021+({ \
25022+ unsigned int __lvl; \
25023+ pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \
25024+ BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
25025+ (((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT) \
25026+ | ((unsigned long)(va) & (PAGE_SIZE - 1))); \
25027+})
25028+
25029+#include <asm-generic/pgtable.h>
25030+
25031+#include <xen/features.h>
25032+void make_page_readonly(void *va, unsigned int feature);
25033+void make_page_writable(void *va, unsigned int feature);
25034+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
25035+void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
25036+
25037+struct vm_area_struct;
25038+
25039+int direct_remap_pfn_range(struct vm_area_struct *vma,
25040+ unsigned long address,
25041+ unsigned long mfn,
25042+ unsigned long size,
25043+ pgprot_t prot,
25044+ domid_t domid);
25045+int direct_kernel_remap_pfn_range(unsigned long address,
25046+ unsigned long mfn,
25047+ unsigned long size,
25048+ pgprot_t prot,
25049+ domid_t domid);
25050+int create_lookup_pte_addr(struct mm_struct *mm,
25051+ unsigned long address,
25052+ uint64_t *ptep);
25053+int touch_pte_range(struct mm_struct *mm,
25054+ unsigned long address,
25055+ unsigned long size);
25056+
25057+int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
25058+ unsigned long addr, unsigned long end, pgprot_t newprot,
25059+ int dirty_accountable);
25060+
25061+#endif /* __ASSEMBLY__ */
25062+
25063+#endif /* _ASM_X86_PGTABLE_H */
25064--- a/include/asm-x86/mach-xen/asm/processor_32.h
25065+++ /dev/null
25066@@ -1,751 +0,0 @@
25067-/*
25068- * include/asm-i386/processor.h
25069- *
25070- * Copyright (C) 1994 Linus Torvalds
25071- */
25072-
25073-#ifndef __ASM_I386_PROCESSOR_H
25074-#define __ASM_I386_PROCESSOR_H
25075-
25076-#include <asm/vm86.h>
25077-#include <asm/math_emu.h>
25078-#include <asm/segment.h>
25079-#include <asm/page.h>
25080-#include <asm/types.h>
25081-#include <asm/sigcontext.h>
25082-#include <asm/cpufeature.h>
25083-#include <asm/msr.h>
25084-#include <asm/system.h>
25085-#include <linux/cache.h>
25086-#include <linux/threads.h>
25087-#include <asm/percpu.h>
25088-#include <linux/cpumask.h>
25089-#include <linux/init.h>
25090-#include <asm/processor-flags.h>
25091-#include <xen/interface/physdev.h>
25092-
25093-/* flag for disabling the tsc */
25094-#define tsc_disable 0
25095-
25096-struct desc_struct {
25097- unsigned long a,b;
25098-};
25099-
25100-#define desc_empty(desc) \
25101- (!((desc)->a | (desc)->b))
25102-
25103-#define desc_equal(desc1, desc2) \
25104- (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
25105-/*
25106- * Default implementation of macro that returns current
25107- * instruction pointer ("program counter").
25108- */
25109-#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
25110-
25111-/*
25112- * CPU type and hardware bug flags. Kept separately for each CPU.
25113- * Members of this structure are referenced in head.S, so think twice
25114- * before touching them. [mj]
25115- */
25116-
25117-struct cpuinfo_x86 {
25118- __u8 x86; /* CPU family */
25119- __u8 x86_vendor; /* CPU vendor */
25120- __u8 x86_model;
25121- __u8 x86_mask;
25122- char wp_works_ok; /* It doesn't on 386's */
25123- char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
25124- char hard_math;
25125- char rfu;
25126- int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
25127- unsigned long x86_capability[NCAPINTS];
25128- char x86_vendor_id[16];
25129- char x86_model_id[64];
25130- int x86_cache_size; /* in KB - valid for CPUS which support this
25131- call */
25132- int x86_cache_alignment; /* In bytes */
25133- char fdiv_bug;
25134- char f00f_bug;
25135- char coma_bug;
25136- char pad0;
25137- int x86_power;
25138- unsigned long loops_per_jiffy;
25139-#ifdef CONFIG_SMP
25140- cpumask_t llc_shared_map; /* cpus sharing the last level cache */
25141-#endif
25142- unsigned char x86_max_cores; /* cpuid returned max cores value */
25143- unsigned char apicid;
25144- unsigned short x86_clflush_size;
25145-#ifdef CONFIG_SMP
25146- unsigned char booted_cores; /* number of cores as seen by OS */
25147- __u8 phys_proc_id; /* Physical processor id. */
25148- __u8 cpu_core_id; /* Core id */
25149- __u8 cpu_index; /* index into per_cpu list */
25150-#endif
25151-} __attribute__((__aligned__(SMP_CACHE_BYTES)));
25152-
25153-#define X86_VENDOR_INTEL 0
25154-#define X86_VENDOR_CYRIX 1
25155-#define X86_VENDOR_AMD 2
25156-#define X86_VENDOR_UMC 3
25157-#define X86_VENDOR_NEXGEN 4
25158-#define X86_VENDOR_CENTAUR 5
25159-#define X86_VENDOR_TRANSMETA 7
25160-#define X86_VENDOR_NSC 8
25161-#define X86_VENDOR_NUM 9
25162-#define X86_VENDOR_UNKNOWN 0xff
25163-
25164-/*
25165- * capabilities of CPUs
25166- */
25167-
25168-extern struct cpuinfo_x86 boot_cpu_data;
25169-extern struct cpuinfo_x86 new_cpu_data;
25170-#ifndef CONFIG_X86_NO_TSS
25171-extern struct tss_struct doublefault_tss;
25172-DECLARE_PER_CPU(struct tss_struct, init_tss);
25173-#endif
25174-
25175-#ifdef CONFIG_SMP
25176-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25177-#define cpu_data(cpu) per_cpu(cpu_info, cpu)
25178-#define current_cpu_data cpu_data(smp_processor_id())
25179-#else
25180-#define cpu_data(cpu) boot_cpu_data
25181-#define current_cpu_data boot_cpu_data
25182-#endif
25183-
25184-/*
25185- * the following now lives in the per cpu area:
25186- * extern int cpu_llc_id[NR_CPUS];
25187- */
25188-DECLARE_PER_CPU(u8, cpu_llc_id);
25189-extern char ignore_fpu_irq;
25190-
25191-void __init cpu_detect(struct cpuinfo_x86 *c);
25192-
25193-extern void identify_boot_cpu(void);
25194-extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25195-extern void print_cpu_info(struct cpuinfo_x86 *);
25196-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
25197-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
25198-extern unsigned short num_cache_leaves;
25199-
25200-#ifdef CONFIG_X86_HT
25201-extern void detect_ht(struct cpuinfo_x86 *c);
25202-#else
25203-static inline void detect_ht(struct cpuinfo_x86 *c) {}
25204-#endif
25205-
25206-static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
25207- unsigned int *ecx, unsigned int *edx)
25208-{
25209- /* ecx is often an input as well as an output. */
25210- __asm__(XEN_CPUID
25211- : "=a" (*eax),
25212- "=b" (*ebx),
25213- "=c" (*ecx),
25214- "=d" (*edx)
25215- : "0" (*eax), "2" (*ecx));
25216-}
25217-
25218-#define load_cr3(pgdir) write_cr3(__pa(pgdir))
25219-
25220-/*
25221- * Save the cr4 feature set we're using (ie
25222- * Pentium 4MB enable and PPro Global page
25223- * enable), so that any CPU's that boot up
25224- * after us can get the correct flags.
25225- */
25226-extern unsigned long mmu_cr4_features;
25227-
25228-static inline void set_in_cr4 (unsigned long mask)
25229-{
25230- unsigned cr4;
25231- mmu_cr4_features |= mask;
25232- cr4 = read_cr4();
25233- cr4 |= mask;
25234- write_cr4(cr4);
25235-}
25236-
25237-static inline void clear_in_cr4 (unsigned long mask)
25238-{
25239- unsigned cr4;
25240- mmu_cr4_features &= ~mask;
25241- cr4 = read_cr4();
25242- cr4 &= ~mask;
25243- write_cr4(cr4);
25244-}
25245-
25246-/* Stop speculative execution */
25247-static inline void sync_core(void)
25248-{
25249- int tmp;
25250- asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
25251-}
25252-
25253-static inline void __monitor(const void *eax, unsigned long ecx,
25254- unsigned long edx)
25255-{
25256- /* "monitor %eax,%ecx,%edx;" */
25257- asm volatile(
25258- ".byte 0x0f,0x01,0xc8;"
25259- : :"a" (eax), "c" (ecx), "d"(edx));
25260-}
25261-
25262-static inline void __mwait(unsigned long eax, unsigned long ecx)
25263-{
25264- /* "mwait %eax,%ecx;" */
25265- asm volatile(
25266- ".byte 0x0f,0x01,0xc9;"
25267- : :"a" (eax), "c" (ecx));
25268-}
25269-
25270-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25271-
25272-/* from system description table in BIOS. Mostly for MCA use, but
25273-others may find it useful. */
25274-extern unsigned int machine_id;
25275-extern unsigned int machine_submodel_id;
25276-extern unsigned int BIOS_revision;
25277-extern unsigned int mca_pentium_flag;
25278-
25279-/* Boot loader type from the setup header */
25280-extern int bootloader_type;
25281-
25282-/*
25283- * User space process size: 3GB (default).
25284- */
25285-#define TASK_SIZE (PAGE_OFFSET)
25286-
25287-/* This decides where the kernel will search for a free chunk of vm
25288- * space during mmap's.
25289- */
25290-#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
25291-
25292-#define HAVE_ARCH_PICK_MMAP_LAYOUT
25293-
25294-extern void hard_disable_TSC(void);
25295-extern void disable_TSC(void);
25296-extern void hard_enable_TSC(void);
25297-
25298-/*
25299- * Size of io_bitmap.
25300- */
25301-#define IO_BITMAP_BITS 65536
25302-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
25303-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
25304-#ifndef CONFIG_X86_NO_TSS
25305-#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
25306-#endif
25307-#define INVALID_IO_BITMAP_OFFSET 0x8000
25308-#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
25309-
25310-struct i387_fsave_struct {
25311- long cwd;
25312- long swd;
25313- long twd;
25314- long fip;
25315- long fcs;
25316- long foo;
25317- long fos;
25318- long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
25319- long status; /* software status information */
25320-};
25321-
25322-struct i387_fxsave_struct {
25323- unsigned short cwd;
25324- unsigned short swd;
25325- unsigned short twd;
25326- unsigned short fop;
25327- long fip;
25328- long fcs;
25329- long foo;
25330- long fos;
25331- long mxcsr;
25332- long mxcsr_mask;
25333- long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
25334- long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
25335- long padding[56];
25336-} __attribute__ ((aligned (16)));
25337-
25338-struct i387_soft_struct {
25339- long cwd;
25340- long swd;
25341- long twd;
25342- long fip;
25343- long fcs;
25344- long foo;
25345- long fos;
25346- long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
25347- unsigned char ftop, changed, lookahead, no_update, rm, alimit;
25348- struct info *info;
25349- unsigned long entry_eip;
25350-};
25351-
25352-union i387_union {
25353- struct i387_fsave_struct fsave;
25354- struct i387_fxsave_struct fxsave;
25355- struct i387_soft_struct soft;
25356-};
25357-
25358-typedef struct {
25359- unsigned long seg;
25360-} mm_segment_t;
25361-
25362-struct thread_struct;
25363-
25364-#ifndef CONFIG_X86_NO_TSS
25365-/* This is the TSS defined by the hardware. */
25366-struct i386_hw_tss {
25367- unsigned short back_link,__blh;
25368- unsigned long esp0;
25369- unsigned short ss0,__ss0h;
25370- unsigned long esp1;
25371- unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
25372- unsigned long esp2;
25373- unsigned short ss2,__ss2h;
25374- unsigned long __cr3;
25375- unsigned long eip;
25376- unsigned long eflags;
25377- unsigned long eax,ecx,edx,ebx;
25378- unsigned long esp;
25379- unsigned long ebp;
25380- unsigned long esi;
25381- unsigned long edi;
25382- unsigned short es, __esh;
25383- unsigned short cs, __csh;
25384- unsigned short ss, __ssh;
25385- unsigned short ds, __dsh;
25386- unsigned short fs, __fsh;
25387- unsigned short gs, __gsh;
25388- unsigned short ldt, __ldth;
25389- unsigned short trace, io_bitmap_base;
25390-} __attribute__((packed));
25391-
25392-struct tss_struct {
25393- struct i386_hw_tss x86_tss;
25394-
25395- /*
25396- * The extra 1 is there because the CPU will access an
25397- * additional byte beyond the end of the IO permission
25398- * bitmap. The extra byte must be all 1 bits, and must
25399- * be within the limit.
25400- */
25401- unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
25402- /*
25403- * Cache the current maximum and the last task that used the bitmap:
25404- */
25405- unsigned long io_bitmap_max;
25406- struct thread_struct *io_bitmap_owner;
25407- /*
25408- * pads the TSS to be cacheline-aligned (size is 0x100)
25409- */
25410- unsigned long __cacheline_filler[35];
25411- /*
25412- * .. and then another 0x100 bytes for emergency kernel stack
25413- */
25414- unsigned long stack[64];
25415-} __attribute__((packed));
25416-#endif
25417-
25418-#define ARCH_MIN_TASKALIGN 16
25419-
25420-struct thread_struct {
25421-/* cached TLS descriptors. */
25422- struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
25423- unsigned long esp0;
25424- unsigned long sysenter_cs;
25425- unsigned long eip;
25426- unsigned long esp;
25427- unsigned long fs;
25428- unsigned long gs;
25429-/* Hardware debugging registers */
25430- unsigned long debugreg[8]; /* %%db0-7 debug registers */
25431-/* fault info */
25432- unsigned long cr2, trap_no, error_code;
25433-/* floating point info */
25434- union i387_union i387;
25435-/* virtual 86 mode info */
25436- struct vm86_struct __user * vm86_info;
25437- unsigned long screen_bitmap;
25438- unsigned long v86flags, v86mask, saved_esp0;
25439- unsigned int saved_fs, saved_gs;
25440-/* IO permissions */
25441- unsigned long *io_bitmap_ptr;
25442- unsigned long iopl;
25443-/* max allowed port in the bitmap, in bytes: */
25444- unsigned long io_bitmap_max;
25445-};
25446-
25447-#define INIT_THREAD { \
25448- .esp0 = sizeof(init_stack) + (long)&init_stack, \
25449- .vm86_info = NULL, \
25450- .sysenter_cs = __KERNEL_CS, \
25451- .io_bitmap_ptr = NULL, \
25452- .fs = __KERNEL_PERCPU, \
25453-}
25454-
25455-/*
25456- * Note that the .io_bitmap member must be extra-big. This is because
25457- * the CPU will access an additional byte beyond the end of the IO
25458- * permission bitmap. The extra byte must be all 1 bits, and must
25459- * be within the limit.
25460- */
25461-#define INIT_TSS { \
25462- .x86_tss = { \
25463- .esp0 = sizeof(init_stack) + (long)&init_stack, \
25464- .ss0 = __KERNEL_DS, \
25465- .ss1 = __KERNEL_CS, \
25466- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
25467- }, \
25468- .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
25469-}
25470-
25471-#define start_thread(regs, new_eip, new_esp) do { \
25472- __asm__("movl %0,%%gs": :"r" (0)); \
25473- regs->xfs = 0; \
25474- set_fs(USER_DS); \
25475- regs->xds = __USER_DS; \
25476- regs->xes = __USER_DS; \
25477- regs->xss = __USER_DS; \
25478- regs->xcs = __USER_CS; \
25479- regs->eip = new_eip; \
25480- regs->esp = new_esp; \
25481-} while (0)
25482-
25483-/* Forward declaration, a strange C thing */
25484-struct task_struct;
25485-struct mm_struct;
25486-
25487-/* Free all resources held by a thread. */
25488-extern void release_thread(struct task_struct *);
25489-
25490-/* Prepare to copy thread state - unlazy all lazy status */
25491-extern void prepare_to_copy(struct task_struct *tsk);
25492-
25493-/*
25494- * create a kernel thread without removing it from tasklists
25495- */
25496-extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
25497-
25498-extern unsigned long thread_saved_pc(struct task_struct *tsk);
25499-void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
25500-
25501-unsigned long get_wchan(struct task_struct *p);
25502-
25503-#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
25504-#define KSTK_TOP(info) \
25505-({ \
25506- unsigned long *__ptr = (unsigned long *)(info); \
25507- (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
25508-})
25509-
25510-/*
25511- * The below -8 is to reserve 8 bytes on top of the ring0 stack.
25512- * This is necessary to guarantee that the entire "struct pt_regs"
25513- * is accessable even if the CPU haven't stored the SS/ESP registers
25514- * on the stack (interrupt gate does not save these registers
25515- * when switching to the same priv ring).
25516- * Therefore beware: accessing the xss/esp fields of the
25517- * "struct pt_regs" is possible, but they may contain the
25518- * completely wrong values.
25519- */
25520-#define task_pt_regs(task) \
25521-({ \
25522- struct pt_regs *__regs__; \
25523- __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
25524- __regs__ - 1; \
25525-})
25526-
25527-#define KSTK_EIP(task) (task_pt_regs(task)->eip)
25528-#define KSTK_ESP(task) (task_pt_regs(task)->esp)
25529-
25530-
25531-struct microcode_header {
25532- unsigned int hdrver;
25533- unsigned int rev;
25534- unsigned int date;
25535- unsigned int sig;
25536- unsigned int cksum;
25537- unsigned int ldrver;
25538- unsigned int pf;
25539- unsigned int datasize;
25540- unsigned int totalsize;
25541- unsigned int reserved[3];
25542-};
25543-
25544-struct microcode {
25545- struct microcode_header hdr;
25546- unsigned int bits[0];
25547-};
25548-
25549-typedef struct microcode microcode_t;
25550-typedef struct microcode_header microcode_header_t;
25551-
25552-/* microcode format is extended from prescott processors */
25553-struct extended_signature {
25554- unsigned int sig;
25555- unsigned int pf;
25556- unsigned int cksum;
25557-};
25558-
25559-struct extended_sigtable {
25560- unsigned int count;
25561- unsigned int cksum;
25562- unsigned int reserved[3];
25563- struct extended_signature sigs[0];
25564-};
25565-
25566-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
25567-static inline void rep_nop(void)
25568-{
25569- __asm__ __volatile__("rep;nop": : :"memory");
25570-}
25571-
25572-#define cpu_relax() rep_nop()
25573-
25574-#ifndef CONFIG_X86_NO_TSS
25575-static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
25576-{
25577- tss->x86_tss.esp0 = thread->esp0;
25578- /* This can only happen when SEP is enabled, no need to test "SEP"arately */
25579- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
25580- tss->x86_tss.ss1 = thread->sysenter_cs;
25581- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
25582- }
25583-}
25584-#else
25585-#define xen_load_esp0(tss, thread) do { \
25586- if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
25587- BUG(); \
25588-} while (0)
25589-#endif
25590-
25591-
25592-static inline unsigned long xen_get_debugreg(int regno)
25593-{
25594- return HYPERVISOR_get_debugreg(regno);
25595-}
25596-
25597-static inline void xen_set_debugreg(int regno, unsigned long value)
25598-{
25599- WARN_ON(HYPERVISOR_set_debugreg(regno, value));
25600-}
25601-
25602-/*
25603- * Set IOPL bits in EFLAGS from given mask
25604- */
25605-static inline void xen_set_iopl_mask(unsigned mask)
25606-{
25607- struct physdev_set_iopl set_iopl;
25608-
25609- /* Force the change at ring 0. */
25610- set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
25611- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
25612-}
25613-
25614-
25615-#define paravirt_enabled() 0
25616-#define __cpuid xen_cpuid
25617-
25618-#define load_esp0 xen_load_esp0
25619-
25620-/*
25621- * These special macros can be used to get or set a debugging register
25622- */
25623-#define get_debugreg(var, register) \
25624- (var) = xen_get_debugreg(register)
25625-#define set_debugreg(value, register) \
25626- xen_set_debugreg(register, value)
25627-
25628-#define set_iopl_mask xen_set_iopl_mask
25629-
25630-/*
25631- * Generic CPUID function
25632- * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
25633- * resulting in stale register contents being returned.
25634- */
25635-static inline void cpuid(unsigned int op,
25636- unsigned int *eax, unsigned int *ebx,
25637- unsigned int *ecx, unsigned int *edx)
25638-{
25639- *eax = op;
25640- *ecx = 0;
25641- __cpuid(eax, ebx, ecx, edx);
25642-}
25643-
25644-/* Some CPUID calls want 'count' to be placed in ecx */
25645-static inline void cpuid_count(unsigned int op, int count,
25646- unsigned int *eax, unsigned int *ebx,
25647- unsigned int *ecx, unsigned int *edx)
25648-{
25649- *eax = op;
25650- *ecx = count;
25651- __cpuid(eax, ebx, ecx, edx);
25652-}
25653-
25654-/*
25655- * CPUID functions returning a single datum
25656- */
25657-static inline unsigned int cpuid_eax(unsigned int op)
25658-{
25659- unsigned int eax, ebx, ecx, edx;
25660-
25661- cpuid(op, &eax, &ebx, &ecx, &edx);
25662- return eax;
25663-}
25664-static inline unsigned int cpuid_ebx(unsigned int op)
25665-{
25666- unsigned int eax, ebx, ecx, edx;
25667-
25668- cpuid(op, &eax, &ebx, &ecx, &edx);
25669- return ebx;
25670-}
25671-static inline unsigned int cpuid_ecx(unsigned int op)
25672-{
25673- unsigned int eax, ebx, ecx, edx;
25674-
25675- cpuid(op, &eax, &ebx, &ecx, &edx);
25676- return ecx;
25677-}
25678-static inline unsigned int cpuid_edx(unsigned int op)
25679-{
25680- unsigned int eax, ebx, ecx, edx;
25681-
25682- cpuid(op, &eax, &ebx, &ecx, &edx);
25683- return edx;
25684-}
25685-
25686-/* generic versions from gas */
25687-#define GENERIC_NOP1 ".byte 0x90\n"
25688-#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
25689-#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
25690-#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
25691-#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
25692-#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
25693-#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
25694-#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
25695-
25696-/* Opteron nops */
25697-#define K8_NOP1 GENERIC_NOP1
25698-#define K8_NOP2 ".byte 0x66,0x90\n"
25699-#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
25700-#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
25701-#define K8_NOP5 K8_NOP3 K8_NOP2
25702-#define K8_NOP6 K8_NOP3 K8_NOP3
25703-#define K8_NOP7 K8_NOP4 K8_NOP3
25704-#define K8_NOP8 K8_NOP4 K8_NOP4
25705-
25706-/* K7 nops */
25707-/* uses eax dependencies (arbitary choice) */
25708-#define K7_NOP1 GENERIC_NOP1
25709-#define K7_NOP2 ".byte 0x8b,0xc0\n"
25710-#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
25711-#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
25712-#define K7_NOP5 K7_NOP4 ASM_NOP1
25713-#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
25714-#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
25715-#define K7_NOP8 K7_NOP7 ASM_NOP1
25716-
25717-/* P6 nops */
25718-/* uses eax dependencies (Intel-recommended choice) */
25719-#define P6_NOP1 GENERIC_NOP1
25720-#define P6_NOP2 ".byte 0x66,0x90\n"
25721-#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
25722-#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
25723-#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
25724-#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
25725-#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
25726-#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
25727-
25728-#ifdef CONFIG_MK8
25729-#define ASM_NOP1 K8_NOP1
25730-#define ASM_NOP2 K8_NOP2
25731-#define ASM_NOP3 K8_NOP3
25732-#define ASM_NOP4 K8_NOP4
25733-#define ASM_NOP5 K8_NOP5
25734-#define ASM_NOP6 K8_NOP6
25735-#define ASM_NOP7 K8_NOP7
25736-#define ASM_NOP8 K8_NOP8
25737-#elif defined(CONFIG_MK7)
25738-#define ASM_NOP1 K7_NOP1
25739-#define ASM_NOP2 K7_NOP2
25740-#define ASM_NOP3 K7_NOP3
25741-#define ASM_NOP4 K7_NOP4
25742-#define ASM_NOP5 K7_NOP5
25743-#define ASM_NOP6 K7_NOP6
25744-#define ASM_NOP7 K7_NOP7
25745-#define ASM_NOP8 K7_NOP8
25746-#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
25747- defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
25748- defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
25749-#define ASM_NOP1 P6_NOP1
25750-#define ASM_NOP2 P6_NOP2
25751-#define ASM_NOP3 P6_NOP3
25752-#define ASM_NOP4 P6_NOP4
25753-#define ASM_NOP5 P6_NOP5
25754-#define ASM_NOP6 P6_NOP6
25755-#define ASM_NOP7 P6_NOP7
25756-#define ASM_NOP8 P6_NOP8
25757-#else
25758-#define ASM_NOP1 GENERIC_NOP1
25759-#define ASM_NOP2 GENERIC_NOP2
25760-#define ASM_NOP3 GENERIC_NOP3
25761-#define ASM_NOP4 GENERIC_NOP4
25762-#define ASM_NOP5 GENERIC_NOP5
25763-#define ASM_NOP6 GENERIC_NOP6
25764-#define ASM_NOP7 GENERIC_NOP7
25765-#define ASM_NOP8 GENERIC_NOP8
25766-#endif
25767-
25768-#define ASM_NOP_MAX 8
25769-
25770-/* Prefetch instructions for Pentium III and AMD Athlon */
25771-/* It's not worth to care about 3dnow! prefetches for the K6
25772- because they are microcoded there and very slow.
25773- However we don't do prefetches for pre XP Athlons currently
25774- That should be fixed. */
25775-#define ARCH_HAS_PREFETCH
25776-static inline void prefetch(const void *x)
25777-{
25778- alternative_input(ASM_NOP4,
25779- "prefetchnta (%1)",
25780- X86_FEATURE_XMM,
25781- "r" (x));
25782-}
25783-
25784-#define ARCH_HAS_PREFETCH
25785-#define ARCH_HAS_PREFETCHW
25786-#define ARCH_HAS_SPINLOCK_PREFETCH
25787-
25788-/* 3dnow! prefetch to get an exclusive cache line. Useful for
25789- spinlocks to avoid one state transition in the cache coherency protocol. */
25790-static inline void prefetchw(const void *x)
25791-{
25792- alternative_input(ASM_NOP4,
25793- "prefetchw (%1)",
25794- X86_FEATURE_3DNOW,
25795- "r" (x));
25796-}
25797-#define spin_lock_prefetch(x) prefetchw(x)
25798-
25799-extern void select_idle_routine(const struct cpuinfo_x86 *c);
25800-
25801-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
25802-
25803-extern unsigned long boot_option_idle_override;
25804-extern void enable_sep_cpu(void);
25805-extern int sysenter_setup(void);
25806-
25807-/* Defined in head.S */
25808-extern struct Xgt_desc_struct early_gdt_descr;
25809-
25810-extern void cpu_set_gdt(int);
25811-extern void switch_to_new_gdt(void);
25812-extern void cpu_init(void);
25813-extern void init_gdt(int cpu);
25814-
25815-extern int force_mwait;
25816-
25817-#endif /* __ASM_I386_PROCESSOR_H */
25818--- a/include/asm-x86/mach-xen/asm/processor_64.h
25819+++ /dev/null
25820@@ -1,461 +0,0 @@
25821-/*
25822- * include/asm-x86_64/processor.h
25823- *
25824- * Copyright (C) 1994 Linus Torvalds
25825- */
25826-
25827-#ifndef __ASM_X86_64_PROCESSOR_H
25828-#define __ASM_X86_64_PROCESSOR_H
25829-
25830-#include <asm/segment.h>
25831-#include <asm/page.h>
25832-#include <asm/types.h>
25833-#include <asm/sigcontext.h>
25834-#include <asm/cpufeature.h>
25835-#include <linux/threads.h>
25836-#include <asm/msr.h>
25837-#include <asm/current.h>
25838-#include <asm/system.h>
25839-#include <asm/mmsegment.h>
25840-#include <asm/percpu.h>
25841-#include <linux/personality.h>
25842-#include <linux/cpumask.h>
25843-#include <asm/processor-flags.h>
25844-
25845-#define TF_MASK 0x00000100
25846-#define IF_MASK 0x00000200
25847-#define IOPL_MASK 0x00003000
25848-#define NT_MASK 0x00004000
25849-#define VM_MASK 0x00020000
25850-#define AC_MASK 0x00040000
25851-#define VIF_MASK 0x00080000 /* virtual interrupt flag */
25852-#define VIP_MASK 0x00100000 /* virtual interrupt pending */
25853-#define ID_MASK 0x00200000
25854-
25855-#define desc_empty(desc) \
25856- (!((desc)->a | (desc)->b))
25857-
25858-#define desc_equal(desc1, desc2) \
25859- (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
25860-
25861-/*
25862- * Default implementation of macro that returns current
25863- * instruction pointer ("program counter").
25864- */
25865-#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
25866-
25867-/*
25868- * CPU type and hardware bug flags. Kept separately for each CPU.
25869- */
25870-
25871-struct cpuinfo_x86 {
25872- __u8 x86; /* CPU family */
25873- __u8 x86_vendor; /* CPU vendor */
25874- __u8 x86_model;
25875- __u8 x86_mask;
25876- int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
25877- __u32 x86_capability[NCAPINTS];
25878- char x86_vendor_id[16];
25879- char x86_model_id[64];
25880- int x86_cache_size; /* in KB */
25881- int x86_clflush_size;
25882- int x86_cache_alignment;
25883- int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/
25884- __u8 x86_virt_bits, x86_phys_bits;
25885- __u8 x86_max_cores; /* cpuid returned max cores value */
25886- __u32 x86_power;
25887- __u32 extended_cpuid_level; /* Max extended CPUID function supported */
25888- unsigned long loops_per_jiffy;
25889-#ifdef CONFIG_SMP
25890- cpumask_t llc_shared_map; /* cpus sharing the last level cache */
25891-#endif
25892- __u8 apicid;
25893-#ifdef CONFIG_SMP
25894- __u8 booted_cores; /* number of cores as seen by OS */
25895- __u8 phys_proc_id; /* Physical Processor id. */
25896- __u8 cpu_core_id; /* Core id. */
25897- __u8 cpu_index; /* index into per_cpu list */
25898-#endif
25899-} ____cacheline_aligned;
25900-
25901-#define X86_VENDOR_INTEL 0
25902-#define X86_VENDOR_CYRIX 1
25903-#define X86_VENDOR_AMD 2
25904-#define X86_VENDOR_UMC 3
25905-#define X86_VENDOR_NEXGEN 4
25906-#define X86_VENDOR_CENTAUR 5
25907-#define X86_VENDOR_TRANSMETA 7
25908-#define X86_VENDOR_NUM 8
25909-#define X86_VENDOR_UNKNOWN 0xff
25910-
25911-#ifdef CONFIG_SMP
25912-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25913-#define cpu_data(cpu) per_cpu(cpu_info, cpu)
25914-#define current_cpu_data cpu_data(smp_processor_id())
25915-#else
25916-#define cpu_data(cpu) boot_cpu_data
25917-#define current_cpu_data boot_cpu_data
25918-#endif
25919-
25920-extern char ignore_irq13;
25921-
25922-extern void identify_cpu(struct cpuinfo_x86 *);
25923-extern void print_cpu_info(struct cpuinfo_x86 *);
25924-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
25925-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
25926-extern unsigned short num_cache_leaves;
25927-
25928-/*
25929- * Save the cr4 feature set we're using (ie
25930- * Pentium 4MB enable and PPro Global page
25931- * enable), so that any CPU's that boot up
25932- * after us can get the correct flags.
25933- */
25934-extern unsigned long mmu_cr4_features;
25935-
25936-static inline void set_in_cr4 (unsigned long mask)
25937-{
25938- mmu_cr4_features |= mask;
25939- __asm__("movq %%cr4,%%rax\n\t"
25940- "orq %0,%%rax\n\t"
25941- "movq %%rax,%%cr4\n"
25942- : : "irg" (mask)
25943- :"ax");
25944-}
25945-
25946-static inline void clear_in_cr4 (unsigned long mask)
25947-{
25948- mmu_cr4_features &= ~mask;
25949- __asm__("movq %%cr4,%%rax\n\t"
25950- "andq %0,%%rax\n\t"
25951- "movq %%rax,%%cr4\n"
25952- : : "irg" (~mask)
25953- :"ax");
25954-}
25955-
25956-
25957-/*
25958- * User space process size. 47bits minus one guard page.
25959- */
25960-#define TASK_SIZE64 (0x800000000000UL - 4096)
25961-
25962-/* This decides where the kernel will search for a free chunk of vm
25963- * space during mmap's.
25964- */
25965-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
25966-
25967-#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
25968-#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
25969-
25970-#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
25971-
25972-/*
25973- * Size of io_bitmap.
25974- */
25975-#define IO_BITMAP_BITS 65536
25976-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
25977-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
25978-#ifndef CONFIG_X86_NO_TSS
25979-#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
25980-#endif
25981-#define INVALID_IO_BITMAP_OFFSET 0x8000
25982-
25983-struct i387_fxsave_struct {
25984- u16 cwd;
25985- u16 swd;
25986- u16 twd;
25987- u16 fop;
25988- u64 rip;
25989- u64 rdp;
25990- u32 mxcsr;
25991- u32 mxcsr_mask;
25992- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
25993- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
25994- u32 padding[24];
25995-} __attribute__ ((aligned (16)));
25996-
25997-union i387_union {
25998- struct i387_fxsave_struct fxsave;
25999-};
26000-
26001-#ifndef CONFIG_X86_NO_TSS
26002-struct tss_struct {
26003- u32 reserved1;
26004- u64 rsp0;
26005- u64 rsp1;
26006- u64 rsp2;
26007- u64 reserved2;
26008- u64 ist[7];
26009- u32 reserved3;
26010- u32 reserved4;
26011- u16 reserved5;
26012- u16 io_bitmap_base;
26013- /*
26014- * The extra 1 is there because the CPU will access an
26015- * additional byte beyond the end of the IO permission
26016- * bitmap. The extra byte must be all 1 bits, and must
26017- * be within the limit. Thus we have:
26018- *
26019- * 128 bytes, the bitmap itself, for ports 0..0x3ff
26020- * 8 bytes, for an extra "long" of ~0UL
26021- */
26022- unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
26023-} __attribute__((packed)) ____cacheline_aligned;
26024-
26025-DECLARE_PER_CPU(struct tss_struct,init_tss);
26026-#endif
26027-
26028-
26029-extern struct cpuinfo_x86 boot_cpu_data;
26030-#ifndef CONFIG_X86_NO_TSS
26031-/* Save the original ist values for checking stack pointers during debugging */
26032-struct orig_ist {
26033- unsigned long ist[7];
26034-};
26035-DECLARE_PER_CPU(struct orig_ist, orig_ist);
26036-#endif
26037-
26038-#ifdef CONFIG_X86_VSMP
26039-#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
26040-#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
26041-#else
26042-#define ARCH_MIN_TASKALIGN 16
26043-#define ARCH_MIN_MMSTRUCT_ALIGN 0
26044-#endif
26045-
26046-struct thread_struct {
26047- unsigned long rsp0;
26048- unsigned long rsp;
26049- unsigned long userrsp; /* Copy from PDA */
26050- unsigned long fs;
26051- unsigned long gs;
26052- unsigned short es, ds, fsindex, gsindex;
26053-/* Hardware debugging registers */
26054- unsigned long debugreg0;
26055- unsigned long debugreg1;
26056- unsigned long debugreg2;
26057- unsigned long debugreg3;
26058- unsigned long debugreg6;
26059- unsigned long debugreg7;
26060-/* fault info */
26061- unsigned long cr2, trap_no, error_code;
26062-/* floating point info */
26063- union i387_union i387 __attribute__((aligned(16)));
26064-/* IO permissions. the bitmap could be moved into the GDT, that would make
26065- switch faster for a limited number of ioperm using tasks. -AK */
26066- int ioperm;
26067- unsigned long *io_bitmap_ptr;
26068- unsigned io_bitmap_max;
26069-/* cached TLS descriptors. */
26070- u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
26071- unsigned int iopl;
26072-} __attribute__((aligned(16)));
26073-
26074-#define INIT_THREAD { \
26075- .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
26076-}
26077-
26078-#ifndef CONFIG_X86_NO_TSS
26079-#define INIT_TSS { \
26080- .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
26081-}
26082-#endif
26083-
26084-#define INIT_MMAP \
26085-{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
26086-
26087-#define start_thread(regs,new_rip,new_rsp) do { \
26088- asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
26089- load_gs_index(0); \
26090- (regs)->rip = (new_rip); \
26091- (regs)->rsp = (new_rsp); \
26092- write_pda(oldrsp, (new_rsp)); \
26093- (regs)->cs = __USER_CS; \
26094- (regs)->ss = __USER_DS; \
26095- (regs)->eflags = 0x200; \
26096- set_fs(USER_DS); \
26097-} while(0)
26098-
26099-#define get_debugreg(var, register) \
26100- var = HYPERVISOR_get_debugreg(register)
26101-#define set_debugreg(value, register) do { \
26102- if (HYPERVISOR_set_debugreg(register, value)) \
26103- BUG(); \
26104-} while (0)
26105-
26106-struct task_struct;
26107-struct mm_struct;
26108-
26109-/* Free all resources held by a thread. */
26110-extern void release_thread(struct task_struct *);
26111-
26112-/* Prepare to copy thread state - unlazy all lazy status */
26113-extern void prepare_to_copy(struct task_struct *tsk);
26114-
26115-/*
26116- * create a kernel thread without removing it from tasklists
26117- */
26118-extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
26119-
26120-/*
26121- * Return saved PC of a blocked thread.
26122- * What is this good for? it will be always the scheduler or ret_from_fork.
26123- */
26124-#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
26125-
26126-extern unsigned long get_wchan(struct task_struct *p);
26127-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
26128-#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
26129-#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
26130-
26131-
26132-struct microcode_header {
26133- unsigned int hdrver;
26134- unsigned int rev;
26135- unsigned int date;
26136- unsigned int sig;
26137- unsigned int cksum;
26138- unsigned int ldrver;
26139- unsigned int pf;
26140- unsigned int datasize;
26141- unsigned int totalsize;
26142- unsigned int reserved[3];
26143-};
26144-
26145-struct microcode {
26146- struct microcode_header hdr;
26147- unsigned int bits[0];
26148-};
26149-
26150-typedef struct microcode microcode_t;
26151-typedef struct microcode_header microcode_header_t;
26152-
26153-/* microcode format is extended from prescott processors */
26154-struct extended_signature {
26155- unsigned int sig;
26156- unsigned int pf;
26157- unsigned int cksum;
26158-};
26159-
26160-struct extended_sigtable {
26161- unsigned int count;
26162- unsigned int cksum;
26163- unsigned int reserved[3];
26164- struct extended_signature sigs[0];
26165-};
26166-
26167-
26168-#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2)
26169-#define ASM_NOP1 P6_NOP1
26170-#define ASM_NOP2 P6_NOP2
26171-#define ASM_NOP3 P6_NOP3
26172-#define ASM_NOP4 P6_NOP4
26173-#define ASM_NOP5 P6_NOP5
26174-#define ASM_NOP6 P6_NOP6
26175-#define ASM_NOP7 P6_NOP7
26176-#define ASM_NOP8 P6_NOP8
26177-#else
26178-#define ASM_NOP1 K8_NOP1
26179-#define ASM_NOP2 K8_NOP2
26180-#define ASM_NOP3 K8_NOP3
26181-#define ASM_NOP4 K8_NOP4
26182-#define ASM_NOP5 K8_NOP5
26183-#define ASM_NOP6 K8_NOP6
26184-#define ASM_NOP7 K8_NOP7
26185-#define ASM_NOP8 K8_NOP8
26186-#endif
26187-
26188-/* Opteron nops */
26189-#define K8_NOP1 ".byte 0x90\n"
26190-#define K8_NOP2 ".byte 0x66,0x90\n"
26191-#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
26192-#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
26193-#define K8_NOP5 K8_NOP3 K8_NOP2
26194-#define K8_NOP6 K8_NOP3 K8_NOP3
26195-#define K8_NOP7 K8_NOP4 K8_NOP3
26196-#define K8_NOP8 K8_NOP4 K8_NOP4
26197-
26198-/* P6 nops */
26199-/* uses eax dependencies (Intel-recommended choice) */
26200-#define P6_NOP1 ".byte 0x90\n"
26201-#define P6_NOP2 ".byte 0x66,0x90\n"
26202-#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
26203-#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
26204-#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
26205-#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
26206-#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
26207-#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
26208-
26209-#define ASM_NOP_MAX 8
26210-
26211-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
26212-static inline void rep_nop(void)
26213-{
26214- __asm__ __volatile__("rep;nop": : :"memory");
26215-}
26216-
26217-/* Stop speculative execution */
26218-static inline void sync_core(void)
26219-{
26220- int tmp;
26221- asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
26222-}
26223-
26224-#define ARCH_HAS_PREFETCHW 1
26225-static inline void prefetchw(void *x)
26226-{
26227- alternative_input("prefetcht0 (%1)",
26228- "prefetchw (%1)",
26229- X86_FEATURE_3DNOW,
26230- "r" (x));
26231-}
26232-
26233-#define ARCH_HAS_SPINLOCK_PREFETCH 1
26234-
26235-#define spin_lock_prefetch(x) prefetchw(x)
26236-
26237-#define cpu_relax() rep_nop()
26238-
26239-static inline void __monitor(const void *eax, unsigned long ecx,
26240- unsigned long edx)
26241-{
26242- /* "monitor %eax,%ecx,%edx;" */
26243- asm volatile(
26244- ".byte 0x0f,0x01,0xc8;"
26245- : :"a" (eax), "c" (ecx), "d"(edx));
26246-}
26247-
26248-static inline void __mwait(unsigned long eax, unsigned long ecx)
26249-{
26250- /* "mwait %eax,%ecx;" */
26251- asm volatile(
26252- ".byte 0x0f,0x01,0xc9;"
26253- : :"a" (eax), "c" (ecx));
26254-}
26255-
26256-static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
26257-{
26258- /* "mwait %eax,%ecx;" */
26259- asm volatile(
26260- "sti; .byte 0x0f,0x01,0xc9;"
26261- : :"a" (eax), "c" (ecx));
26262-}
26263-
26264-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
26265-
26266-#define stack_current() \
26267-({ \
26268- struct thread_info *ti; \
26269- asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
26270- ti->task; \
26271-})
26272-
26273-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
26274-
26275-extern unsigned long boot_option_idle_override;
26276-/* Boot loader type from the setup header */
26277-extern int bootloader_type;
26278-
26279-#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
26280-
26281-#endif /* __ASM_X86_64_PROCESSOR_H */
26282--- a/include/asm-x86/mach-xen/asm/processor.h
26283+++ b/include/asm-x86/mach-xen/asm/processor.h
26284@@ -1,5 +1,793 @@
26285+#ifndef __ASM_X86_PROCESSOR_H
26286+#define __ASM_X86_PROCESSOR_H
26287+
26288+#include <asm/processor-flags.h>
26289+
26290+/* migration helpers, for KVM - will be removed in 2.6.25: */
26291+#include <asm/vm86.h>
26292+#define Xgt_desc_struct desc_ptr
26293+
26294+/* Forward declaration, a strange C thing */
26295+struct task_struct;
26296+struct mm_struct;
26297+
26298+#include <asm/vm86.h>
26299+#include <asm/math_emu.h>
26300+#include <asm/segment.h>
26301+#include <asm/types.h>
26302+#include <asm/sigcontext.h>
26303+#include <asm/current.h>
26304+#include <asm/cpufeature.h>
26305+#include <asm/system.h>
26306+#include <asm/page.h>
26307+#include <asm/percpu.h>
26308+#include <asm/msr.h>
26309+#include <asm/desc_defs.h>
26310+#include <asm/nops.h>
26311+#include <linux/personality.h>
26312+#include <linux/cpumask.h>
26313+#include <linux/cache.h>
26314+#include <linux/threads.h>
26315+#include <linux/init.h>
26316+#include <xen/interface/physdev.h>
26317+
26318+/*
26319+ * Default implementation of macro that returns current
26320+ * instruction pointer ("program counter").
26321+ */
26322+static inline void *current_text_addr(void)
26323+{
26324+ void *pc;
26325+ asm volatile("mov $1f,%0\n1:":"=r" (pc));
26326+ return pc;
26327+}
26328+
26329+#ifdef CONFIG_X86_VSMP
26330+#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
26331+#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
26332+#else
26333+#define ARCH_MIN_TASKALIGN 16
26334+#define ARCH_MIN_MMSTRUCT_ALIGN 0
26335+#endif
26336+
26337+/*
26338+ * CPU type and hardware bug flags. Kept separately for each CPU.
26339+ * Members of this structure are referenced in head.S, so think twice
26340+ * before touching them. [mj]
26341+ */
26342+
26343+struct cpuinfo_x86 {
26344+ __u8 x86; /* CPU family */
26345+ __u8 x86_vendor; /* CPU vendor */
26346+ __u8 x86_model;
26347+ __u8 x86_mask;
26348+#ifdef CONFIG_X86_32
26349+ char wp_works_ok; /* It doesn't on 386's */
26350+ char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
26351+ char hard_math;
26352+ char rfu;
26353+ char fdiv_bug;
26354+ char f00f_bug;
26355+ char coma_bug;
26356+ char pad0;
26357+#else
26358+ /* number of 4K pages in DTLB/ITLB combined(in pages)*/
26359+ int x86_tlbsize;
26360+ __u8 x86_virt_bits, x86_phys_bits;
26361+ /* cpuid returned core id bits */
26362+ __u8 x86_coreid_bits;
26363+ /* Max extended CPUID function supported */
26364+ __u32 extended_cpuid_level;
26365+#endif
26366+ int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
26367+ __u32 x86_capability[NCAPINTS];
26368+ char x86_vendor_id[16];
26369+ char x86_model_id[64];
26370+ int x86_cache_size; /* in KB - valid for CPUS which support this
26371+ call */
26372+ int x86_cache_alignment; /* In bytes */
26373+ int x86_power;
26374+ unsigned long loops_per_jiffy;
26375+#ifdef CONFIG_SMP
26376+ cpumask_t llc_shared_map; /* cpus sharing the last level cache */
26377+#endif
26378+ u16 x86_max_cores; /* cpuid returned max cores value */
26379+ u16 apicid;
26380+ u16 x86_clflush_size;
26381+#ifdef CONFIG_SMP
26382+ u16 booted_cores; /* number of cores as seen by OS */
26383+ u16 phys_proc_id; /* Physical processor id. */
26384+ u16 cpu_core_id; /* Core id */
26385+ u16 cpu_index; /* index into per_cpu list */
26386+#endif
26387+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
26388+
26389+#define X86_VENDOR_INTEL 0
26390+#define X86_VENDOR_CYRIX 1
26391+#define X86_VENDOR_AMD 2
26392+#define X86_VENDOR_UMC 3
26393+#define X86_VENDOR_NEXGEN 4
26394+#define X86_VENDOR_CENTAUR 5
26395+#define X86_VENDOR_TRANSMETA 7
26396+#define X86_VENDOR_NSC 8
26397+#define X86_VENDOR_NUM 9
26398+#define X86_VENDOR_UNKNOWN 0xff
26399+
26400+/*
26401+ * capabilities of CPUs
26402+ */
26403+extern struct cpuinfo_x86 boot_cpu_data;
26404+extern struct cpuinfo_x86 new_cpu_data;
26405+extern __u32 cleared_cpu_caps[NCAPINTS];
26406+
26407+#ifdef CONFIG_SMP
26408+DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
26409+#define cpu_data(cpu) per_cpu(cpu_info, cpu)
26410+#define current_cpu_data cpu_data(smp_processor_id())
26411+#else
26412+#define cpu_data(cpu) boot_cpu_data
26413+#define current_cpu_data boot_cpu_data
26414+#endif
26415+
26416+void cpu_detect(struct cpuinfo_x86 *c);
26417+
26418+extern void identify_cpu(struct cpuinfo_x86 *);
26419+extern void identify_boot_cpu(void);
26420+extern void identify_secondary_cpu(struct cpuinfo_x86 *);
26421+extern void print_cpu_info(struct cpuinfo_x86 *);
26422+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
26423+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
26424+extern unsigned short num_cache_leaves;
26425+
26426+#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
26427+extern void detect_ht(struct cpuinfo_x86 *c);
26428+#else
26429+static inline void detect_ht(struct cpuinfo_x86 *c) {}
26430+#endif
26431+
26432+static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
26433+ unsigned int *ecx, unsigned int *edx)
26434+{
26435+ /* ecx is often an input as well as an output. */
26436+ __asm__(XEN_CPUID
26437+ : "=a" (*eax),
26438+ "=b" (*ebx),
26439+ "=c" (*ecx),
26440+ "=d" (*edx)
26441+ : "0" (*eax), "2" (*ecx));
26442+}
26443+
26444+static inline void load_cr3(pgd_t *pgdir)
26445+{
26446+ write_cr3(__pa(pgdir));
26447+}
26448+
26449+#ifndef CONFIG_X86_NO_TSS
26450+#ifdef CONFIG_X86_32
26451+/* This is the TSS defined by the hardware. */
26452+struct x86_hw_tss {
26453+ unsigned short back_link, __blh;
26454+ unsigned long sp0;
26455+ unsigned short ss0, __ss0h;
26456+ unsigned long sp1;
26457+ unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */
26458+ unsigned long sp2;
26459+ unsigned short ss2, __ss2h;
26460+ unsigned long __cr3;
26461+ unsigned long ip;
26462+ unsigned long flags;
26463+ unsigned long ax, cx, dx, bx;
26464+ unsigned long sp, bp, si, di;
26465+ unsigned short es, __esh;
26466+ unsigned short cs, __csh;
26467+ unsigned short ss, __ssh;
26468+ unsigned short ds, __dsh;
26469+ unsigned short fs, __fsh;
26470+ unsigned short gs, __gsh;
26471+ unsigned short ldt, __ldth;
26472+ unsigned short trace, io_bitmap_base;
26473+} __attribute__((packed));
26474+extern struct tss_struct doublefault_tss;
26475+#else
26476+struct x86_hw_tss {
26477+ u32 reserved1;
26478+ u64 sp0;
26479+ u64 sp1;
26480+ u64 sp2;
26481+ u64 reserved2;
26482+ u64 ist[7];
26483+ u32 reserved3;
26484+ u32 reserved4;
26485+ u16 reserved5;
26486+ u16 io_bitmap_base;
26487+} __attribute__((packed)) ____cacheline_aligned;
26488+#endif
26489+#endif /* CONFIG_X86_NO_TSS */
26490+
26491+/*
26492+ * Size of io_bitmap.
26493+ */
26494+#define IO_BITMAP_BITS 65536
26495+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
26496+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
26497+#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
26498+#define INVALID_IO_BITMAP_OFFSET 0x8000
26499+#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
26500+
26501+#ifndef CONFIG_X86_NO_TSS
26502+struct tss_struct {
26503+ struct x86_hw_tss x86_tss;
26504+
26505+ /*
26506+ * The extra 1 is there because the CPU will access an
26507+ * additional byte beyond the end of the IO permission
26508+ * bitmap. The extra byte must be all 1 bits, and must
26509+ * be within the limit.
26510+ */
26511+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
26512+ /*
26513+ * Cache the current maximum and the last task that used the bitmap:
26514+ */
26515+ unsigned long io_bitmap_max;
26516+ struct thread_struct *io_bitmap_owner;
26517+ /*
26518+ * pads the TSS to be cacheline-aligned (size is 0x100)
26519+ */
26520+ unsigned long __cacheline_filler[35];
26521+ /*
26522+ * .. and then another 0x100 bytes for emergency kernel stack
26523+ */
26524+ unsigned long stack[64];
26525+} __attribute__((packed));
26526+
26527+DECLARE_PER_CPU(struct tss_struct, init_tss);
26528+
26529+/* Save the original ist values for checking stack pointers during debugging */
26530+struct orig_ist {
26531+ unsigned long ist[7];
26532+};
26533+#endif /* CONFIG_X86_NO_TSS */
26534+
26535+#define MXCSR_DEFAULT 0x1f80
26536+
26537+struct i387_fsave_struct {
26538+ u32 cwd;
26539+ u32 swd;
26540+ u32 twd;
26541+ u32 fip;
26542+ u32 fcs;
26543+ u32 foo;
26544+ u32 fos;
26545+ u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
26546+ u32 status; /* software status information */
26547+};
26548+
26549+struct i387_fxsave_struct {
26550+ u16 cwd;
26551+ u16 swd;
26552+ u16 twd;
26553+ u16 fop;
26554+ union {
26555+ struct {
26556+ u64 rip;
26557+ u64 rdp;
26558+ };
26559+ struct {
26560+ u32 fip;
26561+ u32 fcs;
26562+ u32 foo;
26563+ u32 fos;
26564+ };
26565+ };
26566+ u32 mxcsr;
26567+ u32 mxcsr_mask;
26568+ u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
26569+ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
26570+ u32 padding[24];
26571+} __attribute__((aligned(16)));
26572+
26573+struct i387_soft_struct {
26574+ u32 cwd;
26575+ u32 swd;
26576+ u32 twd;
26577+ u32 fip;
26578+ u32 fcs;
26579+ u32 foo;
26580+ u32 fos;
26581+ u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
26582+ u8 ftop, changed, lookahead, no_update, rm, alimit;
26583+ struct info *info;
26584+ u32 entry_eip;
26585+};
26586+
26587+union i387_union {
26588+ struct i387_fsave_struct fsave;
26589+ struct i387_fxsave_struct fxsave;
26590+ struct i387_soft_struct soft;
26591+};
26592+
26593+#ifdef CONFIG_X86_32
26594+DECLARE_PER_CPU(u8, cpu_llc_id);
26595+#elif !defined(CONFIG_X86_NO_TSS)
26596+DECLARE_PER_CPU(struct orig_ist, orig_ist);
26597+#endif
26598+
26599+extern void print_cpu_info(struct cpuinfo_x86 *);
26600+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
26601+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
26602+extern unsigned short num_cache_leaves;
26603+
26604+struct thread_struct {
26605+/* cached TLS descriptors. */
26606+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
26607+ unsigned long sp0;
26608+ unsigned long sp;
26609+#ifdef CONFIG_X86_32
26610+ unsigned long sysenter_cs;
26611+#else
26612+ unsigned long usersp; /* Copy from PDA */
26613+ unsigned short es, ds, fsindex, gsindex;
26614+#endif
26615+ unsigned long ip;
26616+ unsigned long fs;
26617+ unsigned long gs;
26618+/* Hardware debugging registers */
26619+ unsigned long debugreg0;
26620+ unsigned long debugreg1;
26621+ unsigned long debugreg2;
26622+ unsigned long debugreg3;
26623+ unsigned long debugreg6;
26624+ unsigned long debugreg7;
26625+/* fault info */
26626+ unsigned long cr2, trap_no, error_code;
26627+/* floating point info */
26628+ union i387_union i387 __attribute__((aligned(16)));;
26629+#ifdef CONFIG_X86_32
26630+/* virtual 86 mode info */
26631+ struct vm86_struct __user *vm86_info;
26632+ unsigned long screen_bitmap;
26633+ unsigned long v86flags, v86mask, saved_sp0;
26634+ unsigned int saved_fs, saved_gs;
26635+#endif
26636+/* IO permissions */
26637+ unsigned long *io_bitmap_ptr;
26638+ unsigned long iopl;
26639+/* max allowed port in the bitmap, in bytes: */
26640+ unsigned io_bitmap_max;
26641+/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
26642+ unsigned long debugctlmsr;
26643+/* Debug Store - if not 0 points to a DS Save Area configuration;
26644+ * goes into MSR_IA32_DS_AREA */
26645+ unsigned long ds_area_msr;
26646+};
26647+
26648+static inline unsigned long xen_get_debugreg(int regno)
26649+{
26650+ return HYPERVISOR_get_debugreg(regno);
26651+}
26652+
26653+static inline void xen_set_debugreg(int regno, unsigned long value)
26654+{
26655+ WARN_ON(HYPERVISOR_set_debugreg(regno, value));
26656+}
26657+
26658+/*
26659+ * Set IOPL bits in EFLAGS from given mask
26660+ */
26661+static inline void xen_set_iopl_mask(unsigned mask)
26662+{
26663+ struct physdev_set_iopl set_iopl;
26664+
26665+ /* Force the change at ring 0. */
26666+ set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
26667+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
26668+}
26669+
26670+#ifndef CONFIG_X86_NO_TSS
26671+static inline void native_load_sp0(struct tss_struct *tss,
26672+ struct thread_struct *thread)
26673+{
26674+ tss->x86_tss.sp0 = thread->sp0;
26675+#ifdef CONFIG_X86_32
26676+ /* Only happens when SEP is enabled, no need to test "SEP"arately */
26677+ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
26678+ tss->x86_tss.ss1 = thread->sysenter_cs;
26679+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
26680+ }
26681+#endif
26682+}
26683+#else
26684+#define xen_load_sp0(tss, thread) do { \
26685+ if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
26686+ BUG(); \
26687+} while (0)
26688+#endif
26689+
26690+#define __cpuid xen_cpuid
26691+#define paravirt_enabled() 0
26692+
26693+/*
26694+ * These special macros can be used to get or set a debugging register
26695+ */
26696+#define get_debugreg(var, register) \
26697+ (var) = xen_get_debugreg(register)
26698+#define set_debugreg(value, register) \
26699+ xen_set_debugreg(register, value)
26700+
26701+#define load_sp0 xen_load_sp0
26702+
26703+#define set_iopl_mask xen_set_iopl_mask
26704+
26705+/*
26706+ * Save the cr4 feature set we're using (ie
26707+ * Pentium 4MB enable and PPro Global page
26708+ * enable), so that any CPU's that boot up
26709+ * after us can get the correct flags.
26710+ */
26711+extern unsigned long mmu_cr4_features;
26712+
26713+static inline void set_in_cr4(unsigned long mask)
26714+{
26715+ unsigned cr4;
26716+ mmu_cr4_features |= mask;
26717+ cr4 = read_cr4();
26718+ cr4 |= mask;
26719+ write_cr4(cr4);
26720+}
26721+
26722+static inline void clear_in_cr4(unsigned long mask)
26723+{
26724+ unsigned cr4;
26725+ mmu_cr4_features &= ~mask;
26726+ cr4 = read_cr4();
26727+ cr4 &= ~mask;
26728+ write_cr4(cr4);
26729+}
26730+
26731+struct microcode_header {
26732+ unsigned int hdrver;
26733+ unsigned int rev;
26734+ unsigned int date;
26735+ unsigned int sig;
26736+ unsigned int cksum;
26737+ unsigned int ldrver;
26738+ unsigned int pf;
26739+ unsigned int datasize;
26740+ unsigned int totalsize;
26741+ unsigned int reserved[3];
26742+};
26743+
26744+struct microcode {
26745+ struct microcode_header hdr;
26746+ unsigned int bits[0];
26747+};
26748+
26749+typedef struct microcode microcode_t;
26750+typedef struct microcode_header microcode_header_t;
26751+
26752+/* microcode format is extended from prescott processors */
26753+struct extended_signature {
26754+ unsigned int sig;
26755+ unsigned int pf;
26756+ unsigned int cksum;
26757+};
26758+
26759+struct extended_sigtable {
26760+ unsigned int count;
26761+ unsigned int cksum;
26762+ unsigned int reserved[3];
26763+ struct extended_signature sigs[0];
26764+};
26765+
26766+typedef struct {
26767+ unsigned long seg;
26768+} mm_segment_t;
26769+
26770+
26771+/*
26772+ * create a kernel thread without removing it from tasklists
26773+ */
26774+extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
26775+
26776+/* Free all resources held by a thread. */
26777+extern void release_thread(struct task_struct *);
26778+
26779+/* Prepare to copy thread state - unlazy all lazy status */
26780+extern void prepare_to_copy(struct task_struct *tsk);
26781+
26782+unsigned long get_wchan(struct task_struct *p);
26783+
26784+/*
26785+ * Generic CPUID function
26786+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
26787+ * resulting in stale register contents being returned.
26788+ */
26789+static inline void cpuid(unsigned int op,
26790+ unsigned int *eax, unsigned int *ebx,
26791+ unsigned int *ecx, unsigned int *edx)
26792+{
26793+ *eax = op;
26794+ *ecx = 0;
26795+ __cpuid(eax, ebx, ecx, edx);
26796+}
26797+
26798+/* Some CPUID calls want 'count' to be placed in ecx */
26799+static inline void cpuid_count(unsigned int op, int count,
26800+ unsigned int *eax, unsigned int *ebx,
26801+ unsigned int *ecx, unsigned int *edx)
26802+{
26803+ *eax = op;
26804+ *ecx = count;
26805+ __cpuid(eax, ebx, ecx, edx);
26806+}
26807+
26808+/*
26809+ * CPUID functions returning a single datum
26810+ */
26811+static inline unsigned int cpuid_eax(unsigned int op)
26812+{
26813+ unsigned int eax, ebx, ecx, edx;
26814+
26815+ cpuid(op, &eax, &ebx, &ecx, &edx);
26816+ return eax;
26817+}
26818+static inline unsigned int cpuid_ebx(unsigned int op)
26819+{
26820+ unsigned int eax, ebx, ecx, edx;
26821+
26822+ cpuid(op, &eax, &ebx, &ecx, &edx);
26823+ return ebx;
26824+}
26825+static inline unsigned int cpuid_ecx(unsigned int op)
26826+{
26827+ unsigned int eax, ebx, ecx, edx;
26828+
26829+ cpuid(op, &eax, &ebx, &ecx, &edx);
26830+ return ecx;
26831+}
26832+static inline unsigned int cpuid_edx(unsigned int op)
26833+{
26834+ unsigned int eax, ebx, ecx, edx;
26835+
26836+ cpuid(op, &eax, &ebx, &ecx, &edx);
26837+ return edx;
26838+}
26839+
26840+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
26841+static inline void rep_nop(void)
26842+{
26843+ __asm__ __volatile__("rep;nop": : :"memory");
26844+}
26845+
26846+/* Stop speculative execution */
26847+static inline void sync_core(void)
26848+{
26849+ int tmp;
26850+ asm volatile("cpuid" : "=a" (tmp) : "0" (1)
26851+ : "ebx", "ecx", "edx", "memory");
26852+}
26853+
26854+#define cpu_relax() rep_nop()
26855+
26856+static inline void __monitor(const void *eax, unsigned long ecx,
26857+ unsigned long edx)
26858+{
26859+ /* "monitor %eax,%ecx,%edx;" */
26860+ asm volatile(
26861+ ".byte 0x0f,0x01,0xc8;"
26862+ : :"a" (eax), "c" (ecx), "d"(edx));
26863+}
26864+
26865+static inline void __mwait(unsigned long eax, unsigned long ecx)
26866+{
26867+ /* "mwait %eax,%ecx;" */
26868+ asm volatile(
26869+ ".byte 0x0f,0x01,0xc9;"
26870+ : :"a" (eax), "c" (ecx));
26871+}
26872+
26873+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
26874+{
26875+ /* "mwait %eax,%ecx;" */
26876+ asm volatile(
26877+ "sti; .byte 0x0f,0x01,0xc9;"
26878+ : :"a" (eax), "c" (ecx));
26879+}
26880+
26881+extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
26882+
26883+extern int force_mwait;
26884+
26885+extern void select_idle_routine(const struct cpuinfo_x86 *c);
26886+
26887+extern unsigned long boot_option_idle_override;
26888+
26889+extern void enable_sep_cpu(void);
26890+extern int sysenter_setup(void);
26891+
26892+/* Defined in head.S */
26893+extern struct desc_ptr early_gdt_descr;
26894+
26895+extern void cpu_set_gdt(int);
26896+extern void switch_to_new_gdt(void);
26897+extern void cpu_init(void);
26898+extern void init_gdt(int cpu);
26899+
26900+/* from system description table in BIOS. Mostly for MCA use, but
26901+ * others may find it useful. */
26902+extern unsigned int machine_id;
26903+extern unsigned int machine_submodel_id;
26904+extern unsigned int BIOS_revision;
26905+
26906+/* Boot loader type from the setup header */
26907+extern int bootloader_type;
26908+
26909+extern char ignore_fpu_irq;
26910+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
26911+
26912+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
26913+#define ARCH_HAS_PREFETCHW
26914+#define ARCH_HAS_SPINLOCK_PREFETCH
26915+
26916+#ifdef CONFIG_X86_32
26917+#define BASE_PREFETCH ASM_NOP4
26918+#define ARCH_HAS_PREFETCH
26919+#else
26920+#define BASE_PREFETCH "prefetcht0 (%1)"
26921+#endif
26922+
26923+/* Prefetch instructions for Pentium III and AMD Athlon */
26924+/* It's not worth to care about 3dnow! prefetches for the K6
26925+ because they are microcoded there and very slow.
26926+ However we don't do prefetches for pre XP Athlons currently
26927+ That should be fixed. */
26928+static inline void prefetch(const void *x)
26929+{
26930+ alternative_input(BASE_PREFETCH,
26931+ "prefetchnta (%1)",
26932+ X86_FEATURE_XMM,
26933+ "r" (x));
26934+}
26935+
26936+/* 3dnow! prefetch to get an exclusive cache line. Useful for
26937+ spinlocks to avoid one state transition in the cache coherency protocol. */
26938+static inline void prefetchw(const void *x)
26939+{
26940+ alternative_input(BASE_PREFETCH,
26941+ "prefetchw (%1)",
26942+ X86_FEATURE_3DNOW,
26943+ "r" (x));
26944+}
26945+
26946+#define spin_lock_prefetch(x) prefetchw(x)
26947 #ifdef CONFIG_X86_32
26948-# include "processor_32.h"
26949+/*
26950+ * User space process size: 3GB (default).
26951+ */
26952+#define TASK_SIZE (PAGE_OFFSET)
26953+#define STACK_TOP TASK_SIZE
26954+#define STACK_TOP_MAX STACK_TOP
26955+
26956+#define INIT_THREAD { \
26957+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
26958+ .vm86_info = NULL, \
26959+ .sysenter_cs = __KERNEL_CS, \
26960+ .io_bitmap_ptr = NULL, \
26961+ .fs = __KERNEL_PERCPU, \
26962+}
26963+
26964+/*
26965+ * Note that the .io_bitmap member must be extra-big. This is because
26966+ * the CPU will access an additional byte beyond the end of the IO
26967+ * permission bitmap. The extra byte must be all 1 bits, and must
26968+ * be within the limit.
26969+ */
26970+#define INIT_TSS { \
26971+ .x86_tss = { \
26972+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
26973+ .ss0 = __KERNEL_DS, \
26974+ .ss1 = __KERNEL_CS, \
26975+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
26976+ }, \
26977+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
26978+}
26979+
26980+#define start_thread(regs, new_eip, new_esp) do { \
26981+ __asm__("movl %0,%%gs": :"r" (0)); \
26982+ regs->fs = 0; \
26983+ set_fs(USER_DS); \
26984+ regs->ds = __USER_DS; \
26985+ regs->es = __USER_DS; \
26986+ regs->ss = __USER_DS; \
26987+ regs->cs = __USER_CS; \
26988+ regs->ip = new_eip; \
26989+ regs->sp = new_esp; \
26990+} while (0)
26991+
26992+
26993+extern unsigned long thread_saved_pc(struct task_struct *tsk);
26994+
26995+#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
26996+#define KSTK_TOP(info) \
26997+({ \
26998+ unsigned long *__ptr = (unsigned long *)(info); \
26999+ (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
27000+})
27001+
27002+/*
27003+ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
27004+ * This is necessary to guarantee that the entire "struct pt_regs"
27005+ * is accessable even if the CPU haven't stored the SS/ESP registers
27006+ * on the stack (interrupt gate does not save these registers
27007+ * when switching to the same priv ring).
27008+ * Therefore beware: accessing the ss/esp fields of the
27009+ * "struct pt_regs" is possible, but they may contain the
27010+ * completely wrong values.
27011+ */
27012+#define task_pt_regs(task) \
27013+({ \
27014+ struct pt_regs *__regs__; \
27015+ __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
27016+ __regs__ - 1; \
27017+})
27018+
27019+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
27020+
27021 #else
27022-# include "processor_64.h"
27023+/*
27024+ * User space process size. 47bits minus one guard page.
27025+ */
27026+#define TASK_SIZE64 (0x800000000000UL - 4096)
27027+
27028+/* This decides where the kernel will search for a free chunk of vm
27029+ * space during mmap's.
27030+ */
27031+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
27032+ 0xc0000000 : 0xFFFFe000)
27033+
27034+#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
27035+ IA32_PAGE_OFFSET : TASK_SIZE64)
27036+#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
27037+ IA32_PAGE_OFFSET : TASK_SIZE64)
27038+
27039+#define STACK_TOP TASK_SIZE
27040+#define STACK_TOP_MAX TASK_SIZE64
27041+
27042+#define INIT_THREAD { \
27043+ .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
27044+}
27045+
27046+#define INIT_TSS { \
27047+ .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
27048+}
27049+
27050+#define start_thread(regs, new_rip, new_rsp) do { \
27051+ asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
27052+ load_gs_index(0); \
27053+ (regs)->ip = (new_rip); \
27054+ (regs)->sp = (new_rsp); \
27055+ write_pda(oldrsp, (new_rsp)); \
27056+ (regs)->cs = __USER_CS; \
27057+ (regs)->ss = __USER_DS; \
27058+ (regs)->flags = 0x200; \
27059+ set_fs(USER_DS); \
27060+} while (0)
27061+
27062+/*
27063+ * Return saved PC of a blocked thread.
27064+ * What is this good for? it will be always the scheduler or ret_from_fork.
27065+ */
27066+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
27067+
27068+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
27069+#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
27070+#endif /* CONFIG_X86_64 */
27071+
27072+/* This decides where the kernel will search for a free chunk of vm
27073+ * space during mmap's.
27074+ */
27075+#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
27076+
27077+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
27078+
27079 #endif
27080--- a/include/asm-x86/mach-xen/asm/segment_32.h
27081+++ /dev/null
27082@@ -1,150 +0,0 @@
27083-#ifndef _ASM_SEGMENT_H
27084-#define _ASM_SEGMENT_H
27085-
27086-/*
27087- * The layout of the per-CPU GDT under Linux:
27088- *
27089- * 0 - null
27090- * 1 - reserved
27091- * 2 - reserved
27092- * 3 - reserved
27093- *
27094- * 4 - unused <==== new cacheline
27095- * 5 - unused
27096- *
27097- * ------- start of TLS (Thread-Local Storage) segments:
27098- *
27099- * 6 - TLS segment #1 [ glibc's TLS segment ]
27100- * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
27101- * 8 - TLS segment #3
27102- * 9 - reserved
27103- * 10 - reserved
27104- * 11 - reserved
27105- *
27106- * ------- start of kernel segments:
27107- *
27108- * 12 - kernel code segment <==== new cacheline
27109- * 13 - kernel data segment
27110- * 14 - default user CS
27111- * 15 - default user DS
27112- * 16 - TSS
27113- * 17 - LDT
27114- * 18 - PNPBIOS support (16->32 gate)
27115- * 19 - PNPBIOS support
27116- * 20 - PNPBIOS support
27117- * 21 - PNPBIOS support
27118- * 22 - PNPBIOS support
27119- * 23 - APM BIOS support
27120- * 24 - APM BIOS support
27121- * 25 - APM BIOS support
27122- *
27123- * 26 - ESPFIX small SS
27124- * 27 - per-cpu [ offset to per-cpu data area ]
27125- * 28 - unused
27126- * 29 - unused
27127- * 30 - unused
27128- * 31 - TSS for double fault handler
27129- */
27130-#define GDT_ENTRY_TLS_ENTRIES 3
27131-#define GDT_ENTRY_TLS_MIN 6
27132-#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
27133-
27134-#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
27135-
27136-#define GDT_ENTRY_DEFAULT_USER_CS 14
27137-#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
27138-
27139-#define GDT_ENTRY_DEFAULT_USER_DS 15
27140-#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
27141-
27142-#define GDT_ENTRY_KERNEL_BASE 12
27143-
27144-#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
27145-#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
27146-
27147-#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
27148-#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
27149-
27150-#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
27151-#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
27152-
27153-#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
27154-#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
27155-
27156-#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
27157-#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
27158-
27159-#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
27160-#ifdef CONFIG_SMP
27161-#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
27162-#else
27163-#define __KERNEL_PERCPU 0
27164-#endif
27165-
27166-#define GDT_ENTRY_DOUBLEFAULT_TSS 31
27167-
27168-/*
27169- * The GDT has 32 entries
27170- */
27171-#define GDT_ENTRIES 32
27172-#define GDT_SIZE (GDT_ENTRIES * 8)
27173-
27174-/* Simple and small GDT entries for booting only */
27175-
27176-#define GDT_ENTRY_BOOT_CS 2
27177-#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
27178-
27179-#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
27180-#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
27181-
27182-/* The PnP BIOS entries in the GDT */
27183-#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
27184-#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
27185-#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
27186-#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
27187-#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
27188-
27189-/* The PnP BIOS selectors */
27190-#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
27191-#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
27192-#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
27193-#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
27194-#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
27195-
27196-/*
27197- * The interrupt descriptor table has room for 256 idt's,
27198- * the global descriptor table is dependent on the number
27199- * of tasks we can have..
27200- */
27201-#define IDT_ENTRIES 256
27202-
27203-/* Bottom two bits of selector give the ring privilege level */
27204-#define SEGMENT_RPL_MASK 0x3
27205-/* Bit 2 is table indicator (LDT/GDT) */
27206-#define SEGMENT_TI_MASK 0x4
27207-
27208-/* User mode is privilege level 3 */
27209-#define USER_RPL 0x3
27210-/* LDT segment has TI set, GDT has it cleared */
27211-#define SEGMENT_LDT 0x4
27212-#define SEGMENT_GDT 0x0
27213-
27214-#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
27215-
27216-/*
27217- * Matching rules for certain types of segments.
27218- */
27219-
27220-/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
27221-#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
27222- || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
27223-
27224-/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
27225-#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
27226- || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
27227- || ((x) & ~3) == (FLAT_USER_CS & ~3))
27228-
27229-/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
27230-#define SEGMENT_IS_PNP_CODE(x) (((x) & ~0x0b) == GDT_ENTRY_PNPBIOS_BASE * 8)
27231-
27232-#endif
27233--- a/include/asm-x86/mach-xen/asm/segment.h
27234+++ b/include/asm-x86/mach-xen/asm/segment.h
27235@@ -1,5 +1,204 @@
27236+#ifndef _ASM_X86_SEGMENT_H_
27237+#define _ASM_X86_SEGMENT_H_
27238+
27239+/* Simple and small GDT entries for booting only */
27240+
27241+#define GDT_ENTRY_BOOT_CS 2
27242+#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
27243+
27244+#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
27245+#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
27246+
27247+#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
27248+#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
27249+
27250 #ifdef CONFIG_X86_32
27251-# include "segment_32.h"
27252+/*
27253+ * The layout of the per-CPU GDT under Linux:
27254+ *
27255+ * 0 - null
27256+ * 1 - reserved
27257+ * 2 - reserved
27258+ * 3 - reserved
27259+ *
27260+ * 4 - unused <==== new cacheline
27261+ * 5 - unused
27262+ *
27263+ * ------- start of TLS (Thread-Local Storage) segments:
27264+ *
27265+ * 6 - TLS segment #1 [ glibc's TLS segment ]
27266+ * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
27267+ * 8 - TLS segment #3
27268+ * 9 - reserved
27269+ * 10 - reserved
27270+ * 11 - reserved
27271+ *
27272+ * ------- start of kernel segments:
27273+ *
27274+ * 12 - kernel code segment <==== new cacheline
27275+ * 13 - kernel data segment
27276+ * 14 - default user CS
27277+ * 15 - default user DS
27278+ * 16 - TSS
27279+ * 17 - LDT
27280+ * 18 - PNPBIOS support (16->32 gate)
27281+ * 19 - PNPBIOS support
27282+ * 20 - PNPBIOS support
27283+ * 21 - PNPBIOS support
27284+ * 22 - PNPBIOS support
27285+ * 23 - APM BIOS support
27286+ * 24 - APM BIOS support
27287+ * 25 - APM BIOS support
27288+ *
27289+ * 26 - ESPFIX small SS
27290+ * 27 - per-cpu [ offset to per-cpu data area ]
27291+ * 28 - unused
27292+ * 29 - unused
27293+ * 30 - unused
27294+ * 31 - TSS for double fault handler
27295+ */
27296+#define GDT_ENTRY_TLS_MIN 6
27297+#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
27298+
27299+#define GDT_ENTRY_DEFAULT_USER_CS 14
27300+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
27301+
27302+#define GDT_ENTRY_DEFAULT_USER_DS 15
27303+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
27304+
27305+#define GDT_ENTRY_KERNEL_BASE 12
27306+
27307+#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
27308+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
27309+
27310+#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
27311+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
27312+
27313+#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
27314+#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
27315+
27316+#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
27317+#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
27318+
27319+#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
27320+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
27321+
27322+#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
27323+#ifdef CONFIG_SMP
27324+#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
27325 #else
27326-# include "../../segment_64.h"
27327+#define __KERNEL_PERCPU 0
27328+#endif
27329+
27330+#define GDT_ENTRY_DOUBLEFAULT_TSS 31
27331+
27332+/*
27333+ * The GDT has 32 entries
27334+ */
27335+#define GDT_ENTRIES 32
27336+
27337+/* The PnP BIOS entries in the GDT */
27338+#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
27339+#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
27340+#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
27341+#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
27342+#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
27343+
27344+/* The PnP BIOS selectors */
27345+#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
27346+#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
27347+#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
27348+#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
27349+#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
27350+
27351+/* Bottom two bits of selector give the ring privilege level */
27352+#define SEGMENT_RPL_MASK 0x3
27353+/* Bit 2 is table indicator (LDT/GDT) */
27354+#define SEGMENT_TI_MASK 0x4
27355+
27356+/* User mode is privilege level 3 */
27357+#define USER_RPL 0x3
27358+/* LDT segment has TI set, GDT has it cleared */
27359+#define SEGMENT_LDT 0x4
27360+#define SEGMENT_GDT 0x0
27361+
27362+/*
27363+ * Matching rules for certain types of segments.
27364+ */
27365+
27366+/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
27367+#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
27368+ || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
27369+
27370+/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
27371+#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
27372+ || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
27373+ || ((x) & ~3) == (FLAT_USER_CS & ~3))
27374+
27375+/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
27376+#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
27377+
27378+#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
27379+
27380+#else
27381+#include <asm/cache.h>
27382+
27383+#define __KERNEL_CS 0x10
27384+#define __KERNEL_DS 0x18
27385+
27386+#define __KERNEL32_CS 0x08
27387+
27388+/*
27389+ * we cannot use the same code segment descriptor for user and kernel
27390+ * -- not even in the long flat mode, because of different DPL /kkeil
27391+ * The segment offset needs to contain a RPL. Grr. -AK
27392+ * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
27393+ */
27394+
27395+#define __USER32_CS 0x23 /* 4*8+3 */
27396+#define __USER_DS 0x2b /* 5*8+3 */
27397+#define __USER_CS 0x33 /* 6*8+3 */
27398+#define __USER32_DS __USER_DS
27399+
27400+#define GDT_ENTRY_TSS 8 /* needs two entries */
27401+#define GDT_ENTRY_LDT 10 /* needs two entries */
27402+#define GDT_ENTRY_TLS_MIN 12
27403+#define GDT_ENTRY_TLS_MAX 14
27404+
27405+#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
27406+#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
27407+
27408+/* TLS indexes for 64bit - hardcoded in arch_prctl */
27409+#define FS_TLS 0
27410+#define GS_TLS 1
27411+
27412+#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
27413+#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
27414+
27415+#define GDT_ENTRIES 16
27416+
27417+#endif
27418+
27419+/* User mode is privilege level 3 */
27420+#define USER_RPL 0x3
27421+/* LDT segment has TI set, GDT has it cleared */
27422+#define SEGMENT_LDT 0x4
27423+#define SEGMENT_GDT 0x0
27424+
27425+/* Bottom two bits of selector give the ring privilege level */
27426+#define SEGMENT_RPL_MASK 0x3
27427+/* Bit 2 is table indicator (LDT/GDT) */
27428+#define SEGMENT_TI_MASK 0x4
27429+
27430+#define IDT_ENTRIES 256
27431+#define GDT_SIZE (GDT_ENTRIES * 8)
27432+#define GDT_ENTRY_TLS_ENTRIES 3
27433+#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
27434+
27435+#ifdef __KERNEL__
27436+#ifndef __ASSEMBLY__
27437+extern const char early_idt_handlers[IDT_ENTRIES][10];
27438+#endif
27439+#endif
27440+
27441 #endif
27442--- a/include/asm-x86/mach-xen/asm/smp_32.h
27443+++ b/include/asm-x86/mach-xen/asm/smp_32.h
27444@@ -1,56 +1,51 @@
27445 #ifndef __ASM_SMP_H
27446 #define __ASM_SMP_H
27447
27448+#ifndef __ASSEMBLY__
27449+#include <linux/cpumask.h>
27450+#include <linux/init.h>
27451+
27452 /*
27453 * We need the APIC definitions automatically as part of 'smp.h'
27454 */
27455-#ifndef __ASSEMBLY__
27456-#include <linux/kernel.h>
27457-#include <linux/threads.h>
27458-#include <linux/cpumask.h>
27459+#ifdef CONFIG_X86_LOCAL_APIC
27460+# include <asm/mpspec.h>
27461+# include <asm/apic.h>
27462+# ifdef CONFIG_X86_IO_APIC
27463+# include <asm/io_apic.h>
27464+# endif
27465 #endif
27466
27467-#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
27468-#include <linux/bitops.h>
27469-#include <asm/mpspec.h>
27470-#include <asm/apic.h>
27471-#ifdef CONFIG_X86_IO_APIC
27472-#include <asm/io_apic.h>
27473-#endif
27474-#endif
27475+#define cpu_callout_map cpu_possible_map
27476+#define cpu_callin_map cpu_possible_map
27477
27478-#define BAD_APICID 0xFFu
27479-#ifdef CONFIG_SMP
27480-#ifndef __ASSEMBLY__
27481+extern int smp_num_siblings;
27482+extern unsigned int num_processors;
27483
27484-/*
27485- * Private routines/data
27486- */
27487-
27488 extern void smp_alloc_memory(void);
27489-extern int pic_mode;
27490-extern int smp_num_siblings;
27491-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27492-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27493+extern void lock_ipi_call_lock(void);
27494+extern void unlock_ipi_call_lock(void);
27495
27496 extern void (*mtrr_hook) (void);
27497 extern void zap_low_mappings (void);
27498-extern void lock_ipi_call_lock(void);
27499-extern void unlock_ipi_call_lock(void);
27500
27501-#define MAX_APICID 256
27502-extern u8 __initdata x86_cpu_to_apicid_init[];
27503-extern void *x86_cpu_to_apicid_ptr;
27504+DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27505+DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27506+DECLARE_PER_CPU(u8, cpu_llc_id);
27507 DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
27508
27509-#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27510-
27511 #ifdef CONFIG_HOTPLUG_CPU
27512 extern void cpu_exit_clear(void);
27513 extern void cpu_uninit(void);
27514 #endif
27515
27516+#ifdef CONFIG_SMP
27517+
27518 #ifndef CONFIG_XEN
27519+
27520+/* Globals due to paravirt */
27521+extern void set_cpu_sibling_map(int cpu);
27522+
27523 struct smp_ops
27524 {
27525 void (*smp_prepare_boot_cpu)(void);
27526@@ -104,11 +99,11 @@ void native_smp_prepare_cpus(unsigned in
27527 int native_cpu_up(unsigned int cpunum);
27528 void native_smp_cpus_done(unsigned int max_cpus);
27529
27530-#define startup_ipi_hook(phys_apicid, start_eip, start_esp) \
27531-do { } while (0)
27532-
27533-#else
27534+#ifndef CONFIG_PARAVIRT
27535+#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
27536+#endif
27537
27538+#else /* CONFIG_XEN */
27539
27540 void xen_smp_send_stop(void);
27541 void xen_smp_send_reschedule(int cpu);
27542@@ -120,7 +115,12 @@ int xen_smp_call_function_mask(cpumask_t
27543 #define smp_send_reschedule xen_smp_send_reschedule
27544 #define smp_call_function_mask xen_smp_call_function_mask
27545
27546-#endif
27547+extern void prefill_possible_map(void);
27548+
27549+#endif /* CONFIG_XEN */
27550+
27551+extern int __cpu_disable(void);
27552+extern void __cpu_die(unsigned int cpu);
27553
27554 /*
27555 * This function is needed by all SMP systems. It must _always_ be valid
27556@@ -130,64 +130,49 @@ int xen_smp_call_function_mask(cpumask_t
27557 DECLARE_PER_CPU(int, cpu_number);
27558 #define raw_smp_processor_id() (x86_read_percpu(cpu_number))
27559
27560-extern cpumask_t cpu_possible_map;
27561-#define cpu_callin_map cpu_possible_map
27562+#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27563+
27564+#define safe_smp_processor_id() smp_processor_id()
27565
27566 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
27567 static inline int num_booting_cpus(void)
27568 {
27569- return cpus_weight(cpu_possible_map);
27570+ return cpus_weight(cpu_callout_map);
27571 }
27572
27573-#define safe_smp_processor_id() smp_processor_id()
27574-extern int __cpu_disable(void);
27575-extern void __cpu_die(unsigned int cpu);
27576-extern void prefill_possible_map(void);
27577-extern unsigned int num_processors;
27578-
27579-#endif /* !__ASSEMBLY__ */
27580-
27581 #else /* CONFIG_SMP */
27582
27583 #define safe_smp_processor_id() 0
27584 #define cpu_physical_id(cpu) boot_cpu_physical_apicid
27585
27586-#define NO_PROC_ID 0xFF /* No processor magic marker */
27587-
27588-#endif /* CONFIG_SMP */
27589-
27590-#ifndef __ASSEMBLY__
27591+#endif /* !CONFIG_SMP */
27592
27593 #ifdef CONFIG_X86_LOCAL_APIC
27594
27595-#ifdef APIC_DEFINITION
27596+static __inline int logical_smp_processor_id(void)
27597+{
27598+ /* we don't want to mark this access volatile - bad code generation */
27599+ return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
27600+}
27601+
27602+# ifdef APIC_DEFINITION
27603 extern int hard_smp_processor_id(void);
27604-#else
27605-#include <mach_apicdef.h>
27606+# else
27607+# include <mach_apicdef.h>
27608 static inline int hard_smp_processor_id(void)
27609 {
27610 /* we don't want to mark this access volatile - bad code generation */
27611- return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
27612+ return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
27613 }
27614-#endif /* APIC_DEFINITION */
27615+# endif /* APIC_DEFINITION */
27616
27617 #else /* CONFIG_X86_LOCAL_APIC */
27618
27619-#ifndef CONFIG_SMP
27620-#define hard_smp_processor_id() 0
27621-#endif
27622+# ifndef CONFIG_SMP
27623+# define hard_smp_processor_id() 0
27624+# endif
27625
27626 #endif /* CONFIG_X86_LOCAL_APIC */
27627
27628-extern u8 apicid_2_node[];
27629-
27630-#ifdef CONFIG_X86_LOCAL_APIC
27631-static __inline int logical_smp_processor_id(void)
27632-{
27633- /* we don't want to mark this access volatile - bad code generation */
27634- return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
27635-}
27636-#endif
27637-#endif
27638-
27639+#endif /* !ASSEMBLY */
27640 #endif
27641--- a/include/asm-x86/mach-xen/asm/smp_64.h
27642+++ b/include/asm-x86/mach-xen/asm/smp_64.h
27643@@ -1,139 +1,103 @@
27644 #ifndef __ASM_SMP_H
27645 #define __ASM_SMP_H
27646
27647-/*
27648- * We need the APIC definitions automatically as part of 'smp.h'
27649- */
27650-#include <linux/threads.h>
27651 #include <linux/cpumask.h>
27652-#include <linux/bitops.h>
27653 #include <linux/init.h>
27654-extern int disable_apic;
27655
27656 #ifdef CONFIG_X86_LOCAL_APIC
27657-#include <asm/mpspec.h>
27658+/*
27659+ * We need the APIC definitions automatically as part of 'smp.h'
27660+ */
27661 #include <asm/apic.h>
27662 #ifdef CONFIG_X86_IO_APIC
27663 #include <asm/io_apic.h>
27664 #endif
27665-#include <asm/thread_info.h>
27666+#include <asm/mpspec.h>
27667 #endif
27668-
27669-#ifdef CONFIG_SMP
27670-
27671 #include <asm/pda.h>
27672+#include <asm/thread_info.h>
27673
27674-struct pt_regs;
27675-
27676-extern cpumask_t cpu_present_mask;
27677-extern cpumask_t cpu_possible_map;
27678-extern cpumask_t cpu_online_map;
27679 extern cpumask_t cpu_initialized;
27680
27681-/*
27682- * Private routines/data
27683- */
27684-
27685+extern int smp_num_siblings;
27686+extern unsigned int num_processors;
27687+
27688 extern void smp_alloc_memory(void);
27689-extern volatile unsigned long smp_invalidate_needed;
27690 extern void lock_ipi_call_lock(void);
27691 extern void unlock_ipi_call_lock(void);
27692-extern int smp_num_siblings;
27693-extern void smp_send_reschedule(int cpu);
27694+
27695 extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
27696 void *info, int wait);
27697
27698-/*
27699- * cpu_sibling_map and cpu_core_map now live
27700- * in the per cpu area
27701- *
27702- * extern cpumask_t cpu_sibling_map[NR_CPUS];
27703- * extern cpumask_t cpu_core_map[NR_CPUS];
27704- */
27705 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27706 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27707-DECLARE_PER_CPU(u8, cpu_llc_id);
27708-
27709-#define SMP_TRAMPOLINE_BASE 0x6000
27710+DECLARE_PER_CPU(u16, cpu_llc_id);
27711+DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
27712+DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
27713
27714-/*
27715- * On x86 all CPUs are mapped 1:1 to the APIC space.
27716- * This simplifies scheduling and IPI sending and
27717- * compresses data structures.
27718- */
27719-
27720-static inline int num_booting_cpus(void)
27721+#ifdef CONFIG_X86_LOCAL_APIC
27722+static inline int cpu_present_to_apicid(int mps_cpu)
27723 {
27724- return cpus_weight(cpu_possible_map);
27725+ if (cpu_present(mps_cpu))
27726+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
27727+ else
27728+ return BAD_APICID;
27729 }
27730+#endif
27731
27732-#define raw_smp_processor_id() read_pda(cpunumber)
27733+#ifdef CONFIG_SMP
27734+
27735+#define SMP_TRAMPOLINE_BASE 0x6000
27736
27737 extern int __cpu_disable(void);
27738 extern void __cpu_die(unsigned int cpu);
27739 extern void prefill_possible_map(void);
27740-extern unsigned num_processors;
27741 extern unsigned __cpuinitdata disabled_cpus;
27742
27743-#define NO_PROC_ID 0xFF /* No processor magic marker */
27744-
27745-#endif /* CONFIG_SMP */
27746+#define raw_smp_processor_id() read_pda(cpunumber)
27747+#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27748
27749-#define safe_smp_processor_id() smp_processor_id()
27750-
27751-#ifdef CONFIG_X86_LOCAL_APIC
27752-static inline int hard_smp_processor_id(void)
27753-{
27754- /* we don't want to mark this access volatile - bad code generation */
27755- return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
27756-}
27757-#endif
27758+#define stack_smp_processor_id() \
27759+ ({ \
27760+ struct thread_info *ti; \
27761+ __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
27762+ ti->cpu; \
27763+})
27764
27765 /*
27766- * Some lowlevel functions might want to know about
27767- * the real APIC ID <-> CPU # mapping.
27768+ * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
27769+ * scheduling and IPI sending and compresses data structures.
27770 */
27771-extern u8 __initdata x86_cpu_to_apicid_init[];
27772-extern void *x86_cpu_to_apicid_ptr;
27773-DECLARE_PER_CPU(u8, x86_cpu_to_apicid); /* physical ID */
27774-extern u8 bios_cpu_apicid[];
27775-
27776-#ifdef CONFIG_X86_LOCAL_APIC
27777-static inline int cpu_present_to_apicid(int mps_cpu)
27778+static inline int num_booting_cpus(void)
27779 {
27780- if (mps_cpu < NR_CPUS)
27781- return (int)bios_cpu_apicid[mps_cpu];
27782- else
27783- return BAD_APICID;
27784+ return cpus_weight(cpu_possible_map);
27785 }
27786-#endif
27787
27788-#ifndef CONFIG_SMP
27789+extern void smp_send_reschedule(int cpu);
27790+
27791+#else /* CONFIG_SMP */
27792+
27793+extern unsigned int boot_cpu_id;
27794+#define cpu_physical_id(cpu) boot_cpu_id
27795 #define stack_smp_processor_id() 0
27796-#define cpu_logical_map(x) (x)
27797-#else
27798-#include <asm/thread_info.h>
27799-#define stack_smp_processor_id() \
27800-({ \
27801- struct thread_info *ti; \
27802- __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
27803- ti->cpu; \
27804-})
27805-#endif
27806+
27807+#endif /* !CONFIG_SMP */
27808+
27809+#define safe_smp_processor_id() smp_processor_id()
27810
27811 #ifdef CONFIG_X86_LOCAL_APIC
27812 static __inline int logical_smp_processor_id(void)
27813 {
27814 /* we don't want to mark this access volatile - bad code generation */
27815- return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
27816+ return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
27817+}
27818+
27819+static inline int hard_smp_processor_id(void)
27820+{
27821+ /* we don't want to mark this access volatile - bad code generation */
27822+ return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
27823 }
27824 #endif
27825
27826-#ifdef CONFIG_SMP
27827-#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27828-#else
27829-extern unsigned int boot_cpu_id;
27830-#define cpu_physical_id(cpu) boot_cpu_id
27831-#endif /* !CONFIG_SMP */
27832 #endif
27833
27834--- /dev/null
27835+++ b/include/asm-x86/mach-xen/asm/spinlock.h
27836@@ -0,0 +1,333 @@
27837+#ifndef _X86_SPINLOCK_H_
27838+#define _X86_SPINLOCK_H_
27839+
27840+#include <asm/atomic.h>
27841+#include <asm/rwlock.h>
27842+#include <asm/page.h>
27843+#include <asm/processor.h>
27844+#include <linux/compiler.h>
27845+
27846+/*
27847+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
27848+ *
27849+ * Simple spin lock operations. There are two variants, one clears IRQ's
27850+ * on the local processor, one does not.
27851+ *
27852+ * These are fair FIFO ticket locks, which are currently limited to 256
27853+ * CPUs.
27854+ *
27855+ * (the type definitions are in asm/spinlock_types.h)
27856+ */
27857+
27858+#ifdef CONFIG_X86_32
27859+# define LOCK_PTR_REG "a"
27860+# define REG_PTR_MODE "k"
27861+#else
27862+# define LOCK_PTR_REG "D"
27863+# define REG_PTR_MODE "q"
27864+#endif
27865+
27866+#if defined(CONFIG_X86_32) && \
27867+ (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
27868+/*
27869+ * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
27870+ * (PPro errata 66, 92)
27871+ */
27872+# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
27873+#else
27874+# define UNLOCK_LOCK_PREFIX
27875+#endif
27876+
27877+int xen_spinlock_init(unsigned int cpu);
27878+void xen_spinlock_cleanup(unsigned int cpu);
27879+extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
27880+extern int xen_spin_wait_flags(raw_spinlock_t *, unsigned int *token,
27881+ unsigned int flags);
27882+extern unsigned int xen_spin_adjust(raw_spinlock_t *, unsigned int token);
27883+extern void xen_spin_kick(raw_spinlock_t *, unsigned int token);
27884+
27885+/*
27886+ * Ticket locks are conceptually two parts, one indicating the current head of
27887+ * the queue, and the other indicating the current tail. The lock is acquired
27888+ * by atomically noting the tail and incrementing it by one (thus adding
27889+ * ourself to the queue and noting our position), then waiting until the head
27890+ * becomes equal to the the initial value of the tail.
27891+ *
27892+ * We use an xadd covering *both* parts of the lock, to increment the tail and
27893+ * also load the position of the head, which takes care of memory ordering
27894+ * issues and should be optimal for the uncontended case. Note the tail must be
27895+ * in the high part, because a wide xadd increment of the low part would carry
27896+ * up and contaminate the high part.
27897+ *
27898+ * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
27899+ * save some instructions and make the code more elegant. There really isn't
27900+ * much between them in performance though, especially as locks are out of line.
27901+ */
27902+#if (NR_CPUS < 256)
27903+#define TICKET_SHIFT 8
27904+#define __raw_spin_lock_preamble \
27905+ asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
27906+ "cmpb %h0, %b0\n\t" \
27907+ "sete %1" \
27908+ : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
27909+ : "0" (0x0100) \
27910+ : "memory", "cc")
27911+#define __raw_spin_lock_body \
27912+ asm("1:\t" \
27913+ "cmpb %h0, %b0\n\t" \
27914+ "je 2f\n\t" \
27915+ "decl %1\n\t" \
27916+ "jz 2f\n\t" \
27917+ "rep ; nop\n\t" \
27918+ "movb %2, %b0\n\t" \
27919+ /* don't need lfence here, because loads are in-order */ \
27920+ "jmp 1b\n" \
27921+ "2:" \
27922+ : "+Q" (token), "+g" (count) \
27923+ : "m" (lock->slock) \
27924+ : "memory", "cc")
27925+
27926+
27927+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
27928+{
27929+ int tmp, new;
27930+
27931+ asm("movzwl %2, %0\n\t"
27932+ "cmpb %h0, %b0\n\t"
27933+ "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
27934+ "jne 1f\n\t"
27935+ LOCK_PREFIX "cmpxchgw %w1, %2\n\t"
27936+ "1:\t"
27937+ "sete %b1\n\t"
27938+ "movzbl %b1, %0\n\t"
27939+ : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
27940+ :
27941+ : "memory", "cc");
27942+
27943+ return tmp;
27944+}
27945+
27946+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
27947+{
27948+ unsigned int token;
27949+ unsigned char kick;
27950+
27951+ asm(UNLOCK_LOCK_PREFIX "incb %2\n\t"
27952+ "movzwl %2, %0\n\t"
27953+ "cmpb %h0, %b0\n\t"
27954+ "setne %1"
27955+ : "=&Q" (token), "=qm" (kick), "+m" (lock->slock)
27956+ :
27957+ : "memory", "cc");
27958+ if (kick)
27959+ xen_spin_kick(lock, token);
27960+}
27961+#else
27962+#define TICKET_SHIFT 16
27963+#define __raw_spin_lock_preamble \
27964+ do { \
27965+ unsigned int tmp; \
27966+ asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
27967+ "shldl $16, %0, %3\n\t" \
27968+ "cmpw %w3, %w0\n\t" \
27969+ "sete %1"
27970+ : "=&r" (token), "=qm" (free), "+m" (lock->slock), \
27971+ "=&g" (tmp) \
27972+ : "0" (0x00010000) \
27973+ : "memory", "cc"); \
27974+ } while (0)
27975+#define __raw_spin_lock_body \
27976+ do { \
27977+ unsigned int tmp; \
27978+ asm("shldl $16, %0, %2\n" \
27979+ "1:\t" \
27980+ "cmpw %w2, %w0\n\t" \
27981+ "je 2f\n\t" \
27982+ "decl %1\n\t" \
27983+ "jz 2f\n\t" \
27984+ "rep ; nop\n\t" \
27985+ "movw %3, %w0\n\t" \
27986+ /* don't need lfence here, because loads are in-order */ \
27987+ "jmp 1b\n" \
27988+ "2:" \
27989+ : "+r" (token), "+g" (count), "=&g" (tmp) \
27990+ : "m" (lock->slock) \
27991+ : "memory", "cc"); \
27992+ } while (0)
27993+
27994+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
27995+{
27996+ int tmp;
27997+ int new;
27998+
27999+ asm("movl %2, %0\n\t"
28000+ "movl %0, %1\n\t"
28001+ "roll $16, %0\n\t"
28002+ "cmpl %0, %1\n\t"
28003+ "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
28004+ "jne 1f\n\t"
28005+ LOCK_PREFIX "cmpxchgl %1, %2\n"
28006+ "1:\t"
28007+ "sete %b1\n\t"
28008+ "movzbl %b1, %0\n\t"
28009+ : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
28010+ :
28011+ : "memory", "cc");
28012+
28013+ return tmp;
28014+}
28015+
28016+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
28017+{
28018+ unsigned int token, tmp;
28019+ bool kick;
28020+
28021+ asm(UNLOCK_LOCK_PREFIX "incw %2\n\t"
28022+ "movl %2, %0\n\t"
28023+ "shldl $16, %0, %3\n\t"
28024+ "cmpw %w3, %w0\n\t"
28025+ "setne %1"
28026+ : "=&r" (token), "=qm" (kick), "+m" (lock->slock), "=&r" (tmp)
28027+ :
28028+ : "memory", "cc");
28029+ if (kick)
28030+ xen_spin_kick(lock, token);
28031+}
28032+#endif
28033+
28034+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
28035+{
28036+ int tmp = *(volatile signed int *)(&(lock)->slock);
28037+
28038+ return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
28039+}
28040+
28041+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
28042+{
28043+ int tmp = *(volatile signed int *)(&(lock)->slock);
28044+
28045+ return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
28046+}
28047+
28048+static inline void __raw_spin_lock(raw_spinlock_t *lock)
28049+{
28050+ unsigned int token, count;
28051+ bool free;
28052+
28053+ __raw_spin_lock_preamble;
28054+ if (unlikely(!free))
28055+ token = xen_spin_adjust(lock, token);
28056+ do {
28057+ count = 1 << 10;
28058+ __raw_spin_lock_body;
28059+ } while (unlikely(!count) && !xen_spin_wait(lock, token));
28060+}
28061+
28062+static inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
28063+ unsigned long flags)
28064+{
28065+ unsigned int token, count;
28066+ bool free;
28067+
28068+ __raw_spin_lock_preamble;
28069+ if (unlikely(!free))
28070+ token = xen_spin_adjust(lock, token);
28071+ do {
28072+ count = 1 << 10;
28073+ __raw_spin_lock_body;
28074+ } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
28075+}
28076+
28077+static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
28078+{
28079+ while (__raw_spin_is_locked(lock))
28080+ cpu_relax();
28081+}
28082+
28083+/*
28084+ * Read-write spinlocks, allowing multiple readers
28085+ * but only one writer.
28086+ *
28087+ * NOTE! it is quite common to have readers in interrupts
28088+ * but no interrupt writers. For those circumstances we
28089+ * can "mix" irq-safe locks - any writer needs to get a
28090+ * irq-safe write-lock, but readers can get non-irqsafe
28091+ * read-locks.
28092+ *
28093+ * On x86, we implement read-write locks as a 32-bit counter
28094+ * with the high bit (sign) being the "contended" bit.
28095+ */
28096+
28097+/**
28098+ * read_can_lock - would read_trylock() succeed?
28099+ * @lock: the rwlock in question.
28100+ */
28101+static inline int __raw_read_can_lock(raw_rwlock_t *lock)
28102+{
28103+ return (int)(lock)->lock > 0;
28104+}
28105+
28106+/**
28107+ * write_can_lock - would write_trylock() succeed?
28108+ * @lock: the rwlock in question.
28109+ */
28110+static inline int __raw_write_can_lock(raw_rwlock_t *lock)
28111+{
28112+ return (lock)->lock == RW_LOCK_BIAS;
28113+}
28114+
28115+static inline void __raw_read_lock(raw_rwlock_t *rw)
28116+{
28117+ asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
28118+ "jns 1f\n"
28119+ "call __read_lock_failed\n\t"
28120+ "1:\n"
28121+ ::LOCK_PTR_REG (rw) : "memory");
28122+}
28123+
28124+static inline void __raw_write_lock(raw_rwlock_t *rw)
28125+{
28126+ asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
28127+ "jz 1f\n"
28128+ "call __write_lock_failed\n\t"
28129+ "1:\n"
28130+ ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
28131+}
28132+
28133+static inline int __raw_read_trylock(raw_rwlock_t *lock)
28134+{
28135+ atomic_t *count = (atomic_t *)lock;
28136+
28137+ atomic_dec(count);
28138+ if (atomic_read(count) >= 0)
28139+ return 1;
28140+ atomic_inc(count);
28141+ return 0;
28142+}
28143+
28144+static inline int __raw_write_trylock(raw_rwlock_t *lock)
28145+{
28146+ atomic_t *count = (atomic_t *)lock;
28147+
28148+ if (atomic_sub_and_test(RW_LOCK_BIAS, count))
28149+ return 1;
28150+ atomic_add(RW_LOCK_BIAS, count);
28151+ return 0;
28152+}
28153+
28154+static inline void __raw_read_unlock(raw_rwlock_t *rw)
28155+{
28156+ asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
28157+}
28158+
28159+static inline void __raw_write_unlock(raw_rwlock_t *rw)
28160+{
28161+ asm volatile(LOCK_PREFIX "addl %1, %0"
28162+ : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
28163+}
28164+
28165+#define _raw_spin_relax(lock) cpu_relax()
28166+#define _raw_read_relax(lock) cpu_relax()
28167+#define _raw_write_relax(lock) cpu_relax()
28168+
28169+#endif
28170--- a/include/asm-x86/mach-xen/asm/system_32.h
28171+++ /dev/null
28172@@ -1,312 +0,0 @@
28173-#ifndef __ASM_SYSTEM_H
28174-#define __ASM_SYSTEM_H
28175-
28176-#include <linux/kernel.h>
28177-#include <asm/segment.h>
28178-#include <asm/cpufeature.h>
28179-#include <asm/cmpxchg.h>
28180-#include <asm/synch_bitops.h>
28181-#include <asm/hypervisor.h>
28182-
28183-#ifdef __KERNEL__
28184-#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
28185-
28186-struct task_struct; /* one of the stranger aspects of C forward declarations.. */
28187-extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
28188-
28189-/*
28190- * Saving eflags is important. It switches not only IOPL between tasks,
28191- * it also protects other tasks from NT leaking through sysenter etc.
28192- */
28193-#define switch_to(prev,next,last) do { \
28194- unsigned long esi,edi; \
28195- asm volatile("pushfl\n\t" /* Save flags */ \
28196- "pushl %%ebp\n\t" \
28197- "movl %%esp,%0\n\t" /* save ESP */ \
28198- "movl %5,%%esp\n\t" /* restore ESP */ \
28199- "movl $1f,%1\n\t" /* save EIP */ \
28200- "pushl %6\n\t" /* restore EIP */ \
28201- "jmp __switch_to\n" \
28202- "1:\t" \
28203- "popl %%ebp\n\t" \
28204- "popfl" \
28205- :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
28206- "=a" (last),"=S" (esi),"=D" (edi) \
28207- :"m" (next->thread.esp),"m" (next->thread.eip), \
28208- "2" (prev), "d" (next)); \
28209-} while (0)
28210-
28211-#define _set_base(addr,base) do { unsigned long __pr; \
28212-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28213- "rorl $16,%%edx\n\t" \
28214- "movb %%dl,%2\n\t" \
28215- "movb %%dh,%3" \
28216- :"=&d" (__pr) \
28217- :"m" (*((addr)+2)), \
28218- "m" (*((addr)+4)), \
28219- "m" (*((addr)+7)), \
28220- "0" (base) \
28221- ); } while(0)
28222-
28223-#define _set_limit(addr,limit) do { unsigned long __lr; \
28224-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28225- "rorl $16,%%edx\n\t" \
28226- "movb %2,%%dh\n\t" \
28227- "andb $0xf0,%%dh\n\t" \
28228- "orb %%dh,%%dl\n\t" \
28229- "movb %%dl,%2" \
28230- :"=&d" (__lr) \
28231- :"m" (*(addr)), \
28232- "m" (*((addr)+6)), \
28233- "0" (limit) \
28234- ); } while(0)
28235-
28236-#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
28237-#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
28238-
28239-/*
28240- * Load a segment. Fall back on loading the zero
28241- * segment if something goes wrong..
28242- */
28243-#define loadsegment(seg,value) \
28244- asm volatile("\n" \
28245- "1:\t" \
28246- "mov %0,%%" #seg "\n" \
28247- "2:\n" \
28248- ".section .fixup,\"ax\"\n" \
28249- "3:\t" \
28250- "pushl $0\n\t" \
28251- "popl %%" #seg "\n\t" \
28252- "jmp 2b\n" \
28253- ".previous\n" \
28254- ".section __ex_table,\"a\"\n\t" \
28255- ".align 4\n\t" \
28256- ".long 1b,3b\n" \
28257- ".previous" \
28258- : :"rm" (value))
28259-
28260-/*
28261- * Save a segment register away
28262- */
28263-#define savesegment(seg, value) \
28264- asm volatile("mov %%" #seg ",%0":"=rm" (value))
28265-
28266-static inline void xen_clts(void)
28267-{
28268- HYPERVISOR_fpu_taskswitch(0);
28269-}
28270-
28271-static inline unsigned long xen_read_cr0(void)
28272-{
28273- unsigned long val;
28274- asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
28275- return val;
28276-}
28277-
28278-static inline void xen_write_cr0(unsigned long val)
28279-{
28280- asm volatile("movl %0,%%cr0": :"r" (val));
28281-}
28282-
28283-#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
28284-
28285-static inline void xen_write_cr2(unsigned long val)
28286-{
28287- asm volatile("movl %0,%%cr2": :"r" (val));
28288-}
28289-
28290-static inline unsigned long xen_read_cr3(void)
28291-{
28292- unsigned long val;
28293- asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
28294- return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
28295-}
28296-
28297-static inline void xen_write_cr3(unsigned long val)
28298-{
28299- val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
28300- asm volatile("movl %0,%%cr3": :"r" (val));
28301-}
28302-
28303-static inline unsigned long xen_read_cr4(void)
28304-{
28305- unsigned long val;
28306- asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
28307- return val;
28308-}
28309-
28310-static inline unsigned long xen_read_cr4_safe(void)
28311-{
28312- unsigned long val;
28313- /* This could fault if %cr4 does not exist */
28314- asm volatile("1: movl %%cr4, %0 \n"
28315- "2: \n"
28316- ".section __ex_table,\"a\" \n"
28317- ".long 1b,2b \n"
28318- ".previous \n"
28319- : "=r" (val): "0" (0));
28320- return val;
28321-}
28322-
28323-static inline void xen_write_cr4(unsigned long val)
28324-{
28325- asm volatile("movl %0,%%cr4": :"r" (val));
28326-}
28327-
28328-static inline void xen_wbinvd(void)
28329-{
28330- asm volatile("wbinvd": : :"memory");
28331-}
28332-
28333-static inline void clflush(volatile void *__p)
28334-{
28335- asm volatile("clflush %0" : "+m" (*(char __force *)__p));
28336-}
28337-
28338-#define read_cr0() (xen_read_cr0())
28339-#define write_cr0(x) (xen_write_cr0(x))
28340-#define read_cr2() (xen_read_cr2())
28341-#define write_cr2(x) (xen_write_cr2(x))
28342-#define read_cr3() (xen_read_cr3())
28343-#define write_cr3(x) (xen_write_cr3(x))
28344-#define read_cr4() (xen_read_cr4())
28345-#define read_cr4_safe() (xen_read_cr4_safe())
28346-#define write_cr4(x) (xen_write_cr4(x))
28347-#define wbinvd() (xen_wbinvd())
28348-
28349-/* Clear the 'TS' bit */
28350-#define clts() (xen_clts())
28351-
28352-/* Set the 'TS' bit */
28353-#define stts() (HYPERVISOR_fpu_taskswitch(1))
28354-
28355-#endif /* __KERNEL__ */
28356-
28357-static inline unsigned long get_limit(unsigned long segment)
28358-{
28359- unsigned long __limit;
28360- __asm__("lsll %1,%0"
28361- :"=r" (__limit):"r" (segment));
28362- return __limit+1;
28363-}
28364-
28365-#define nop() __asm__ __volatile__ ("nop")
28366-
28367-/*
28368- * Force strict CPU ordering.
28369- * And yes, this is required on UP too when we're talking
28370- * to devices.
28371- *
28372- * For now, "wmb()" doesn't actually do anything, as all
28373- * Intel CPU's follow what Intel calls a *Processor Order*,
28374- * in which all writes are seen in the program order even
28375- * outside the CPU.
28376- *
28377- * I expect future Intel CPU's to have a weaker ordering,
28378- * but I'd also expect them to finally get their act together
28379- * and add some real memory barriers if so.
28380- *
28381- * Some non intel clones support out of order store. wmb() ceases to be a
28382- * nop for these.
28383- */
28384-
28385-
28386-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
28387-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
28388-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
28389-
28390-/**
28391- * read_barrier_depends - Flush all pending reads that subsequents reads
28392- * depend on.
28393- *
28394- * No data-dependent reads from memory-like regions are ever reordered
28395- * over this barrier. All reads preceding this primitive are guaranteed
28396- * to access memory (but not necessarily other CPUs' caches) before any
28397- * reads following this primitive that depend on the data return by
28398- * any of the preceding reads. This primitive is much lighter weight than
28399- * rmb() on most CPUs, and is never heavier weight than is
28400- * rmb().
28401- *
28402- * These ordering constraints are respected by both the local CPU
28403- * and the compiler.
28404- *
28405- * Ordering is not guaranteed by anything other than these primitives,
28406- * not even by data dependencies. See the documentation for
28407- * memory_barrier() for examples and URLs to more information.
28408- *
28409- * For example, the following code would force ordering (the initial
28410- * value of "a" is zero, "b" is one, and "p" is "&a"):
28411- *
28412- * <programlisting>
28413- * CPU 0 CPU 1
28414- *
28415- * b = 2;
28416- * memory_barrier();
28417- * p = &b; q = p;
28418- * read_barrier_depends();
28419- * d = *q;
28420- * </programlisting>
28421- *
28422- * because the read of "*q" depends on the read of "p" and these
28423- * two reads are separated by a read_barrier_depends(). However,
28424- * the following code, with the same initial values for "a" and "b":
28425- *
28426- * <programlisting>
28427- * CPU 0 CPU 1
28428- *
28429- * a = 2;
28430- * memory_barrier();
28431- * b = 3; y = b;
28432- * read_barrier_depends();
28433- * x = a;
28434- * </programlisting>
28435- *
28436- * does not enforce ordering, since there is no data dependency between
28437- * the read of "a" and the read of "b". Therefore, on some CPUs, such
28438- * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
28439- * in cases like this where there are no data dependencies.
28440- **/
28441-
28442-#define read_barrier_depends() do { } while(0)
28443-
28444-#ifdef CONFIG_SMP
28445-#define smp_mb() mb()
28446-#ifdef CONFIG_X86_PPRO_FENCE
28447-# define smp_rmb() rmb()
28448-#else
28449-# define smp_rmb() barrier()
28450-#endif
28451-#ifdef CONFIG_X86_OOSTORE
28452-# define smp_wmb() wmb()
28453-#else
28454-# define smp_wmb() barrier()
28455-#endif
28456-#define smp_read_barrier_depends() read_barrier_depends()
28457-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28458-#else
28459-#define smp_mb() barrier()
28460-#define smp_rmb() barrier()
28461-#define smp_wmb() barrier()
28462-#define smp_read_barrier_depends() do { } while(0)
28463-#define set_mb(var, value) do { var = value; barrier(); } while (0)
28464-#endif
28465-
28466-#include <linux/irqflags.h>
28467-
28468-/*
28469- * disable hlt during certain critical i/o operations
28470- */
28471-#define HAVE_DISABLE_HLT
28472-void disable_hlt(void);
28473-void enable_hlt(void);
28474-
28475-extern int es7000_plat;
28476-void cpu_idle_wait(void);
28477-
28478-extern unsigned long arch_align_stack(unsigned long sp);
28479-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28480-
28481-void default_idle(void);
28482-void __show_registers(struct pt_regs *, int all);
28483-
28484-#endif
28485--- a/include/asm-x86/mach-xen/asm/system_64.h
28486+++ b/include/asm-x86/mach-xen/asm/system_64.h
28487@@ -1,122 +1,9 @@
28488 #ifndef __ASM_SYSTEM_H
28489 #define __ASM_SYSTEM_H
28490
28491-#include <linux/kernel.h>
28492 #include <asm/segment.h>
28493 #include <asm/cmpxchg.h>
28494
28495-#include <asm/synch_bitops.h>
28496-#include <asm/hypervisor.h>
28497-#include <xen/interface/arch-x86_64.h>
28498-
28499-#ifdef __KERNEL__
28500-
28501-/* entries in ARCH_DLINFO: */
28502-#ifdef CONFIG_IA32_EMULATION
28503-# define AT_VECTOR_SIZE_ARCH 2
28504-#else
28505-# define AT_VECTOR_SIZE_ARCH 1
28506-#endif
28507-
28508-#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
28509-#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
28510-
28511-/* frame pointer must be last for get_wchan */
28512-#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
28513-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t"
28514-
28515-#define __EXTRA_CLOBBER \
28516- ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
28517-
28518-/* Save restore flags to clear handle leaking NT */
28519-#define switch_to(prev,next,last) \
28520- asm volatile(SAVE_CONTEXT \
28521- "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
28522- "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
28523- "call __switch_to\n\t" \
28524- ".globl thread_return\n" \
28525- "thread_return:\n\t" \
28526- "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
28527- "movq %P[thread_info](%%rsi),%%r8\n\t" \
28528- LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
28529- "movq %%rax,%%rdi\n\t" \
28530- "jc ret_from_fork\n\t" \
28531- RESTORE_CONTEXT \
28532- : "=a" (last) \
28533- : [next] "S" (next), [prev] "D" (prev), \
28534- [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
28535- [ti_flags] "i" (offsetof(struct thread_info, flags)),\
28536- [tif_fork] "i" (TIF_FORK), \
28537- [thread_info] "i" (offsetof(struct task_struct, stack)), \
28538- [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
28539- : "memory", "cc" __EXTRA_CLOBBER)
28540-
28541-extern void load_gs_index(unsigned);
28542-
28543-/*
28544- * Load a segment. Fall back on loading the zero
28545- * segment if something goes wrong..
28546- */
28547-#define loadsegment(seg,value) \
28548- asm volatile("\n" \
28549- "1:\t" \
28550- "movl %k0,%%" #seg "\n" \
28551- "2:\n" \
28552- ".section .fixup,\"ax\"\n" \
28553- "3:\t" \
28554- "movl %1,%%" #seg "\n\t" \
28555- "jmp 2b\n" \
28556- ".previous\n" \
28557- ".section __ex_table,\"a\"\n\t" \
28558- ".align 8\n\t" \
28559- ".quad 1b,3b\n" \
28560- ".previous" \
28561- : :"r" (value), "r" (0))
28562-
28563-/*
28564- * Clear and set 'TS' bit respectively
28565- */
28566-#define clts() (HYPERVISOR_fpu_taskswitch(0))
28567-
28568-static inline unsigned long read_cr0(void)
28569-{
28570- unsigned long cr0;
28571- asm volatile("movq %%cr0,%0" : "=r" (cr0));
28572- return cr0;
28573-}
28574-
28575-static inline void write_cr0(unsigned long val)
28576-{
28577- asm volatile("movq %0,%%cr0" :: "r" (val));
28578-}
28579-
28580-#define read_cr2() current_vcpu_info()->arch.cr2
28581-
28582-#define write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
28583-
28584-#define read_cr3() ({ \
28585- unsigned long __dummy; \
28586- asm volatile("movq %%cr3,%0" : "=r" (__dummy)); \
28587- machine_to_phys(__dummy); \
28588-})
28589-
28590-static inline void write_cr3(unsigned long val)
28591-{
28592- val = phys_to_machine(val);
28593- asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
28594-}
28595-
28596-static inline unsigned long read_cr4(void)
28597-{
28598- unsigned long cr4;
28599- asm volatile("movq %%cr4,%0" : "=r" (cr4));
28600- return cr4;
28601-}
28602-
28603-static inline void write_cr4(unsigned long val)
28604-{
28605- asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
28606-}
28607
28608 static inline unsigned long read_cr8(void)
28609 {
28610@@ -128,52 +15,6 @@ static inline void write_cr8(unsigned lo
28611 BUG_ON(val);
28612 }
28613
28614-#define stts() (HYPERVISOR_fpu_taskswitch(1))
28615-
28616-#define wbinvd() \
28617- __asm__ __volatile__ ("wbinvd": : :"memory")
28618-
28619-#endif /* __KERNEL__ */
28620-
28621-static inline void clflush(volatile void *__p)
28622-{
28623- asm volatile("clflush %0" : "+m" (*(char __force *)__p));
28624-}
28625-
28626-#define nop() __asm__ __volatile__ ("nop")
28627-
28628-#ifdef CONFIG_SMP
28629-#define smp_mb() mb()
28630-#define smp_rmb() barrier()
28631-#define smp_wmb() barrier()
28632-#define smp_read_barrier_depends() do {} while(0)
28633-#else
28634-#define smp_mb() barrier()
28635-#define smp_rmb() barrier()
28636-#define smp_wmb() barrier()
28637-#define smp_read_barrier_depends() do {} while(0)
28638-#endif
28639-
28640-
28641-/*
28642- * Force strict CPU ordering.
28643- * And yes, this is required on UP too when we're talking
28644- * to devices.
28645- */
28646-#define mb() asm volatile("mfence":::"memory")
28647-#define rmb() asm volatile("lfence":::"memory")
28648-#define wmb() asm volatile("sfence" ::: "memory")
28649-
28650-#define read_barrier_depends() do {} while(0)
28651-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28652-
28653-#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
28654-
28655 #include <linux/irqflags.h>
28656
28657-void cpu_idle_wait(void);
28658-
28659-extern unsigned long arch_align_stack(unsigned long sp);
28660-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28661-
28662 #endif
28663--- a/include/asm-x86/mach-xen/asm/system.h
28664+++ b/include/asm-x86/mach-xen/asm/system.h
28665@@ -1,5 +1,393 @@
28666+#ifndef _ASM_X86_SYSTEM_H_
28667+#define _ASM_X86_SYSTEM_H_
28668+
28669+#include <asm/asm.h>
28670+#include <asm/segment.h>
28671+#include <asm/cpufeature.h>
28672+#include <asm/cmpxchg.h>
28673+#include <asm/nops.h>
28674+#include <asm/hypervisor.h>
28675+
28676+#include <linux/kernel.h>
28677+#include <linux/irqflags.h>
28678+
28679+/* entries in ARCH_DLINFO: */
28680+#ifdef CONFIG_IA32_EMULATION
28681+# define AT_VECTOR_SIZE_ARCH 2
28682+#else
28683+# define AT_VECTOR_SIZE_ARCH 1
28684+#endif
28685+
28686+#ifdef CONFIG_X86_32
28687+
28688+struct task_struct; /* one of the stranger aspects of C forward declarations */
28689+struct task_struct *__switch_to(struct task_struct *prev,
28690+ struct task_struct *next);
28691+
28692+/*
28693+ * Saving eflags is important. It switches not only IOPL between tasks,
28694+ * it also protects other tasks from NT leaking through sysenter etc.
28695+ */
28696+#define switch_to(prev, next, last) do { \
28697+ unsigned long esi, edi; \
28698+ asm volatile("pushfl\n\t" /* Save flags */ \
28699+ "pushl %%ebp\n\t" \
28700+ "movl %%esp,%0\n\t" /* save ESP */ \
28701+ "movl %5,%%esp\n\t" /* restore ESP */ \
28702+ "movl $1f,%1\n\t" /* save EIP */ \
28703+ "pushl %6\n\t" /* restore EIP */ \
28704+ "jmp __switch_to\n" \
28705+ "1:\t" \
28706+ "popl %%ebp\n\t" \
28707+ "popfl" \
28708+ :"=m" (prev->thread.sp), "=m" (prev->thread.ip), \
28709+ "=a" (last), "=S" (esi), "=D" (edi) \
28710+ :"m" (next->thread.sp), "m" (next->thread.ip), \
28711+ "2" (prev), "d" (next)); \
28712+} while (0)
28713+
28714+/*
28715+ * disable hlt during certain critical i/o operations
28716+ */
28717+#define HAVE_DISABLE_HLT
28718+#else
28719+#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
28720+#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
28721+
28722+/* frame pointer must be last for get_wchan */
28723+#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
28724+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
28725+
28726+#define __EXTRA_CLOBBER \
28727+ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
28728+ "r12", "r13", "r14", "r15"
28729+
28730+/* Save restore flags to clear handle leaking NT */
28731+#define switch_to(prev, next, last) \
28732+ asm volatile(SAVE_CONTEXT \
28733+ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
28734+ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
28735+ "call __switch_to\n\t" \
28736+ ".globl thread_return\n" \
28737+ "thread_return:\n\t" \
28738+ "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
28739+ "movq %P[thread_info](%%rsi),%%r8\n\t" \
28740+ LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
28741+ "movq %%rax,%%rdi\n\t" \
28742+ "jc ret_from_fork\n\t" \
28743+ RESTORE_CONTEXT \
28744+ : "=a" (last) \
28745+ : [next] "S" (next), [prev] "D" (prev), \
28746+ [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
28747+ [ti_flags] "i" (offsetof(struct thread_info, flags)), \
28748+ [tif_fork] "i" (TIF_FORK), \
28749+ [thread_info] "i" (offsetof(struct task_struct, stack)), \
28750+ [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
28751+ : "memory", "cc" __EXTRA_CLOBBER)
28752+#endif
28753+
28754+#ifdef __KERNEL__
28755+#define _set_base(addr, base) do { unsigned long __pr; \
28756+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28757+ "rorl $16,%%edx\n\t" \
28758+ "movb %%dl,%2\n\t" \
28759+ "movb %%dh,%3" \
28760+ :"=&d" (__pr) \
28761+ :"m" (*((addr)+2)), \
28762+ "m" (*((addr)+4)), \
28763+ "m" (*((addr)+7)), \
28764+ "0" (base) \
28765+ ); } while (0)
28766+
28767+#define _set_limit(addr, limit) do { unsigned long __lr; \
28768+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28769+ "rorl $16,%%edx\n\t" \
28770+ "movb %2,%%dh\n\t" \
28771+ "andb $0xf0,%%dh\n\t" \
28772+ "orb %%dh,%%dl\n\t" \
28773+ "movb %%dl,%2" \
28774+ :"=&d" (__lr) \
28775+ :"m" (*(addr)), \
28776+ "m" (*((addr)+6)), \
28777+ "0" (limit) \
28778+ ); } while (0)
28779+
28780+#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
28781+#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
28782+
28783+extern void load_gs_index(unsigned);
28784+
28785+/*
28786+ * Load a segment. Fall back on loading the zero
28787+ * segment if something goes wrong..
28788+ */
28789+#define loadsegment(seg, value) \
28790+ asm volatile("\n" \
28791+ "1:\t" \
28792+ "movl %k0,%%" #seg "\n" \
28793+ "2:\n" \
28794+ ".section .fixup,\"ax\"\n" \
28795+ "3:\t" \
28796+ "movl %k1, %%" #seg "\n\t" \
28797+ "jmp 2b\n" \
28798+ ".previous\n" \
28799+ _ASM_EXTABLE(1b,3b) \
28800+ : :"r" (value), "r" (0))
28801+
28802+
28803+/*
28804+ * Save a segment register away
28805+ */
28806+#define savesegment(seg, value) \
28807+ asm volatile("mov %%" #seg ",%0":"=rm" (value))
28808+
28809+static inline unsigned long get_limit(unsigned long segment)
28810+{
28811+ unsigned long __limit;
28812+ __asm__("lsll %1,%0"
28813+ :"=r" (__limit):"r" (segment));
28814+ return __limit+1;
28815+}
28816+
28817+static inline void xen_clts(void)
28818+{
28819+ HYPERVISOR_fpu_taskswitch(0);
28820+}
28821+
28822+static inline void xen_stts(void)
28823+{
28824+ HYPERVISOR_fpu_taskswitch(1);
28825+}
28826+
28827+/*
28828+ * Volatile isn't enough to prevent the compiler from reordering the
28829+ * read/write functions for the control registers and messing everything up.
28830+ * A memory clobber would solve the problem, but would prevent reordering of
28831+ * all loads stores around it, which can hurt performance. Solution is to
28832+ * use a variable and mimic reads and writes to it to enforce serialization
28833+ */
28834+static unsigned long __force_order;
28835+
28836+static inline unsigned long xen_read_cr0(void)
28837+{
28838+ unsigned long val;
28839+ asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
28840+ return val;
28841+}
28842+
28843+static inline void xen_write_cr0(unsigned long val)
28844+{
28845+ asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
28846+}
28847+
28848+#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
28849+#define xen_write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
28850+
28851+static inline unsigned long xen_read_cr3(void)
28852+{
28853+ unsigned long val;
28854+ asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
28855+#ifdef CONFIG_X86_32
28856+ return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
28857+#else
28858+ return machine_to_phys(val);
28859+#endif
28860+}
28861+
28862+static inline void xen_write_cr3(unsigned long val)
28863+{
28864+#ifdef CONFIG_X86_32
28865+ val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
28866+#else
28867+ val = phys_to_machine(val);
28868+#endif
28869+ asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
28870+}
28871+
28872+static inline unsigned long xen_read_cr4(void)
28873+{
28874+ unsigned long val;
28875+ asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
28876+ return val;
28877+}
28878+
28879+#define xen_read_cr4_safe() xen_read_cr4()
28880+
28881+static inline void xen_write_cr4(unsigned long val)
28882+{
28883+ asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
28884+}
28885+
28886+#ifdef CONFIG_X86_64
28887+static inline unsigned long xen_read_cr8(void)
28888+{
28889+ return 0;
28890+}
28891+
28892+static inline void xen_write_cr8(unsigned long val)
28893+{
28894+ BUG_ON(val);
28895+}
28896+#endif
28897+
28898+static inline void xen_wbinvd(void)
28899+{
28900+ asm volatile("wbinvd": : :"memory");
28901+}
28902+#define read_cr0() (xen_read_cr0())
28903+#define write_cr0(x) (xen_write_cr0(x))
28904+#define read_cr2() (xen_read_cr2())
28905+#define write_cr2(x) (xen_write_cr2(x))
28906+#define read_cr3() (xen_read_cr3())
28907+#define write_cr3(x) (xen_write_cr3(x))
28908+#define read_cr4() (xen_read_cr4())
28909+#define read_cr4_safe() (xen_read_cr4_safe())
28910+#define write_cr4(x) (xen_write_cr4(x))
28911+#define wbinvd() (xen_wbinvd())
28912+#ifdef CONFIG_X86_64
28913+#define read_cr8() (xen_read_cr8())
28914+#define write_cr8(x) (xen_write_cr8(x))
28915+#endif
28916+
28917+/* Clear the 'TS' bit */
28918+#define clts() (xen_clts())
28919+#define stts() (xen_stts())
28920+
28921+#endif /* __KERNEL__ */
28922+
28923+static inline void clflush(volatile void *__p)
28924+{
28925+ asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
28926+}
28927+
28928+#define nop() __asm__ __volatile__ ("nop")
28929+
28930+void disable_hlt(void);
28931+void enable_hlt(void);
28932+
28933+extern int es7000_plat;
28934+void cpu_idle_wait(void);
28935+
28936+extern unsigned long arch_align_stack(unsigned long sp);
28937+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28938+
28939+void default_idle(void);
28940+
28941+/*
28942+ * Force strict CPU ordering.
28943+ * And yes, this is required on UP too when we're talking
28944+ * to devices.
28945+ */
28946 #ifdef CONFIG_X86_32
28947-# include "system_32.h"
28948+/*
28949+ * For now, "wmb()" doesn't actually do anything, as all
28950+ * Intel CPU's follow what Intel calls a *Processor Order*,
28951+ * in which all writes are seen in the program order even
28952+ * outside the CPU.
28953+ *
28954+ * I expect future Intel CPU's to have a weaker ordering,
28955+ * but I'd also expect them to finally get their act together
28956+ * and add some real memory barriers if so.
28957+ *
28958+ * Some non intel clones support out of order store. wmb() ceases to be a
28959+ * nop for these.
28960+ */
28961+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
28962+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
28963+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
28964+#else
28965+#define mb() asm volatile("mfence":::"memory")
28966+#define rmb() asm volatile("lfence":::"memory")
28967+#define wmb() asm volatile("sfence" ::: "memory")
28968+#endif
28969+
28970+/**
28971+ * read_barrier_depends - Flush all pending reads that subsequents reads
28972+ * depend on.
28973+ *
28974+ * No data-dependent reads from memory-like regions are ever reordered
28975+ * over this barrier. All reads preceding this primitive are guaranteed
28976+ * to access memory (but not necessarily other CPUs' caches) before any
28977+ * reads following this primitive that depend on the data return by
28978+ * any of the preceding reads. This primitive is much lighter weight than
28979+ * rmb() on most CPUs, and is never heavier weight than is
28980+ * rmb().
28981+ *
28982+ * These ordering constraints are respected by both the local CPU
28983+ * and the compiler.
28984+ *
28985+ * Ordering is not guaranteed by anything other than these primitives,
28986+ * not even by data dependencies. See the documentation for
28987+ * memory_barrier() for examples and URLs to more information.
28988+ *
28989+ * For example, the following code would force ordering (the initial
28990+ * value of "a" is zero, "b" is one, and "p" is "&a"):
28991+ *
28992+ * <programlisting>
28993+ * CPU 0 CPU 1
28994+ *
28995+ * b = 2;
28996+ * memory_barrier();
28997+ * p = &b; q = p;
28998+ * read_barrier_depends();
28999+ * d = *q;
29000+ * </programlisting>
29001+ *
29002+ * because the read of "*q" depends on the read of "p" and these
29003+ * two reads are separated by a read_barrier_depends(). However,
29004+ * the following code, with the same initial values for "a" and "b":
29005+ *
29006+ * <programlisting>
29007+ * CPU 0 CPU 1
29008+ *
29009+ * a = 2;
29010+ * memory_barrier();
29011+ * b = 3; y = b;
29012+ * read_barrier_depends();
29013+ * x = a;
29014+ * </programlisting>
29015+ *
29016+ * does not enforce ordering, since there is no data dependency between
29017+ * the read of "a" and the read of "b". Therefore, on some CPUs, such
29018+ * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
29019+ * in cases like this where there are no data dependencies.
29020+ **/
29021+
29022+#define read_barrier_depends() do { } while (0)
29023+
29024+#ifdef CONFIG_SMP
29025+#define smp_mb() mb()
29026+#ifdef CONFIG_X86_PPRO_FENCE
29027+# define smp_rmb() rmb()
29028 #else
29029-# include "system_64.h"
29030+# define smp_rmb() barrier()
29031+#endif
29032+#ifdef CONFIG_X86_OOSTORE
29033+# define smp_wmb() wmb()
29034+#else
29035+# define smp_wmb() barrier()
29036+#endif
29037+#define smp_read_barrier_depends() read_barrier_depends()
29038+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
29039+#else
29040+#define smp_mb() barrier()
29041+#define smp_rmb() barrier()
29042+#define smp_wmb() barrier()
29043+#define smp_read_barrier_depends() do { } while (0)
29044+#define set_mb(var, value) do { var = value; barrier(); } while (0)
29045+#endif
29046+
29047+/*
29048+ * Stop RDTSC speculation. This is needed when you need to use RDTSC
29049+ * (or get_cycles or vread that possibly accesses the TSC) in a defined
29050+ * code region.
29051+ *
29052+ * (Could use an alternative three way for this if there was one.)
29053+ */
29054+static inline void rdtsc_barrier(void)
29055+{
29056+ alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
29057+ alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
29058+}
29059+
29060 #endif
29061--- a/include/asm-x86/mach-xen/asm/tlbflush_32.h
29062+++ /dev/null
29063@@ -1,99 +0,0 @@
29064-#ifndef _I386_TLBFLUSH_H
29065-#define _I386_TLBFLUSH_H
29066-
29067-#include <linux/mm.h>
29068-#include <asm/processor.h>
29069-
29070-#define __flush_tlb() xen_tlb_flush()
29071-#define __flush_tlb_global() xen_tlb_flush()
29072-#define __flush_tlb_all() xen_tlb_flush()
29073-
29074-#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
29075-
29076-#define __flush_tlb_single(addr) xen_invlpg(addr)
29077-
29078-#define __flush_tlb_one(addr) __flush_tlb_single(addr)
29079-
29080-/*
29081- * TLB flushing:
29082- *
29083- * - flush_tlb() flushes the current mm struct TLBs
29084- * - flush_tlb_all() flushes all processes TLBs
29085- * - flush_tlb_mm(mm) flushes the specified mm context TLB's
29086- * - flush_tlb_page(vma, vmaddr) flushes one page
29087- * - flush_tlb_range(vma, start, end) flushes a range of pages
29088- * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
29089- *
29090- * ..but the i386 has somewhat limited tlb flushing capabilities,
29091- * and page-granular flushes are available only on i486 and up.
29092- */
29093-
29094-#define TLB_FLUSH_ALL 0xffffffff
29095-
29096-
29097-#ifndef CONFIG_SMP
29098-
29099-#include <linux/sched.h>
29100-
29101-#define flush_tlb() __flush_tlb()
29102-#define flush_tlb_all() __flush_tlb_all()
29103-#define local_flush_tlb() __flush_tlb()
29104-
29105-static inline void flush_tlb_mm(struct mm_struct *mm)
29106-{
29107- if (mm == current->active_mm)
29108- __flush_tlb();
29109-}
29110-
29111-static inline void flush_tlb_page(struct vm_area_struct *vma,
29112- unsigned long addr)
29113-{
29114- if (vma->vm_mm == current->active_mm)
29115- __flush_tlb_one(addr);
29116-}
29117-
29118-static inline void flush_tlb_range(struct vm_area_struct *vma,
29119- unsigned long start, unsigned long end)
29120-{
29121- if (vma->vm_mm == current->active_mm)
29122- __flush_tlb();
29123-}
29124-
29125-#else /* SMP */
29126-
29127-#include <asm/smp.h>
29128-
29129-#define local_flush_tlb() \
29130- __flush_tlb()
29131-
29132-#define flush_tlb_all xen_tlb_flush_all
29133-#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
29134-#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
29135-#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
29136-
29137-#define flush_tlb() flush_tlb_current_task()
29138-
29139-static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
29140-{
29141- flush_tlb_mm(vma->vm_mm);
29142-}
29143-
29144-#define TLBSTATE_OK 1
29145-#define TLBSTATE_LAZY 2
29146-
29147-struct tlb_state
29148-{
29149- struct mm_struct *active_mm;
29150- int state;
29151- char __cacheline_padding[L1_CACHE_BYTES-8];
29152-};
29153-DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
29154-#endif /* SMP */
29155-
29156-static inline void flush_tlb_kernel_range(unsigned long start,
29157- unsigned long end)
29158-{
29159- flush_tlb_all();
29160-}
29161-
29162-#endif /* _I386_TLBFLUSH_H */
29163--- a/include/asm-x86/mach-xen/asm/tlbflush_64.h
29164+++ /dev/null
29165@@ -1,97 +0,0 @@
29166-#ifndef _X8664_TLBFLUSH_H
29167-#define _X8664_TLBFLUSH_H
29168-
29169-#include <linux/mm.h>
29170-#include <linux/sched.h>
29171-#include <asm/processor.h>
29172-#include <asm/system.h>
29173-
29174-#define __flush_tlb() xen_tlb_flush()
29175-
29176-/*
29177- * Global pages have to be flushed a bit differently. Not a real
29178- * performance problem because this does not happen often.
29179- */
29180-#define __flush_tlb_global() xen_tlb_flush()
29181-
29182-#define __flush_tlb_all() __flush_tlb_global()
29183-
29184-#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
29185-
29186-
29187-/*
29188- * TLB flushing:
29189- *
29190- * - flush_tlb() flushes the current mm struct TLBs
29191- * - flush_tlb_all() flushes all processes TLBs
29192- * - flush_tlb_mm(mm) flushes the specified mm context TLB's
29193- * - flush_tlb_page(vma, vmaddr) flushes one page
29194- * - flush_tlb_range(vma, start, end) flushes a range of pages
29195- * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
29196- *
29197- * x86-64 can only flush individual pages or full VMs. For a range flush
29198- * we always do the full VM. Might be worth trying if for a small
29199- * range a few INVLPGs in a row are a win.
29200- */
29201-
29202-#ifndef CONFIG_SMP
29203-
29204-#define flush_tlb() __flush_tlb()
29205-#define flush_tlb_all() __flush_tlb_all()
29206-#define local_flush_tlb() __flush_tlb()
29207-
29208-static inline void flush_tlb_mm(struct mm_struct *mm)
29209-{
29210- if (mm == current->active_mm)
29211- __flush_tlb();
29212-}
29213-
29214-static inline void flush_tlb_page(struct vm_area_struct *vma,
29215- unsigned long addr)
29216-{
29217- if (vma->vm_mm == current->active_mm)
29218- __flush_tlb_one(addr);
29219-}
29220-
29221-static inline void flush_tlb_range(struct vm_area_struct *vma,
29222- unsigned long start, unsigned long end)
29223-{
29224- if (vma->vm_mm == current->active_mm)
29225- __flush_tlb();
29226-}
29227-
29228-#else
29229-
29230-#include <asm/smp.h>
29231-
29232-#define local_flush_tlb() \
29233- __flush_tlb()
29234-
29235-#define flush_tlb_all xen_tlb_flush_all
29236-#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
29237-#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
29238-#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
29239-
29240-#define flush_tlb() flush_tlb_current_task()
29241-
29242-static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
29243-{
29244- flush_tlb_mm(vma->vm_mm);
29245-}
29246-
29247-#define TLBSTATE_OK 1
29248-#define TLBSTATE_LAZY 2
29249-
29250-/* Roughly an IPI every 20MB with 4k pages for freeing page table
29251- ranges. Cost is about 42k of memory for each CPU. */
29252-#define ARCH_FREE_PTE_NR 5350
29253-
29254-#endif
29255-
29256-static inline void flush_tlb_kernel_range(unsigned long start,
29257- unsigned long end)
29258-{
29259- flush_tlb_all();
29260-}
29261-
29262-#endif /* _X8664_TLBFLUSH_H */
29263--- a/include/asm-x86/mach-xen/asm/tlbflush.h
29264+++ b/include/asm-x86/mach-xen/asm/tlbflush.h
29265@@ -1,5 +1,106 @@
29266+#ifndef _ASM_X86_TLBFLUSH_H
29267+#define _ASM_X86_TLBFLUSH_H
29268+
29269+#include <linux/mm.h>
29270+#include <linux/sched.h>
29271+
29272+#include <asm/processor.h>
29273+#include <asm/system.h>
29274+
29275+#define __flush_tlb() xen_tlb_flush()
29276+#define __flush_tlb_global() xen_tlb_flush()
29277+#define __flush_tlb_single(addr) xen_invlpg(addr)
29278+#define __flush_tlb_all() xen_tlb_flush()
29279+#define __flush_tlb_one(addr) xen_invlpg(addr)
29280+
29281 #ifdef CONFIG_X86_32
29282-# include "tlbflush_32.h"
29283+# define TLB_FLUSH_ALL 0xffffffff
29284 #else
29285-# include "tlbflush_64.h"
29286+# define TLB_FLUSH_ALL -1ULL
29287 #endif
29288+
29289+/*
29290+ * TLB flushing:
29291+ *
29292+ * - flush_tlb() flushes the current mm struct TLBs
29293+ * - flush_tlb_all() flushes all processes TLBs
29294+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
29295+ * - flush_tlb_page(vma, vmaddr) flushes one page
29296+ * - flush_tlb_range(vma, start, end) flushes a range of pages
29297+ * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
29298+ *
29299+ * ..but the i386 has somewhat limited tlb flushing capabilities,
29300+ * and page-granular flushes are available only on i486 and up.
29301+ *
29302+ * x86-64 can only flush individual pages or full VMs. For a range flush
29303+ * we always do the full VM. Might be worth trying if for a small
29304+ * range a few INVLPGs in a row are a win.
29305+ */
29306+
29307+#ifndef CONFIG_SMP
29308+
29309+#define flush_tlb() __flush_tlb()
29310+#define flush_tlb_all() __flush_tlb_all()
29311+#define local_flush_tlb() __flush_tlb()
29312+
29313+static inline void flush_tlb_mm(struct mm_struct *mm)
29314+{
29315+ if (mm == current->active_mm)
29316+ __flush_tlb();
29317+}
29318+
29319+static inline void flush_tlb_page(struct vm_area_struct *vma,
29320+ unsigned long addr)
29321+{
29322+ if (vma->vm_mm == current->active_mm)
29323+ __flush_tlb_one(addr);
29324+}
29325+
29326+static inline void flush_tlb_range(struct vm_area_struct *vma,
29327+ unsigned long start, unsigned long end)
29328+{
29329+ if (vma->vm_mm == current->active_mm)
29330+ __flush_tlb();
29331+}
29332+
29333+#else /* SMP */
29334+
29335+#include <asm/smp.h>
29336+
29337+#define local_flush_tlb() __flush_tlb()
29338+
29339+#define flush_tlb_all xen_tlb_flush_all
29340+#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
29341+#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
29342+#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
29343+
29344+#define flush_tlb() flush_tlb_current_task()
29345+
29346+static inline void flush_tlb_range(struct vm_area_struct *vma,
29347+ unsigned long start, unsigned long end)
29348+{
29349+ flush_tlb_mm(vma->vm_mm);
29350+}
29351+
29352+#define TLBSTATE_OK 1
29353+#define TLBSTATE_LAZY 2
29354+
29355+#ifdef CONFIG_X86_32
29356+struct tlb_state
29357+{
29358+ struct mm_struct *active_mm;
29359+ int state;
29360+ char __cacheline_padding[L1_CACHE_BYTES-8];
29361+};
29362+DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
29363+#endif
29364+
29365+#endif /* SMP */
29366+
29367+static inline void flush_tlb_kernel_range(unsigned long start,
29368+ unsigned long end)
29369+{
29370+ flush_tlb_all();
29371+}
29372+
29373+#endif /* _ASM_X86_TLBFLUSH_H */
29374--- a/include/asm-x86/mach-xen/irq_vectors.h
29375+++ b/include/asm-x86/mach-xen/irq_vectors.h
29376@@ -82,7 +82,8 @@
29377
29378 #define RESCHEDULE_VECTOR 0
29379 #define CALL_FUNCTION_VECTOR 1
29380-#define NR_IPIS 2
29381+#define SPIN_UNLOCK_VECTOR 2
29382+#define NR_IPIS 3
29383
29384 /*
29385 * The maximum number of vectors supported by i386 processors
29386--- a/include/asm-x86/mmu.h
29387+++ b/include/asm-x86/mmu.h
29388@@ -23,7 +23,7 @@ typedef struct {
29389 void *vdso;
29390 } mm_context_t;
29391
29392-#ifdef CONFIG_SMP
29393+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
29394 void leave_mm(int cpu);
29395 #else
29396 static inline void leave_mm(int cpu)
29397--- a/include/asm-x86/ptrace.h
29398+++ b/include/asm-x86/ptrace.h
29399@@ -249,7 +249,9 @@ extern void user_enable_single_step(stru
29400 extern void user_disable_single_step(struct task_struct *);
29401
29402 extern void user_enable_block_step(struct task_struct *);
29403-#ifdef CONFIG_X86_DEBUGCTLMSR
29404+#if defined(CONFIG_XEN)
29405+#define arch_has_block_step() (0)
29406+#elif defined(CONFIG_X86_DEBUGCTLMSR)
29407 #define arch_has_block_step() (1)
29408 #else
29409 #define arch_has_block_step() (boot_cpu_data.x86 >= 6)
29410--- a/include/asm-x86/thread_info.h
29411+++ b/include/asm-x86/thread_info.h
29412@@ -94,6 +94,9 @@ struct thread_info {
29413 #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
29414 #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
29415 #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
29416+#ifdef CONFIG_X86_XEN
29417+#define TIF_CSTAR 31 /* cstar-based syscall (special handling) */
29418+#endif
29419
29420 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
29421 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
29422@@ -118,6 +121,7 @@ struct thread_info {
29423 #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
29424 #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
29425 #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
29426+#define _TIF_CSTAR (1 << TIF_CSTAR)
29427
29428 /* work to do in syscall_trace_enter() */
29429 #define _TIF_WORK_SYSCALL_ENTRY \
29430@@ -147,12 +151,12 @@ struct thread_info {
29431 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
29432 _TIF_NOTSC|_TIF_PERFMON_CTXSW)
29433
29434-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
29435-#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
29436 #else
29437-#define _TIF_WORK_CTXSW_NEXT (_TIF_NOTSC | _TIF_DEBUG)
29438-#define _TIF_WORK_CTXSW_PREV (_TIF_NOTSC)
29439+#define _TIF_WORK_CTXSW (_TIF_NOTSC \
29440+ /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/)
29441 #endif
29442+#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
29443+#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
29444
29445 #define PREEMPT_ACTIVE 0x10000000
29446
29447--- a/include/asm-x86/time.h
29448+++ b/include/asm-x86/time.h
29449@@ -58,4 +58,10 @@ static inline int native_set_wallclock(u
29450
29451 extern unsigned long __init calibrate_cpu(void);
29452
29453+#ifdef CONFIG_XEN
29454+extern int xen_independent_wallclock(void);
29455+extern unsigned long xen_read_persistent_clock(void);
29456+extern int xen_update_persistent_clock(void);
29457+#endif
29458+
29459 #endif
29460--- a/include/linux/page-flags.h
29461+++ b/include/linux/page-flags.h
29462@@ -101,8 +101,8 @@ enum pageflags {
29463 PG_foreign, /* Page is owned by foreign allocator. */
29464 PG_pinned, /* Cannot alias with PG_owner_priv_1 since
29465 * bad_page() checks include this bit.
29466- * Also cannot use PG_arch_1 since that now
29467- * has a different purpose on x86. */
29468+ * Should not use PG_arch_1 as that may have
29469+ * a different purpose elsewhere. */
29470 #endif
29471 __NR_PAGEFLAGS,
29472
29473--- a/include/linux/pci.h
29474+++ b/include/linux/pci.h
29475@@ -644,6 +644,9 @@ int pcie_set_readrq(struct pci_dev *dev,
29476 void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
29477 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
29478 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
29479+#ifdef CONFIG_XEN
29480+void pci_restore_bars(struct pci_dev *);
29481+#endif
29482
29483 /* ROM control related routines */
29484 void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
29485--- a/include/xen/evtchn.h
29486+++ b/include/xen/evtchn.h
29487@@ -130,12 +130,37 @@ static inline void clear_evtchn(int port
29488 synch_clear_bit(port, s->evtchn_pending);
29489 }
29490
29491+static inline void set_evtchn(int port)
29492+{
29493+ shared_info_t *s = HYPERVISOR_shared_info;
29494+ synch_set_bit(port, s->evtchn_pending);
29495+}
29496+
29497+static inline int test_evtchn(int port)
29498+{
29499+ shared_info_t *s = HYPERVISOR_shared_info;
29500+ return synch_test_bit(port, s->evtchn_pending);
29501+}
29502+
29503 static inline void notify_remote_via_evtchn(int port)
29504 {
29505 struct evtchn_send send = { .port = port };
29506 VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
29507 }
29508
29509+/* Clear an irq's pending state, in preparation for polling on it. */
29510+void xen_clear_irq_pending(int irq);
29511+
29512+/* Set an irq's pending state, to avoid blocking on it. */
29513+void xen_set_irq_pending(int irq);
29514+
29515+/* Test an irq's pending state. */
29516+int xen_test_irq_pending(int irq);
29517+
29518+/* Poll waiting for an irq to become pending. In the usual case, the
29519+ irq will be disabled so it won't deliver an interrupt. */
29520+void xen_poll_irq(int irq);
29521+
29522 /*
29523 * Use these to access the event channel underlying the IRQ handle returned
29524 * by bind_*_to_irqhandler().
29525--- a/kernel/sysctl_check.c
29526+++ b/kernel/sysctl_check.c
29527@@ -899,7 +899,7 @@ static const struct trans_ctl_table tran
29528 };
29529
29530 #ifdef CONFIG_XEN
29531-static struct trans_ctl_table trans_xen_table[] = {
29532+static const struct trans_ctl_table trans_xen_table[] = {
29533 { CTL_XEN_INDEPENDENT_WALLCLOCK, "independent_wallclock" },
29534 { CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" },
29535 {}
29536--- a/lib/swiotlb-xen.c
29537+++ b/lib/swiotlb-xen.c
29538@@ -30,7 +30,6 @@
29539 #include <asm/gnttab_dma.h>
29540
29541 int swiotlb;
29542-EXPORT_SYMBOL(swiotlb);
29543
29544 #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
29545
29546@@ -289,6 +288,15 @@ __sync_single(struct phys_addr buffer, c
29547 }
29548 }
29549
29550+static inline unsigned int is_span_boundary(unsigned int index,
29551+ unsigned int nslots,
29552+ unsigned long offset_slots,
29553+ unsigned long max_slots)
29554+{
29555+ unsigned long offset = (offset_slots + index) & (max_slots - 1);
29556+ return offset + nslots > max_slots;
29557+}
29558+
29559 /*
29560 * Allocates bounce buffer and returns its kernel virtual address.
29561 */
29562@@ -300,6 +308,15 @@ map_single(struct device *hwdev, struct
29563 unsigned int nslots, stride, index, wrap;
29564 struct phys_addr slot_buf;
29565 int i;
29566+ unsigned long mask;
29567+ unsigned long offset_slots;
29568+ unsigned long max_slots;
29569+
29570+ mask = dma_get_seg_boundary(hwdev);
29571+ offset_slots = -IO_TLB_SEGSIZE;
29572+ max_slots = mask + 1
29573+ ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
29574+ : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
29575
29576 /*
29577 * For mappings greater than a page, we limit the stride (and
29578@@ -319,12 +336,21 @@ map_single(struct device *hwdev, struct
29579 */
29580 spin_lock_irqsave(&io_tlb_lock, flags);
29581 {
29582- wrap = index = ALIGN(io_tlb_index, stride);
29583-
29584+ index = ALIGN(io_tlb_index, stride);
29585 if (index >= iotlb_nslabs)
29586- wrap = index = 0;
29587+ index = 0;
29588+ wrap = index;
29589
29590 do {
29591+ while (is_span_boundary(index, nslots, offset_slots,
29592+ max_slots)) {
29593+ index += stride;
29594+ if (index >= iotlb_nslabs)
29595+ index = 0;
29596+ if (index == wrap)
29597+ goto not_found;
29598+ }
29599+
29600 /*
29601 * If we find a slot that indicates we have 'nslots'
29602 * number of contiguous buffers, we allocate the
29603@@ -359,6 +385,7 @@ map_single(struct device *hwdev, struct
29604 index = 0;
29605 } while (index != wrap);
29606
29607+ not_found:
29608 spin_unlock_irqrestore(&io_tlb_lock, flags);
29609 return NULL;
29610 }