]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blame - src/patches/60036_xen3-patch-2.6.27.patch1
Stop dhcpcd before starting if it was running
[people/pmueller/ipfire-2.x.git] / src / patches / 60036_xen3-patch-2.6.27.patch1
CommitLineData
cc90b958
BS
1From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
2Subject: [PATCH] Linux: Update to 2.6.27
3Patch-mainline: 2.6.27
4
5 This patch contains the differences between Linux 2.6.26 and 2.6.27.
6
7Acked-by: Jeff Mahoney <jeffm@suse.com>
8Automatically created from "patches.kernel.org/patch-2.6.27" by xen-port-patches.py
9
10Index: head-2008-12-01/arch/x86/Kconfig
11===================================================================
12--- head-2008-12-01.orig/arch/x86/Kconfig 2008-12-01 11:44:55.000000000 +0100
13+++ head-2008-12-01/arch/x86/Kconfig 2008-12-01 11:49:07.000000000 +0100
14@@ -590,7 +590,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
15 config AMD_IOMMU
16 bool "AMD IOMMU support"
17 select SWIOTLB
18- depends on X86_64 && PCI && ACPI
19+ depends on X86_64 && PCI && ACPI && !X86_64_XEN
20 help
21 With this option you can enable support for AMD IOMMU hardware in
22 your system. An IOMMU is a hardware component which provides
23@@ -625,8 +625,10 @@ config MAXSMP
24
25 config NR_CPUS
26 int "Maximum number of CPUs (2-512)" if !MAXSMP
27+ range 2 32 if XEN
28 range 2 512
29 depends on SMP
30+ default "32" if MAXSMP && XEN
31 default "4096" if MAXSMP
32 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
33 default "16" if X86_64_XEN
34@@ -1223,7 +1225,7 @@ config MTRR
35 config MTRR_SANITIZER
36 bool
37 prompt "MTRR cleanup support"
38- depends on MTRR
39+ depends on MTRR && !XEN
40 help
41 Convert MTRR layout from continuous to discrete, so X drivers can
42 add writeback entries.
43Index: head-2008-12-01/arch/x86/Kconfig.debug
44===================================================================
45--- head-2008-12-01.orig/arch/x86/Kconfig.debug 2008-12-01 11:37:10.000000000 +0100
46+++ head-2008-12-01/arch/x86/Kconfig.debug 2008-12-01 11:49:07.000000000 +0100
47@@ -25,6 +25,7 @@ config STRICT_DEVMEM
48 config X86_VERBOSE_BOOTUP
49 bool "Enable verbose x86 bootup info messages"
50 default y
51+ depends on !XEN
52 help
53 Enables the informational output from the decompression stage
54 (e.g. bzImage) of the boot. If you disable this you will still
55@@ -166,7 +167,7 @@ config MMIOTRACE_HOOKS
56
57 config MMIOTRACE
58 bool "Memory mapped IO tracing"
59- depends on DEBUG_KERNEL && PCI
60+ depends on DEBUG_KERNEL && PCI && !XEN
61 select TRACING
62 select MMIOTRACE_HOOKS
63 help
64Index: head-2008-12-01/arch/x86/Makefile
65===================================================================
66--- head-2008-12-01.orig/arch/x86/Makefile 2008-12-01 11:36:55.000000000 +0100
67+++ head-2008-12-01/arch/x86/Makefile 2008-12-01 11:49:07.000000000 +0100
68@@ -116,8 +116,8 @@ mflags-$(CONFIG_X86_VOYAGER) := -Iinclud
69 mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
70
71 # Xen subarch support
72-mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-x86/mach-xen
73-mcore-$(CONFIG_X86_XEN) := arch/x86/mach-xen/
74+mflags-$(CONFIG_XEN) := -Iinclude/asm-x86/mach-xen
75+mcore-$(CONFIG_XEN) := arch/x86/mach-xen/
76
77 # generic subarchitecture
78 mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
79@@ -128,8 +128,10 @@ mcore-$(CONFIG_X86_GENERICARCH) := arch/
80 mflags-y += -Iinclude/asm-x86/mach-default
81
82 # 64 bit does not support subarch support - clear sub arch variables
83+ifneq ($(CONFIG_XEN),y)
84 fcore-$(CONFIG_X86_64) :=
85 mcore-$(CONFIG_X86_64) :=
86+endif
87
88 KBUILD_CFLAGS += $(mflags-y)
89 KBUILD_AFLAGS += $(mflags-y)
90Index: head-2008-12-01/arch/x86/ia32/ia32entry-xen.S
91===================================================================
92--- head-2008-12-01.orig/arch/x86/ia32/ia32entry-xen.S 2008-12-01 11:44:55.000000000 +0100
93+++ head-2008-12-01/arch/x86/ia32/ia32entry-xen.S 2008-12-01 11:49:07.000000000 +0100
94@@ -15,6 +15,16 @@
95 #include <asm/irqflags.h>
96 #include <linux/linkage.h>
97
98+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
99+#include <linux/elf-em.h>
100+#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
101+#define __AUDIT_ARCH_LE 0x40000000
102+
103+#ifndef CONFIG_AUDITSYSCALL
104+#define sysexit_audit int_ret_from_sys_call
105+#define sysretl_audit int_ret_from_sys_call
106+#endif
107+
108 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
109
110 .macro IA32_ARG_FIXUP noebp=0
111@@ -37,6 +47,11 @@
112 movq %rax,R8(%rsp)
113 .endm
114
115+ /*
116+ * Reload arg registers from stack in case ptrace changed them.
117+ * We don't reload %eax because syscall_trace_enter() returned
118+ * the value it wants us to use in the table lookup.
119+ */
120 .macro LOAD_ARGS32 offset
121 movl \offset(%rsp),%r11d
122 movl \offset+8(%rsp),%r10d
123@@ -46,7 +61,6 @@
124 movl \offset+48(%rsp),%edx
125 movl \offset+56(%rsp),%esi
126 movl \offset+64(%rsp),%edi
127- movl \offset+72(%rsp),%eax
128 .endm
129
130 .macro CFI_STARTPROC32 simple
131@@ -61,6 +75,19 @@
132 CFI_UNDEFINED r15
133 .endm
134
135+#ifdef CONFIG_PARAVIRT
136+ENTRY(native_usergs_sysret32)
137+ swapgs
138+ sysretl
139+ENDPROC(native_usergs_sysret32)
140+
141+ENTRY(native_irq_enable_sysexit)
142+ swapgs
143+ sti
144+ sysexit
145+ENDPROC(native_irq_enable_sysexit)
146+#endif
147+
148 /*
149 * 32bit SYSENTER instruction entry.
150 *
151@@ -98,7 +125,7 @@ ENTRY(ia32_sysenter_target)
152 CFI_RESTORE rcx
153 movl %ebp,%ebp /* zero extension */
154 movl %eax,%eax
155- movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
156+ movl 48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d
157 movl $__USER32_DS,40(%rsp)
158 movq %rbp,32(%rsp)
159 movl $__USER32_CS,16(%rsp)
160@@ -113,19 +140,79 @@ ENTRY(ia32_sysenter_target)
161 .quad 1b,ia32_badarg
162 .previous
163 GET_THREAD_INFO(%r10)
164- orl $TS_COMPAT,threadinfo_status(%r10)
165- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
166+ orl $TS_COMPAT,TI_status(%r10)
167+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
168 jnz sysenter_tracesys
169-sysenter_do_call:
170 cmpl $(IA32_NR_syscalls-1),%eax
171 ja ia32_badsys
172+sysenter_do_call:
173 IA32_ARG_FIXUP 1
174+sysenter_dispatch:
175 call *ia32_sys_call_table(,%rax,8)
176 movq %rax,RAX-ARGOFFSET(%rsp)
177+ GET_THREAD_INFO(%r10)
178+ DISABLE_INTERRUPTS(CLBR_NONE)
179+ TRACE_IRQS_OFF
180+ testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
181+ jnz sysexit_audit
182+ jmp int_ret_from_sys_call
183+
184+#ifdef CONFIG_AUDITSYSCALL
185+ .macro auditsys_entry_common
186+ movl %esi,%r9d /* 6th arg: 4th syscall arg */
187+ movl %edx,%r8d /* 5th arg: 3rd syscall arg */
188+ /* (already in %ecx) 4th arg: 2nd syscall arg */
189+ movl %ebx,%edx /* 3rd arg: 1st syscall arg */
190+ movl %eax,%esi /* 2nd arg: syscall number */
191+ movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
192+ call audit_syscall_entry
193+ movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
194+ cmpl $(IA32_NR_syscalls-1),%eax
195+ ja ia32_badsys
196+ movl %ebx,%edi /* reload 1st syscall arg */
197+ movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
198+ movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
199+ movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
200+ movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
201+ .endm
202+
203+ .macro auditsys_exit exit,ebpsave=RBP
204+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
205+ jnz int_ret_from_sys_call
206+ TRACE_IRQS_ON
207+ ENABLE_INTERRUPTS(CLBR_NONE)
208+ movl %eax,%esi /* second arg, syscall return value */
209+ cmpl $0,%eax /* is it < 0? */
210+ setl %al /* 1 if so, 0 if not */
211+ movzbl %al,%edi /* zero-extend that into %edi */
212+ inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
213+ call audit_syscall_exit
214+ GET_THREAD_INFO(%r10)
215+ movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
216+ movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
217+ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
218+ DISABLE_INTERRUPTS(CLBR_NONE)
219+ TRACE_IRQS_OFF
220+ testl %edi,TI_flags(%r10)
221+ jnz int_with_check
222 jmp int_ret_from_sys_call
223+ .endm
224+
225+sysenter_auditsys:
226+ auditsys_entry_common
227+ movl %ebp,%r9d /* reload 6th syscall arg */
228+ jmp sysenter_dispatch
229+
230+sysexit_audit:
231+ auditsys_exit sysexit_from_sys_call
232+#endif
233
234 sysenter_tracesys:
235 xchgl %r9d,%ebp
236+#ifdef CONFIG_AUDITSYSCALL
237+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
238+ jz sysenter_auditsys
239+#endif
240 SAVE_REST
241 CLEAR_RREGS
242 movq %r9,R9(%rsp)
243@@ -186,18 +273,38 @@ ENTRY(ia32_cstar_target)
244 .quad 1b,ia32_badarg
245 .previous
246 GET_THREAD_INFO(%r10)
247- orl $TS_COMPAT,threadinfo_status(%r10)
248- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
249+ orl $TS_COMPAT,TI_status(%r10)
250+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
251 jnz cstar_tracesys
252 cstar_do_call:
253 cmpl $IA32_NR_syscalls-1,%eax
254 ja ia32_badsys
255 IA32_ARG_FIXUP 1
256+cstar_dispatch:
257 call *ia32_sys_call_table(,%rax,8)
258 movq %rax,RAX-ARGOFFSET(%rsp)
259+ GET_THREAD_INFO(%r10)
260+ DISABLE_INTERRUPTS(CLBR_NONE)
261+ testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
262+ jnz sysretl_audit
263 jmp int_ret_from_sys_call
264
265-cstar_tracesys:
266+#ifdef CONFIG_AUDITSYSCALL
267+cstar_auditsys:
268+ movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
269+ auditsys_entry_common
270+ movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
271+ jmp cstar_dispatch
272+
273+sysretl_audit:
274+ auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
275+#endif
276+
277+cstar_tracesys:
278+#ifdef CONFIG_AUDITSYSCALL
279+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
280+ jz cstar_auditsys
281+#endif
282 xchgl %r9d,%ebp
283 SAVE_REST
284 CLEAR_RREGS
285@@ -263,8 +370,8 @@ ENTRY(ia32_syscall)
286 this could be a problem. */
287 SAVE_ARGS 0,0,1
288 GET_THREAD_INFO(%r10)
289- orl $TS_COMPAT,threadinfo_status(%r10)
290- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
291+ orl $TS_COMPAT,TI_status(%r10)
292+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
293 jnz ia32_tracesys
294 ia32_do_syscall:
295 cmpl $(IA32_NR_syscalls-1),%eax
296@@ -309,13 +416,11 @@ quiet_ni_syscall:
297 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
298 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
299 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
300- PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
301 PTREGSCALL stub32_execve, sys32_execve, %rcx
302 PTREGSCALL stub32_fork, sys_fork, %rdi
303 PTREGSCALL stub32_clone, sys32_clone, %rdx
304 PTREGSCALL stub32_vfork, sys_vfork, %rdi
305 PTREGSCALL stub32_iopl, sys_iopl, %rsi
306- PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
307
308 ENTRY(ia32_ptregs_common)
309 popq %r11
310@@ -415,7 +520,7 @@ ia32_sys_call_table:
311 .quad sys_ssetmask
312 .quad sys_setreuid16 /* 70 */
313 .quad sys_setregid16
314- .quad stub32_sigsuspend
315+ .quad sys32_sigsuspend
316 .quad compat_sys_sigpending
317 .quad sys_sethostname
318 .quad compat_sys_setrlimit /* 75 */
319@@ -522,7 +627,7 @@ ia32_sys_call_table:
320 .quad sys32_rt_sigpending
321 .quad compat_sys_rt_sigtimedwait
322 .quad sys32_rt_sigqueueinfo
323- .quad stub32_rt_sigsuspend
324+ .quad sys_rt_sigsuspend
325 .quad sys32_pread /* 180 */
326 .quad sys32_pwrite
327 .quad sys_chown16
328@@ -670,4 +775,10 @@ ia32_sys_call_table:
329 .quad sys32_fallocate
330 .quad compat_sys_timerfd_settime /* 325 */
331 .quad compat_sys_timerfd_gettime
332+ .quad compat_sys_signalfd4
333+ .quad sys_eventfd2
334+ .quad sys_epoll_create1
335+ .quad sys_dup3 /* 330 */
336+ .quad sys_pipe2
337+ .quad sys_inotify_init1
338 ia32_syscall_end:
339Index: head-2008-12-01/arch/x86/kernel/Makefile
340===================================================================
341--- head-2008-12-01.orig/arch/x86/kernel/Makefile 2008-12-01 11:44:55.000000000 +0100
342+++ head-2008-12-01/arch/x86/kernel/Makefile 2008-12-01 11:49:07.000000000 +0100
343@@ -120,9 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
344
345 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
346
347- obj-$(CONFIG_XEN) += nmi_64.o
348+ obj-$(CONFIG_XEN) += nmi.o
349 time_64-$(CONFIG_XEN) += time_32.o
350 endif
351
352-disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
353- pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
354+disabled-obj-$(CONFIG_XEN) := bios_uv.o early-quirks.o hpet.o i8253.o \
355+ i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \
356+ tlb_$(BITS).o tsc.o tsc_sync.o vsmp_64.o
357Index: head-2008-12-01/arch/x86/kernel/acpi/boot.c
358===================================================================
359--- head-2008-12-01.orig/arch/x86/kernel/acpi/boot.c 2008-12-01 11:44:55.000000000 +0100
360+++ head-2008-12-01/arch/x86/kernel/acpi/boot.c 2008-12-01 11:49:07.000000000 +0100
361@@ -951,7 +951,9 @@ void __init mp_register_ioapic(int id, u
362 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
363 mp_ioapics[idx].mp_apicaddr = address;
364
365+#ifndef CONFIG_XEN
366 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
367+#endif
368 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
369 #ifdef CONFIG_X86_32
370 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
371@@ -1108,7 +1110,7 @@ int mp_register_gsi(u32 gsi, int trigger
372 {
373 int ioapic;
374 int ioapic_pin;
375-#ifdef CONFIG_X86_32
376+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
377 #define MAX_GSI_NUM 4096
378 #define IRQ_COMPRESSION_START 64
379
380@@ -1156,7 +1158,7 @@ int mp_register_gsi(u32 gsi, int trigger
381 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
382 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
383 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
384-#ifdef CONFIG_X86_32
385+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
386 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
387 #else
388 return gsi;
389@@ -1164,7 +1166,7 @@ int mp_register_gsi(u32 gsi, int trigger
390 }
391
392 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
393-#ifdef CONFIG_X86_32
394+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
395 /*
396 * For GSI >= 64, use IRQ compression
397 */
398Index: head-2008-12-01/arch/x86/kernel/acpi/sleep-xen.c
399===================================================================
400--- head-2008-12-01.orig/arch/x86/kernel/acpi/sleep-xen.c 2008-12-01 11:44:55.000000000 +0100
401+++ head-2008-12-01/arch/x86/kernel/acpi/sleep-xen.c 2008-12-01 11:49:07.000000000 +0100
402@@ -9,6 +9,7 @@
403 #include <linux/bootmem.h>
404 #include <linux/dmi.h>
405 #include <linux/cpumask.h>
406+#include <asm/segment.h>
407
408 #include "realmode/wakeup.h"
409 #include "sleep.h"
410@@ -20,7 +21,7 @@ unsigned long acpi_realmode_flags;
411 /* address in low memory of the wakeup routine. */
412 static unsigned long acpi_realmode;
413
414-#ifdef CONFIG_64BIT
415+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
416 static char temp_stack[10240];
417 #endif
418 #endif
419@@ -54,18 +55,27 @@ int acpi_save_state_mem(void)
420 header->video_mode = saved_video_mode;
421
422 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
423+
424+ /*
425+ * Set up the wakeup GDT. We set these up as Big Real Mode,
426+ * that is, with limits set to 4 GB. At least the Lenovo
427+ * Thinkpad X61 is known to need this for the video BIOS
428+ * initialization quirk to work; this is likely to also
429+ * be the case for other laptops or integrated video devices.
430+ */
431+
432 /* GDT[0]: GDT self-pointer */
433 header->wakeup_gdt[0] =
434 (u64)(sizeof(header->wakeup_gdt) - 1) +
435 ((u64)(acpi_wakeup_address +
436 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
437 << 16);
438- /* GDT[1]: real-mode-like code segment */
439- header->wakeup_gdt[1] = (0x009bULL << 40) +
440- ((u64)acpi_wakeup_address << 16) + 0xffff;
441- /* GDT[2]: real-mode-like data segment */
442- header->wakeup_gdt[2] = (0x0093ULL << 40) +
443- ((u64)acpi_wakeup_address << 16) + 0xffff;
444+ /* GDT[1]: big real mode-like code segment */
445+ header->wakeup_gdt[1] =
446+ GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
447+ /* GDT[2]: big real mode-like data segment */
448+ header->wakeup_gdt[2] =
449+ GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
450
451 #ifndef CONFIG_64BIT
452 store_gdt((struct desc_ptr *)&header->pmode_gdt);
453@@ -79,7 +89,7 @@ int acpi_save_state_mem(void)
454 #endif /* !CONFIG_64BIT */
455
456 header->pmode_cr0 = read_cr0();
457- header->pmode_cr4 = read_cr4();
458+ header->pmode_cr4 = read_cr4_safe();
459 header->realmode_flags = acpi_realmode_flags;
460 header->real_magic = 0x12345678;
461
462@@ -89,7 +99,9 @@ int acpi_save_state_mem(void)
463 saved_magic = 0x12345678;
464 #else /* CONFIG_64BIT */
465 header->trampoline_segment = setup_trampoline() >> 4;
466- init_rsp = (unsigned long)temp_stack + 4096;
467+#ifdef CONFIG_SMP
468+ stack_start.sp = temp_stack + 4096;
469+#endif
470 initial_code = (unsigned long)wakeup_long64;
471 saved_magic = 0x123456789abcdef0;
472 #endif /* CONFIG_64BIT */
473@@ -145,6 +157,12 @@ static int __init acpi_sleep_setup(char
474 acpi_realmode_flags |= 2;
475 if (strncmp(str, "s3_beep", 7) == 0)
476 acpi_realmode_flags |= 4;
477+#ifdef CONFIG_HIBERNATION
478+ if (strncmp(str, "s4_nohwsig", 10) == 0)
479+ acpi_no_s4_hw_signature();
480+#endif
481+ if (strncmp(str, "old_ordering", 12) == 0)
482+ acpi_old_suspend_ordering();
483 str = strchr(str, ',');
484 if (str != NULL)
485 str += strspn(str, ", \t");
486Index: head-2008-12-01/arch/x86/kernel/apic_32-xen.c
487===================================================================
488--- head-2008-12-01.orig/arch/x86/kernel/apic_32-xen.c 2008-12-01 11:37:10.000000000 +0100
489+++ head-2008-12-01/arch/x86/kernel/apic_32-xen.c 2008-12-01 11:49:07.000000000 +0100
490@@ -59,7 +59,10 @@ static cpumask_t timer_bcast_ipi;
491 /*
492 * Debug level, exported for io_apic.c
493 */
494-int apic_verbosity;
495+unsigned int apic_verbosity;
496+
497+/* Have we found an MP table */
498+int smp_found_config;
499
500 #ifndef CONFIG_XEN
501 static int modern_apic(void)
502Index: head-2008-12-01/arch/x86/kernel/apic_64-xen.c
503===================================================================
504--- head-2008-12-01.orig/arch/x86/kernel/apic_64-xen.c 2008-12-01 11:37:10.000000000 +0100
505+++ head-2008-12-01/arch/x86/kernel/apic_64-xen.c 2008-12-01 11:49:07.000000000 +0100
506@@ -39,7 +39,10 @@ int disable_apic;
507 /*
508 * Debug level, exported for io_apic.c
509 */
510-int apic_verbosity;
511+unsigned int apic_verbosity;
512+
513+/* Have we found an MP table */
514+int smp_found_config;
515
516 /*
517 * The guts of the apic timer interrupt
518Index: head-2008-12-01/arch/x86/kernel/asm-offsets_64.c
519===================================================================
520--- head-2008-12-01.orig/arch/x86/kernel/asm-offsets_64.c 2008-12-03 15:48:43.000000000 +0100
521+++ head-2008-12-01/arch/x86/kernel/asm-offsets_64.c 2008-12-01 11:49:07.000000000 +0100
522@@ -138,7 +138,7 @@ int main(void)
523
524 BLANK();
525 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
526-#ifdef CONFIG_XEN
527+#ifdef CONFIG_PARAVIRT_XEN
528 BLANK();
529 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
530 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
531Index: head-2008-12-01/arch/x86/kernel/cpu/amd_64.c
532===================================================================
533--- head-2008-12-01.orig/arch/x86/kernel/cpu/amd_64.c 2008-12-03 15:48:43.000000000 +0100
534+++ head-2008-12-01/arch/x86/kernel/cpu/amd_64.c 2008-12-01 11:49:07.000000000 +0100
535@@ -193,6 +193,7 @@ static void __cpuinit init_amd(struct cp
536 fam10h_check_enable_mmcfg();
537 }
538
539+#ifndef CONFIG_XEN
540 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
541 unsigned long long tseg;
542
543@@ -211,6 +212,7 @@ static void __cpuinit init_amd(struct cp
544 set_memory_4k((unsigned long)__va(tseg), 1);
545 }
546 }
547+#endif
548 }
549
550 static struct cpu_dev amd_cpu_dev __cpuinitdata = {
551Index: head-2008-12-01/arch/x86/kernel/cpu/bugs_64.c
552===================================================================
553--- head-2008-12-01.orig/arch/x86/kernel/cpu/bugs_64.c 2008-12-03 15:48:43.000000000 +0100
554+++ head-2008-12-01/arch/x86/kernel/cpu/bugs_64.c 2008-12-01 11:49:07.000000000 +0100
555@@ -20,6 +20,7 @@ void __init check_bugs(void)
556 #endif
557 alternative_instructions();
558
559+#ifndef CONFIG_XEN
560 /*
561 * Make sure the first 2MB area is not mapped by huge pages
562 * There are typically fixed size MTRRs in there and overlapping
563@@ -30,4 +31,5 @@ void __init check_bugs(void)
564 */
565 if (!direct_gbpages)
566 set_memory_4k((unsigned long)__va(0), 1);
567+#endif
568 }
569Index: head-2008-12-01/arch/x86/kernel/cpu/common-xen.c
570===================================================================
571--- head-2008-12-01.orig/arch/x86/kernel/cpu/common-xen.c 2008-12-01 11:44:55.000000000 +0100
572+++ head-2008-12-01/arch/x86/kernel/cpu/common-xen.c 2008-12-01 11:49:07.000000000 +0100
573@@ -13,6 +13,7 @@
574 #include <asm/mtrr.h>
575 #include <asm/mce.h>
576 #include <asm/pat.h>
577+#include <asm/asm.h>
578 #ifdef CONFIG_X86_LOCAL_APIC
579 #include <asm/mpspec.h>
580 #include <asm/apic.h>
581@@ -341,11 +342,24 @@ static void __init early_cpu_detect(void
582
583 get_cpu_vendor(c, 1);
584
585+ early_get_cap(c);
586+
587 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
588 cpu_devs[c->x86_vendor]->c_early_init)
589 cpu_devs[c->x86_vendor]->c_early_init(c);
590+}
591
592- early_get_cap(c);
593+/*
594+ * The NOPL instruction is supposed to exist on all CPUs with
595+ * family >= 6; unfortunately, that's not true in practice because
596+ * of early VIA chips and (more importantly) broken virtualizers that
597+ * are not easy to detect. In the latter case it doesn't even *fail*
598+ * reliably, so probing for it doesn't even work. Disable it completely
599+ * unless we can find a reliable way to detect all the broken cases.
600+ */
601+static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
602+{
603+ clear_cpu_cap(c, X86_FEATURE_NOPL);
604 }
605
606 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
607@@ -402,8 +416,8 @@ static void __cpuinit generic_identify(s
608 }
609
610 init_scattered_cpuid_features(c);
611+ detect_nopl(c);
612 }
613-
614 }
615
616 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
617@@ -434,7 +448,7 @@ __setup("serialnumber", x86_serial_nr_se
618 /*
619 * This does the hard work of actually picking apart the CPU stuff...
620 */
621-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
622+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
623 {
624 int i;
625
626@@ -448,6 +462,8 @@ void __cpuinit identify_cpu(struct cpuin
627 c->x86_max_cores = 1;
628 c->x86_clflush_size = 32;
629 memset(&c->x86_capability, 0, sizeof c->x86_capability);
630+ if (boot_cpu_has(X86_FEATURE_SYSCALL32))
631+ set_cpu_cap(c, X86_FEATURE_SYSCALL32);
632
633 if (!have_cpuid_p()) {
634 /*
635Index: head-2008-12-01/arch/x86/kernel/cpu/common_64-xen.c
636===================================================================
637--- /dev/null 1970-01-01 00:00:00.000000000 +0000
638+++ head-2008-12-01/arch/x86/kernel/cpu/common_64-xen.c 2008-12-01 11:49:07.000000000 +0100
639@@ -0,0 +1,771 @@
640+#include <linux/init.h>
641+#include <linux/kernel.h>
642+#include <linux/sched.h>
643+#include <linux/string.h>
644+#include <linux/bootmem.h>
645+#include <linux/bitops.h>
646+#include <linux/module.h>
647+#include <linux/kgdb.h>
648+#include <linux/topology.h>
649+#include <linux/delay.h>
650+#include <linux/smp.h>
651+#include <linux/percpu.h>
652+#include <asm/i387.h>
653+#include <asm/msr.h>
654+#include <asm/io.h>
655+#include <asm/linkage.h>
656+#include <asm/mmu_context.h>
657+#include <asm/mtrr.h>
658+#include <asm/mce.h>
659+#include <asm/pat.h>
660+#include <asm/asm.h>
661+#include <asm/numa.h>
662+#ifdef CONFIG_X86_LOCAL_APIC
663+#include <asm/mpspec.h>
664+#include <asm/apic.h>
665+#include <mach_apic.h>
666+#elif defined(CONFIG_XEN)
667+#include <mach_apic.h>
668+#endif
669+#include <asm/pda.h>
670+#include <asm/pgtable.h>
671+#include <asm/processor.h>
672+#include <asm/desc.h>
673+#include <asm/atomic.h>
674+#include <asm/proto.h>
675+#include <asm/sections.h>
676+#include <asm/setup.h>
677+#include <asm/genapic.h>
678+
679+#include "cpu.h"
680+
681+/* We need valid kernel segments for data and code in long mode too
682+ * IRET will check the segment types kkeil 2000/10/28
683+ * Also sysret mandates a special GDT layout
684+ */
685+/* The TLS descriptors are currently at a different place compared to i386.
686+ Hopefully nobody expects them at a fixed place (Wine?) */
687+DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
688+ [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
689+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
690+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
691+ [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
692+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
693+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
694+} };
695+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
696+
697+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
698+
699+/* Current gdt points %fs at the "master" per-cpu area: after this,
700+ * it's on the real one. */
701+void switch_to_new_gdt(void)
702+{
703+#ifndef CONFIG_XEN
704+ struct desc_ptr gdt_descr;
705+
706+ gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
707+ gdt_descr.size = GDT_SIZE - 1;
708+ load_gdt(&gdt_descr);
709+#else
710+ void *va, *gdt_addr = get_cpu_gdt_table(smp_processor_id());
711+ unsigned long frames[16];
712+ unsigned int f = 0;
713+
714+ for (va = gdt_addr; va < gdt_addr + GDT_SIZE; va += PAGE_SIZE) {
715+ frames[f++] = virt_to_mfn(va);
716+ make_page_readonly(va, XENFEAT_writable_descriptor_tables);
717+ }
718+ if (HYPERVISOR_set_gdt(frames, GDT_SIZE / sizeof(struct desc_struct)))
719+ BUG();
720+#endif
721+}
722+
723+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
724+
725+static void __cpuinit default_init(struct cpuinfo_x86 *c)
726+{
727+ display_cacheinfo(c);
728+}
729+
730+static struct cpu_dev __cpuinitdata default_cpu = {
731+ .c_init = default_init,
732+ .c_vendor = "Unknown",
733+};
734+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
735+
736+int __cpuinit get_model_name(struct cpuinfo_x86 *c)
737+{
738+ unsigned int *v;
739+
740+ if (c->extended_cpuid_level < 0x80000004)
741+ return 0;
742+
743+ v = (unsigned int *) c->x86_model_id;
744+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
745+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
746+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
747+ c->x86_model_id[48] = 0;
748+ return 1;
749+}
750+
751+
752+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
753+{
754+ unsigned int n, dummy, ebx, ecx, edx;
755+
756+ n = c->extended_cpuid_level;
757+
758+ if (n >= 0x80000005) {
759+ cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
760+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
761+ "D cache %dK (%d bytes/line)\n",
762+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
763+ c->x86_cache_size = (ecx>>24) + (edx>>24);
764+ /* On K8 L1 TLB is inclusive, so don't count it */
765+ c->x86_tlbsize = 0;
766+ }
767+
768+ if (n >= 0x80000006) {
769+ cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
770+ ecx = cpuid_ecx(0x80000006);
771+ c->x86_cache_size = ecx >> 16;
772+ c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
773+
774+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
775+ c->x86_cache_size, ecx & 0xFF);
776+ }
777+}
778+
779+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
780+{
781+#ifdef CONFIG_SMP
782+ u32 eax, ebx, ecx, edx;
783+ int index_msb, core_bits;
784+
785+ cpuid(1, &eax, &ebx, &ecx, &edx);
786+
787+
788+ if (!cpu_has(c, X86_FEATURE_HT))
789+ return;
790+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
791+ goto out;
792+
793+ smp_num_siblings = (ebx & 0xff0000) >> 16;
794+
795+ if (smp_num_siblings == 1) {
796+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
797+ } else if (smp_num_siblings > 1) {
798+
799+ if (smp_num_siblings > NR_CPUS) {
800+ printk(KERN_WARNING "CPU: Unsupported number of "
801+ "siblings %d", smp_num_siblings);
802+ smp_num_siblings = 1;
803+ return;
804+ }
805+
806+ index_msb = get_count_order(smp_num_siblings);
807+ c->phys_proc_id = phys_pkg_id(index_msb);
808+
809+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
810+
811+ index_msb = get_count_order(smp_num_siblings);
812+
813+ core_bits = get_count_order(c->x86_max_cores);
814+
815+ c->cpu_core_id = phys_pkg_id(index_msb) &
816+ ((1 << core_bits) - 1);
817+ }
818+out:
819+ if ((c->x86_max_cores * smp_num_siblings) > 1) {
820+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
821+ c->phys_proc_id);
822+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
823+ c->cpu_core_id);
824+ }
825+
826+#endif
827+}
828+
829+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
830+{
831+ char *v = c->x86_vendor_id;
832+ int i;
833+ static int printed;
834+
835+ for (i = 0; i < X86_VENDOR_NUM; i++) {
836+ if (cpu_devs[i]) {
837+ if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
838+ (cpu_devs[i]->c_ident[1] &&
839+ !strcmp(v, cpu_devs[i]->c_ident[1]))) {
840+ c->x86_vendor = i;
841+ this_cpu = cpu_devs[i];
842+ return;
843+ }
844+ }
845+ }
846+ if (!printed) {
847+ printed++;
848+ printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
849+ printk(KERN_ERR "CPU: Your system may be unstable.\n");
850+ }
851+ c->x86_vendor = X86_VENDOR_UNKNOWN;
852+}
853+
854+static void __init early_cpu_support_print(void)
855+{
856+ int i,j;
857+ struct cpu_dev *cpu_devx;
858+
859+ printk("KERNEL supported cpus:\n");
860+ for (i = 0; i < X86_VENDOR_NUM; i++) {
861+ cpu_devx = cpu_devs[i];
862+ if (!cpu_devx)
863+ continue;
864+ for (j = 0; j < 2; j++) {
865+ if (!cpu_devx->c_ident[j])
866+ continue;
867+ printk(" %s %s\n", cpu_devx->c_vendor,
868+ cpu_devx->c_ident[j]);
869+ }
870+ }
871+}
872+
873+/*
874+ * The NOPL instruction is supposed to exist on all CPUs with
875+ * family >= 6, unfortunately, that's not true in practice because
876+ * of early VIA chips and (more importantly) broken virtualizers that
877+ * are not easy to detect. Hence, probe for it based on first
878+ * principles.
879+ *
880+ * Note: no 64-bit chip is known to lack these, but put the code here
881+ * for consistency with 32 bits, and to make it utterly trivial to
882+ * diagnose the problem should it ever surface.
883+ */
884+static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
885+{
886+ const u32 nopl_signature = 0x888c53b1; /* Random number */
887+ u32 has_nopl = nopl_signature;
888+
889+ clear_cpu_cap(c, X86_FEATURE_NOPL);
890+ if (c->x86 >= 6) {
891+ asm volatile("\n"
892+ "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
893+ "2:\n"
894+ " .section .fixup,\"ax\"\n"
895+ "3: xor %0,%0\n"
896+ " jmp 2b\n"
897+ " .previous\n"
898+ _ASM_EXTABLE(1b,3b)
899+ : "+a" (has_nopl));
900+
901+ if (has_nopl == nopl_signature)
902+ set_cpu_cap(c, X86_FEATURE_NOPL);
903+ }
904+}
905+
906+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
907+
908+void __init early_cpu_init(void)
909+{
910+ struct cpu_vendor_dev *cvdev;
911+
912+ for (cvdev = __x86cpuvendor_start ;
913+ cvdev < __x86cpuvendor_end ;
914+ cvdev++)
915+ cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
916+ early_cpu_support_print();
917+ early_identify_cpu(&boot_cpu_data);
918+}
919+
920+/* Do some early cpuid on the boot CPU to get some parameter that are
921+ needed before check_bugs. Everything advanced is in identify_cpu
922+ below. */
923+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
924+{
925+ u32 tfms, xlvl;
926+
927+ c->loops_per_jiffy = loops_per_jiffy;
928+ c->x86_cache_size = -1;
929+ c->x86_vendor = X86_VENDOR_UNKNOWN;
930+ c->x86_model = c->x86_mask = 0; /* So far unknown... */
931+ c->x86_vendor_id[0] = '\0'; /* Unset */
932+ c->x86_model_id[0] = '\0'; /* Unset */
933+ c->x86_clflush_size = 64;
934+ c->x86_cache_alignment = c->x86_clflush_size;
935+ c->x86_max_cores = 1;
936+ c->x86_coreid_bits = 0;
937+ c->extended_cpuid_level = 0;
938+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
939+
940+ /* Get vendor name */
941+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
942+ (unsigned int *)&c->x86_vendor_id[0],
943+ (unsigned int *)&c->x86_vendor_id[8],
944+ (unsigned int *)&c->x86_vendor_id[4]);
945+
946+ get_cpu_vendor(c);
947+
948+ /* Initialize the standard set of capabilities */
949+ /* Note that the vendor-specific code below might override */
950+
951+ /* Intel-defined flags: level 0x00000001 */
952+ if (c->cpuid_level >= 0x00000001) {
953+ __u32 misc;
954+ cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
955+ &c->x86_capability[0]);
956+ c->x86 = (tfms >> 8) & 0xf;
957+ c->x86_model = (tfms >> 4) & 0xf;
958+ c->x86_mask = tfms & 0xf;
959+ if (c->x86 == 0xf)
960+ c->x86 += (tfms >> 20) & 0xff;
961+ if (c->x86 >= 0x6)
962+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
963+ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
964+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
965+ } else {
966+ /* Have CPUID level 0 only - unheard of */
967+ c->x86 = 4;
968+ }
969+
970+ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
971+#ifdef CONFIG_SMP
972+ c->phys_proc_id = c->initial_apicid;
973+#endif
974+ /* AMD-defined flags: level 0x80000001 */
975+ xlvl = cpuid_eax(0x80000000);
976+ c->extended_cpuid_level = xlvl;
977+ if ((xlvl & 0xffff0000) == 0x80000000) {
978+ if (xlvl >= 0x80000001) {
979+ c->x86_capability[1] = cpuid_edx(0x80000001);
980+ c->x86_capability[6] = cpuid_ecx(0x80000001);
981+ }
982+ if (xlvl >= 0x80000004)
983+ get_model_name(c); /* Default name */
984+ }
985+
986+ /* Transmeta-defined flags: level 0x80860001 */
987+ xlvl = cpuid_eax(0x80860000);
988+ if ((xlvl & 0xffff0000) == 0x80860000) {
989+ /* Don't set x86_cpuid_level here for now to not confuse. */
990+ if (xlvl >= 0x80860001)
991+ c->x86_capability[2] = cpuid_edx(0x80860001);
992+ }
993+
994+ if (c->extended_cpuid_level >= 0x80000007)
995+ c->x86_power = cpuid_edx(0x80000007);
996+
997+ if (c->extended_cpuid_level >= 0x80000008) {
998+ u32 eax = cpuid_eax(0x80000008);
999+
1000+ c->x86_virt_bits = (eax >> 8) & 0xff;
1001+ c->x86_phys_bits = eax & 0xff;
1002+ }
1003+
1004+ detect_nopl(c);
1005+
1006+ if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
1007+ cpu_devs[c->x86_vendor]->c_early_init)
1008+ cpu_devs[c->x86_vendor]->c_early_init(c);
1009+
1010+ validate_pat_support(c);
1011+}
1012+
1013+/*
1014+ * This does the hard work of actually picking apart the CPU stuff...
1015+ */
1016+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
1017+{
1018+ int i;
1019+
1020+ early_identify_cpu(c);
1021+
1022+ init_scattered_cpuid_features(c);
1023+
1024+ c->apicid = phys_pkg_id(0);
1025+
1026+ /*
1027+ * Vendor-specific initialization. In this section we
1028+ * canonicalize the feature flags, meaning if there are
1029+ * features a certain CPU supports which CPUID doesn't
1030+ * tell us, CPUID claiming incorrect flags, or other bugs,
1031+ * we handle them here.
1032+ *
1033+ * At the end of this section, c->x86_capability better
1034+ * indicate the features this CPU genuinely supports!
1035+ */
1036+ if (this_cpu->c_init)
1037+ this_cpu->c_init(c);
1038+
1039+ detect_ht(c);
1040+
1041+ /*
1042+ * On SMP, boot_cpu_data holds the common feature set between
1043+ * all CPUs; so make sure that we indicate which features are
1044+ * common between the CPUs. The first time this routine gets
1045+ * executed, c == &boot_cpu_data.
1046+ */
1047+ if (c != &boot_cpu_data) {
1048+ /* AND the already accumulated flags with these */
1049+ for (i = 0; i < NCAPINTS; i++)
1050+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1051+ }
1052+
1053+ /* Clear all flags overriden by options */
1054+ for (i = 0; i < NCAPINTS; i++)
1055+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
1056+
1057+#ifdef CONFIG_X86_MCE
1058+ mcheck_init(c);
1059+#endif
1060+ select_idle_routine(c);
1061+
1062+#ifdef CONFIG_NUMA
1063+ numa_add_cpu(smp_processor_id());
1064+#endif
1065+
1066+}
1067+
1068+void __cpuinit identify_boot_cpu(void)
1069+{
1070+ identify_cpu(&boot_cpu_data);
1071+}
1072+
1073+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1074+{
1075+ BUG_ON(c == &boot_cpu_data);
1076+ identify_cpu(c);
1077+ mtrr_ap_init();
1078+}
1079+
1080+static __init int setup_noclflush(char *arg)
1081+{
1082+ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1083+ return 1;
1084+}
1085+__setup("noclflush", setup_noclflush);
1086+
1087+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1088+{
1089+ if (c->x86_model_id[0])
1090+ printk(KERN_CONT "%s", c->x86_model_id);
1091+
1092+ if (c->x86_mask || c->cpuid_level >= 0)
1093+ printk(KERN_CONT " stepping %02x\n", c->x86_mask);
1094+ else
1095+ printk(KERN_CONT "\n");
1096+}
1097+
1098+static __init int setup_disablecpuid(char *arg)
1099+{
1100+ int bit;
1101+ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1102+ setup_clear_cpu_cap(bit);
1103+ else
1104+ return 0;
1105+ return 1;
1106+}
1107+__setup("clearcpuid=", setup_disablecpuid);
1108+
1109+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1110+
1111+struct x8664_pda **_cpu_pda __read_mostly;
1112+EXPORT_SYMBOL(_cpu_pda);
1113+
1114+#ifndef CONFIG_X86_NO_IDT
1115+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
1116+#endif
1117+
1118+char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
1119+
1120+unsigned long __supported_pte_mask __read_mostly = ~0UL;
1121+EXPORT_SYMBOL_GPL(__supported_pte_mask);
1122+
1123+static int do_not_nx __cpuinitdata;
1124+
1125+/* noexec=on|off
1126+Control non executable mappings for 64bit processes.
1127+
1128+on Enable(default)
1129+off Disable
1130+*/
1131+static int __init nonx_setup(char *str)
1132+{
1133+ if (!str)
1134+ return -EINVAL;
1135+ if (!strncmp(str, "on", 2)) {
1136+ __supported_pte_mask |= _PAGE_NX;
1137+ do_not_nx = 0;
1138+ } else if (!strncmp(str, "off", 3)) {
1139+ do_not_nx = 1;
1140+ __supported_pte_mask &= ~_PAGE_NX;
1141+ }
1142+ return 0;
1143+}
1144+early_param("noexec", nonx_setup);
1145+
1146+int force_personality32;
1147+
1148+/* noexec32=on|off
1149+Control non executable heap for 32bit processes.
1150+To control the stack too use noexec=off
1151+
1152+on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
1153+off PROT_READ implies PROT_EXEC
1154+*/
1155+static int __init nonx32_setup(char *str)
1156+{
1157+ if (!strcmp(str, "on"))
1158+ force_personality32 &= ~READ_IMPLIES_EXEC;
1159+ else if (!strcmp(str, "off"))
1160+ force_personality32 |= READ_IMPLIES_EXEC;
1161+ return 1;
1162+}
1163+__setup("noexec32=", nonx32_setup);
1164+
1165+static void __init_refok switch_pt(int cpu)
1166+{
1167+#ifdef CONFIG_XEN
1168+ if (cpu == 0)
1169+ xen_init_pt();
1170+ xen_pt_switch(__pa_symbol(init_level4_pgt));
1171+ xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
1172+#endif
1173+}
1174+
1175+void pda_init(int cpu)
1176+{
1177+ struct x8664_pda *pda = cpu_pda(cpu);
1178+
1179+ /* Setup up data that may be needed in __get_free_pages early */
1180+ loadsegment(fs, 0);
1181+ loadsegment(gs, 0);
1182+#ifndef CONFIG_XEN
1183+ /* Memory clobbers used to order PDA accessed */
1184+ mb();
1185+ wrmsrl(MSR_GS_BASE, pda);
1186+ mb();
1187+#else
1188+ if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
1189+ (unsigned long)pda))
1190+ BUG();
1191+#endif
1192+
1193+ pda->cpunumber = cpu;
1194+ pda->irqcount = -1;
1195+ pda->kernelstack = (unsigned long)stack_thread_info() -
1196+ PDA_STACKOFFSET + THREAD_SIZE;
1197+ pda->active_mm = &init_mm;
1198+ pda->mmu_state = 0;
1199+
1200+ if (cpu == 0) {
1201+ /* others are initialized in smpboot.c */
1202+ pda->pcurrent = &init_task;
1203+ pda->irqstackptr = boot_cpu_stack;
1204+ pda->irqstackptr += IRQSTACKSIZE - 64;
1205+ } else {
1206+ if (!pda->irqstackptr) {
1207+ pda->irqstackptr = (char *)
1208+ __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
1209+ if (!pda->irqstackptr)
1210+ panic("cannot allocate irqstack for cpu %d",
1211+ cpu);
1212+ pda->irqstackptr += IRQSTACKSIZE - 64;
1213+ }
1214+
1215+ if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
1216+ pda->nodenumber = cpu_to_node(cpu);
1217+ }
1218+
1219+ switch_pt(cpu);
1220+}
1221+
1222+#ifndef CONFIG_X86_NO_TSS
1223+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
1224+ DEBUG_STKSZ] __page_aligned_bss;
1225+#endif
1226+
1227+extern asmlinkage void ignore_sysret(void);
1228+
1229+void __cpuinit syscall_init(void)
1230+{
1231+#ifndef CONFIG_XEN
1232+ /*
1233+ * LSTAR and STAR live in a bit strange symbiosis.
1234+ * They both write to the same internal register. STAR allows to
1235+ * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
1236+ */
1237+ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1238+ wrmsrl(MSR_LSTAR, system_call);
1239+ wrmsrl(MSR_CSTAR, ignore_sysret);
1240+
1241+ /* Flags to clear on syscall */
1242+ wrmsrl(MSR_SYSCALL_MASK,
1243+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1244+#endif
1245+#ifdef CONFIG_IA32_EMULATION
1246+ syscall32_cpu_init();
1247+#else
1248+ static /*const*/ struct callback_register __cpuinitdata cstar = {
1249+ .type = CALLBACKTYPE_syscall32,
1250+ .address = (unsigned long)ignore_sysret
1251+ };
1252+
1253+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
1254+ printk(KERN_WARN "Unable to register CSTAR callback\n");
1255+#endif
1256+}
1257+
1258+void __cpuinit check_efer(void)
1259+{
1260+ unsigned long efer;
1261+
1262+ rdmsrl(MSR_EFER, efer);
1263+ if (!(efer & EFER_NX) || do_not_nx)
1264+ __supported_pte_mask &= ~_PAGE_NX;
1265+}
1266+
1267+unsigned long kernel_eflags;
1268+
1269+#ifndef CONFIG_X86_NO_TSS
1270+/*
1271+ * Copies of the original ist values from the tss are only accessed during
1272+ * debugging, no special alignment required.
1273+ */
1274+DEFINE_PER_CPU(struct orig_ist, orig_ist);
1275+#endif
1276+
1277+/*
1278+ * cpu_init() initializes state that is per-CPU. Some data is already
1279+ * initialized (naturally) in the bootstrap process, such as the GDT
1280+ * and IDT. We reload them nevertheless, this function acts as a
1281+ * 'CPU state barrier', nothing should get across.
1282+ * A lot of state is already set up in PDA init.
1283+ */
1284+void __cpuinit cpu_init(void)
1285+{
1286+ int cpu = stack_smp_processor_id();
1287+#ifndef CONFIG_X86_NO_TSS
1288+ struct tss_struct *t = &per_cpu(init_tss, cpu);
1289+ struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
1290+ unsigned long v;
1291+ char *estacks = NULL;
1292+ int i;
1293+#endif
1294+ struct task_struct *me;
1295+
1296+ /* CPU 0 is initialised in head64.c */
1297+ if (cpu != 0)
1298+ pda_init(cpu);
1299+#ifndef CONFIG_X86_NO_TSS
1300+ else
1301+ estacks = boot_exception_stacks;
1302+#endif
1303+
1304+ me = current;
1305+
1306+ if (cpu_test_and_set(cpu, cpu_initialized))
1307+ panic("CPU#%d already initialized!\n", cpu);
1308+
1309+ printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1310+
1311+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1312+
1313+ /*
1314+ * Initialize the per-CPU GDT with the boot GDT,
1315+ * and set up the GDT descriptor:
1316+ */
1317+
1318+ switch_to_new_gdt();
1319+#ifndef CONFIG_X86_NO_IDT
1320+ load_idt((const struct desc_ptr *)&idt_descr);
1321+#endif
1322+
1323+ memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
1324+ syscall_init();
1325+
1326+ wrmsrl(MSR_FS_BASE, 0);
1327+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
1328+ barrier();
1329+
1330+ check_efer();
1331+
1332+#ifndef CONFIG_X86_NO_TSS
1333+ /*
1334+ * set up and load the per-CPU TSS
1335+ */
1336+ if (!orig_ist->ist[0]) {
1337+ static const unsigned int order[N_EXCEPTION_STACKS] = {
1338+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1339+ [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1340+ };
1341+ for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1342+ if (cpu) {
1343+ estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1344+ if (!estacks)
1345+ panic("Cannot allocate exception "
1346+ "stack %ld %d\n", v, cpu);
1347+ }
1348+ estacks += PAGE_SIZE << order[v];
1349+ orig_ist->ist[v] = t->x86_tss.ist[v] =
1350+ (unsigned long)estacks;
1351+ }
1352+ }
1353+
1354+ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1355+ /*
1356+ * <= is required because the CPU will access up to
1357+ * 8 bits beyond the end of the IO permission bitmap.
1358+ */
1359+ for (i = 0; i <= IO_BITMAP_LONGS; i++)
1360+ t->io_bitmap[i] = ~0UL;
1361+#endif
1362+
1363+ atomic_inc(&init_mm.mm_count);
1364+ me->active_mm = &init_mm;
1365+ if (me->mm)
1366+ BUG();
1367+ enter_lazy_tlb(&init_mm, me);
1368+
1369+ load_sp0(t, &current->thread);
1370+#ifndef CONFIG_X86_NO_TSS
1371+ set_tss_desc(cpu, t);
1372+ load_TR_desc();
1373+#endif
1374+ load_LDT(&init_mm.context);
1375+
1376+#ifdef CONFIG_KGDB
1377+ /*
1378+ * If the kgdb is connected no debug regs should be altered. This
1379+ * is only applicable when KGDB and a KGDB I/O module are built
1380+ * into the kernel and you are using early debugging with
1381+ * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1382+ */
1383+ if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1384+ arch_kgdb_ops.correct_hw_break();
1385+ else {
1386+#endif
1387+ /*
1388+ * Clear all 6 debug registers:
1389+ */
1390+
1391+ set_debugreg(0UL, 0);
1392+ set_debugreg(0UL, 1);
1393+ set_debugreg(0UL, 2);
1394+ set_debugreg(0UL, 3);
1395+ set_debugreg(0UL, 6);
1396+ set_debugreg(0UL, 7);
1397+#ifdef CONFIG_KGDB
1398+ /* If the kgdb is connected no debug regs should be altered. */
1399+ }
1400+#endif
1401+
1402+ fpu_init();
1403+
1404+ asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
1405+ if (raw_irqs_disabled())
1406+ kernel_eflags &= ~X86_EFLAGS_IF;
1407+
1408+ if (is_uv_system())
1409+ uv_cpu_init();
1410+}
1411Index: head-2008-12-01/arch/x86/kernel/e820-xen.c
1412===================================================================
1413--- /dev/null 1970-01-01 00:00:00.000000000 +0000
1414+++ head-2008-12-01/arch/x86/kernel/e820-xen.c 2008-12-01 11:49:07.000000000 +0100
1415@@ -0,0 +1,1470 @@
1416+/*
1417+ * Handle the memory map.
1418+ * The functions here do the job until bootmem takes over.
1419+ *
1420+ * Getting sanitize_e820_map() in sync with i386 version by applying change:
1421+ * - Provisions for empty E820 memory regions (reported by certain BIOSes).
1422+ * Alex Achenbach <xela@slit.de>, December 2002.
1423+ * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
1424+ *
1425+ */
1426+#include <linux/kernel.h>
1427+#include <linux/types.h>
1428+#include <linux/init.h>
1429+#include <linux/bootmem.h>
1430+#include <linux/ioport.h>
1431+#include <linux/string.h>
1432+#include <linux/kexec.h>
1433+#include <linux/module.h>
1434+#include <linux/mm.h>
1435+#include <linux/pfn.h>
1436+#include <linux/suspend.h>
1437+#include <linux/firmware-map.h>
1438+
1439+#include <asm/pgtable.h>
1440+#include <asm/page.h>
1441+#include <asm/e820.h>
1442+#include <asm/proto.h>
1443+#include <asm/setup.h>
1444+#include <xen/interface/memory.h>
1445+
1446+/*
1447+ * The e820 map is the map that gets modified e.g. with command line parameters
1448+ * and that is also registered with modifications in the kernel resource tree
1449+ * with the iomem_resource as parent.
1450+ *
1451+ * The e820_saved is directly saved after the BIOS-provided memory map is
1452+ * copied. It doesn't get modified afterwards. It's registered for the
1453+ * /sys/firmware/memmap interface.
1454+ *
1455+ * That memory map is not modified and is used as base for kexec. The kexec'd
1456+ * kernel should get the same memory map as the firmware provides. Then the
1457+ * user can e.g. boot the original kernel with mem=1G while still booting the
1458+ * next kernel with full memory.
1459+ */
1460+struct e820map e820;
1461+struct e820map e820_saved;
1462+#ifdef CONFIG_XEN
1463+static struct e820map machine_e820;
1464+#endif
1465+
1466+/* For PCI or other memory-mapped resources */
1467+unsigned long pci_mem_start = 0xaeedbabe;
1468+#ifdef CONFIG_PCI
1469+EXPORT_SYMBOL(pci_mem_start);
1470+#endif
1471+
1472+/*
1473+ * This function checks if any part of the range <start,end> is mapped
1474+ * with type.
1475+ */
1476+int
1477+e820_any_mapped(u64 start, u64 end, unsigned type)
1478+{
1479+ int i;
1480+
1481+#ifndef CONFIG_XEN
1482+ for (i = 0; i < e820.nr_map; i++) {
1483+ struct e820entry *ei = &e820.map[i];
1484+#else
1485+ if (!is_initial_xendomain())
1486+ return 0;
1487+ for (i = 0; i < machine_e820.nr_map; ++i) {
1488+ const struct e820entry *ei = &machine_e820.map[i];
1489+#endif
1490+
1491+ if (type && ei->type != type)
1492+ continue;
1493+ if (ei->addr >= end || ei->addr + ei->size <= start)
1494+ continue;
1495+ return 1;
1496+ }
1497+ return 0;
1498+}
1499+EXPORT_SYMBOL_GPL(e820_any_mapped);
1500+
1501+/*
1502+ * This function checks if the entire range <start,end> is mapped with type.
1503+ *
1504+ * Note: this function only works correct if the e820 table is sorted and
1505+ * not-overlapping, which is the case
1506+ */
1507+int __init e820_all_mapped(u64 start, u64 end, unsigned type)
1508+{
1509+ int i;
1510+
1511+#ifndef CONFIG_XEN
1512+ for (i = 0; i < e820.nr_map; i++) {
1513+ struct e820entry *ei = &e820.map[i];
1514+#else
1515+ if (!is_initial_xendomain())
1516+ return 0;
1517+ for (i = 0; i < machine_e820.nr_map; ++i) {
1518+ const struct e820entry *ei = &machine_e820.map[i];
1519+#endif
1520+
1521+ if (type && ei->type != type)
1522+ continue;
1523+ /* is the region (part) in overlap with the current region ?*/
1524+ if (ei->addr >= end || ei->addr + ei->size <= start)
1525+ continue;
1526+
1527+ /* if the region is at the beginning of <start,end> we move
1528+ * start to the end of the region since it's ok until there
1529+ */
1530+ if (ei->addr <= start)
1531+ start = ei->addr + ei->size;
1532+ /*
1533+ * if start is now at or beyond end, we're done, full
1534+ * coverage
1535+ */
1536+ if (start >= end)
1537+ return 1;
1538+ }
1539+ return 0;
1540+}
1541+
1542+/*
1543+ * Add a memory region to the kernel e820 map.
1544+ */
1545+void __init e820_add_region(u64 start, u64 size, int type)
1546+{
1547+ int x = e820.nr_map;
1548+
1549+ if (x == ARRAY_SIZE(e820.map)) {
1550+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1551+ return;
1552+ }
1553+
1554+ e820.map[x].addr = start;
1555+ e820.map[x].size = size;
1556+ e820.map[x].type = type;
1557+ e820.nr_map++;
1558+}
1559+
1560+void __init e820_print_map(char *who)
1561+{
1562+ int i;
1563+
1564+ for (i = 0; i < e820.nr_map; i++) {
1565+ printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1566+ (unsigned long long) e820.map[i].addr,
1567+ (unsigned long long)
1568+ (e820.map[i].addr + e820.map[i].size));
1569+ switch (e820.map[i].type) {
1570+ case E820_RAM:
1571+ case E820_RESERVED_KERN:
1572+ printk(KERN_CONT "(usable)\n");
1573+ break;
1574+ case E820_RESERVED:
1575+ printk(KERN_CONT "(reserved)\n");
1576+ break;
1577+ case E820_ACPI:
1578+ printk(KERN_CONT "(ACPI data)\n");
1579+ break;
1580+ case E820_NVS:
1581+ printk(KERN_CONT "(ACPI NVS)\n");
1582+ break;
1583+ default:
1584+ printk(KERN_CONT "type %u\n", e820.map[i].type);
1585+ break;
1586+ }
1587+ }
1588+}
1589+
1590+/*
1591+ * Sanitize the BIOS e820 map.
1592+ *
1593+ * Some e820 responses include overlapping entries. The following
1594+ * replaces the original e820 map with a new one, removing overlaps,
1595+ * and resolving conflicting memory types in favor of highest
1596+ * numbered type.
1597+ *
1598+ * The input parameter biosmap points to an array of 'struct
1599+ * e820entry' which on entry has elements in the range [0, *pnr_map)
1600+ * valid, and which has space for up to max_nr_map entries.
1601+ * On return, the resulting sanitized e820 map entries will be in
1602+ * overwritten in the same location, starting at biosmap.
1603+ *
1604+ * The integer pointed to by pnr_map must be valid on entry (the
1605+ * current number of valid entries located at biosmap) and will
1606+ * be updated on return, with the new number of valid entries
1607+ * (something no more than max_nr_map.)
1608+ *
1609+ * The return value from sanitize_e820_map() is zero if it
1610+ * successfully 'sanitized' the map entries passed in, and is -1
1611+ * if it did nothing, which can happen if either of (1) it was
1612+ * only passed one map entry, or (2) any of the input map entries
1613+ * were invalid (start + size < start, meaning that the size was
1614+ * so big the described memory range wrapped around through zero.)
1615+ *
1616+ * Visually we're performing the following
1617+ * (1,2,3,4 = memory types)...
1618+ *
1619+ * Sample memory map (w/overlaps):
1620+ * ____22__________________
1621+ * ______________________4_
1622+ * ____1111________________
1623+ * _44_____________________
1624+ * 11111111________________
1625+ * ____________________33__
1626+ * ___________44___________
1627+ * __________33333_________
1628+ * ______________22________
1629+ * ___________________2222_
1630+ * _________111111111______
1631+ * _____________________11_
1632+ * _________________4______
1633+ *
1634+ * Sanitized equivalent (no overlap):
1635+ * 1_______________________
1636+ * _44_____________________
1637+ * ___1____________________
1638+ * ____22__________________
1639+ * ______11________________
1640+ * _________1______________
1641+ * __________3_____________
1642+ * ___________44___________
1643+ * _____________33_________
1644+ * _______________2________
1645+ * ________________1_______
1646+ * _________________4______
1647+ * ___________________2____
1648+ * ____________________33__
1649+ * ______________________4_
1650+ */
1651+
1652+int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
1653+ int *pnr_map)
1654+{
1655+ struct change_member {
1656+ struct e820entry *pbios; /* pointer to original bios entry */
1657+ unsigned long long addr; /* address for this change point */
1658+ };
1659+ static struct change_member change_point_list[2*E820_X_MAX] __initdata;
1660+ static struct change_member *change_point[2*E820_X_MAX] __initdata;
1661+ static struct e820entry *overlap_list[E820_X_MAX] __initdata;
1662+ static struct e820entry new_bios[E820_X_MAX] __initdata;
1663+ struct change_member *change_tmp;
1664+ unsigned long current_type, last_type;
1665+ unsigned long long last_addr;
1666+ int chgidx, still_changing;
1667+ int overlap_entries;
1668+ int new_bios_entry;
1669+ int old_nr, new_nr, chg_nr;
1670+ int i;
1671+
1672+ /* if there's only one memory region, don't bother */
1673+#ifdef CONFIG_XEN
1674+ if (*pnr_map == 1)
1675+ return 0;
1676+#endif
1677+ if (*pnr_map < 2)
1678+ return -1;
1679+
1680+ old_nr = *pnr_map;
1681+ BUG_ON(old_nr > max_nr_map);
1682+
1683+ /* bail out if we find any unreasonable addresses in bios map */
1684+ for (i = 0; i < old_nr; i++)
1685+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
1686+ return -1;
1687+
1688+ /* create pointers for initial change-point information (for sorting) */
1689+ for (i = 0; i < 2 * old_nr; i++)
1690+ change_point[i] = &change_point_list[i];
1691+
1692+ /* record all known change-points (starting and ending addresses),
1693+ omitting those that are for empty memory regions */
1694+ chgidx = 0;
1695+ for (i = 0; i < old_nr; i++) {
1696+ if (biosmap[i].size != 0) {
1697+ change_point[chgidx]->addr = biosmap[i].addr;
1698+ change_point[chgidx++]->pbios = &biosmap[i];
1699+ change_point[chgidx]->addr = biosmap[i].addr +
1700+ biosmap[i].size;
1701+ change_point[chgidx++]->pbios = &biosmap[i];
1702+ }
1703+ }
1704+ chg_nr = chgidx;
1705+
1706+ /* sort change-point list by memory addresses (low -> high) */
1707+ still_changing = 1;
1708+ while (still_changing) {
1709+ still_changing = 0;
1710+ for (i = 1; i < chg_nr; i++) {
1711+ unsigned long long curaddr, lastaddr;
1712+ unsigned long long curpbaddr, lastpbaddr;
1713+
1714+ curaddr = change_point[i]->addr;
1715+ lastaddr = change_point[i - 1]->addr;
1716+ curpbaddr = change_point[i]->pbios->addr;
1717+ lastpbaddr = change_point[i - 1]->pbios->addr;
1718+
1719+ /*
1720+ * swap entries, when:
1721+ *
1722+ * curaddr > lastaddr or
1723+ * curaddr == lastaddr and curaddr == curpbaddr and
1724+ * lastaddr != lastpbaddr
1725+ */
1726+ if (curaddr < lastaddr ||
1727+ (curaddr == lastaddr && curaddr == curpbaddr &&
1728+ lastaddr != lastpbaddr)) {
1729+ change_tmp = change_point[i];
1730+ change_point[i] = change_point[i-1];
1731+ change_point[i-1] = change_tmp;
1732+ still_changing = 1;
1733+ }
1734+ }
1735+ }
1736+
1737+ /* create a new bios memory map, removing overlaps */
1738+ overlap_entries = 0; /* number of entries in the overlap table */
1739+ new_bios_entry = 0; /* index for creating new bios map entries */
1740+ last_type = 0; /* start with undefined memory type */
1741+ last_addr = 0; /* start with 0 as last starting address */
1742+
1743+ /* loop through change-points, determining affect on the new bios map */
1744+ for (chgidx = 0; chgidx < chg_nr; chgidx++) {
1745+ /* keep track of all overlapping bios entries */
1746+ if (change_point[chgidx]->addr ==
1747+ change_point[chgidx]->pbios->addr) {
1748+ /*
1749+ * add map entry to overlap list (> 1 entry
1750+ * implies an overlap)
1751+ */
1752+ overlap_list[overlap_entries++] =
1753+ change_point[chgidx]->pbios;
1754+ } else {
1755+ /*
1756+ * remove entry from list (order independent,
1757+ * so swap with last)
1758+ */
1759+ for (i = 0; i < overlap_entries; i++) {
1760+ if (overlap_list[i] ==
1761+ change_point[chgidx]->pbios)
1762+ overlap_list[i] =
1763+ overlap_list[overlap_entries-1];
1764+ }
1765+ overlap_entries--;
1766+ }
1767+ /*
1768+ * if there are overlapping entries, decide which
1769+ * "type" to use (larger value takes precedence --
1770+ * 1=usable, 2,3,4,4+=unusable)
1771+ */
1772+ current_type = 0;
1773+ for (i = 0; i < overlap_entries; i++)
1774+ if (overlap_list[i]->type > current_type)
1775+ current_type = overlap_list[i]->type;
1776+ /*
1777+ * continue building up new bios map based on this
1778+ * information
1779+ */
1780+ if (current_type != last_type) {
1781+ if (last_type != 0) {
1782+ new_bios[new_bios_entry].size =
1783+ change_point[chgidx]->addr - last_addr;
1784+ /*
1785+ * move forward only if the new size
1786+ * was non-zero
1787+ */
1788+ if (new_bios[new_bios_entry].size != 0)
1789+ /*
1790+ * no more space left for new
1791+ * bios entries ?
1792+ */
1793+ if (++new_bios_entry >= max_nr_map)
1794+ break;
1795+ }
1796+ if (current_type != 0) {
1797+ new_bios[new_bios_entry].addr =
1798+ change_point[chgidx]->addr;
1799+ new_bios[new_bios_entry].type = current_type;
1800+ last_addr = change_point[chgidx]->addr;
1801+ }
1802+ last_type = current_type;
1803+ }
1804+ }
1805+ /* retain count for new bios entries */
1806+ new_nr = new_bios_entry;
1807+
1808+ /* copy new bios mapping into original location */
1809+ memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
1810+ *pnr_map = new_nr;
1811+
1812+ return 0;
1813+}
1814+
1815+static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
1816+{
1817+ while (nr_map) {
1818+ u64 start = biosmap->addr;
1819+ u64 size = biosmap->size;
1820+ u64 end = start + size;
1821+ u32 type = biosmap->type;
1822+
1823+ /* Overflow in 64 bits? Ignore the memory map. */
1824+ if (start > end)
1825+ return -1;
1826+
1827+ e820_add_region(start, size, type);
1828+
1829+ biosmap++;
1830+ nr_map--;
1831+ }
1832+ return 0;
1833+}
1834+
1835+/*
1836+ * Copy the BIOS e820 map into a safe place.
1837+ *
1838+ * Sanity-check it while we're at it..
1839+ *
1840+ * If we're lucky and live on a modern system, the setup code
1841+ * will have given us a memory map that we can use to properly
1842+ * set up memory. If we aren't, we'll fake a memory map.
1843+ */
1844+static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
1845+{
1846+#ifndef CONFIG_XEN
1847+ /* Only one memory region (or negative)? Ignore it */
1848+ if (nr_map < 2)
1849+ return -1;
1850+#else
1851+ BUG_ON(nr_map < 1);
1852+#endif
1853+
1854+ return __append_e820_map(biosmap, nr_map);
1855+}
1856+
1857+static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
1858+ u64 size, unsigned old_type,
1859+ unsigned new_type)
1860+{
1861+ int i;
1862+ u64 real_updated_size = 0;
1863+
1864+ BUG_ON(old_type == new_type);
1865+
1866+ if (size > (ULLONG_MAX - start))
1867+ size = ULLONG_MAX - start;
1868+
1869+ for (i = 0; i < e820.nr_map; i++) {
1870+ struct e820entry *ei = &e820x->map[i];
1871+ u64 final_start, final_end;
1872+ if (ei->type != old_type)
1873+ continue;
1874+ /* totally covered? */
1875+ if (ei->addr >= start &&
1876+ (ei->addr + ei->size) <= (start + size)) {
1877+ ei->type = new_type;
1878+ real_updated_size += ei->size;
1879+ continue;
1880+ }
1881+ /* partially covered */
1882+ final_start = max(start, ei->addr);
1883+ final_end = min(start + size, ei->addr + ei->size);
1884+ if (final_start >= final_end)
1885+ continue;
1886+ e820_add_region(final_start, final_end - final_start,
1887+ new_type);
1888+ real_updated_size += final_end - final_start;
1889+
1890+ ei->size -= final_end - final_start;
1891+ if (ei->addr < final_start)
1892+ continue;
1893+ ei->addr = final_end;
1894+ }
1895+ return real_updated_size;
1896+}
1897+
1898+u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
1899+ unsigned new_type)
1900+{
1901+ return e820_update_range_map(&e820, start, size, old_type, new_type);
1902+}
1903+
1904+static u64 __init e820_update_range_saved(u64 start, u64 size,
1905+ unsigned old_type, unsigned new_type)
1906+{
1907+ return e820_update_range_map(&e820_saved, start, size, old_type,
1908+ new_type);
1909+}
1910+
1911+/* make e820 not cover the range */
1912+u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
1913+ int checktype)
1914+{
1915+ int i;
1916+ u64 real_removed_size = 0;
1917+
1918+ if (size > (ULLONG_MAX - start))
1919+ size = ULLONG_MAX - start;
1920+
1921+ for (i = 0; i < e820.nr_map; i++) {
1922+ struct e820entry *ei = &e820.map[i];
1923+ u64 final_start, final_end;
1924+
1925+ if (checktype && ei->type != old_type)
1926+ continue;
1927+ /* totally covered? */
1928+ if (ei->addr >= start &&
1929+ (ei->addr + ei->size) <= (start + size)) {
1930+ real_removed_size += ei->size;
1931+ memset(ei, 0, sizeof(struct e820entry));
1932+ continue;
1933+ }
1934+ /* partially covered */
1935+ final_start = max(start, ei->addr);
1936+ final_end = min(start + size, ei->addr + ei->size);
1937+ if (final_start >= final_end)
1938+ continue;
1939+ real_removed_size += final_end - final_start;
1940+
1941+ ei->size -= final_end - final_start;
1942+ if (ei->addr < final_start)
1943+ continue;
1944+ ei->addr = final_end;
1945+ }
1946+ return real_removed_size;
1947+}
1948+
1949+void __init update_e820(void)
1950+{
1951+ int nr_map;
1952+
1953+ nr_map = e820.nr_map;
1954+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
1955+ return;
1956+ e820.nr_map = nr_map;
1957+ printk(KERN_INFO "modified physical RAM map:\n");
1958+ e820_print_map("modified");
1959+}
1960+static void __init update_e820_saved(void)
1961+{
1962+ int nr_map;
1963+
1964+ nr_map = e820_saved.nr_map;
1965+ if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
1966+ return;
1967+ e820_saved.nr_map = nr_map;
1968+}
1969+
1970+#ifdef CONFIG_XEN
1971+#define e820 machine_e820
1972+#endif
1973+
1974+#define MAX_GAP_END 0x100000000ull
1975+/*
1976+ * Search for a gap in the e820 memory space from start_addr to end_addr.
1977+ */
1978+__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
1979+ unsigned long start_addr, unsigned long long end_addr)
1980+{
1981+ unsigned long long last;
1982+ int i = e820.nr_map;
1983+ int found = 0;
1984+
1985+ last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
1986+#ifdef CONFIG_X86_64
1987+ if (start_addr >= MAX_GAP_END)
1988+ last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
1989+#endif
1990+
1991+ while (--i >= 0) {
1992+ unsigned long long start = e820.map[i].addr;
1993+ unsigned long long end = start + e820.map[i].size;
1994+
1995+ if (end < start_addr)
1996+ continue;
1997+
1998+ /*
1999+ * Since "last" is at most 4GB, we know we'll
2000+ * fit in 32 bits if this condition is true
2001+ */
2002+ if (last > end) {
2003+ unsigned long gap = last - end;
2004+
2005+ if (gap >= *gapsize) {
2006+ *gapsize = gap;
2007+ *gapstart = end;
2008+ found = 1;
2009+ }
2010+ }
2011+ if (start < last)
2012+ last = start;
2013+ }
2014+ return found;
2015+}
2016+
2017+/*
2018+ * Search for the biggest gap in the low 32 bits of the e820
2019+ * memory space. We pass this space to PCI to assign MMIO resources
2020+ * for hotplug or unconfigured devices in.
2021+ * Hopefully the BIOS let enough space left.
2022+ */
2023+__init void e820_setup_gap(void)
2024+{
2025+ unsigned long gapstart, gapsize, round;
2026+ int found;
2027+
2028+ gapstart = 0x10000000;
2029+ gapsize = 0x400000;
2030+ found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
2031+
2032+#ifdef CONFIG_X86_64
2033+ if (!found) {
2034+ printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2035+ "address range\n"
2036+ KERN_ERR "PCI: Unassigned devices with 32bit resource "
2037+ "registers may break!\n");
2038+ found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
2039+ BUG_ON(!found);
2040+ }
2041+#endif
2042+
2043+ /*
2044+ * See how much we want to round up: start off with
2045+ * rounding to the next 1MB area.
2046+ */
2047+ round = 0x100000;
2048+ while ((gapsize >> 4) > round)
2049+ round += round;
2050+ /* Fun with two's complement */
2051+ pci_mem_start = (gapstart + round) & -round;
2052+
2053+ printk(KERN_INFO
2054+ "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2055+ pci_mem_start, gapstart, gapsize);
2056+}
2057+
2058+#undef e820
2059+
2060+#ifndef CONFIG_XEN
2061+/**
2062+ * Because of the size limitation of struct boot_params, only first
2063+ * 128 E820 memory entries are passed to kernel via
2064+ * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
2065+ * linked list of struct setup_data, which is parsed here.
2066+ */
2067+void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
2068+{
2069+ u32 map_len;
2070+ int entries;
2071+ struct e820entry *extmap;
2072+
2073+ entries = sdata->len / sizeof(struct e820entry);
2074+ map_len = sdata->len + sizeof(struct setup_data);
2075+ if (map_len > PAGE_SIZE)
2076+ sdata = early_ioremap(pa_data, map_len);
2077+ extmap = (struct e820entry *)(sdata->data);
2078+ __append_e820_map(extmap, entries);
2079+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
2080+ if (map_len > PAGE_SIZE)
2081+ early_iounmap(sdata, map_len);
2082+ printk(KERN_INFO "extended physical RAM map:\n");
2083+ e820_print_map("extended");
2084+}
2085+
2086+#if defined(CONFIG_X86_64) || \
2087+ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
2088+/**
2089+ * Find the ranges of physical addresses that do not correspond to
2090+ * e820 RAM areas and mark the corresponding pages as nosave for
2091+ * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
2092+ *
2093+ * This function requires the e820 map to be sorted and without any
2094+ * overlapping entries and assumes the first e820 area to be RAM.
2095+ */
2096+void __init e820_mark_nosave_regions(unsigned long limit_pfn)
2097+{
2098+ int i;
2099+ unsigned long pfn;
2100+
2101+ pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
2102+ for (i = 1; i < e820.nr_map; i++) {
2103+ struct e820entry *ei = &e820.map[i];
2104+
2105+ if (pfn < PFN_UP(ei->addr))
2106+ register_nosave_region(pfn, PFN_UP(ei->addr));
2107+
2108+ pfn = PFN_DOWN(ei->addr + ei->size);
2109+ if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
2110+ register_nosave_region(PFN_UP(ei->addr), pfn);
2111+
2112+ if (pfn >= limit_pfn)
2113+ break;
2114+ }
2115+}
2116+#endif
2117+#endif
2118+
2119+/*
2120+ * Early reserved memory areas.
2121+ */
2122+#define MAX_EARLY_RES 20
2123+
2124+struct early_res {
2125+ u64 start, end;
2126+ char name[16];
2127+ char overlap_ok;
2128+};
2129+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
2130+#ifndef CONFIG_XEN
2131+ { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
2132+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
2133+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
2134+#endif
2135+#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
2136+ /*
2137+ * But first pinch a few for the stack/trampoline stuff
2138+ * FIXME: Don't need the extra page at 4K, but need to fix
2139+ * trampoline before removing it. (see the GDT stuff)
2140+ */
2141+ { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
2142+ /*
2143+ * Has to be in very low memory so we can execute
2144+ * real-mode AP code.
2145+ */
2146+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
2147+#endif
2148+#endif
2149+ {}
2150+};
2151+
2152+static int __init find_overlapped_early(u64 start, u64 end)
2153+{
2154+ int i;
2155+ struct early_res *r;
2156+
2157+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2158+ r = &early_res[i];
2159+ if (end > r->start && start < r->end)
2160+ break;
2161+ }
2162+
2163+ return i;
2164+}
2165+
2166+/*
2167+ * Drop the i-th range from the early reservation map,
2168+ * by copying any higher ranges down one over it, and
2169+ * clearing what had been the last slot.
2170+ */
2171+static void __init drop_range(int i)
2172+{
2173+ int j;
2174+
2175+ for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
2176+ ;
2177+
2178+ memmove(&early_res[i], &early_res[i + 1],
2179+ (j - 1 - i) * sizeof(struct early_res));
2180+
2181+ early_res[j - 1].end = 0;
2182+}
2183+
2184+/*
2185+ * Split any existing ranges that:
2186+ * 1) are marked 'overlap_ok', and
2187+ * 2) overlap with the stated range [start, end)
2188+ * into whatever portion (if any) of the existing range is entirely
2189+ * below or entirely above the stated range. Drop the portion
2190+ * of the existing range that overlaps with the stated range,
2191+ * which will allow the caller of this routine to then add that
2192+ * stated range without conflicting with any existing range.
2193+ */
2194+static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
2195+{
2196+ int i;
2197+ struct early_res *r;
2198+ u64 lower_start, lower_end;
2199+ u64 upper_start, upper_end;
2200+ char name[16];
2201+
2202+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2203+ r = &early_res[i];
2204+
2205+ /* Continue past non-overlapping ranges */
2206+ if (end <= r->start || start >= r->end)
2207+ continue;
2208+
2209+ /*
2210+ * Leave non-ok overlaps as is; let caller
2211+ * panic "Overlapping early reservations"
2212+ * when it hits this overlap.
2213+ */
2214+ if (!r->overlap_ok)
2215+ return;
2216+
2217+ /*
2218+ * We have an ok overlap. We will drop it from the early
2219+ * reservation map, and add back in any non-overlapping
2220+ * portions (lower or upper) as separate, overlap_ok,
2221+ * non-overlapping ranges.
2222+ */
2223+
2224+ /* 1. Note any non-overlapping (lower or upper) ranges. */
2225+ strncpy(name, r->name, sizeof(name) - 1);
2226+
2227+ lower_start = lower_end = 0;
2228+ upper_start = upper_end = 0;
2229+ if (r->start < start) {
2230+ lower_start = r->start;
2231+ lower_end = start;
2232+ }
2233+ if (r->end > end) {
2234+ upper_start = end;
2235+ upper_end = r->end;
2236+ }
2237+
2238+ /* 2. Drop the original ok overlapping range */
2239+ drop_range(i);
2240+
2241+ i--; /* resume for-loop on copied down entry */
2242+
2243+ /* 3. Add back in any non-overlapping ranges. */
2244+ if (lower_end)
2245+ reserve_early_overlap_ok(lower_start, lower_end, name);
2246+ if (upper_end)
2247+ reserve_early_overlap_ok(upper_start, upper_end, name);
2248+ }
2249+}
2250+
2251+static void __init __reserve_early(u64 start, u64 end, char *name,
2252+ int overlap_ok)
2253+{
2254+ int i;
2255+ struct early_res *r;
2256+
2257+ i = find_overlapped_early(start, end);
2258+ if (i >= MAX_EARLY_RES)
2259+ panic("Too many early reservations");
2260+ r = &early_res[i];
2261+ if (r->end)
2262+ panic("Overlapping early reservations "
2263+ "%llx-%llx %s to %llx-%llx %s\n",
2264+ start, end - 1, name?name:"", r->start,
2265+ r->end - 1, r->name);
2266+ r->start = start;
2267+ r->end = end;
2268+ r->overlap_ok = overlap_ok;
2269+ if (name)
2270+ strncpy(r->name, name, sizeof(r->name) - 1);
2271+}
2272+
2273+/*
2274+ * A few early reservtations come here.
2275+ *
2276+ * The 'overlap_ok' in the name of this routine does -not- mean it
2277+ * is ok for these reservations to overlap an earlier reservation.
2278+ * Rather it means that it is ok for subsequent reservations to
2279+ * overlap this one.
2280+ *
2281+ * Use this entry point to reserve early ranges when you are doing
2282+ * so out of "Paranoia", reserving perhaps more memory than you need,
2283+ * just in case, and don't mind a subsequent overlapping reservation
2284+ * that is known to be needed.
2285+ *
2286+ * The drop_overlaps_that_are_ok() call here isn't really needed.
2287+ * It would be needed if we had two colliding 'overlap_ok'
2288+ * reservations, so that the second such would not panic on the
2289+ * overlap with the first. We don't have any such as of this
2290+ * writing, but might as well tolerate such if it happens in
2291+ * the future.
2292+ */
2293+void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
2294+{
2295+ drop_overlaps_that_are_ok(start, end);
2296+ __reserve_early(start, end, name, 1);
2297+}
2298+
2299+/*
2300+ * Most early reservations come here.
2301+ *
2302+ * We first have drop_overlaps_that_are_ok() drop any pre-existing
2303+ * 'overlap_ok' ranges, so that we can then reserve this memory
2304+ * range without risk of panic'ing on an overlapping overlap_ok
2305+ * early reservation.
2306+ */
2307+void __init reserve_early(u64 start, u64 end, char *name)
2308+{
2309+ drop_overlaps_that_are_ok(start, end);
2310+ __reserve_early(start, end, name, 0);
2311+}
2312+
2313+void __init free_early(u64 start, u64 end)
2314+{
2315+ struct early_res *r;
2316+ int i;
2317+
2318+ i = find_overlapped_early(start, end);
2319+ r = &early_res[i];
2320+ if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
2321+ panic("free_early on not reserved area: %llx-%llx!",
2322+ start, end - 1);
2323+
2324+ drop_range(i);
2325+}
2326+
2327+void __init early_res_to_bootmem(u64 start, u64 end)
2328+{
2329+ int i, count;
2330+ u64 final_start, final_end;
2331+
2332+ count = 0;
2333+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
2334+ count++;
2335+
2336+ printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
2337+ count, start, end);
2338+ for (i = 0; i < count; i++) {
2339+ struct early_res *r = &early_res[i];
2340+ printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
2341+ r->start, r->end, r->name);
2342+ final_start = max(start, r->start);
2343+ final_end = min(end, r->end);
2344+ if (final_start >= final_end) {
2345+ printk(KERN_CONT "\n");
2346+ continue;
2347+ }
2348+ printk(KERN_CONT " ==> [%010llx - %010llx]\n",
2349+ final_start, final_end);
2350+ reserve_bootmem_generic(final_start, final_end - final_start,
2351+ BOOTMEM_DEFAULT);
2352+ }
2353+}
2354+
2355+/* Check for already reserved areas */
2356+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
2357+{
2358+ int i;
2359+ u64 addr = *addrp;
2360+ int changed = 0;
2361+ struct early_res *r;
2362+again:
2363+ i = find_overlapped_early(addr, addr + size);
2364+ r = &early_res[i];
2365+ if (i < MAX_EARLY_RES && r->end) {
2366+ *addrp = addr = round_up(r->end, align);
2367+ changed = 1;
2368+ goto again;
2369+ }
2370+ return changed;
2371+}
2372+
2373+/* Check for already reserved areas */
2374+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
2375+{
2376+ int i;
2377+ u64 addr = *addrp, last;
2378+ u64 size = *sizep;
2379+ int changed = 0;
2380+again:
2381+ last = addr + size;
2382+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2383+ struct early_res *r = &early_res[i];
2384+ if (last > r->start && addr < r->start) {
2385+ size = r->start - addr;
2386+ changed = 1;
2387+ goto again;
2388+ }
2389+ if (last > r->end && addr < r->end) {
2390+ addr = round_up(r->end, align);
2391+ size = last - addr;
2392+ changed = 1;
2393+ goto again;
2394+ }
2395+ if (last <= r->end && addr >= r->start) {
2396+ (*sizep)++;
2397+ return 0;
2398+ }
2399+ }
2400+ if (changed) {
2401+ *addrp = addr;
2402+ *sizep = size;
2403+ }
2404+ return changed;
2405+}
2406+
2407+/*
2408+ * Find a free area with specified alignment in a specific range.
2409+ */
2410+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
2411+{
2412+ int i;
2413+
2414+ for (i = 0; i < e820.nr_map; i++) {
2415+ struct e820entry *ei = &e820.map[i];
2416+ u64 addr, last;
2417+ u64 ei_last;
2418+
2419+ if (ei->type != E820_RAM)
2420+ continue;
2421+ addr = round_up(ei->addr, align);
2422+ ei_last = ei->addr + ei->size;
2423+ if (addr < start)
2424+ addr = round_up(start, align);
2425+ if (addr >= ei_last)
2426+ continue;
2427+ while (bad_addr(&addr, size, align) && addr+size <= ei_last)
2428+ ;
2429+ last = addr + size;
2430+ if (last > ei_last)
2431+ continue;
2432+ if (last > end)
2433+ continue;
2434+ return addr;
2435+ }
2436+ return -1ULL;
2437+}
2438+
2439+/*
2440+ * Find next free range after *start
2441+ */
2442+u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
2443+{
2444+ int i;
2445+
2446+ for (i = 0; i < e820.nr_map; i++) {
2447+ struct e820entry *ei = &e820.map[i];
2448+ u64 addr, last;
2449+ u64 ei_last;
2450+
2451+ if (ei->type != E820_RAM)
2452+ continue;
2453+ addr = round_up(ei->addr, align);
2454+ ei_last = ei->addr + ei->size;
2455+ if (addr < start)
2456+ addr = round_up(start, align);
2457+ if (addr >= ei_last)
2458+ continue;
2459+ *sizep = ei_last - addr;
2460+ while (bad_addr_size(&addr, sizep, align) &&
2461+ addr + *sizep <= ei_last)
2462+ ;
2463+ last = addr + *sizep;
2464+ if (last > ei_last)
2465+ continue;
2466+ return addr;
2467+ }
2468+ return -1UL;
2469+
2470+}
2471+
2472+/*
2473+ * pre allocated 4k and reserved it in e820
2474+ */
2475+u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
2476+{
2477+ u64 size = 0;
2478+ u64 addr;
2479+ u64 start;
2480+
2481+ start = startt;
2482+ while (size < sizet)
2483+ start = find_e820_area_size(start, &size, align);
2484+
2485+ if (size < sizet)
2486+ return 0;
2487+
2488+ addr = round_down(start + size - sizet, align);
2489+ e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
2490+ e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
2491+ printk(KERN_INFO "update e820 for early_reserve_e820\n");
2492+ update_e820();
2493+ update_e820_saved();
2494+
2495+ return addr;
2496+}
2497+
2498+#ifdef CONFIG_X86_32
2499+# ifdef CONFIG_X86_PAE
2500+# define MAX_ARCH_PFN (1ULL<<(40-PAGE_SHIFT))
2501+# else
2502+# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
2503+# endif
2504+#else /* CONFIG_X86_32 */
2505+# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
2506+#endif
2507+
2508+/*
2509+ * Find the highest page frame number we have available
2510+ */
2511+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
2512+{
2513+ int i;
2514+ unsigned long last_pfn = 0;
2515+ unsigned long max_arch_pfn = MAX_ARCH_PFN;
2516+
2517+ for (i = 0; i < e820.nr_map; i++) {
2518+ struct e820entry *ei = &e820.map[i];
2519+ unsigned long start_pfn;
2520+ unsigned long end_pfn;
2521+
2522+ if (ei->type != type)
2523+ continue;
2524+
2525+ start_pfn = ei->addr >> PAGE_SHIFT;
2526+ end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
2527+
2528+ if (start_pfn >= limit_pfn)
2529+ continue;
2530+ if (end_pfn > limit_pfn) {
2531+ last_pfn = limit_pfn;
2532+ break;
2533+ }
2534+ if (end_pfn > last_pfn)
2535+ last_pfn = end_pfn;
2536+ }
2537+
2538+ if (last_pfn > max_arch_pfn)
2539+ last_pfn = max_arch_pfn;
2540+
2541+ printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
2542+ last_pfn, max_arch_pfn);
2543+ return last_pfn;
2544+}
2545+unsigned long __init e820_end_of_ram_pfn(void)
2546+{
2547+ return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
2548+}
2549+
2550+unsigned long __init e820_end_of_low_ram_pfn(void)
2551+{
2552+ return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
2553+}
2554+/*
2555+ * Finds an active region in the address range from start_pfn to last_pfn and
2556+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
2557+ */
2558+int __init e820_find_active_region(const struct e820entry *ei,
2559+ unsigned long start_pfn,
2560+ unsigned long last_pfn,
2561+ unsigned long *ei_startpfn,
2562+ unsigned long *ei_endpfn)
2563+{
2564+ u64 align = PAGE_SIZE;
2565+
2566+ *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
2567+ *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
2568+
2569+ /* Skip map entries smaller than a page */
2570+ if (*ei_startpfn >= *ei_endpfn)
2571+ return 0;
2572+
2573+ /* Skip if map is outside the node */
2574+ if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
2575+ *ei_startpfn >= last_pfn)
2576+ return 0;
2577+
2578+ /* Check for overlaps */
2579+ if (*ei_startpfn < start_pfn)
2580+ *ei_startpfn = start_pfn;
2581+ if (*ei_endpfn > last_pfn)
2582+ *ei_endpfn = last_pfn;
2583+
2584+ return 1;
2585+}
2586+
2587+/* Walk the e820 map and register active regions within a node */
2588+void __init e820_register_active_regions(int nid, unsigned long start_pfn,
2589+ unsigned long last_pfn)
2590+{
2591+ unsigned long ei_startpfn;
2592+ unsigned long ei_endpfn;
2593+ int i;
2594+
2595+ for (i = 0; i < e820.nr_map; i++)
2596+ if (e820_find_active_region(&e820.map[i],
2597+ start_pfn, last_pfn,
2598+ &ei_startpfn, &ei_endpfn))
2599+ add_active_range(nid, ei_startpfn, ei_endpfn);
2600+}
2601+
2602+/*
2603+ * Find the hole size (in bytes) in the memory range.
2604+ * @start: starting address of the memory range to scan
2605+ * @end: ending address of the memory range to scan
2606+ */
2607+u64 __init e820_hole_size(u64 start, u64 end)
2608+{
2609+ unsigned long start_pfn = start >> PAGE_SHIFT;
2610+ unsigned long last_pfn = end >> PAGE_SHIFT;
2611+ unsigned long ei_startpfn, ei_endpfn, ram = 0;
2612+ int i;
2613+
2614+ for (i = 0; i < e820.nr_map; i++) {
2615+ if (e820_find_active_region(&e820.map[i],
2616+ start_pfn, last_pfn,
2617+ &ei_startpfn, &ei_endpfn))
2618+ ram += ei_endpfn - ei_startpfn;
2619+ }
2620+ return end - start - ((u64)ram << PAGE_SHIFT);
2621+}
2622+
2623+static void early_panic(char *msg)
2624+{
2625+ early_printk(msg);
2626+ panic(msg);
2627+}
2628+
2629+static int userdef __initdata;
2630+
2631+/* "mem=nopentium" disables the 4MB page tables. */
2632+static int __init parse_memopt(char *p)
2633+{
2634+ u64 mem_size, current_end;
2635+ unsigned int i;
2636+
2637+ if (!p)
2638+ return -EINVAL;
2639+
2640+#ifdef CONFIG_X86_32
2641+ if (!strcmp(p, "nopentium")) {
2642+ setup_clear_cpu_cap(X86_FEATURE_PSE);
2643+ return 0;
2644+ }
2645+#endif
2646+
2647+ userdef = 1;
2648+ mem_size = memparse(p, &p);
2649+ e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2650+
2651+ i = e820.nr_map - 1;
2652+ current_end = e820.map[i].addr + e820.map[i].size;
2653+ if (current_end < mem_size) {
2654+ /*
2655+ * The e820 map ends before our requested size so
2656+ * extend the final entry to the requested address.
2657+ */
2658+ if (e820.map[i].type == E820_RAM)
2659+ e820.map[i].size = mem_size - e820.map[i].addr;
2660+ else
2661+ e820_add_region(current_end, mem_size - current_end, E820_RAM);
2662+ }
2663+
2664+ return 0;
2665+}
2666+early_param("mem", parse_memopt);
2667+
2668+#ifndef CONFIG_XEN
2669+static int __init parse_memmap_opt(char *p)
2670+{
2671+ char *oldp;
2672+ u64 start_at, mem_size;
2673+
2674+ if (!p)
2675+ return -EINVAL;
2676+
2677+ if (!strncmp(p, "exactmap", 8)) {
2678+#ifdef CONFIG_CRASH_DUMP
2679+ /*
2680+ * If we are doing a crash dump, we still need to know
2681+ * the real mem size before original memory map is
2682+ * reset.
2683+ */
2684+ saved_max_pfn = e820_end_of_ram_pfn();
2685+#endif
2686+ e820.nr_map = 0;
2687+ userdef = 1;
2688+ return 0;
2689+ }
2690+
2691+ oldp = p;
2692+ mem_size = memparse(p, &p);
2693+ if (p == oldp)
2694+ return -EINVAL;
2695+
2696+ userdef = 1;
2697+ if (*p == '@') {
2698+ start_at = memparse(p+1, &p);
2699+ e820_add_region(start_at, mem_size, E820_RAM);
2700+ } else if (*p == '#') {
2701+ start_at = memparse(p+1, &p);
2702+ e820_add_region(start_at, mem_size, E820_ACPI);
2703+ } else if (*p == '$') {
2704+ start_at = memparse(p+1, &p);
2705+ e820_add_region(start_at, mem_size, E820_RESERVED);
2706+ } else
2707+ e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2708+
2709+ return *p == '\0' ? 0 : -EINVAL;
2710+}
2711+early_param("memmap", parse_memmap_opt);
2712+
2713+void __init finish_e820_parsing(void)
2714+{
2715+ if (userdef) {
2716+ int nr = e820.nr_map;
2717+
2718+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
2719+ early_panic("Invalid user supplied memory map");
2720+ e820.nr_map = nr;
2721+
2722+ printk(KERN_INFO "user-defined physical RAM map:\n");
2723+ e820_print_map("user");
2724+ }
2725+}
2726+#endif
2727+
2728+static inline const char *e820_type_to_string(int e820_type)
2729+{
2730+ switch (e820_type) {
2731+ case E820_RESERVED_KERN:
2732+ case E820_RAM: return "System RAM";
2733+ case E820_ACPI: return "ACPI Tables";
2734+ case E820_NVS: return "ACPI Non-volatile Storage";
2735+ default: return "reserved";
2736+ }
2737+}
2738+
2739+#ifdef CONFIG_XEN
2740+#define e820 machine_e820
2741+#endif
2742+
2743+/*
2744+ * Mark e820 reserved areas as busy for the resource manager.
2745+ */
2746+void __init e820_reserve_resources(void)
2747+{
2748+ int i;
2749+ struct resource *res;
2750+ u64 end;
2751+
2752+ res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
2753+ for (i = 0; i < e820.nr_map; i++) {
2754+ end = e820.map[i].addr + e820.map[i].size - 1;
2755+#ifndef CONFIG_RESOURCES_64BIT
2756+ if (end > 0x100000000ULL) {
2757+ res++;
2758+ continue;
2759+ }
2760+#endif
2761+ res->name = e820_type_to_string(e820.map[i].type);
2762+ res->start = e820.map[i].addr;
2763+ res->end = end;
2764+
2765+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
2766+ insert_resource(&iomem_resource, res);
2767+ res++;
2768+ }
2769+
2770+ for (i = 0; i < e820_saved.nr_map; i++) {
2771+ struct e820entry *entry = &e820_saved.map[i];
2772+ firmware_map_add_early(entry->addr,
2773+ entry->addr + entry->size - 1,
2774+ e820_type_to_string(entry->type));
2775+ }
2776+}
2777+
2778+#undef e820
2779+
2780+#ifndef CONFIG_XEN
2781+char *__init default_machine_specific_memory_setup(void)
2782+{
2783+ char *who = "BIOS-e820";
2784+ int new_nr;
2785+ /*
2786+ * Try to copy the BIOS-supplied E820-map.
2787+ *
2788+ * Otherwise fake a memory map; one section from 0k->640k,
2789+ * the next section from 1mb->appropriate_mem_k
2790+ */
2791+ new_nr = boot_params.e820_entries;
2792+ sanitize_e820_map(boot_params.e820_map,
2793+ ARRAY_SIZE(boot_params.e820_map),
2794+ &new_nr);
2795+ boot_params.e820_entries = new_nr;
2796+ if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
2797+ < 0) {
2798+ u64 mem_size;
2799+
2800+ /* compare results from other methods and take the greater */
2801+ if (boot_params.alt_mem_k
2802+ < boot_params.screen_info.ext_mem_k) {
2803+ mem_size = boot_params.screen_info.ext_mem_k;
2804+ who = "BIOS-88";
2805+ } else {
2806+ mem_size = boot_params.alt_mem_k;
2807+ who = "BIOS-e801";
2808+ }
2809+
2810+ e820.nr_map = 0;
2811+ e820_add_region(0, LOWMEMSIZE(), E820_RAM);
2812+ e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
2813+ }
2814+
2815+ /* In case someone cares... */
2816+ return who;
2817+}
2818+
2819+char *__init __attribute__((weak)) machine_specific_memory_setup(void)
2820+{
2821+ if (x86_quirks->arch_memory_setup) {
2822+ char *who = x86_quirks->arch_memory_setup();
2823+
2824+ if (who)
2825+ return who;
2826+ }
2827+ return default_machine_specific_memory_setup();
2828+}
2829+#endif
2830+
2831+char * __init memory_setup(void)
2832+{
2833+ int rc, nr_map;
2834+ struct xen_memory_map memmap;
2835+ /*
2836+ * This is rather large for a stack variable but this early in
2837+ * the boot process we know we have plenty slack space.
2838+ */
2839+ struct e820entry map[E820MAX];
2840+
2841+ memmap.nr_entries = E820MAX;
2842+ set_xen_guest_handle(memmap.buffer, map);
2843+
2844+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
2845+ if (rc == -ENOSYS) {
2846+ memmap.nr_entries = 1;
2847+ map[0].addr = 0ULL;
2848+ map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
2849+ /* 8MB slack (to balance backend allocations). */
2850+ map[0].size += 8ULL << 20;
2851+ map[0].type = E820_RAM;
2852+ rc = 0;
2853+ }
2854+ BUG_ON(rc);
2855+
2856+ nr_map = memmap.nr_entries;
2857+ sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
2858+
2859+ if (append_e820_map(map, nr_map) < 0)
2860+ BUG();
2861+
2862+#ifdef CONFIG_XEN
2863+ if (is_initial_xendomain()) {
2864+ memmap.nr_entries = E820MAX;
2865+ set_xen_guest_handle(memmap.buffer, machine_e820.map);
2866+
2867+ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
2868+ BUG();
2869+ machine_e820.nr_map = memmap.nr_entries;
2870+ } else
2871+ machine_e820 = e820;
2872+#endif
2873+
2874+ return "Xen";
2875+}
2876+
2877+void __init setup_memory_map(void)
2878+{
2879+ char *who;
2880+
2881+ who = memory_setup();
2882+ memcpy(&e820_saved, &e820, sizeof(struct e820map));
2883+ printk(KERN_INFO "Xen-provided physical RAM map:\n");
2884+ e820_print_map(who);
2885+}
2886Index: head-2008-12-01/arch/x86/kernel/e820_32-xen.c
2887===================================================================
2888--- head-2008-12-01.orig/arch/x86/kernel/e820_32-xen.c 2008-12-01 11:44:55.000000000 +0100
2889+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
2890@@ -1,873 +0,0 @@
2891-#include <linux/kernel.h>
2892-#include <linux/types.h>
2893-#include <linux/init.h>
2894-#include <linux/bootmem.h>
2895-#include <linux/ioport.h>
2896-#include <linux/string.h>
2897-#include <linux/kexec.h>
2898-#include <linux/module.h>
2899-#include <linux/mm.h>
2900-#include <linux/pfn.h>
2901-#include <linux/uaccess.h>
2902-#include <linux/suspend.h>
2903-
2904-#include <asm/pgtable.h>
2905-#include <asm/page.h>
2906-#include <asm/e820.h>
2907-#include <asm/setup.h>
2908-#include <xen/interface/memory.h>
2909-
2910-struct e820map e820;
2911-struct change_member {
2912- struct e820entry *pbios; /* pointer to original bios entry */
2913- unsigned long long addr; /* address for this change point */
2914-};
2915-static struct change_member change_point_list[2*E820MAX] __initdata;
2916-static struct change_member *change_point[2*E820MAX] __initdata;
2917-static struct e820entry *overlap_list[E820MAX] __initdata;
2918-static struct e820entry new_bios[E820MAX] __initdata;
2919-/* For PCI or other memory-mapped resources */
2920-unsigned long pci_mem_start = 0x10000000;
2921-#ifdef CONFIG_PCI
2922-EXPORT_SYMBOL(pci_mem_start);
2923-#endif
2924-extern int user_defined_memmap;
2925-
2926-static struct resource system_rom_resource = {
2927- .name = "System ROM",
2928- .start = 0xf0000,
2929- .end = 0xfffff,
2930- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2931-};
2932-
2933-static struct resource extension_rom_resource = {
2934- .name = "Extension ROM",
2935- .start = 0xe0000,
2936- .end = 0xeffff,
2937- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2938-};
2939-
2940-static struct resource adapter_rom_resources[] = { {
2941- .name = "Adapter ROM",
2942- .start = 0xc8000,
2943- .end = 0,
2944- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2945-}, {
2946- .name = "Adapter ROM",
2947- .start = 0,
2948- .end = 0,
2949- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2950-}, {
2951- .name = "Adapter ROM",
2952- .start = 0,
2953- .end = 0,
2954- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2955-}, {
2956- .name = "Adapter ROM",
2957- .start = 0,
2958- .end = 0,
2959- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2960-}, {
2961- .name = "Adapter ROM",
2962- .start = 0,
2963- .end = 0,
2964- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2965-}, {
2966- .name = "Adapter ROM",
2967- .start = 0,
2968- .end = 0,
2969- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2970-} };
2971-
2972-static struct resource video_rom_resource = {
2973- .name = "Video ROM",
2974- .start = 0xc0000,
2975- .end = 0xc7fff,
2976- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2977-};
2978-
2979-#define ROMSIGNATURE 0xaa55
2980-
2981-static int __init romsignature(const unsigned char *rom)
2982-{
2983- const unsigned short * const ptr = (const unsigned short *)rom;
2984- unsigned short sig;
2985-
2986- return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
2987-}
2988-
2989-static int __init romchecksum(const unsigned char *rom, unsigned long length)
2990-{
2991- unsigned char sum, c;
2992-
2993- for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
2994- sum += c;
2995- return !length && !sum;
2996-}
2997-
2998-static void __init probe_roms(void)
2999-{
3000- const unsigned char *rom;
3001- unsigned long start, length, upper;
3002- unsigned char c;
3003- int i;
3004-
3005-#ifdef CONFIG_XEN
3006- /* Nothing to do if not running in dom0. */
3007- if (!is_initial_xendomain())
3008- return;
3009-#endif
3010-
3011- /* video rom */
3012- upper = adapter_rom_resources[0].start;
3013- for (start = video_rom_resource.start; start < upper; start += 2048) {
3014- rom = isa_bus_to_virt(start);
3015- if (!romsignature(rom))
3016- continue;
3017-
3018- video_rom_resource.start = start;
3019-
3020- if (probe_kernel_address(rom + 2, c) != 0)
3021- continue;
3022-
3023- /* 0 < length <= 0x7f * 512, historically */
3024- length = c * 512;
3025-
3026- /* if checksum okay, trust length byte */
3027- if (length && romchecksum(rom, length))
3028- video_rom_resource.end = start + length - 1;
3029-
3030- request_resource(&iomem_resource, &video_rom_resource);
3031- break;
3032- }
3033-
3034- start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
3035- if (start < upper)
3036- start = upper;
3037-
3038- /* system rom */
3039- request_resource(&iomem_resource, &system_rom_resource);
3040- upper = system_rom_resource.start;
3041-
3042- /* check for extension rom (ignore length byte!) */
3043- rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
3044- if (romsignature(rom)) {
3045- length = extension_rom_resource.end - extension_rom_resource.start + 1;
3046- if (romchecksum(rom, length)) {
3047- request_resource(&iomem_resource, &extension_rom_resource);
3048- upper = extension_rom_resource.start;
3049- }
3050- }
3051-
3052- /* check for adapter roms on 2k boundaries */
3053- for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
3054- rom = isa_bus_to_virt(start);
3055- if (!romsignature(rom))
3056- continue;
3057-
3058- if (probe_kernel_address(rom + 2, c) != 0)
3059- continue;
3060-
3061- /* 0 < length <= 0x7f * 512, historically */
3062- length = c * 512;
3063-
3064- /* but accept any length that fits if checksum okay */
3065- if (!length || start + length > upper || !romchecksum(rom, length))
3066- continue;
3067-
3068- adapter_rom_resources[i].start = start;
3069- adapter_rom_resources[i].end = start + length - 1;
3070- request_resource(&iomem_resource, &adapter_rom_resources[i]);
3071-
3072- start = adapter_rom_resources[i++].end & ~2047UL;
3073- }
3074-}
3075-
3076-#ifdef CONFIG_XEN
3077-static struct e820map machine_e820;
3078-#define e820 machine_e820
3079-#endif
3080-
3081-/*
3082- * Request address space for all standard RAM and ROM resources
3083- * and also for regions reported as reserved by the e820.
3084- */
3085-void __init init_iomem_resources(struct resource *code_resource,
3086- struct resource *data_resource,
3087- struct resource *bss_resource)
3088-{
3089- int i;
3090-
3091- probe_roms();
3092- for (i = 0; i < e820.nr_map; i++) {
3093- struct resource *res;
3094-#ifndef CONFIG_RESOURCES_64BIT
3095- if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
3096- continue;
3097-#endif
3098- res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
3099- switch (e820.map[i].type) {
3100- case E820_RAM: res->name = "System RAM"; break;
3101- case E820_ACPI: res->name = "ACPI Tables"; break;
3102- case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
3103- default: res->name = "reserved";
3104- }
3105- res->start = e820.map[i].addr;
3106- res->end = res->start + e820.map[i].size - 1;
3107- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3108- if (request_resource(&iomem_resource, res)) {
3109- kfree(res);
3110- continue;
3111- }
3112- if (e820.map[i].type == E820_RAM) {
3113- /*
3114- * We don't know which RAM region contains kernel data,
3115- * so we try it repeatedly and let the resource manager
3116- * test it.
3117- */
3118-#ifndef CONFIG_XEN
3119- request_resource(res, code_resource);
3120- request_resource(res, data_resource);
3121- request_resource(res, bss_resource);
3122-#endif
3123-#ifdef CONFIG_KEXEC
3124- if (crashk_res.start != crashk_res.end)
3125- request_resource(res, &crashk_res);
3126-#ifdef CONFIG_XEN
3127- xen_machine_kexec_register_resources(res);
3128-#endif
3129-#endif
3130- }
3131- }
3132-}
3133-
3134-#undef e820
3135-
3136-#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
3137-/**
3138- * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
3139- * correspond to e820 RAM areas and mark the corresponding pages as nosave for
3140- * hibernation.
3141- *
3142- * This function requires the e820 map to be sorted and without any
3143- * overlapping entries and assumes the first e820 area to be RAM.
3144- */
3145-void __init e820_mark_nosave_regions(void)
3146-{
3147- int i;
3148- unsigned long pfn;
3149-
3150- pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
3151- for (i = 1; i < e820.nr_map; i++) {
3152- struct e820entry *ei = &e820.map[i];
3153-
3154- if (pfn < PFN_UP(ei->addr))
3155- register_nosave_region(pfn, PFN_UP(ei->addr));
3156-
3157- pfn = PFN_DOWN(ei->addr + ei->size);
3158- if (ei->type != E820_RAM)
3159- register_nosave_region(PFN_UP(ei->addr), pfn);
3160-
3161- if (pfn >= max_low_pfn)
3162- break;
3163- }
3164-}
3165-#endif
3166-
3167-void __init add_memory_region(unsigned long long start,
3168- unsigned long long size, int type)
3169-{
3170- int x;
3171-
3172- x = e820.nr_map;
3173-
3174- if (x == E820MAX) {
3175- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
3176- return;
3177- }
3178-
3179- e820.map[x].addr = start;
3180- e820.map[x].size = size;
3181- e820.map[x].type = type;
3182- e820.nr_map++;
3183-} /* add_memory_region */
3184-
3185-/*
3186- * Sanitize the BIOS e820 map.
3187- *
3188- * Some e820 responses include overlapping entries. The following
3189- * replaces the original e820 map with a new one, removing overlaps.
3190- *
3191- */
3192-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
3193-{
3194- struct change_member *change_tmp;
3195- unsigned long current_type, last_type;
3196- unsigned long long last_addr;
3197- int chgidx, still_changing;
3198- int overlap_entries;
3199- int new_bios_entry;
3200- int old_nr, new_nr, chg_nr;
3201- int i;
3202-
3203- /*
3204- Visually we're performing the following (1,2,3,4 = memory types)...
3205-
3206- Sample memory map (w/overlaps):
3207- ____22__________________
3208- ______________________4_
3209- ____1111________________
3210- _44_____________________
3211- 11111111________________
3212- ____________________33__
3213- ___________44___________
3214- __________33333_________
3215- ______________22________
3216- ___________________2222_
3217- _________111111111______
3218- _____________________11_
3219- _________________4______
3220-
3221- Sanitized equivalent (no overlap):
3222- 1_______________________
3223- _44_____________________
3224- ___1____________________
3225- ____22__________________
3226- ______11________________
3227- _________1______________
3228- __________3_____________
3229- ___________44___________
3230- _____________33_________
3231- _______________2________
3232- ________________1_______
3233- _________________4______
3234- ___________________2____
3235- ____________________33__
3236- ______________________4_
3237- */
3238- /* if there's only one memory region, don't bother */
3239- if (*pnr_map < 2) {
3240- return -1;
3241- }
3242-
3243- old_nr = *pnr_map;
3244-
3245- /* bail out if we find any unreasonable addresses in bios map */
3246- for (i=0; i<old_nr; i++)
3247- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
3248- return -1;
3249- }
3250-
3251- /* create pointers for initial change-point information (for sorting) */
3252- for (i=0; i < 2*old_nr; i++)
3253- change_point[i] = &change_point_list[i];
3254-
3255- /* record all known change-points (starting and ending addresses),
3256- omitting those that are for empty memory regions */
3257- chgidx = 0;
3258- for (i=0; i < old_nr; i++) {
3259- if (biosmap[i].size != 0) {
3260- change_point[chgidx]->addr = biosmap[i].addr;
3261- change_point[chgidx++]->pbios = &biosmap[i];
3262- change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
3263- change_point[chgidx++]->pbios = &biosmap[i];
3264- }
3265- }
3266- chg_nr = chgidx; /* true number of change-points */
3267-
3268- /* sort change-point list by memory addresses (low -> high) */
3269- still_changing = 1;
3270- while (still_changing) {
3271- still_changing = 0;
3272- for (i=1; i < chg_nr; i++) {
3273- /* if <current_addr> > <last_addr>, swap */
3274- /* or, if current=<start_addr> & last=<end_addr>, swap */
3275- if ((change_point[i]->addr < change_point[i-1]->addr) ||
3276- ((change_point[i]->addr == change_point[i-1]->addr) &&
3277- (change_point[i]->addr == change_point[i]->pbios->addr) &&
3278- (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
3279- )
3280- {
3281- change_tmp = change_point[i];
3282- change_point[i] = change_point[i-1];
3283- change_point[i-1] = change_tmp;
3284- still_changing=1;
3285- }
3286- }
3287- }
3288-
3289- /* create a new bios memory map, removing overlaps */
3290- overlap_entries=0; /* number of entries in the overlap table */
3291- new_bios_entry=0; /* index for creating new bios map entries */
3292- last_type = 0; /* start with undefined memory type */
3293- last_addr = 0; /* start with 0 as last starting address */
3294- /* loop through change-points, determining affect on the new bios map */
3295- for (chgidx=0; chgidx < chg_nr; chgidx++)
3296- {
3297- /* keep track of all overlapping bios entries */
3298- if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
3299- {
3300- /* add map entry to overlap list (> 1 entry implies an overlap) */
3301- overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
3302- }
3303- else
3304- {
3305- /* remove entry from list (order independent, so swap with last) */
3306- for (i=0; i<overlap_entries; i++)
3307- {
3308- if (overlap_list[i] == change_point[chgidx]->pbios)
3309- overlap_list[i] = overlap_list[overlap_entries-1];
3310- }
3311- overlap_entries--;
3312- }
3313- /* if there are overlapping entries, decide which "type" to use */
3314- /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
3315- current_type = 0;
3316- for (i=0; i<overlap_entries; i++)
3317- if (overlap_list[i]->type > current_type)
3318- current_type = overlap_list[i]->type;
3319- /* continue building up new bios map based on this information */
3320- if (current_type != last_type) {
3321- if (last_type != 0) {
3322- new_bios[new_bios_entry].size =
3323- change_point[chgidx]->addr - last_addr;
3324- /* move forward only if the new size was non-zero */
3325- if (new_bios[new_bios_entry].size != 0)
3326- if (++new_bios_entry >= E820MAX)
3327- break; /* no more space left for new bios entries */
3328- }
3329- if (current_type != 0) {
3330- new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
3331- new_bios[new_bios_entry].type = current_type;
3332- last_addr=change_point[chgidx]->addr;
3333- }
3334- last_type = current_type;
3335- }
3336- }
3337- new_nr = new_bios_entry; /* retain count for new bios entries */
3338-
3339- /* copy new bios mapping into original location */
3340- memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
3341- *pnr_map = new_nr;
3342-
3343- return 0;
3344-}
3345-
3346-/*
3347- * Copy the BIOS e820 map into a safe place.
3348- *
3349- * Sanity-check it while we're at it..
3350- *
3351- * If we're lucky and live on a modern system, the setup code
3352- * will have given us a memory map that we can use to properly
3353- * set up memory. If we aren't, we'll fake a memory map.
3354- *
3355- * We check to see that the memory map contains at least 2 elements
3356- * before we'll use it, because the detection code in setup.S may
3357- * not be perfect and most every PC known to man has two memory
3358- * regions: one from 0 to 640k, and one from 1mb up. (The IBM
3359- * thinkpad 560x, for example, does not cooperate with the memory
3360- * detection code.)
3361- */
3362-int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
3363-{
3364-#ifndef CONFIG_XEN
3365- /* Only one memory region (or negative)? Ignore it */
3366- if (nr_map < 2)
3367- return -1;
3368-#else
3369- BUG_ON(nr_map < 1);
3370-#endif
3371-
3372- do {
3373- u64 start = biosmap->addr;
3374- u64 size = biosmap->size;
3375- u64 end = start + size;
3376- u32 type = biosmap->type;
3377-
3378- /* Overflow in 64 bits? Ignore the memory map. */
3379- if (start > end)
3380- return -1;
3381-
3382- add_memory_region(start, size, type);
3383- } while (biosmap++, --nr_map);
3384-
3385-#ifdef CONFIG_XEN
3386- if (is_initial_xendomain()) {
3387- struct xen_memory_map memmap;
3388-
3389- memmap.nr_entries = E820MAX;
3390- set_xen_guest_handle(memmap.buffer, machine_e820.map);
3391-
3392- if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3393- BUG();
3394- machine_e820.nr_map = memmap.nr_entries;
3395- } else
3396- machine_e820 = e820;
3397-#endif
3398-
3399- return 0;
3400-}
3401-
3402-/*
3403- * Find the highest page frame number we have available
3404- */
3405-void __init propagate_e820_map(void)
3406-{
3407- int i;
3408-
3409- max_pfn = 0;
3410-
3411- for (i = 0; i < e820.nr_map; i++) {
3412- unsigned long start, end;
3413- /* RAM? */
3414- if (e820.map[i].type != E820_RAM)
3415- continue;
3416- start = PFN_UP(e820.map[i].addr);
3417- end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3418- if (start >= end)
3419- continue;
3420- if (end > max_pfn)
3421- max_pfn = end;
3422- memory_present(0, start, end);
3423- }
3424-}
3425-
3426-/*
3427- * Register fully available low RAM pages with the bootmem allocator.
3428- */
3429-void __init register_bootmem_low_pages(unsigned long max_low_pfn)
3430-{
3431- int i;
3432-
3433- for (i = 0; i < e820.nr_map; i++) {
3434- unsigned long curr_pfn, last_pfn, size;
3435- /*
3436- * Reserve usable low memory
3437- */
3438- if (e820.map[i].type != E820_RAM)
3439- continue;
3440- /*
3441- * We are rounding up the start address of usable memory:
3442- */
3443- curr_pfn = PFN_UP(e820.map[i].addr);
3444- if (curr_pfn >= max_low_pfn)
3445- continue;
3446- /*
3447- * ... and at the end of the usable range downwards:
3448- */
3449- last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3450-
3451-#ifdef CONFIG_XEN
3452- /*
3453- * Truncate to the number of actual pages currently
3454- * present.
3455- */
3456- if (last_pfn > xen_start_info->nr_pages)
3457- last_pfn = xen_start_info->nr_pages;
3458-#endif
3459-
3460- if (last_pfn > max_low_pfn)
3461- last_pfn = max_low_pfn;
3462-
3463- /*
3464- * .. finally, did all the rounding and playing
3465- * around just make the area go away?
3466- */
3467- if (last_pfn <= curr_pfn)
3468- continue;
3469-
3470- size = last_pfn - curr_pfn;
3471- free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
3472- }
3473-}
3474-
3475-void __init e820_register_memory(void)
3476-{
3477- unsigned long gapstart, gapsize, round;
3478- unsigned long long last;
3479- int i;
3480-
3481-#ifdef CONFIG_XEN
3482- if (is_initial_xendomain()) {
3483- struct xen_memory_map memmap;
3484-
3485- memmap.nr_entries = E820MAX;
3486- set_xen_guest_handle(memmap.buffer, machine_e820.map);
3487-
3488- if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3489- BUG();
3490- machine_e820.nr_map = memmap.nr_entries;
3491- }
3492- else
3493- machine_e820 = e820;
3494-#define e820 machine_e820
3495-#endif
3496-
3497- /*
3498- * Search for the biggest gap in the low 32 bits of the e820
3499- * memory space.
3500- */
3501- last = 0x100000000ull;
3502- gapstart = 0x10000000;
3503- gapsize = 0x400000;
3504- i = e820.nr_map;
3505- while (--i >= 0) {
3506- unsigned long long start = e820.map[i].addr;
3507- unsigned long long end = start + e820.map[i].size;
3508-
3509- /*
3510- * Since "last" is at most 4GB, we know we'll
3511- * fit in 32 bits if this condition is true
3512- */
3513- if (last > end) {
3514- unsigned long gap = last - end;
3515-
3516- if (gap > gapsize) {
3517- gapsize = gap;
3518- gapstart = end;
3519- }
3520- }
3521- if (start < last)
3522- last = start;
3523- }
3524-#undef e820
3525-
3526- /*
3527- * See how much we want to round up: start off with
3528- * rounding to the next 1MB area.
3529- */
3530- round = 0x100000;
3531- while ((gapsize >> 4) > round)
3532- round += round;
3533- /* Fun with two's complement */
3534- pci_mem_start = (gapstart + round) & -round;
3535-
3536- printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
3537- pci_mem_start, gapstart, gapsize);
3538-}
3539-
3540-void __init print_memory_map(char *who)
3541-{
3542- int i;
3543-
3544- for (i = 0; i < e820.nr_map; i++) {
3545- printk(" %s: %016Lx - %016Lx ", who,
3546- e820.map[i].addr,
3547- e820.map[i].addr + e820.map[i].size);
3548- switch (e820.map[i].type) {
3549- case E820_RAM: printk("(usable)\n");
3550- break;
3551- case E820_RESERVED:
3552- printk("(reserved)\n");
3553- break;
3554- case E820_ACPI:
3555- printk("(ACPI data)\n");
3556- break;
3557- case E820_NVS:
3558- printk("(ACPI NVS)\n");
3559- break;
3560- default: printk("type %u\n", e820.map[i].type);
3561- break;
3562- }
3563- }
3564-}
3565-
3566-void __init limit_regions(unsigned long long size)
3567-{
3568- unsigned long long current_addr = 0;
3569- int i;
3570-
3571- print_memory_map("limit_regions start");
3572- for (i = 0; i < e820.nr_map; i++) {
3573- current_addr = e820.map[i].addr + e820.map[i].size;
3574- if (current_addr < size)
3575- continue;
3576-
3577- if (e820.map[i].type != E820_RAM)
3578- continue;
3579-
3580- if (e820.map[i].addr >= size) {
3581- /*
3582- * This region starts past the end of the
3583- * requested size, skip it completely.
3584- */
3585- e820.nr_map = i;
3586- } else {
3587- e820.nr_map = i + 1;
3588- e820.map[i].size -= current_addr - size;
3589- }
3590- print_memory_map("limit_regions endfor");
3591- return;
3592- }
3593-#ifdef CONFIG_XEN
3594- if (current_addr < size) {
3595- /*
3596- * The e820 map finished before our requested size so
3597- * extend the final entry to the requested address.
3598- */
3599- --i;
3600- if (e820.map[i].type == E820_RAM)
3601- e820.map[i].size -= current_addr - size;
3602- else
3603- add_memory_region(current_addr, size - current_addr, E820_RAM);
3604- }
3605-#endif
3606- print_memory_map("limit_regions endfunc");
3607-}
3608-
3609-/*
3610- * This function checks if any part of the range <start,end> is mapped
3611- * with type.
3612- */
3613-int
3614-e820_any_mapped(u64 start, u64 end, unsigned type)
3615-{
3616- int i;
3617-
3618-#ifndef CONFIG_XEN
3619- for (i = 0; i < e820.nr_map; i++) {
3620- const struct e820entry *ei = &e820.map[i];
3621-#else
3622- if (!is_initial_xendomain())
3623- return 0;
3624- for (i = 0; i < machine_e820.nr_map; ++i) {
3625- const struct e820entry *ei = &machine_e820.map[i];
3626-#endif
3627-
3628- if (type && ei->type != type)
3629- continue;
3630- if (ei->addr >= end || ei->addr + ei->size <= start)
3631- continue;
3632- return 1;
3633- }
3634- return 0;
3635-}
3636-EXPORT_SYMBOL_GPL(e820_any_mapped);
3637-
3638- /*
3639- * This function checks if the entire range <start,end> is mapped with type.
3640- *
3641- * Note: this function only works correct if the e820 table is sorted and
3642- * not-overlapping, which is the case
3643- */
3644-int __init
3645-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
3646-{
3647- u64 start = s;
3648- u64 end = e;
3649- int i;
3650-
3651-#ifndef CONFIG_XEN
3652- for (i = 0; i < e820.nr_map; i++) {
3653- struct e820entry *ei = &e820.map[i];
3654-#else
3655- if (!is_initial_xendomain())
3656- return 0;
3657- for (i = 0; i < machine_e820.nr_map; ++i) {
3658- const struct e820entry *ei = &machine_e820.map[i];
3659-#endif
3660-
3661- if (type && ei->type != type)
3662- continue;
3663- /* is the region (part) in overlap with the current region ?*/
3664- if (ei->addr >= end || ei->addr + ei->size <= start)
3665- continue;
3666- /* if the region is at the beginning of <start,end> we move
3667- * start to the end of the region since it's ok until there
3668- */
3669- if (ei->addr <= start)
3670- start = ei->addr + ei->size;
3671- /* if start is now at or beyond end, we're done, full
3672- * coverage */
3673- if (start >= end)
3674- return 1; /* we're done */
3675- }
3676- return 0;
3677-}
3678-
3679-static int __init parse_memmap(char *arg)
3680-{
3681- if (!arg)
3682- return -EINVAL;
3683-
3684- if (strcmp(arg, "exactmap") == 0) {
3685-#ifdef CONFIG_CRASH_DUMP
3686- /* If we are doing a crash dump, we
3687- * still need to know the real mem
3688- * size before original memory map is
3689- * reset.
3690- */
3691- propagate_e820_map();
3692- saved_max_pfn = max_pfn;
3693-#endif
3694- e820.nr_map = 0;
3695- user_defined_memmap = 1;
3696- } else {
3697- /* If the user specifies memory size, we
3698- * limit the BIOS-provided memory map to
3699- * that size. exactmap can be used to specify
3700- * the exact map. mem=number can be used to
3701- * trim the existing memory map.
3702- */
3703- unsigned long long start_at, mem_size;
3704-
3705- mem_size = memparse(arg, &arg);
3706- if (*arg == '@') {
3707- start_at = memparse(arg+1, &arg);
3708- add_memory_region(start_at, mem_size, E820_RAM);
3709- } else if (*arg == '#') {
3710- start_at = memparse(arg+1, &arg);
3711- add_memory_region(start_at, mem_size, E820_ACPI);
3712- } else if (*arg == '$') {
3713- start_at = memparse(arg+1, &arg);
3714- add_memory_region(start_at, mem_size, E820_RESERVED);
3715- } else {
3716- limit_regions(mem_size);
3717- user_defined_memmap = 1;
3718- }
3719- }
3720- return 0;
3721-}
3722-early_param("memmap", parse_memmap);
3723-
3724-#ifndef CONFIG_XEN
3725-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
3726- unsigned new_type)
3727-{
3728- int i;
3729-
3730- BUG_ON(old_type == new_type);
3731-
3732- for (i = 0; i < e820.nr_map; i++) {
3733- struct e820entry *ei = &e820.map[i];
3734- u64 final_start, final_end;
3735- if (ei->type != old_type)
3736- continue;
3737- /* totally covered? */
3738- if (ei->addr >= start && ei->size <= size) {
3739- ei->type = new_type;
3740- continue;
3741- }
3742- /* partially covered */
3743- final_start = max(start, ei->addr);
3744- final_end = min(start + size, ei->addr + ei->size);
3745- if (final_start >= final_end)
3746- continue;
3747- add_memory_region(final_start, final_end - final_start,
3748- new_type);
3749- }
3750-}
3751-
3752-void __init update_e820(void)
3753-{
3754- u8 nr_map;
3755-
3756- nr_map = e820.nr_map;
3757- if (sanitize_e820_map(e820.map, &nr_map))
3758- return;
3759- e820.nr_map = nr_map;
3760- printk(KERN_INFO "modified physical RAM map:\n");
3761- print_memory_map("modified");
3762-}
3763-#endif
3764Index: head-2008-12-01/arch/x86/kernel/e820_64-xen.c
3765===================================================================
3766--- head-2008-12-01.orig/arch/x86/kernel/e820_64-xen.c 2008-12-01 11:44:55.000000000 +0100
3767+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3768@@ -1,1045 +0,0 @@
3769-/*
3770- * Handle the memory map.
3771- * The functions here do the job until bootmem takes over.
3772- *
3773- * Getting sanitize_e820_map() in sync with i386 version by applying change:
3774- * - Provisions for empty E820 memory regions (reported by certain BIOSes).
3775- * Alex Achenbach <xela@slit.de>, December 2002.
3776- * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
3777- *
3778- */
3779-#include <linux/kernel.h>
3780-#include <linux/types.h>
3781-#include <linux/init.h>
3782-#include <linux/bootmem.h>
3783-#include <linux/ioport.h>
3784-#include <linux/string.h>
3785-#include <linux/kexec.h>
3786-#include <linux/module.h>
3787-#include <linux/mm.h>
3788-#include <linux/suspend.h>
3789-#include <linux/pfn.h>
3790-
3791-#include <asm/pgtable.h>
3792-#include <asm/page.h>
3793-#include <asm/e820.h>
3794-#include <asm/proto.h>
3795-#include <asm/setup.h>
3796-#include <asm/sections.h>
3797-#include <asm/kdebug.h>
3798-#include <xen/interface/memory.h>
3799-
3800-struct e820map e820 __initdata;
3801-#ifdef CONFIG_XEN
3802-struct e820map machine_e820;
3803-#endif
3804-
3805-/*
3806- * PFN of last memory page.
3807- */
3808-unsigned long end_pfn;
3809-
3810-/*
3811- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
3812- * The direct mapping extends to max_pfn_mapped, so that we can directly access
3813- * apertures, ACPI and other tables without having to play with fixmaps.
3814- */
3815-unsigned long max_pfn_mapped;
3816-
3817-/*
3818- * Last pfn which the user wants to use.
3819- */
3820-static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
3821-
3822-/*
3823- * Early reserved memory areas.
3824- */
3825-#define MAX_EARLY_RES 20
3826-
3827-struct early_res {
3828- unsigned long start, end;
3829- char name[16];
3830-};
3831-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
3832-#ifndef CONFIG_XEN
3833- { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
3834-#ifdef CONFIG_X86_TRAMPOLINE
3835- { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
3836-#endif
3837-#endif
3838- {}
3839-};
3840-
3841-void __init reserve_early(unsigned long start, unsigned long end, char *name)
3842-{
3843- int i;
3844- struct early_res *r;
3845- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3846- r = &early_res[i];
3847- if (end > r->start && start < r->end)
3848- panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
3849- start, end - 1, name?name:"", r->start, r->end - 1, r->name);
3850- }
3851- if (i >= MAX_EARLY_RES)
3852- panic("Too many early reservations");
3853- r = &early_res[i];
3854- r->start = start;
3855- r->end = end;
3856- if (name)
3857- strncpy(r->name, name, sizeof(r->name) - 1);
3858-}
3859-
3860-void __init free_early(unsigned long start, unsigned long end)
3861-{
3862- struct early_res *r;
3863- int i, j;
3864-
3865- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3866- r = &early_res[i];
3867- if (start == r->start && end == r->end)
3868- break;
3869- }
3870- if (i >= MAX_EARLY_RES || !early_res[i].end)
3871- panic("free_early on not reserved area: %lx-%lx!", start, end);
3872-
3873- for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
3874- ;
3875-
3876- memmove(&early_res[i], &early_res[i + 1],
3877- (j - 1 - i) * sizeof(struct early_res));
3878-
3879- early_res[j - 1].end = 0;
3880-}
3881-
3882-void __init early_res_to_bootmem(unsigned long start, unsigned long end)
3883-{
3884- int i;
3885- unsigned long final_start, final_end;
3886- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3887- struct early_res *r = &early_res[i];
3888- final_start = max(start, r->start);
3889- final_end = min(end, r->end);
3890- if (final_start >= final_end)
3891- continue;
3892- printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
3893- final_start, final_end - 1, r->name);
3894- reserve_bootmem_generic(final_start, final_end - final_start);
3895- }
3896-}
3897-
3898-/* Check for already reserved areas */
3899-static inline int __init
3900-bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
3901-{
3902- int i;
3903- unsigned long addr = *addrp, last;
3904- int changed = 0;
3905-again:
3906- last = addr + size;
3907- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3908- struct early_res *r = &early_res[i];
3909- if (last >= r->start && addr < r->end) {
3910- *addrp = addr = round_up(r->end, align);
3911- changed = 1;
3912- goto again;
3913- }
3914- }
3915- return changed;
3916-}
3917-
3918-/* Check for already reserved areas */
3919-static inline int __init
3920-bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
3921-{
3922- int i;
3923- unsigned long addr = *addrp, last;
3924- unsigned long size = *sizep;
3925- int changed = 0;
3926-again:
3927- last = addr + size;
3928- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3929- struct early_res *r = &early_res[i];
3930- if (last > r->start && addr < r->start) {
3931- size = r->start - addr;
3932- changed = 1;
3933- goto again;
3934- }
3935- if (last > r->end && addr < r->end) {
3936- addr = round_up(r->end, align);
3937- size = last - addr;
3938- changed = 1;
3939- goto again;
3940- }
3941- if (last <= r->end && addr >= r->start) {
3942- (*sizep)++;
3943- return 0;
3944- }
3945- }
3946- if (changed) {
3947- *addrp = addr;
3948- *sizep = size;
3949- }
3950- return changed;
3951-}
3952-/*
3953- * This function checks if any part of the range <start,end> is mapped
3954- * with type.
3955- */
3956-int
3957-e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
3958-{
3959- int i;
3960-
3961-#ifndef CONFIG_XEN
3962- for (i = 0; i < e820.nr_map; i++) {
3963- struct e820entry *ei = &e820.map[i];
3964-#else
3965- if (!is_initial_xendomain())
3966- return 0;
3967- for (i = 0; i < machine_e820.nr_map; i++) {
3968- const struct e820entry *ei = &machine_e820.map[i];
3969-#endif
3970-
3971- if (type && ei->type != type)
3972- continue;
3973- if (ei->addr >= end || ei->addr + ei->size <= start)
3974- continue;
3975- return 1;
3976- }
3977- return 0;
3978-}
3979-EXPORT_SYMBOL_GPL(e820_any_mapped);
3980-
3981-/*
3982- * This function checks if the entire range <start,end> is mapped with type.
3983- *
3984- * Note: this function only works correct if the e820 table is sorted and
3985- * not-overlapping, which is the case
3986- */
3987-int __init e820_all_mapped(unsigned long start, unsigned long end,
3988- unsigned type)
3989-{
3990- int i;
3991-
3992-#ifndef CONFIG_XEN
3993- for (i = 0; i < e820.nr_map; i++) {
3994- struct e820entry *ei = &e820.map[i];
3995-#else
3996- if (!is_initial_xendomain())
3997- return 0;
3998- for (i = 0; i < machine_e820.nr_map; i++) {
3999- const struct e820entry *ei = &machine_e820.map[i];
4000-#endif
4001-
4002- if (type && ei->type != type)
4003- continue;
4004- /* is the region (part) in overlap with the current region ?*/
4005- if (ei->addr >= end || ei->addr + ei->size <= start)
4006- continue;
4007-
4008- /* if the region is at the beginning of <start,end> we move
4009- * start to the end of the region since it's ok until there
4010- */
4011- if (ei->addr <= start)
4012- start = ei->addr + ei->size;
4013- /*
4014- * if start is now at or beyond end, we're done, full
4015- * coverage
4016- */
4017- if (start >= end)
4018- return 1;
4019- }
4020- return 0;
4021-}
4022-
4023-/*
4024- * Find a free area with specified alignment in a specific range.
4025- */
4026-unsigned long __init find_e820_area(unsigned long start, unsigned long end,
4027- unsigned long size, unsigned long align)
4028-{
4029- int i;
4030-
4031- for (i = 0; i < e820.nr_map; i++) {
4032- struct e820entry *ei = &e820.map[i];
4033- unsigned long addr, last;
4034- unsigned long ei_last;
4035-
4036- if (ei->type != E820_RAM)
4037- continue;
4038- addr = round_up(ei->addr, align);
4039- ei_last = ei->addr + ei->size;
4040- if (addr < start)
4041- addr = round_up(start, align);
4042- if (addr >= ei_last)
4043- continue;
4044- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
4045- ;
4046- last = addr + size;
4047- if (last > ei_last)
4048- continue;
4049- if (last > end)
4050- continue;
4051- return addr;
4052- }
4053- return -1UL;
4054-}
4055-
4056-/*
4057- * Find next free range after *start
4058- */
4059-unsigned long __init find_e820_area_size(unsigned long start,
4060- unsigned long *sizep,
4061- unsigned long align)
4062-{
4063- int i;
4064-
4065- for (i = 0; i < e820.nr_map; i++) {
4066- struct e820entry *ei = &e820.map[i];
4067- unsigned long addr, last;
4068- unsigned long ei_last;
4069-
4070- if (ei->type != E820_RAM)
4071- continue;
4072- addr = round_up(ei->addr, align);
4073- ei_last = ei->addr + ei->size;
4074- if (addr < start)
4075- addr = round_up(start, align);
4076- if (addr >= ei_last)
4077- continue;
4078- *sizep = ei_last - addr;
4079- while (bad_addr_size(&addr, sizep, align) &&
4080- addr + *sizep <= ei_last)
4081- ;
4082- last = addr + *sizep;
4083- if (last > ei_last)
4084- continue;
4085- return addr;
4086- }
4087- return -1UL;
4088-
4089-}
4090-/*
4091- * Find the highest page frame number we have available
4092- */
4093-unsigned long __init e820_end_of_ram(void)
4094-{
4095- unsigned long end_pfn;
4096-
4097- end_pfn = find_max_pfn_with_active_regions();
4098-
4099- if (end_pfn > max_pfn_mapped)
4100- max_pfn_mapped = end_pfn;
4101- if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
4102- max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
4103- if (end_pfn > end_user_pfn)
4104- end_pfn = end_user_pfn;
4105- if (end_pfn > max_pfn_mapped)
4106- end_pfn = max_pfn_mapped;
4107-
4108- printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
4109- return end_pfn;
4110-}
4111-
4112-/*
4113- * Mark e820 reserved areas as busy for the resource manager.
4114- */
4115-void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
4116-{
4117- int i;
4118- struct resource *res;
4119-
4120- res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
4121- for (i = 0; i < nr_map; i++) {
4122- switch (e820[i].type) {
4123- case E820_RAM: res->name = "System RAM"; break;
4124- case E820_ACPI: res->name = "ACPI Tables"; break;
4125- case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
4126- default: res->name = "reserved";
4127- }
4128- res->start = e820[i].addr;
4129- res->end = res->start + e820[i].size - 1;
4130- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4131- insert_resource(&iomem_resource, res);
4132- res++;
4133- }
4134-}
4135-
4136-#ifndef CONFIG_XEN
4137-/*
4138- * Find the ranges of physical addresses that do not correspond to
4139- * e820 RAM areas and mark the corresponding pages as nosave for software
4140- * suspend and suspend to RAM.
4141- *
4142- * This function requires the e820 map to be sorted and without any
4143- * overlapping entries and assumes the first e820 area to be RAM.
4144- */
4145-void __init e820_mark_nosave_regions(void)
4146-{
4147- int i;
4148- unsigned long paddr;
4149-
4150- paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
4151- for (i = 1; i < e820.nr_map; i++) {
4152- struct e820entry *ei = &e820.map[i];
4153-
4154- if (paddr < ei->addr)
4155- register_nosave_region(PFN_DOWN(paddr),
4156- PFN_UP(ei->addr));
4157-
4158- paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
4159- if (ei->type != E820_RAM)
4160- register_nosave_region(PFN_UP(ei->addr),
4161- PFN_DOWN(paddr));
4162-
4163- if (paddr >= (end_pfn << PAGE_SHIFT))
4164- break;
4165- }
4166-}
4167-#endif
4168-
4169-/*
4170- * Finds an active region in the address range from start_pfn to end_pfn and
4171- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
4172- */
4173-static int __init e820_find_active_region(const struct e820entry *ei,
4174- unsigned long start_pfn,
4175- unsigned long end_pfn,
4176- unsigned long *ei_startpfn,
4177- unsigned long *ei_endpfn)
4178-{
4179- *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
4180- *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
4181-
4182- /* Skip map entries smaller than a page */
4183- if (*ei_startpfn >= *ei_endpfn)
4184- return 0;
4185-
4186- /* Check if max_pfn_mapped should be updated */
4187- if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
4188- max_pfn_mapped = *ei_endpfn;
4189-
4190- /* Skip if map is outside the node */
4191- if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
4192- *ei_startpfn >= end_pfn)
4193- return 0;
4194-
4195- /* Check for overlaps */
4196- if (*ei_startpfn < start_pfn)
4197- *ei_startpfn = start_pfn;
4198- if (*ei_endpfn > end_pfn)
4199- *ei_endpfn = end_pfn;
4200-
4201- /* Obey end_user_pfn to save on memmap */
4202- if (*ei_startpfn >= end_user_pfn)
4203- return 0;
4204- if (*ei_endpfn > end_user_pfn)
4205- *ei_endpfn = end_user_pfn;
4206-
4207- return 1;
4208-}
4209-
4210-/* Walk the e820 map and register active regions within a node */
4211-void __init
4212-e820_register_active_regions(int nid, unsigned long start_pfn,
4213- unsigned long end_pfn)
4214-{
4215- unsigned long ei_startpfn;
4216- unsigned long ei_endpfn;
4217- int i;
4218-
4219- for (i = 0; i < e820.nr_map; i++)
4220- if (e820_find_active_region(&e820.map[i],
4221- start_pfn, end_pfn,
4222- &ei_startpfn, &ei_endpfn))
4223- add_active_range(nid, ei_startpfn, ei_endpfn);
4224-}
4225-
4226-/*
4227- * Add a memory region to the kernel e820 map.
4228- */
4229-void __init add_memory_region(unsigned long start, unsigned long size, int type)
4230-{
4231- int x = e820.nr_map;
4232-
4233- if (x == E820MAX) {
4234- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
4235- return;
4236- }
4237-
4238- e820.map[x].addr = start;
4239- e820.map[x].size = size;
4240- e820.map[x].type = type;
4241- e820.nr_map++;
4242-}
4243-
4244-/*
4245- * Find the hole size (in bytes) in the memory range.
4246- * @start: starting address of the memory range to scan
4247- * @end: ending address of the memory range to scan
4248- */
4249-unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
4250-{
4251- unsigned long start_pfn = start >> PAGE_SHIFT;
4252- unsigned long end_pfn = end >> PAGE_SHIFT;
4253- unsigned long ei_startpfn, ei_endpfn, ram = 0;
4254- int i;
4255-
4256- for (i = 0; i < e820.nr_map; i++) {
4257- if (e820_find_active_region(&e820.map[i],
4258- start_pfn, end_pfn,
4259- &ei_startpfn, &ei_endpfn))
4260- ram += ei_endpfn - ei_startpfn;
4261- }
4262- return end - start - (ram << PAGE_SHIFT);
4263-}
4264-
4265-static void __init e820_print_map(char *who)
4266-{
4267- int i;
4268-
4269- for (i = 0; i < e820.nr_map; i++) {
4270- printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
4271- (unsigned long long) e820.map[i].addr,
4272- (unsigned long long)
4273- (e820.map[i].addr + e820.map[i].size));
4274- switch (e820.map[i].type) {
4275- case E820_RAM:
4276- printk(KERN_CONT "(usable)\n");
4277- break;
4278- case E820_RESERVED:
4279- printk(KERN_CONT "(reserved)\n");
4280- break;
4281- case E820_ACPI:
4282- printk(KERN_CONT "(ACPI data)\n");
4283- break;
4284- case E820_NVS:
4285- printk(KERN_CONT "(ACPI NVS)\n");
4286- break;
4287- default:
4288- printk(KERN_CONT "type %u\n", e820.map[i].type);
4289- break;
4290- }
4291- }
4292-}
4293-
4294-/*
4295- * Sanitize the BIOS e820 map.
4296- *
4297- * Some e820 responses include overlapping entries. The following
4298- * replaces the original e820 map with a new one, removing overlaps.
4299- *
4300- */
4301-static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
4302-{
4303- struct change_member {
4304- struct e820entry *pbios; /* pointer to original bios entry */
4305- unsigned long long addr; /* address for this change point */
4306- };
4307- static struct change_member change_point_list[2*E820MAX] __initdata;
4308- static struct change_member *change_point[2*E820MAX] __initdata;
4309- static struct e820entry *overlap_list[E820MAX] __initdata;
4310- static struct e820entry new_bios[E820MAX] __initdata;
4311- struct change_member *change_tmp;
4312- unsigned long current_type, last_type;
4313- unsigned long long last_addr;
4314- int chgidx, still_changing;
4315- int overlap_entries;
4316- int new_bios_entry;
4317- int old_nr, new_nr, chg_nr;
4318- int i;
4319-
4320- /*
4321- Visually we're performing the following
4322- (1,2,3,4 = memory types)...
4323-
4324- Sample memory map (w/overlaps):
4325- ____22__________________
4326- ______________________4_
4327- ____1111________________
4328- _44_____________________
4329- 11111111________________
4330- ____________________33__
4331- ___________44___________
4332- __________33333_________
4333- ______________22________
4334- ___________________2222_
4335- _________111111111______
4336- _____________________11_
4337- _________________4______
4338-
4339- Sanitized equivalent (no overlap):
4340- 1_______________________
4341- _44_____________________
4342- ___1____________________
4343- ____22__________________
4344- ______11________________
4345- _________1______________
4346- __________3_____________
4347- ___________44___________
4348- _____________33_________
4349- _______________2________
4350- ________________1_______
4351- _________________4______
4352- ___________________2____
4353- ____________________33__
4354- ______________________4_
4355- */
4356-
4357- /* if there's only one memory region, don't bother */
4358- if (*pnr_map < 2)
4359- return -1;
4360-
4361- old_nr = *pnr_map;
4362-
4363- /* bail out if we find any unreasonable addresses in bios map */
4364- for (i = 0; i < old_nr; i++)
4365- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
4366- return -1;
4367-
4368- /* create pointers for initial change-point information (for sorting) */
4369- for (i = 0; i < 2 * old_nr; i++)
4370- change_point[i] = &change_point_list[i];
4371-
4372- /* record all known change-points (starting and ending addresses),
4373- omitting those that are for empty memory regions */
4374- chgidx = 0;
4375- for (i = 0; i < old_nr; i++) {
4376- if (biosmap[i].size != 0) {
4377- change_point[chgidx]->addr = biosmap[i].addr;
4378- change_point[chgidx++]->pbios = &biosmap[i];
4379- change_point[chgidx]->addr = biosmap[i].addr +
4380- biosmap[i].size;
4381- change_point[chgidx++]->pbios = &biosmap[i];
4382- }
4383- }
4384- chg_nr = chgidx;
4385-
4386- /* sort change-point list by memory addresses (low -> high) */
4387- still_changing = 1;
4388- while (still_changing) {
4389- still_changing = 0;
4390- for (i = 1; i < chg_nr; i++) {
4391- unsigned long long curaddr, lastaddr;
4392- unsigned long long curpbaddr, lastpbaddr;
4393-
4394- curaddr = change_point[i]->addr;
4395- lastaddr = change_point[i - 1]->addr;
4396- curpbaddr = change_point[i]->pbios->addr;
4397- lastpbaddr = change_point[i - 1]->pbios->addr;
4398-
4399- /*
4400- * swap entries, when:
4401- *
4402- * curaddr > lastaddr or
4403- * curaddr == lastaddr and curaddr == curpbaddr and
4404- * lastaddr != lastpbaddr
4405- */
4406- if (curaddr < lastaddr ||
4407- (curaddr == lastaddr && curaddr == curpbaddr &&
4408- lastaddr != lastpbaddr)) {
4409- change_tmp = change_point[i];
4410- change_point[i] = change_point[i-1];
4411- change_point[i-1] = change_tmp;
4412- still_changing = 1;
4413- }
4414- }
4415- }
4416-
4417- /* create a new bios memory map, removing overlaps */
4418- overlap_entries = 0; /* number of entries in the overlap table */
4419- new_bios_entry = 0; /* index for creating new bios map entries */
4420- last_type = 0; /* start with undefined memory type */
4421- last_addr = 0; /* start with 0 as last starting address */
4422-
4423- /* loop through change-points, determining affect on the new bios map */
4424- for (chgidx = 0; chgidx < chg_nr; chgidx++) {
4425- /* keep track of all overlapping bios entries */
4426- if (change_point[chgidx]->addr ==
4427- change_point[chgidx]->pbios->addr) {
4428- /*
4429- * add map entry to overlap list (> 1 entry
4430- * implies an overlap)
4431- */
4432- overlap_list[overlap_entries++] =
4433- change_point[chgidx]->pbios;
4434- } else {
4435- /*
4436- * remove entry from list (order independent,
4437- * so swap with last)
4438- */
4439- for (i = 0; i < overlap_entries; i++) {
4440- if (overlap_list[i] ==
4441- change_point[chgidx]->pbios)
4442- overlap_list[i] =
4443- overlap_list[overlap_entries-1];
4444- }
4445- overlap_entries--;
4446- }
4447- /*
4448- * if there are overlapping entries, decide which
4449- * "type" to use (larger value takes precedence --
4450- * 1=usable, 2,3,4,4+=unusable)
4451- */
4452- current_type = 0;
4453- for (i = 0; i < overlap_entries; i++)
4454- if (overlap_list[i]->type > current_type)
4455- current_type = overlap_list[i]->type;
4456- /*
4457- * continue building up new bios map based on this
4458- * information
4459- */
4460- if (current_type != last_type) {
4461- if (last_type != 0) {
4462- new_bios[new_bios_entry].size =
4463- change_point[chgidx]->addr - last_addr;
4464- /*
4465- * move forward only if the new size
4466- * was non-zero
4467- */
4468- if (new_bios[new_bios_entry].size != 0)
4469- /*
4470- * no more space left for new
4471- * bios entries ?
4472- */
4473- if (++new_bios_entry >= E820MAX)
4474- break;
4475- }
4476- if (current_type != 0) {
4477- new_bios[new_bios_entry].addr =
4478- change_point[chgidx]->addr;
4479- new_bios[new_bios_entry].type = current_type;
4480- last_addr = change_point[chgidx]->addr;
4481- }
4482- last_type = current_type;
4483- }
4484- }
4485- /* retain count for new bios entries */
4486- new_nr = new_bios_entry;
4487-
4488- /* copy new bios mapping into original location */
4489- memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
4490- *pnr_map = new_nr;
4491-
4492- return 0;
4493-}
4494-
4495-/*
4496- * Copy the BIOS e820 map into a safe place.
4497- *
4498- * Sanity-check it while we're at it..
4499- *
4500- * If we're lucky and live on a modern system, the setup code
4501- * will have given us a memory map that we can use to properly
4502- * set up memory. If we aren't, we'll fake a memory map.
4503- */
4504-static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
4505-{
4506-#ifndef CONFIG_XEN
4507- /* Only one memory region (or negative)? Ignore it */
4508- if (nr_map < 2)
4509- return -1;
4510-#else
4511- BUG_ON(nr_map < 1);
4512-#endif
4513-
4514- do {
4515- u64 start = biosmap->addr;
4516- u64 size = biosmap->size;
4517- u64 end = start + size;
4518- u32 type = biosmap->type;
4519-
4520- /* Overflow in 64 bits? Ignore the memory map. */
4521- if (start > end)
4522- return -1;
4523-
4524- add_memory_region(start, size, type);
4525- } while (biosmap++, --nr_map);
4526-
4527-#ifdef CONFIG_XEN
4528- if (is_initial_xendomain()) {
4529- struct xen_memory_map memmap;
4530-
4531- memmap.nr_entries = E820MAX;
4532- set_xen_guest_handle(memmap.buffer, machine_e820.map);
4533-
4534- if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
4535- BUG();
4536- machine_e820.nr_map = memmap.nr_entries;
4537- } else
4538- machine_e820 = e820;
4539-#endif
4540-
4541- return 0;
4542-}
4543-
4544-static void early_panic(char *msg)
4545-{
4546- early_printk(msg);
4547- panic(msg);
4548-}
4549-
4550-/* We're not void only for x86 32-bit compat */
4551-char * __init machine_specific_memory_setup(void)
4552-{
4553-#ifndef CONFIG_XEN
4554- char *who = "BIOS-e820";
4555- /*
4556- * Try to copy the BIOS-supplied E820-map.
4557- *
4558- * Otherwise fake a memory map; one section from 0k->640k,
4559- * the next section from 1mb->appropriate_mem_k
4560- */
4561- sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
4562- if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
4563- early_panic("Cannot find a valid memory map");
4564-#else /* CONFIG_XEN */
4565- char *who = "Xen";
4566- int rc;
4567- struct xen_memory_map memmap;
4568- /*
4569- * This is rather large for a stack variable but this early in
4570- * the boot process we know we have plenty slack space.
4571- */
4572- struct e820entry map[E820MAX];
4573-
4574- memmap.nr_entries = E820MAX;
4575- set_xen_guest_handle(memmap.buffer, map);
4576-
4577- rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
4578- if ( rc == -ENOSYS ) {
4579- memmap.nr_entries = 1;
4580- map[0].addr = 0ULL;
4581- map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
4582- /* 8MB slack (to balance backend allocations). */
4583- map[0].size += 8 << 20;
4584- map[0].type = E820_RAM;
4585- rc = 0;
4586- }
4587- BUG_ON(rc);
4588-
4589- sanitize_e820_map(map, (char *)&memmap.nr_entries);
4590-
4591- if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
4592- early_panic("Cannot find a valid memory map");
4593-#endif
4594- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
4595- e820_print_map(who);
4596-
4597- /* In case someone cares... */
4598- return who;
4599-}
4600-
4601-static int __init parse_memopt(char *p)
4602-{
4603- int i;
4604- unsigned long current_end;
4605- unsigned long end;
4606-
4607- if (!p)
4608- return -EINVAL;
4609- end_user_pfn = memparse(p, &p);
4610- end_user_pfn >>= PAGE_SHIFT;
4611-
4612- end = end_user_pfn<<PAGE_SHIFT;
4613- i = e820.nr_map-1;
4614- current_end = e820.map[i].addr + e820.map[i].size;
4615-
4616- if (current_end < end) {
4617- /*
4618- * The e820 map ends before our requested size so
4619- * extend the final entry to the requested address.
4620- */
4621- if (e820.map[i].type == E820_RAM)
4622- e820.map[i].size = end - e820.map[i].addr;
4623- else
4624- add_memory_region(current_end, end - current_end, E820_RAM);
4625- }
4626-
4627- return 0;
4628-}
4629-early_param("mem", parse_memopt);
4630-
4631-static int userdef __initdata;
4632-
4633-static int __init parse_memmap_opt(char *p)
4634-{
4635- char *oldp;
4636- unsigned long long start_at, mem_size;
4637-
4638- if (!strcmp(p, "exactmap")) {
4639-#ifdef CONFIG_CRASH_DUMP
4640- /*
4641- * If we are doing a crash dump, we still need to know
4642- * the real mem size before original memory map is
4643- * reset.
4644- */
4645- e820_register_active_regions(0, 0, -1UL);
4646- saved_max_pfn = e820_end_of_ram();
4647- remove_all_active_ranges();
4648-#endif
4649- max_pfn_mapped = 0;
4650- e820.nr_map = 0;
4651- userdef = 1;
4652- return 0;
4653- }
4654-
4655- oldp = p;
4656- mem_size = memparse(p, &p);
4657- if (p == oldp)
4658- return -EINVAL;
4659-
4660- userdef = 1;
4661- if (*p == '@') {
4662- start_at = memparse(p+1, &p);
4663- add_memory_region(start_at, mem_size, E820_RAM);
4664- } else if (*p == '#') {
4665- start_at = memparse(p+1, &p);
4666- add_memory_region(start_at, mem_size, E820_ACPI);
4667- } else if (*p == '$') {
4668- start_at = memparse(p+1, &p);
4669- add_memory_region(start_at, mem_size, E820_RESERVED);
4670- } else {
4671- end_user_pfn = (mem_size >> PAGE_SHIFT);
4672- }
4673- return *p == '\0' ? 0 : -EINVAL;
4674-}
4675-early_param("memmap", parse_memmap_opt);
4676-
4677-void __init finish_e820_parsing(void)
4678-{
4679- if (userdef) {
4680- char nr = e820.nr_map;
4681-
4682- if (sanitize_e820_map(e820.map, &nr) < 0)
4683- early_panic("Invalid user supplied memory map");
4684- e820.nr_map = nr;
4685-
4686- printk(KERN_INFO "user-defined physical RAM map:\n");
4687- e820_print_map("user");
4688- }
4689-}
4690-
4691-#ifndef CONFIG_XEN
4692-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
4693- unsigned new_type)
4694-{
4695- int i;
4696-
4697- BUG_ON(old_type == new_type);
4698-
4699- for (i = 0; i < e820.nr_map; i++) {
4700- struct e820entry *ei = &e820.map[i];
4701- u64 final_start, final_end;
4702- if (ei->type != old_type)
4703- continue;
4704- /* totally covered? */
4705- if (ei->addr >= start && ei->size <= size) {
4706- ei->type = new_type;
4707- continue;
4708- }
4709- /* partially covered */
4710- final_start = max(start, ei->addr);
4711- final_end = min(start + size, ei->addr + ei->size);
4712- if (final_start >= final_end)
4713- continue;
4714- add_memory_region(final_start, final_end - final_start,
4715- new_type);
4716- }
4717-}
4718-
4719-void __init update_e820(void)
4720-{
4721- u8 nr_map;
4722-
4723- nr_map = e820.nr_map;
4724- if (sanitize_e820_map(e820.map, &nr_map))
4725- return;
4726- e820.nr_map = nr_map;
4727- printk(KERN_INFO "modified physical RAM map:\n");
4728- e820_print_map("modified");
4729-}
4730-#endif
4731-
4732-unsigned long pci_mem_start = 0xaeedbabe;
4733-EXPORT_SYMBOL(pci_mem_start);
4734-
4735-/*
4736- * Search for the biggest gap in the low 32 bits of the e820
4737- * memory space. We pass this space to PCI to assign MMIO resources
4738- * for hotplug or unconfigured devices in.
4739- * Hopefully the BIOS let enough space left.
4740- */
4741-__init void e820_setup_gap(struct e820entry *e820, int nr_map)
4742-{
4743- unsigned long gapstart, gapsize, round;
4744- unsigned long last;
4745- int i;
4746- int found = 0;
4747-
4748- last = 0x100000000ull;
4749- gapstart = 0x10000000;
4750- gapsize = 0x400000;
4751- i = nr_map;
4752- while (--i >= 0) {
4753- unsigned long long start = e820[i].addr;
4754- unsigned long long end = start + e820[i].size;
4755-
4756- /*
4757- * Since "last" is at most 4GB, we know we'll
4758- * fit in 32 bits if this condition is true
4759- */
4760- if (last > end) {
4761- unsigned long gap = last - end;
4762-
4763- if (gap > gapsize) {
4764- gapsize = gap;
4765- gapstart = end;
4766- found = 1;
4767- }
4768- }
4769- if (start < last)
4770- last = start;
4771- }
4772-
4773- if (!found) {
4774- gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
4775- printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
4776- "address range\n"
4777- KERN_ERR "PCI: Unassigned devices with 32bit resource "
4778- "registers may break!\n");
4779- }
4780-
4781- /*
4782- * See how much we want to round up: start off with
4783- * rounding to the next 1MB area.
4784- */
4785- round = 0x100000;
4786- while ((gapsize >> 4) > round)
4787- round += round;
4788- /* Fun with two's complement */
4789- pci_mem_start = (gapstart + round) & -round;
4790-
4791- printk(KERN_INFO
4792- "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
4793- pci_mem_start, gapstart, gapsize);
4794-}
4795-
4796-int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
4797-{
4798- int i;
4799-
4800- if (slot < 0 || slot >= e820.nr_map)
4801- return -1;
4802- for (i = slot; i < e820.nr_map; i++) {
4803- if (e820.map[i].type != E820_RAM)
4804- continue;
4805- break;
4806- }
4807- if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
4808- return -1;
4809- *addr = e820.map[i].addr;
4810- *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
4811- max_pfn << PAGE_SHIFT) - *addr;
4812- return i + 1;
4813-}
4814Index: head-2008-12-01/arch/x86/kernel/early_printk-xen.c
4815===================================================================
4816--- head-2008-12-01.orig/arch/x86/kernel/early_printk-xen.c 2008-12-01 11:44:55.000000000 +0100
4817+++ head-2008-12-01/arch/x86/kernel/early_printk-xen.c 2008-12-01 11:49:07.000000000 +0100
4818@@ -225,7 +225,7 @@ static struct console simnow_console = {
4819 static struct console *early_console = &early_vga_console;
4820 static int early_console_initialized;
4821
4822-void early_printk(const char *fmt, ...)
4823+asmlinkage void early_printk(const char *fmt, ...)
4824 {
4825 char buf[512];
4826 int n;
4827Index: head-2008-12-01/arch/x86/kernel/entry_32-xen.S
4828===================================================================
4829--- head-2008-12-01.orig/arch/x86/kernel/entry_32-xen.S 2008-12-01 11:44:55.000000000 +0100
4830+++ head-2008-12-01/arch/x86/kernel/entry_32-xen.S 2008-12-01 11:49:07.000000000 +0100
4831@@ -51,15 +51,26 @@
4832 #include <asm/percpu.h>
4833 #include <asm/dwarf2.h>
4834 #include <asm/processor-flags.h>
4835-#include "irq_vectors.h"
4836+#include <asm/ftrace.h>
4837+#include <asm/irq_vectors.h>
4838 #include <xen/interface/xen.h>
4839
4840+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
4841+#include <linux/elf-em.h>
4842+#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
4843+#define __AUDIT_ARCH_LE 0x40000000
4844+
4845+#ifndef CONFIG_AUDITSYSCALL
4846+#define sysenter_audit syscall_trace_entry
4847+#define sysexit_audit syscall_exit_work
4848+#endif
4849+
4850 /*
4851 * We use macros for low-level operations which need to be overridden
4852 * for paravirtualization. The following will never clobber any registers:
4853 * INTERRUPT_RETURN (aka. "iret")
4854 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
4855- * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
4856+ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
4857 *
4858 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
4859 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
4860@@ -277,11 +288,6 @@ END(resume_kernel)
4861 #endif
4862 CFI_ENDPROC
4863
4864- .macro test_tif ti_reg # system call tracing in operation / emulation
4865- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
4866- testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
4867- .endm
4868-
4869 /* SYSENTER_RETURN points to after the "sysenter" instruction in
4870 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
4871
4872@@ -338,8 +344,9 @@ sysenter_past_esp:
4873 .previous
4874
4875 GET_THREAD_INFO(%ebp)
4876- test_tif %ebp
4877- jnz syscall_trace_entry
4878+ testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4879+ jnz sysenter_audit
4880+sysenter_do_call:
4881 cmpl $(nr_syscalls), %eax
4882 jae syscall_badsys
4883 call *sys_call_table(,%eax,4)
4884@@ -349,14 +356,54 @@ sysenter_past_esp:
4885 TRACE_IRQS_OFF
4886 movl TI_flags(%ebp), %ecx
4887 testw $_TIF_ALLWORK_MASK, %cx
4888- jne syscall_exit_work
4889+ jne sysexit_audit
4890+sysenter_exit:
4891 /* if something modifies registers it must also disable sysexit */
4892 movl PT_EIP(%esp), %edx
4893 movl PT_OLDESP(%esp), %ecx
4894 xorl %ebp,%ebp
4895 TRACE_IRQS_ON
4896 1: mov PT_FS(%esp), %fs
4897- ENABLE_INTERRUPTS_SYSCALL_RET
4898+ ENABLE_INTERRUPTS_SYSEXIT
4899+
4900+#ifdef CONFIG_AUDITSYSCALL
4901+sysenter_audit:
4902+ testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
4903+ jnz syscall_trace_entry
4904+ addl $4,%esp
4905+ CFI_ADJUST_CFA_OFFSET -4
4906+ /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
4907+ /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
4908+ /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
4909+ movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
4910+ movl %eax,%edx /* 2nd arg: syscall number */
4911+ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
4912+ call audit_syscall_entry
4913+ pushl %ebx
4914+ CFI_ADJUST_CFA_OFFSET 4
4915+ movl PT_EAX(%esp),%eax /* reload syscall number */
4916+ jmp sysenter_do_call
4917+
4918+sysexit_audit:
4919+ testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4920+ jne syscall_exit_work
4921+ TRACE_IRQS_ON
4922+ ENABLE_INTERRUPTS(CLBR_ANY)
4923+ movl %eax,%edx /* second arg, syscall return value */
4924+ cmpl $0,%eax /* is it < 0? */
4925+ setl %al /* 1 if so, 0 if not */
4926+ movzbl %al,%eax /* zero-extend that */
4927+ inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
4928+ call audit_syscall_exit
4929+ DISABLE_INTERRUPTS(CLBR_ANY)
4930+ TRACE_IRQS_OFF
4931+ movl TI_flags(%ebp), %ecx
4932+ testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4933+ jne syscall_exit_work
4934+ movl PT_EAX(%esp),%eax /* reload syscall return value */
4935+ jmp sysenter_exit
4936+#endif
4937+
4938 CFI_ENDPROC
4939 .pushsection .fixup,"ax"
4940 2: movl $0,PT_FS(%esp)
4941@@ -400,7 +447,7 @@ ENTRY(system_call)
4942 CFI_ADJUST_CFA_OFFSET 4
4943 SAVE_ALL
4944 GET_THREAD_INFO(%ebp)
4945- test_tif %ebp
4946+ testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4947 jnz syscall_trace_entry
4948 cmpl $(nr_syscalls), %eax
4949 jae syscall_badsys
4950@@ -413,10 +460,6 @@ syscall_exit:
4951 # setting need_resched or sigpending
4952 # between sampling and the iret
4953 TRACE_IRQS_OFF
4954- testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
4955- jz no_singlestep
4956- orl $_TIF_SINGLESTEP,TI_flags(%ebp)
4957-no_singlestep:
4958 movl TI_flags(%ebp), %ecx
4959 testw $_TIF_ALLWORK_MASK, %cx # current->work
4960 jne syscall_exit_work
4961@@ -588,12 +631,8 @@ END(work_pending)
4962 syscall_trace_entry:
4963 movl $-ENOSYS,PT_EAX(%esp)
4964 movl %esp, %eax
4965- xorl %edx,%edx
4966- call do_syscall_trace
4967- cmpl $0, %eax
4968- jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
4969- # so must skip actual syscall
4970- movl PT_ORIG_EAX(%esp), %eax
4971+ call syscall_trace_enter
4972+ /* What it returned is what we'll actually use. */
4973 cmpl $(nr_syscalls), %eax
4974 jnae syscall_call
4975 jmp syscall_exit
4976@@ -602,14 +641,13 @@ END(syscall_trace_entry)
4977 # perform syscall exit tracing
4978 ALIGN
4979 syscall_exit_work:
4980- testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
4981+ testb $_TIF_WORK_SYSCALL_EXIT, %cl
4982 jz work_pending
4983 TRACE_IRQS_ON
4984- ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
4985+ ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
4986 # schedule() instead
4987 movl %esp, %eax
4988- movl $1, %edx
4989- call do_syscall_trace
4990+ call syscall_trace_leave
4991 jmp resume_userspace
4992 END(syscall_exit_work)
4993 CFI_ENDPROC
4994@@ -1109,10 +1147,10 @@ ENTRY(native_iret)
4995 .previous
4996 END(native_iret)
4997
4998-ENTRY(native_irq_enable_syscall_ret)
4999+ENTRY(native_irq_enable_sysexit)
5000 sti
5001 sysexit
5002-END(native_irq_enable_syscall_ret)
5003+END(native_irq_enable_sysexit)
5004 #endif
5005
5006 KPROBE_ENTRY(int3)
5007@@ -1261,6 +1299,77 @@ ENTRY(kernel_thread_helper)
5008 CFI_ENDPROC
5009 ENDPROC(kernel_thread_helper)
5010
5011+#ifdef CONFIG_FTRACE
5012+#ifdef CONFIG_DYNAMIC_FTRACE
5013+
5014+ENTRY(mcount)
5015+ pushl %eax
5016+ pushl %ecx
5017+ pushl %edx
5018+ movl 0xc(%esp), %eax
5019+ subl $MCOUNT_INSN_SIZE, %eax
5020+
5021+.globl mcount_call
5022+mcount_call:
5023+ call ftrace_stub
5024+
5025+ popl %edx
5026+ popl %ecx
5027+ popl %eax
5028+
5029+ ret
5030+END(mcount)
5031+
5032+ENTRY(ftrace_caller)
5033+ pushl %eax
5034+ pushl %ecx
5035+ pushl %edx
5036+ movl 0xc(%esp), %eax
5037+ movl 0x4(%ebp), %edx
5038+ subl $MCOUNT_INSN_SIZE, %eax
5039+
5040+.globl ftrace_call
5041+ftrace_call:
5042+ call ftrace_stub
5043+
5044+ popl %edx
5045+ popl %ecx
5046+ popl %eax
5047+
5048+.globl ftrace_stub
5049+ftrace_stub:
5050+ ret
5051+END(ftrace_caller)
5052+
5053+#else /* ! CONFIG_DYNAMIC_FTRACE */
5054+
5055+ENTRY(mcount)
5056+ cmpl $ftrace_stub, ftrace_trace_function
5057+ jnz trace
5058+.globl ftrace_stub
5059+ftrace_stub:
5060+ ret
5061+
5062+ /* taken from glibc */
5063+trace:
5064+ pushl %eax
5065+ pushl %ecx
5066+ pushl %edx
5067+ movl 0xc(%esp), %eax
5068+ movl 0x4(%ebp), %edx
5069+ subl $MCOUNT_INSN_SIZE, %eax
5070+
5071+ call *ftrace_trace_function
5072+
5073+ popl %edx
5074+ popl %ecx
5075+ popl %eax
5076+
5077+ jmp ftrace_stub
5078+END(mcount)
5079+#endif /* CONFIG_DYNAMIC_FTRACE */
5080+#endif /* CONFIG_FTRACE */
5081+
5082 #include <asm/alternative-asm.h>
5083
5084 # pv syscall call handler stub
5085@@ -1286,7 +1395,7 @@ ENTRY(ia32pv_cstar_target)
5086 .previous
5087 SAVE_ALL
5088 GET_THREAD_INFO(%ebp)
5089- test_tif %ebp
5090+ testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
5091 jnz cstar_trace_entry
5092 cmpl $nr_syscalls,%eax
5093 jae cstar_badsys
5094@@ -1320,29 +1429,21 @@ cstar_trace_entry:
5095 btl %eax,cstar_special
5096 jc .Lcstar_trace_special
5097 1: movl %esp,%eax
5098- xorl %edx,%edx
5099 LOCK_PREFIX
5100 orl $_TIF_CSTAR,TI_flags(%ebp)
5101- call do_syscall_trace
5102+ call syscall_trace_enter
5103 LOCK_PREFIX
5104 andl $~_TIF_CSTAR,TI_flags(%ebp)
5105- testl %eax,%eax
5106- jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
5107- # so must skip actual syscall
5108- movl PT_ORIG_EAX(%esp),%eax
5109+ /* What it returned is what we'll actually use. */
5110 cmpl $nr_syscalls,%eax
5111 jb .Lcstar_call
5112 jmp .Lcstar_exit
5113 .Lcstar_trace_special:
5114 movl PT_ECX(%esp),%ecx
5115 movl %esp,%eax
5116- xorl %edx,%edx
5117 movl %ecx,PT_EBP(%esp) # put user EBP back in place
5118- call do_syscall_trace
5119- testl %eax,%eax
5120- jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5121- # so must skip actual syscall
5122- movl PT_ORIG_EAX(%esp),%eax
5123+ call syscall_trace_enter
5124+ /* What it returned is what we'll actually use. */
5125 cmpl $nr_syscalls,%eax
5126 jb syscall_call
5127 jmp syscall_exit
5128Index: head-2008-12-01/arch/x86/kernel/entry_64.S
5129===================================================================
5130--- head-2008-12-01.orig/arch/x86/kernel/entry_64.S 2008-12-03 15:48:43.000000000 +0100
5131+++ head-2008-12-01/arch/x86/kernel/entry_64.S 2008-12-01 11:49:07.000000000 +0100
5132@@ -1409,7 +1409,7 @@ ENTRY(arch_unwind_init_running)
5133 ENDPROC(arch_unwind_init_running)
5134 #endif
5135
5136-#ifdef CONFIG_XEN
5137+#ifdef CONFIG_PARAVIRT_XEN
5138 ENTRY(xen_hypervisor_callback)
5139 zeroentry xen_do_hypervisor_callback
5140 END(xen_hypervisor_callback)
5141@@ -1507,4 +1507,4 @@ ENTRY(xen_failsafe_callback)
5142 CFI_ENDPROC
5143 END(xen_failsafe_callback)
5144
5145-#endif /* CONFIG_XEN */
5146+#endif /* CONFIG_PARAVIRT_XEN */
5147Index: head-2008-12-01/arch/x86/kernel/entry_64-xen.S
5148===================================================================
5149--- head-2008-12-01.orig/arch/x86/kernel/entry_64-xen.S 2008-12-01 11:44:55.000000000 +0100
5150+++ head-2008-12-01/arch/x86/kernel/entry_64-xen.S 2008-12-01 11:49:07.000000000 +0100
5151@@ -53,19 +53,130 @@
5152 #include <asm/hw_irq.h>
5153 #include <asm/page.h>
5154 #include <asm/irqflags.h>
5155+#include <asm/ftrace.h>
5156 #include <asm/errno.h>
5157 #include <xen/interface/xen.h>
5158 #include <xen/interface/features.h>
5159
5160+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
5161+#include <linux/elf-em.h>
5162+#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
5163+#define __AUDIT_ARCH_64BIT 0x80000000
5164+#define __AUDIT_ARCH_LE 0x40000000
5165+
5166 .code64
5167
5168+#ifdef CONFIG_FTRACE
5169+#ifdef CONFIG_DYNAMIC_FTRACE
5170+ENTRY(mcount)
5171+
5172+ subq $0x38, %rsp
5173+ movq %rax, (%rsp)
5174+ movq %rcx, 8(%rsp)
5175+ movq %rdx, 16(%rsp)
5176+ movq %rsi, 24(%rsp)
5177+ movq %rdi, 32(%rsp)
5178+ movq %r8, 40(%rsp)
5179+ movq %r9, 48(%rsp)
5180+
5181+ movq 0x38(%rsp), %rdi
5182+ subq $MCOUNT_INSN_SIZE, %rdi
5183+
5184+.globl mcount_call
5185+mcount_call:
5186+ call ftrace_stub
5187+
5188+ movq 48(%rsp), %r9
5189+ movq 40(%rsp), %r8
5190+ movq 32(%rsp), %rdi
5191+ movq 24(%rsp), %rsi
5192+ movq 16(%rsp), %rdx
5193+ movq 8(%rsp), %rcx
5194+ movq (%rsp), %rax
5195+ addq $0x38, %rsp
5196+
5197+ retq
5198+END(mcount)
5199+
5200+ENTRY(ftrace_caller)
5201+
5202+ /* taken from glibc */
5203+ subq $0x38, %rsp
5204+ movq %rax, (%rsp)
5205+ movq %rcx, 8(%rsp)
5206+ movq %rdx, 16(%rsp)
5207+ movq %rsi, 24(%rsp)
5208+ movq %rdi, 32(%rsp)
5209+ movq %r8, 40(%rsp)
5210+ movq %r9, 48(%rsp)
5211+
5212+ movq 0x38(%rsp), %rdi
5213+ movq 8(%rbp), %rsi
5214+ subq $MCOUNT_INSN_SIZE, %rdi
5215+
5216+.globl ftrace_call
5217+ftrace_call:
5218+ call ftrace_stub
5219+
5220+ movq 48(%rsp), %r9
5221+ movq 40(%rsp), %r8
5222+ movq 32(%rsp), %rdi
5223+ movq 24(%rsp), %rsi
5224+ movq 16(%rsp), %rdx
5225+ movq 8(%rsp), %rcx
5226+ movq (%rsp), %rax
5227+ addq $0x38, %rsp
5228+
5229+.globl ftrace_stub
5230+ftrace_stub:
5231+ retq
5232+END(ftrace_caller)
5233+
5234+#else /* ! CONFIG_DYNAMIC_FTRACE */
5235+ENTRY(mcount)
5236+ cmpq $ftrace_stub, ftrace_trace_function
5237+ jnz trace
5238+.globl ftrace_stub
5239+ftrace_stub:
5240+ retq
5241+
5242+trace:
5243+ /* taken from glibc */
5244+ subq $0x38, %rsp
5245+ movq %rax, (%rsp)
5246+ movq %rcx, 8(%rsp)
5247+ movq %rdx, 16(%rsp)
5248+ movq %rsi, 24(%rsp)
5249+ movq %rdi, 32(%rsp)
5250+ movq %r8, 40(%rsp)
5251+ movq %r9, 48(%rsp)
5252+
5253+ movq 0x38(%rsp), %rdi
5254+ movq 8(%rbp), %rsi
5255+ subq $MCOUNT_INSN_SIZE, %rdi
5256+
5257+ call *ftrace_trace_function
5258+
5259+ movq 48(%rsp), %r9
5260+ movq 40(%rsp), %r8
5261+ movq 32(%rsp), %rdi
5262+ movq 24(%rsp), %rsi
5263+ movq 16(%rsp), %rdx
5264+ movq 8(%rsp), %rcx
5265+ movq (%rsp), %rax
5266+ addq $0x38, %rsp
5267+
5268+ jmp ftrace_stub
5269+END(mcount)
5270+#endif /* CONFIG_DYNAMIC_FTRACE */
5271+#endif /* CONFIG_FTRACE */
5272+
5273 #ifndef CONFIG_PREEMPT
5274 #define retint_kernel retint_restore_args
5275 #endif
5276
5277 #ifdef CONFIG_PARAVIRT
5278-ENTRY(native_irq_enable_syscall_ret)
5279- movq %gs:pda_oldrsp,%rsp
5280+ENTRY(native_usergs_sysret64)
5281 swapgs
5282 sysretq
5283 #endif /* CONFIG_PARAVIRT */
5284@@ -102,7 +213,7 @@ NMI_MASK = 0x80000000
5285 .macro FAKE_STACK_FRAME child_rip
5286 /* push in order ss, rsp, eflags, cs, rip */
5287 xorl %eax, %eax
5288- pushq %rax /* ss */
5289+ pushq $__KERNEL_DS /* ss */
5290 CFI_ADJUST_CFA_OFFSET 8
5291 /*CFI_REL_OFFSET ss,0*/
5292 pushq %rax /* rsp */
5293@@ -197,13 +308,13 @@ ENTRY(ret_from_fork)
5294 CFI_ADJUST_CFA_OFFSET -4
5295 call schedule_tail
5296 GET_THREAD_INFO(%rcx)
5297- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
5298+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5299 jnz rff_trace
5300 rff_action:
5301 RESTORE_REST
5302 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
5303 je int_ret_from_sys_call
5304- testl $_TIF_IA32,threadinfo_flags(%rcx)
5305+ testl $_TIF_IA32,TI_flags(%rcx)
5306 jnz int_ret_from_sys_call
5307 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
5308 jmp ret_from_sys_call
5309@@ -265,8 +376,9 @@ ENTRY(system_call)
5310 SAVE_ARGS -8,0
5311 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
5312 GET_THREAD_INFO(%rcx)
5313- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
5314+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
5315 jnz tracesys
5316+system_call_fastpath:
5317 cmpq $__NR_syscall_max,%rax
5318 ja badsys
5319 movq %r10,%rcx
5320@@ -284,7 +396,7 @@ sysret_check:
5321 GET_THREAD_INFO(%rcx)
5322 DISABLE_INTERRUPTS(CLBR_NONE)
5323 TRACE_IRQS_OFF
5324- movl threadinfo_flags(%rcx),%edx
5325+ movl TI_flags(%rcx),%edx
5326 andl %edi,%edx
5327 jnz sysret_careful
5328 CFI_REMEMBER_STATE
5329@@ -315,16 +427,16 @@ sysret_careful:
5330 sysret_signal:
5331 TRACE_IRQS_ON
5332 ENABLE_INTERRUPTS(CLBR_NONE)
5333- testl $_TIF_DO_NOTIFY_MASK,%edx
5334- jz 1f
5335-
5336- /* Really a signal */
5337+#ifdef CONFIG_AUDITSYSCALL
5338+ bt $TIF_SYSCALL_AUDIT,%edx
5339+ jc sysret_audit
5340+#endif
5341 /* edx: work flags (arg3) */
5342 leaq do_notify_resume(%rip),%rax
5343 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
5344 xorl %esi,%esi # oldset -> arg2
5345 call ptregscall_common
5346-1: movl $_TIF_NEED_RESCHED,%edi
5347+ movl $_TIF_WORK_MASK,%edi
5348 /* Use IRET because user could have changed frame. This
5349 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
5350 DISABLE_INTERRUPTS(CLBR_NONE)
5351@@ -335,14 +447,56 @@ badsys:
5352 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
5353 jmp ret_from_sys_call
5354
5355+#ifdef CONFIG_AUDITSYSCALL
5356+ /*
5357+ * Fast path for syscall audit without full syscall trace.
5358+ * We just call audit_syscall_entry() directly, and then
5359+ * jump back to the normal fast path.
5360+ */
5361+auditsys:
5362+ movq %r10,%r9 /* 6th arg: 4th syscall arg */
5363+ movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
5364+ movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
5365+ movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
5366+ movq %rax,%rsi /* 2nd arg: syscall number */
5367+ movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
5368+ call audit_syscall_entry
5369+ LOAD_ARGS 0 /* reload call-clobbered registers */
5370+ jmp system_call_fastpath
5371+
5372+ /*
5373+ * Return fast path for syscall audit. Call audit_syscall_exit()
5374+ * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
5375+ * masked off.
5376+ */
5377+sysret_audit:
5378+ movq %rax,%rsi /* second arg, syscall return value */
5379+ cmpq $0,%rax /* is it < 0? */
5380+ setl %al /* 1 if so, 0 if not */
5381+ movzbl %al,%edi /* zero-extend that into %edi */
5382+ inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
5383+ call audit_syscall_exit
5384+ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
5385+ jmp sysret_check
5386+#endif /* CONFIG_AUDITSYSCALL */
5387+
5388 /* Do syscall tracing */
5389 tracesys:
5390+#ifdef CONFIG_AUDITSYSCALL
5391+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5392+ jz auditsys
5393+#endif
5394 SAVE_REST
5395 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
5396 FIXUP_TOP_OF_STACK %rdi
5397 movq %rsp,%rdi
5398 call syscall_trace_enter
5399- LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
5400+ /*
5401+ * Reload arg registers from stack in case ptrace changed them.
5402+ * We don't reload %rax because syscall_trace_enter() returned
5403+ * the value it wants us to use in the table lookup.
5404+ */
5405+ LOAD_ARGS ARGOFFSET, 1
5406 RESTORE_REST
5407 cmpq $__NR_syscall_max,%rax
5408 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
5409@@ -356,6 +510,7 @@ tracesys:
5410 * Has correct top of stack, but partial stack frame.
5411 */
5412 .globl int_ret_from_sys_call
5413+ .globl int_with_check
5414 int_ret_from_sys_call:
5415 DISABLE_INTERRUPTS(CLBR_NONE)
5416 TRACE_IRQS_OFF
5417@@ -370,10 +525,10 @@ int_ret_from_sys_call:
5418 int_with_check:
5419 LOCKDEP_SYS_EXIT_IRQ
5420 GET_THREAD_INFO(%rcx)
5421- movl threadinfo_flags(%rcx),%edx
5422+ movl TI_flags(%rcx),%edx
5423 andl %edi,%edx
5424 jnz int_careful
5425- andl $~TS_COMPAT,threadinfo_status(%rcx)
5426+ andl $~TS_COMPAT,TI_status(%rcx)
5427 jmp retint_restore_args
5428
5429 /* Either reschedule or signal or syscall exit tracking needed. */
5430@@ -399,7 +554,7 @@ int_very_careful:
5431 ENABLE_INTERRUPTS(CLBR_NONE)
5432 SAVE_REST
5433 /* Check for syscall exit trace */
5434- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
5435+ testl $_TIF_WORK_SYSCALL_EXIT,%edx
5436 jz int_signal
5437 pushq %rdi
5438 CFI_ADJUST_CFA_OFFSET 8
5439@@ -407,7 +562,7 @@ int_very_careful:
5440 call syscall_trace_leave
5441 popq %rdi
5442 CFI_ADJUST_CFA_OFFSET -8
5443- andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
5444+ andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
5445 jmp int_restore_rest
5446
5447 int_signal:
5448@@ -416,7 +571,7 @@ int_signal:
5449 movq %rsp,%rdi # &ptregs -> arg1
5450 xorl %esi,%esi # oldset -> arg2
5451 call do_notify_resume
5452-1: movl $_TIF_NEED_RESCHED,%edi
5453+1: movl $_TIF_WORK_MASK,%edi
5454 int_restore_rest:
5455 RESTORE_REST
5456 DISABLE_INTERRUPTS(CLBR_NONE)
5457@@ -443,7 +598,6 @@ END(\label)
5458 PTREGSCALL stub_clone, sys_clone, %r8
5459 PTREGSCALL stub_fork, sys_fork, %rdi
5460 PTREGSCALL stub_vfork, sys_vfork, %rdi
5461- PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
5462 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
5463 PTREGSCALL stub_iopl, sys_iopl, %rsi
5464
5465@@ -517,10 +671,12 @@ END(stub_rt_sigreturn)
5466 *
5467 */
5468
5469-retint_check:
5470+retint_with_reschedule:
5471 CFI_DEFAULT_STACK adj=1
5472+ movl $_TIF_WORK_MASK,%edi
5473+retint_check:
5474 LOCKDEP_SYS_EXIT_IRQ
5475- movl threadinfo_flags(%rcx),%edx
5476+ movl TI_flags(%rcx),%edx
5477 andl %edi,%edx
5478 CFI_REMEMBER_STATE
5479 jnz retint_careful
5480@@ -565,17 +721,16 @@ retint_signal:
5481 RESTORE_REST
5482 DISABLE_INTERRUPTS(CLBR_NONE)
5483 TRACE_IRQS_OFF
5484- movl $_TIF_NEED_RESCHED,%edi
5485 GET_THREAD_INFO(%rcx)
5486- jmp retint_check
5487+ jmp retint_with_reschedule
5488
5489 #ifdef CONFIG_PREEMPT
5490 /* Returning to kernel space. Check if we need preemption */
5491 /* rcx: threadinfo. interrupts off. */
5492 ENTRY(retint_kernel)
5493- cmpl $0,threadinfo_preempt_count(%rcx)
5494+ cmpl $0,TI_preempt_count(%rcx)
5495 jnz retint_restore_args
5496- bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
5497+ bt $TIF_NEED_RESCHED,TI_flags(%rcx)
5498 jnc retint_restore_args
5499 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
5500 jnc retint_restore_args
5501@@ -630,6 +785,9 @@ END(invalidate_interrupt\num)
5502 ENTRY(call_function_interrupt)
5503 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
5504 END(call_function_interrupt)
5505+ENTRY(call_function_single_interrupt)
5506+ apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
5507+END(call_function_single_interrupt)
5508 ENTRY(irq_move_cleanup_interrupt)
5509 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
5510 END(irq_move_cleanup_interrupt)
5511@@ -639,6 +797,10 @@ ENTRY(apic_timer_interrupt)
5512 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
5513 END(apic_timer_interrupt)
5514
5515+ENTRY(uv_bau_message_intr1)
5516+ apicinterrupt 220,uv_bau_message_interrupt
5517+END(uv_bau_message_intr1)
5518+
5519 ENTRY(error_interrupt)
5520 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
5521 END(error_interrupt)
5522@@ -752,7 +914,7 @@ paranoid_restore\trace:
5523 jmp irq_return
5524 paranoid_userspace\trace:
5525 GET_THREAD_INFO(%rcx)
5526- movl threadinfo_flags(%rcx),%ebx
5527+ movl TI_flags(%rcx),%ebx
5528 andl $_TIF_WORK_MASK,%ebx
5529 jz paranoid_swapgs\trace
5530 movq %rsp,%rdi /* &pt_regs */
5531@@ -849,7 +1011,7 @@ error_exit:
5532 testb $3,CS-ARGOFFSET(%rsp)
5533 jz retint_kernel
5534 LOCKDEP_SYS_EXIT_IRQ
5535- movl threadinfo_flags(%rcx),%edx
5536+ movl TI_flags(%rcx),%edx
5537 movl $_TIF_WORK_MASK,%edi
5538 andl %edi,%edx
5539 jnz retint_careful
5540@@ -871,11 +1033,11 @@ error_kernelspace:
5541 iret run with kernel gs again, so don't set the user space flag.
5542 B stepping K8s sometimes report an truncated RIP for IRET
5543 exceptions returning to compat mode. Check for these here too. */
5544- leaq irq_return(%rip),%rbp
5545- cmpq %rbp,RIP(%rsp)
5546+ leaq irq_return(%rip),%rcx
5547+ cmpq %rcx,RIP(%rsp)
5548 je error_swapgs
5549- movl %ebp,%ebp /* zero extend */
5550- cmpq %rbp,RIP(%rsp)
5551+ movl %ecx,%ecx /* zero extend */
5552+ cmpq %rcx,RIP(%rsp)
5553 je error_swapgs
5554 cmpq $gs_change,RIP(%rsp)
5555 je error_swapgs
5556@@ -1121,6 +1283,7 @@ END(device_not_available)
5557 /* runs on exception stack */
5558 KPROBE_ENTRY(debug)
5559 /* INTR_FRAME
5560+ PARAVIRT_ADJUST_EXCEPTION_FRAME
5561 pushq $0
5562 CFI_ADJUST_CFA_OFFSET 8 */
5563 zeroentry do_debug
5564@@ -1148,6 +1311,7 @@ END(do_nmi_callback)
5565
5566 KPROBE_ENTRY(int3)
5567 /* INTR_FRAME
5568+ PARAVIRT_ADJUST_EXCEPTION_FRAME
5569 pushq $0
5570 CFI_ADJUST_CFA_OFFSET 8 */
5571 zeroentry do_int3
5572@@ -1171,14 +1335,11 @@ ENTRY(coprocessor_segment_overrun)
5573 zeroentry do_coprocessor_segment_overrun
5574 END(coprocessor_segment_overrun)
5575
5576-ENTRY(reserved)
5577- zeroentry do_reserved
5578-END(reserved)
5579-
5580 #if 0
5581 /* runs on exception stack */
5582 ENTRY(double_fault)
5583 XCPT_FRAME
5584+ PARAVIRT_ADJUST_EXCEPTION_FRAME
5585 paranoidentry do_double_fault
5586 jmp paranoid_exit1
5587 CFI_ENDPROC
5588@@ -1196,6 +1357,7 @@ END(segment_not_present)
5589 /* runs on exception stack */
5590 ENTRY(stack_segment)
5591 /* XCPT_FRAME
5592+ PARAVIRT_ADJUST_EXCEPTION_FRAME
5593 paranoidentry do_stack_segment */
5594 errorentry do_stack_segment
5595 /* jmp paranoid_exit1
5596@@ -1222,6 +1384,7 @@ END(spurious_interrupt_bug)
5597 /* runs on exception stack */
5598 ENTRY(machine_check)
5599 INTR_FRAME
5600+ PARAVIRT_ADJUST_EXCEPTION_FRAME
5601 pushq $0
5602 CFI_ADJUST_CFA_OFFSET 8
5603 paranoidentry do_machine_check
5604Index: head-2008-12-01/arch/x86/kernel/genapic_64-xen.c
5605===================================================================
5606--- head-2008-12-01.orig/arch/x86/kernel/genapic_64-xen.c 2008-12-01 11:44:55.000000000 +0100
5607+++ head-2008-12-01/arch/x86/kernel/genapic_64-xen.c 2008-12-01 11:49:07.000000000 +0100
5608@@ -58,7 +58,7 @@ void __init setup_apic_routing(void)
5609 else
5610 #endif
5611
5612- if (num_possible_cpus() <= 8)
5613+ if (max_physical_apicid < 8)
5614 genapic = &apic_flat;
5615 else
5616 genapic = &apic_physflat;
5617@@ -121,4 +121,5 @@ int is_uv_system(void)
5618 {
5619 return uv_system_type != UV_NONE;
5620 }
5621+EXPORT_SYMBOL_GPL(is_uv_system);
5622 #endif
5623Index: head-2008-12-01/arch/x86/kernel/genapic_xen_64.c
5624===================================================================
5625--- head-2008-12-01.orig/arch/x86/kernel/genapic_xen_64.c 2008-12-01 11:44:55.000000000 +0100
5626+++ head-2008-12-01/arch/x86/kernel/genapic_xen_64.c 2008-12-01 11:49:07.000000000 +0100
5627@@ -43,7 +43,7 @@ void xen_send_IPI_shortcut(unsigned int
5628 __send_IPI_one(smp_processor_id(), vector);
5629 break;
5630 case APIC_DEST_ALLBUT:
5631- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5632+ for_each_possible_cpu(cpu) {
5633 if (cpu == smp_processor_id())
5634 continue;
5635 if (cpu_isset(cpu, cpu_online_map)) {
5636@@ -52,7 +52,7 @@ void xen_send_IPI_shortcut(unsigned int
5637 }
5638 break;
5639 case APIC_DEST_ALLINC:
5640- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5641+ for_each_possible_cpu(cpu) {
5642 if (cpu_isset(cpu, cpu_online_map)) {
5643 __send_IPI_one(cpu, vector);
5644 }
5645@@ -81,8 +81,6 @@ static cpumask_t xen_vector_allocation_d
5646 */
5647 static void xen_init_apic_ldr(void)
5648 {
5649- Dprintk("%s\n", __FUNCTION__);
5650- return;
5651 }
5652
5653 static void xen_send_IPI_allbutself(int vector)
5654@@ -92,14 +90,12 @@ static void xen_send_IPI_allbutself(int
5655 * we get an APIC send error if we try to broadcast.
5656 * thus we have to avoid sending IPIs in this case.
5657 */
5658- Dprintk("%s\n", __FUNCTION__);
5659 if (num_online_cpus() > 1)
5660 xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
5661 }
5662
5663 static void xen_send_IPI_all(int vector)
5664 {
5665- Dprintk("%s\n", __FUNCTION__);
5666 xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
5667 }
5668
5669@@ -109,11 +105,10 @@ static void xen_send_IPI_mask(cpumask_t
5670 unsigned int cpu;
5671 unsigned long flags;
5672
5673- Dprintk("%s\n", __FUNCTION__);
5674 local_irq_save(flags);
5675 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
5676
5677- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5678+ for_each_possible_cpu(cpu) {
5679 if (cpu_isset(cpu, cpumask)) {
5680 __send_IPI_one(cpu, vector);
5681 }
5682@@ -125,14 +120,12 @@ static void xen_send_IPI_mask(cpumask_t
5683 static int xen_apic_id_registered(void)
5684 {
5685 /* better be set */
5686- Dprintk("%s\n", __FUNCTION__);
5687 return physid_isset(smp_processor_id(), phys_cpu_present_map);
5688 }
5689 #endif
5690
5691 static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
5692 {
5693- Dprintk("%s\n", __FUNCTION__);
5694 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
5695 }
5696
5697@@ -140,7 +133,6 @@ static unsigned int phys_pkg_id(int inde
5698 {
5699 u32 ebx;
5700
5701- Dprintk("%s\n", __FUNCTION__);
5702 ebx = cpuid_ebx(1);
5703 return ((ebx >> 24) & 0xFF) >> index_msb;
5704 }
5705Index: head-2008-12-01/arch/x86/kernel/head-xen.c
5706===================================================================
5707--- /dev/null 1970-01-01 00:00:00.000000000 +0000
5708+++ head-2008-12-01/arch/x86/kernel/head-xen.c 2008-12-01 11:49:07.000000000 +0100
5709@@ -0,0 +1,57 @@
5710+#include <linux/kernel.h>
5711+#include <linux/init.h>
5712+
5713+#include <asm/setup.h>
5714+#include <asm/bios_ebda.h>
5715+
5716+#define BIOS_LOWMEM_KILOBYTES 0x413
5717+
5718+/*
5719+ * The BIOS places the EBDA/XBDA at the top of conventional
5720+ * memory, and usually decreases the reported amount of
5721+ * conventional memory (int 0x12) too. This also contains a
5722+ * workaround for Dell systems that neglect to reserve EBDA.
5723+ * The same workaround also avoids a problem with the AMD768MPX
5724+ * chipset: reserve a page before VGA to prevent PCI prefetch
5725+ * into it (errata #56). Usually the page is reserved anyways,
5726+ * unless you have no PS/2 mouse plugged in.
5727+ */
5728+void __init reserve_ebda_region(void)
5729+{
5730+#ifndef CONFIG_XEN
5731+ unsigned int lowmem, ebda_addr;
5732+
5733+ /* To determine the position of the EBDA and the */
5734+ /* end of conventional memory, we need to look at */
5735+ /* the BIOS data area. In a paravirtual environment */
5736+ /* that area is absent. We'll just have to assume */
5737+ /* that the paravirt case can handle memory setup */
5738+ /* correctly, without our help. */
5739+ if (paravirt_enabled())
5740+ return;
5741+
5742+ /* end of low (conventional) memory */
5743+ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5744+ lowmem <<= 10;
5745+
5746+ /* start of EBDA area */
5747+ ebda_addr = get_bios_ebda();
5748+
5749+ /* Fixup: bios puts an EBDA in the top 64K segment */
5750+ /* of conventional memory, but does not adjust lowmem. */
5751+ if ((lowmem - ebda_addr) <= 0x10000)
5752+ lowmem = ebda_addr;
5753+
5754+ /* Fixup: bios does not report an EBDA at all. */
5755+ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5756+ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5757+ lowmem = 0x9f000;
5758+
5759+ /* Paranoia: should never happen, but... */
5760+ if ((lowmem == 0) || (lowmem >= 0x100000))
5761+ lowmem = 0x9f000;
5762+
5763+ /* reserve all memory between lowmem and the 1MB mark */
5764+ reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
5765+#endif
5766+}
5767Index: head-2008-12-01/arch/x86/kernel/head32-xen.c
5768===================================================================
5769--- /dev/null 1970-01-01 00:00:00.000000000 +0000
5770+++ head-2008-12-01/arch/x86/kernel/head32-xen.c 2008-12-01 11:49:07.000000000 +0100
5771@@ -0,0 +1,57 @@
5772+/*
5773+ * linux/arch/i386/kernel/head32.c -- prepare to run common code
5774+ *
5775+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5776+ * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
5777+ */
5778+
5779+#include <linux/init.h>
5780+#include <linux/start_kernel.h>
5781+
5782+#include <asm/setup.h>
5783+#include <asm/sections.h>
5784+#include <asm/e820.h>
5785+#include <asm/bios_ebda.h>
5786+
5787+void __init i386_start_kernel(void)
5788+{
5789+ reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
5790+
5791+#ifndef CONFIG_XEN
5792+#ifdef CONFIG_BLK_DEV_INITRD
5793+ /* Reserve INITRD */
5794+ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
5795+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
5796+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
5797+ u64 ramdisk_end = ramdisk_image + ramdisk_size;
5798+ reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
5799+ }
5800+#endif
5801+ reserve_early(init_pg_tables_start, init_pg_tables_end,
5802+ "INIT_PG_TABLE");
5803+#else
5804+ reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
5805+ __pa(xen_start_info->pt_base)
5806+ + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
5807+ "Xen provided");
5808+
5809+ {
5810+ int max_cmdline;
5811+
5812+ if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
5813+ max_cmdline = COMMAND_LINE_SIZE;
5814+ memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
5815+ boot_command_line[max_cmdline-1] = '\0';
5816+ }
5817+#endif
5818+
5819+ reserve_ebda_region();
5820+
5821+ /*
5822+ * At this point everything still needed from the boot loader
5823+ * or BIOS or kernel text should be early reserved or marked not
5824+ * RAM in e820. All other memory is free game.
5825+ */
5826+
5827+ start_kernel();
5828+}
5829Index: head-2008-12-01/arch/x86/kernel/head64-xen.c
5830===================================================================
5831--- head-2008-12-01.orig/arch/x86/kernel/head64-xen.c 2008-12-01 11:44:55.000000000 +0100
5832+++ head-2008-12-01/arch/x86/kernel/head64-xen.c 2008-12-01 11:49:07.000000000 +0100
5833@@ -32,7 +32,26 @@
5834 #include <asm/e820.h>
5835 #include <asm/bios_ebda.h>
5836
5837-unsigned long start_pfn;
5838+/* boot cpu pda */
5839+static struct x8664_pda _boot_cpu_pda __read_mostly;
5840+
5841+#ifdef CONFIG_SMP
5842+/*
5843+ * We install an empty cpu_pda pointer table to indicate to early users
5844+ * (numa_set_node) that the cpu_pda pointer table for cpus other than
5845+ * the boot cpu is not yet setup.
5846+ */
5847+static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
5848+#else
5849+static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
5850+#endif
5851+
5852+void __init x86_64_init_pda(void)
5853+{
5854+ _cpu_pda = __cpu_pda;
5855+ cpu_pda(0) = &_boot_cpu_pda;
5856+ pda_init(0);
5857+}
5858
5859 #ifndef CONFIG_XEN
5860 static void __init zap_identity_mappings(void)
5861@@ -77,83 +96,10 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
5862 unsigned int machine_to_phys_order;
5863 EXPORT_SYMBOL(machine_to_phys_order);
5864
5865-#define BIOS_LOWMEM_KILOBYTES 0x413
5866-
5867-/*
5868- * The BIOS places the EBDA/XBDA at the top of conventional
5869- * memory, and usually decreases the reported amount of
5870- * conventional memory (int 0x12) too. This also contains a
5871- * workaround for Dell systems that neglect to reserve EBDA.
5872- * The same workaround also avoids a problem with the AMD768MPX
5873- * chipset: reserve a page before VGA to prevent PCI prefetch
5874- * into it (errata #56). Usually the page is reserved anyways,
5875- * unless you have no PS/2 mouse plugged in.
5876- */
5877-static void __init reserve_ebda_region(void)
5878-{
5879-#ifndef CONFIG_XEN
5880- unsigned int lowmem, ebda_addr;
5881-
5882- /* To determine the position of the EBDA and the */
5883- /* end of conventional memory, we need to look at */
5884- /* the BIOS data area. In a paravirtual environment */
5885- /* that area is absent. We'll just have to assume */
5886- /* that the paravirt case can handle memory setup */
5887- /* correctly, without our help. */
5888- if (paravirt_enabled())
5889- return;
5890-
5891- /* end of low (conventional) memory */
5892- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5893- lowmem <<= 10;
5894-
5895- /* start of EBDA area */
5896- ebda_addr = get_bios_ebda();
5897-
5898- /* Fixup: bios puts an EBDA in the top 64K segment */
5899- /* of conventional memory, but does not adjust lowmem. */
5900- if ((lowmem - ebda_addr) <= 0x10000)
5901- lowmem = ebda_addr;
5902-
5903- /* Fixup: bios does not report an EBDA at all. */
5904- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5905- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5906- lowmem = 0x9f000;
5907-
5908- /* Paranoia: should never happen, but... */
5909- if ((lowmem == 0) || (lowmem >= 0x100000))
5910- lowmem = 0x9f000;
5911-
5912- /* reserve all memory between lowmem and the 1MB mark */
5913- reserve_early(lowmem, 0x100000, "BIOS reserved");
5914-#endif
5915-}
5916-
5917-static void __init reserve_setup_data(void)
5918-{
5919-#ifndef CONFIG_XEN
5920- struct setup_data *data;
5921- unsigned long pa_data;
5922- char buf[32];
5923-
5924- if (boot_params.hdr.version < 0x0209)
5925- return;
5926- pa_data = boot_params.hdr.setup_data;
5927- while (pa_data) {
5928- data = early_ioremap(pa_data, sizeof(*data));
5929- sprintf(buf, "setup data %x", data->type);
5930- reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
5931- pa_data = data->next;
5932- early_iounmap(data, sizeof(*data));
5933- }
5934-#endif
5935-}
5936-
5937 void __init x86_64_start_kernel(char * real_mode_data)
5938 {
5939 struct xen_machphys_mapping mapping;
5940 unsigned long machine_to_phys_nr_ents;
5941- int i;
5942
5943 /*
5944 * Build-time sanity checks on the kernel image and module
5945@@ -167,6 +113,7 @@ void __init x86_64_start_kernel(char * r
5946 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
5947 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
5948 (__START_KERNEL & PGDIR_MASK)));
5949+ BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
5950
5951 xen_setup_features();
5952
5953@@ -174,8 +121,6 @@ void __init x86_64_start_kernel(char * r
5954 if (!xen_feature(XENFEAT_auto_translated_physmap))
5955 phys_to_machine_mapping =
5956 (unsigned long *)xen_start_info->mfn_list;
5957- start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
5958- xen_start_info->nr_pt_frames;
5959
5960 machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
5961 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
5962@@ -208,19 +153,23 @@ void __init x86_64_start_kernel(char * r
5963
5964 early_printk("Kernel alive\n");
5965
5966- for (i = 0; i < NR_CPUS; i++)
5967- cpu_pda(i) = &boot_cpu_pda[i];
5968+ x86_64_init_pda();
5969
5970- pda_init(0);
5971+ early_printk("Kernel really alive\n");
5972+
5973+ x86_64_start_reservations(real_mode_data);
5974+}
5975+
5976+void __init x86_64_start_reservations(char *real_mode_data)
5977+{
5978 copy_bootdata(__va(real_mode_data));
5979
5980 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
5981
5982 reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
5983- start_pfn << PAGE_SHIFT, "Xen provided");
5984-
5985- reserve_ebda_region();
5986- reserve_setup_data();
5987+ __pa(xen_start_info->pt_base)
5988+ + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
5989+ "Xen provided");
5990
5991 /*
5992 * At this point everything still needed from the boot loader
5993Index: head-2008-12-01/arch/x86/kernel/head_64-xen.S
5994===================================================================
5995--- head-2008-12-01.orig/arch/x86/kernel/head_64-xen.S 2008-12-01 11:36:47.000000000 +0100
5996+++ head-2008-12-01/arch/x86/kernel/head_64-xen.S 2008-12-01 11:49:07.000000000 +0100
5997@@ -95,53 +95,6 @@ NEXT_PAGE(hypercall_page)
5998
5999 #undef NEXT_PAGE
6000
6001- .data
6002-
6003- .align 16
6004- .globl cpu_gdt_descr
6005-cpu_gdt_descr:
6006- .word gdt_end-cpu_gdt_table-1
6007-gdt:
6008- .quad cpu_gdt_table
6009-#ifdef CONFIG_SMP
6010- .rept NR_CPUS-1
6011- .word 0
6012- .quad 0
6013- .endr
6014-#endif
6015-
6016-/* We need valid kernel segments for data and code in long mode too
6017- * IRET will check the segment types kkeil 2000/10/28
6018- * Also sysret mandates a special GDT layout
6019- */
6020-
6021- .section .data.page_aligned, "aw"
6022- .align PAGE_SIZE
6023-
6024-/* The TLS descriptors are currently at a different place compared to i386.
6025- Hopefully nobody expects them at a fixed place (Wine?) */
6026-
6027-ENTRY(cpu_gdt_table)
6028- .quad 0x0000000000000000 /* NULL descriptor */
6029- .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
6030- .quad 0x00af9b000000ffff /* __KERNEL_CS */
6031- .quad 0x00cf93000000ffff /* __KERNEL_DS */
6032- .quad 0x00cffb000000ffff /* __USER32_CS */
6033- .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
6034- .quad 0x00affb000000ffff /* __USER_CS */
6035- .quad 0x0 /* unused */
6036- .quad 0,0 /* TSS */
6037- .quad 0,0 /* LDT */
6038- .quad 0,0,0 /* three TLS descriptors */
6039- .quad 0x0000f40000000000 /* node/CPU stored in limit */
6040-gdt_end:
6041- /* asm/segment.h:GDT_ENTRIES must match this */
6042- /* This should be a multiple of the cache line size */
6043- /* GDTs of other CPUs are now dynamically allocated */
6044-
6045- /* zero the remaining page */
6046- .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
6047-
6048 .section .bss.page_aligned, "aw", @nobits
6049 .align PAGE_SIZE
6050 ENTRY(empty_zero_page)
6051Index: head-2008-12-01/arch/x86/kernel/io_apic_32-xen.c
6052===================================================================
6053--- head-2008-12-01.orig/arch/x86/kernel/io_apic_32-xen.c 2008-12-01 11:44:55.000000000 +0100
6054+++ head-2008-12-01/arch/x86/kernel/io_apic_32-xen.c 2008-12-01 11:49:07.000000000 +0100
6055@@ -25,6 +25,7 @@
6056 #include <linux/init.h>
6057 #include <linux/delay.h>
6058 #include <linux/sched.h>
6059+#include <linux/bootmem.h>
6060 #include <linux/mc146818rtc.h>
6061 #include <linux/compiler.h>
6062 #include <linux/acpi.h>
6063@@ -75,7 +76,7 @@ static struct { int pin, apic; } ioapic_
6064 static DEFINE_SPINLOCK(ioapic_lock);
6065 static DEFINE_SPINLOCK(vector_lock);
6066
6067-int timer_over_8254 __initdata = 1;
6068+int timer_through_8259 __initdata;
6069
6070 /*
6071 * Is the SiS APIC rmw bug present ?
6072@@ -89,15 +90,21 @@ int sis_apic_bug = -1;
6073 int nr_ioapic_registers[MAX_IO_APICS];
6074
6075 /* I/O APIC entries */
6076-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
6077+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
6078 int nr_ioapics;
6079
6080 /* MP IRQ source entries */
6081-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6082+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6083
6084 /* # of MP IRQ source entries */
6085 int mp_irq_entries;
6086
6087+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
6088+int mp_bus_id_to_type[MAX_MP_BUSSES];
6089+#endif
6090+
6091+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
6092+
6093 static int disable_timer_pin_1 __initdata;
6094
6095 /*
6096@@ -128,7 +135,7 @@ struct io_apic {
6097 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
6098 {
6099 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
6100- + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
6101+ + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
6102 }
6103 #endif
6104
6105@@ -142,7 +149,7 @@ static inline unsigned int io_apic_read(
6106 struct physdev_apic apic_op;
6107 int ret;
6108
6109- apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6110+ apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6111 apic_op.reg = reg;
6112 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
6113 if (ret)
6114@@ -160,7 +167,7 @@ static inline void io_apic_write(unsigne
6115 #else
6116 struct physdev_apic apic_op;
6117
6118- apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6119+ apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6120 apic_op.reg = reg;
6121 apic_op.value = value;
6122 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
6123@@ -288,7 +295,7 @@ static void __init replace_pin_at_irq(un
6124 }
6125 }
6126
6127-static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
6128+static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
6129 {
6130 struct irq_pin_list *entry = irq_2_pin + irq;
6131 unsigned int pin, reg;
6132@@ -308,30 +315,32 @@ static void __modify_IO_APIC_irq (unsign
6133 }
6134
6135 /* mask = 1 */
6136-static void __mask_IO_APIC_irq (unsigned int irq)
6137+static void __mask_IO_APIC_irq(unsigned int irq)
6138 {
6139- __modify_IO_APIC_irq(irq, 0x00010000, 0);
6140+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
6141 }
6142
6143 /* mask = 0 */
6144-static void __unmask_IO_APIC_irq (unsigned int irq)
6145+static void __unmask_IO_APIC_irq(unsigned int irq)
6146 {
6147- __modify_IO_APIC_irq(irq, 0, 0x00010000);
6148+ __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
6149 }
6150
6151 /* mask = 1, trigger = 0 */
6152-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
6153+static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
6154 {
6155- __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
6156+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
6157+ IO_APIC_REDIR_LEVEL_TRIGGER);
6158 }
6159
6160 /* mask = 0, trigger = 1 */
6161-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
6162+static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
6163 {
6164- __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
6165+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
6166+ IO_APIC_REDIR_MASKED);
6167 }
6168
6169-static void mask_IO_APIC_irq (unsigned int irq)
6170+static void mask_IO_APIC_irq(unsigned int irq)
6171 {
6172 unsigned long flags;
6173
6174@@ -340,7 +349,7 @@ static void mask_IO_APIC_irq (unsigned i
6175 spin_unlock_irqrestore(&ioapic_lock, flags);
6176 }
6177
6178-static void unmask_IO_APIC_irq (unsigned int irq)
6179+static void unmask_IO_APIC_irq(unsigned int irq)
6180 {
6181 unsigned long flags;
6182
6183@@ -352,7 +361,7 @@ static void unmask_IO_APIC_irq (unsigned
6184 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
6185 {
6186 struct IO_APIC_route_entry entry;
6187-
6188+
6189 /* Check delivery_mode to be sure we're not clearing an SMI pin */
6190 entry = ioapic_read_entry(apic, pin);
6191 if (entry.delivery_mode == dest_SMI)
6192@@ -364,7 +373,7 @@ static void clear_IO_APIC_pin(unsigned i
6193 ioapic_mask_entry(apic, pin);
6194 }
6195
6196-static void clear_IO_APIC (void)
6197+static void clear_IO_APIC(void)
6198 {
6199 int apic, pin;
6200
6201@@ -381,7 +390,7 @@ static void set_ioapic_affinity_irq(unsi
6202 struct irq_pin_list *entry = irq_2_pin + irq;
6203 unsigned int apicid_value;
6204 cpumask_t tmp;
6205-
6206+
6207 cpus_and(tmp, cpumask, cpu_online_map);
6208 if (cpus_empty(tmp))
6209 tmp = TARGET_CPUS;
6210@@ -410,7 +419,7 @@ static void set_ioapic_affinity_irq(unsi
6211 # include <linux/kernel_stat.h> /* kstat */
6212 # include <linux/slab.h> /* kmalloc() */
6213 # include <linux/timer.h>
6214-
6215+
6216 #define IRQBALANCE_CHECK_ARCH -999
6217 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
6218 #define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
6219@@ -422,14 +431,14 @@ static int physical_balance __read_mostl
6220 static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
6221
6222 static struct irq_cpu_info {
6223- unsigned long * last_irq;
6224- unsigned long * irq_delta;
6225+ unsigned long *last_irq;
6226+ unsigned long *irq_delta;
6227 unsigned long irq;
6228 } irq_cpu_data[NR_CPUS];
6229
6230 #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
6231-#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
6232-#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
6233+#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
6234+#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
6235
6236 #define IDLE_ENOUGH(cpu,now) \
6237 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
6238@@ -468,8 +477,8 @@ inside:
6239 if (cpu == -1)
6240 cpu = NR_CPUS-1;
6241 }
6242- } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
6243- (search_idle && !IDLE_ENOUGH(cpu,now)));
6244+ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
6245+ (search_idle && !IDLE_ENOUGH(cpu, now)));
6246
6247 return cpu;
6248 }
6249@@ -479,15 +488,14 @@ static inline void balance_irq(int cpu,
6250 unsigned long now = jiffies;
6251 cpumask_t allowed_mask;
6252 unsigned int new_cpu;
6253-
6254+
6255 if (irqbalance_disabled)
6256- return;
6257+ return;
6258
6259 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
6260 new_cpu = move(cpu, allowed_mask, now, 1);
6261- if (cpu != new_cpu) {
6262+ if (cpu != new_cpu)
6263 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
6264- }
6265 }
6266
6267 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
6268@@ -499,14 +507,14 @@ static inline void rotate_irqs_among_cpu
6269 if (!irq_desc[j].action)
6270 continue;
6271 /* Is it a significant load ? */
6272- if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
6273+ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
6274 useful_load_threshold)
6275 continue;
6276 balance_irq(i, j);
6277 }
6278 }
6279 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6280- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6281+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6282 return;
6283 }
6284
6285@@ -535,22 +543,22 @@ static void do_irq_balance(void)
6286 /* Is this an active IRQ or balancing disabled ? */
6287 if (!irq_desc[j].action || irq_balancing_disabled(j))
6288 continue;
6289- if ( package_index == i )
6290- IRQ_DELTA(package_index,j) = 0;
6291+ if (package_index == i)
6292+ IRQ_DELTA(package_index, j) = 0;
6293 /* Determine the total count per processor per IRQ */
6294 value_now = (unsigned long) kstat_cpu(i).irqs[j];
6295
6296 /* Determine the activity per processor per IRQ */
6297- delta = value_now - LAST_CPU_IRQ(i,j);
6298+ delta = value_now - LAST_CPU_IRQ(i, j);
6299
6300 /* Update last_cpu_irq[][] for the next time */
6301- LAST_CPU_IRQ(i,j) = value_now;
6302+ LAST_CPU_IRQ(i, j) = value_now;
6303
6304 /* Ignore IRQs whose rate is less than the clock */
6305 if (delta < useful_load_threshold)
6306 continue;
6307 /* update the load for the processor or package total */
6308- IRQ_DELTA(package_index,j) += delta;
6309+ IRQ_DELTA(package_index, j) += delta;
6310
6311 /* Keep track of the higher numbered sibling as well */
6312 if (i != package_index)
6313@@ -576,7 +584,8 @@ static void do_irq_balance(void)
6314 max_cpu_irq = ULONG_MAX;
6315
6316 tryanothercpu:
6317- /* Look for heaviest loaded processor.
6318+ /*
6319+ * Look for heaviest loaded processor.
6320 * We may come back to get the next heaviest loaded processor.
6321 * Skip processors with trivial loads.
6322 */
6323@@ -585,7 +594,7 @@ tryanothercpu:
6324 for_each_online_cpu(i) {
6325 if (i != CPU_TO_PACKAGEINDEX(i))
6326 continue;
6327- if (max_cpu_irq <= CPU_IRQ(i))
6328+ if (max_cpu_irq <= CPU_IRQ(i))
6329 continue;
6330 if (tmp_cpu_irq < CPU_IRQ(i)) {
6331 tmp_cpu_irq = CPU_IRQ(i);
6332@@ -594,8 +603,9 @@ tryanothercpu:
6333 }
6334
6335 if (tmp_loaded == -1) {
6336- /* In the case of small number of heavy interrupt sources,
6337- * loading some of the cpus too much. We use Ingo's original
6338+ /*
6339+ * In the case of small number of heavy interrupt sources,
6340+ * loading some of the cpus too much. We use Ingo's original
6341 * approach to rotate them around.
6342 */
6343 if (!first_attempt && imbalance >= useful_load_threshold) {
6344@@ -604,13 +614,14 @@ tryanothercpu:
6345 }
6346 goto not_worth_the_effort;
6347 }
6348-
6349+
6350 first_attempt = 0; /* heaviest search */
6351 max_cpu_irq = tmp_cpu_irq; /* load */
6352 max_loaded = tmp_loaded; /* processor */
6353 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
6354-
6355- /* if imbalance is less than approx 10% of max load, then
6356+
6357+ /*
6358+ * if imbalance is less than approx 10% of max load, then
6359 * observe diminishing returns action. - quit
6360 */
6361 if (imbalance < (max_cpu_irq >> 3))
6362@@ -626,26 +637,25 @@ tryanotherirq:
6363 /* Is this an active IRQ? */
6364 if (!irq_desc[j].action)
6365 continue;
6366- if (imbalance <= IRQ_DELTA(max_loaded,j))
6367+ if (imbalance <= IRQ_DELTA(max_loaded, j))
6368 continue;
6369 /* Try to find the IRQ that is closest to the imbalance
6370 * without going over.
6371 */
6372- if (move_this_load < IRQ_DELTA(max_loaded,j)) {
6373- move_this_load = IRQ_DELTA(max_loaded,j);
6374+ if (move_this_load < IRQ_DELTA(max_loaded, j)) {
6375+ move_this_load = IRQ_DELTA(max_loaded, j);
6376 selected_irq = j;
6377 }
6378 }
6379- if (selected_irq == -1) {
6380+ if (selected_irq == -1)
6381 goto tryanothercpu;
6382- }
6383
6384 imbalance = move_this_load;
6385-
6386+
6387 /* For physical_balance case, we accumulated both load
6388 * values in the one of the siblings cpu_irq[],
6389 * to use the same code for physical and logical processors
6390- * as much as possible.
6391+ * as much as possible.
6392 *
6393 * NOTE: the cpu_irq[] array holds the sum of the load for
6394 * sibling A and sibling B in the slot for the lowest numbered
6395@@ -674,11 +684,11 @@ tryanotherirq:
6396 /* mark for change destination */
6397 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
6398
6399- /* Since we made a change, come back sooner to
6400+ /* Since we made a change, come back sooner to
6401 * check for more variation.
6402 */
6403 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6404- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6405+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6406 return;
6407 }
6408 goto tryanotherirq;
6409@@ -689,7 +699,7 @@ not_worth_the_effort:
6410 * upward
6411 */
6412 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
6413- balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6414+ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6415 return;
6416 }
6417
6418@@ -728,13 +738,13 @@ static int __init balanced_irq_init(void
6419 cpumask_t tmp;
6420
6421 cpus_shift_right(tmp, cpu_online_map, 2);
6422- c = &boot_cpu_data;
6423+ c = &boot_cpu_data;
6424 /* When not overwritten by the command line ask subarchitecture. */
6425 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
6426 irqbalance_disabled = NO_BALANCE_IRQ;
6427 if (irqbalance_disabled)
6428 return 0;
6429-
6430+
6431 /* disable irqbalance completely if there is only one processor online */
6432 if (num_online_cpus() < 2) {
6433 irqbalance_disabled = 1;
6434@@ -748,16 +758,14 @@ static int __init balanced_irq_init(void
6435 physical_balance = 1;
6436
6437 for_each_online_cpu(i) {
6438- irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6439- irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6440+ irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6441+ irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6442 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
6443 printk(KERN_ERR "balanced_irq_init: out of memory");
6444 goto failed;
6445 }
6446- memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
6447- memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
6448 }
6449-
6450+
6451 printk(KERN_INFO "Starting balanced_irq\n");
6452 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
6453 return 0;
6454@@ -799,7 +807,7 @@ void send_IPI_self(int vector)
6455 /*
6456 * Send the IPI. The write to APIC_ICR fires this off.
6457 */
6458- apic_write_around(APIC_ICR, cfg);
6459+ apic_write(APIC_ICR, cfg);
6460 #endif
6461 }
6462 #endif /* !CONFIG_SMP */
6463@@ -853,10 +861,10 @@ static int find_irq_entry(int apic, int
6464 int i;
6465
6466 for (i = 0; i < mp_irq_entries; i++)
6467- if (mp_irqs[i].mpc_irqtype == type &&
6468- (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
6469- mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
6470- mp_irqs[i].mpc_dstirq == pin)
6471+ if (mp_irqs[i].mp_irqtype == type &&
6472+ (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
6473+ mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
6474+ mp_irqs[i].mp_dstirq == pin)
6475 return i;
6476
6477 return -1;
6478@@ -871,13 +879,13 @@ static int __init find_isa_irq_pin(int i
6479 int i;
6480
6481 for (i = 0; i < mp_irq_entries; i++) {
6482- int lbus = mp_irqs[i].mpc_srcbus;
6483+ int lbus = mp_irqs[i].mp_srcbus;
6484
6485 if (test_bit(lbus, mp_bus_not_pci) &&
6486- (mp_irqs[i].mpc_irqtype == type) &&
6487- (mp_irqs[i].mpc_srcbusirq == irq))
6488+ (mp_irqs[i].mp_irqtype == type) &&
6489+ (mp_irqs[i].mp_srcbusirq == irq))
6490
6491- return mp_irqs[i].mpc_dstirq;
6492+ return mp_irqs[i].mp_dstirq;
6493 }
6494 return -1;
6495 }
6496@@ -887,17 +895,17 @@ static int __init find_isa_irq_apic(int
6497 int i;
6498
6499 for (i = 0; i < mp_irq_entries; i++) {
6500- int lbus = mp_irqs[i].mpc_srcbus;
6501+ int lbus = mp_irqs[i].mp_srcbus;
6502
6503 if (test_bit(lbus, mp_bus_not_pci) &&
6504- (mp_irqs[i].mpc_irqtype == type) &&
6505- (mp_irqs[i].mpc_srcbusirq == irq))
6506+ (mp_irqs[i].mp_irqtype == type) &&
6507+ (mp_irqs[i].mp_srcbusirq == irq))
6508 break;
6509 }
6510 if (i < mp_irq_entries) {
6511 int apic;
6512- for(apic = 0; apic < nr_ioapics; apic++) {
6513- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
6514+ for (apic = 0; apic < nr_ioapics; apic++) {
6515+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
6516 return apic;
6517 }
6518 }
6519@@ -918,28 +926,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6520
6521 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
6522 "slot:%d, pin:%d.\n", bus, slot, pin);
6523- if (mp_bus_id_to_pci_bus[bus] == -1) {
6524+ if (test_bit(bus, mp_bus_not_pci)) {
6525 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
6526 return -1;
6527 }
6528 for (i = 0; i < mp_irq_entries; i++) {
6529- int lbus = mp_irqs[i].mpc_srcbus;
6530+ int lbus = mp_irqs[i].mp_srcbus;
6531
6532 for (apic = 0; apic < nr_ioapics; apic++)
6533- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
6534- mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
6535+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
6536+ mp_irqs[i].mp_dstapic == MP_APIC_ALL)
6537 break;
6538
6539 if (!test_bit(lbus, mp_bus_not_pci) &&
6540- !mp_irqs[i].mpc_irqtype &&
6541+ !mp_irqs[i].mp_irqtype &&
6542 (bus == lbus) &&
6543- (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
6544- int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
6545+ (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
6546+ int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
6547
6548 if (!(apic || IO_APIC_IRQ(irq)))
6549 continue;
6550
6551- if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
6552+ if (pin == (mp_irqs[i].mp_srcbusirq & 3))
6553 return irq;
6554 /*
6555 * Use the first all-but-pin matching entry as a
6556@@ -954,7 +962,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6557 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
6558
6559 /*
6560- * This function currently is only a helper for the i386 smp boot process where
6561+ * This function currently is only a helper for the i386 smp boot process where
6562 * we need to reprogram the ioredtbls to cater for the cpus which have come online
6563 * so mask in all cases should simply be TARGET_CPUS
6564 */
6565@@ -1008,7 +1016,7 @@ static int EISA_ELCR(unsigned int irq)
6566 * EISA conforming in the MP table, that means its trigger type must
6567 * be read in from the ELCR */
6568
6569-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
6570+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
6571 #define default_EISA_polarity(idx) default_ISA_polarity(idx)
6572
6573 /* PCI interrupts are always polarity one level triggered,
6574@@ -1025,118 +1033,115 @@ static int EISA_ELCR(unsigned int irq)
6575
6576 static int MPBIOS_polarity(int idx)
6577 {
6578- int bus = mp_irqs[idx].mpc_srcbus;
6579+ int bus = mp_irqs[idx].mp_srcbus;
6580 int polarity;
6581
6582 /*
6583 * Determine IRQ line polarity (high active or low active):
6584 */
6585- switch (mp_irqs[idx].mpc_irqflag & 3)
6586+ switch (mp_irqs[idx].mp_irqflag & 3) {
6587+ case 0: /* conforms, ie. bus-type dependent polarity */
6588 {
6589- case 0: /* conforms, ie. bus-type dependent polarity */
6590- {
6591- polarity = test_bit(bus, mp_bus_not_pci)?
6592- default_ISA_polarity(idx):
6593- default_PCI_polarity(idx);
6594- break;
6595- }
6596- case 1: /* high active */
6597- {
6598- polarity = 0;
6599- break;
6600- }
6601- case 2: /* reserved */
6602- {
6603- printk(KERN_WARNING "broken BIOS!!\n");
6604- polarity = 1;
6605- break;
6606- }
6607- case 3: /* low active */
6608- {
6609- polarity = 1;
6610- break;
6611- }
6612- default: /* invalid */
6613- {
6614- printk(KERN_WARNING "broken BIOS!!\n");
6615- polarity = 1;
6616- break;
6617- }
6618+ polarity = test_bit(bus, mp_bus_not_pci)?
6619+ default_ISA_polarity(idx):
6620+ default_PCI_polarity(idx);
6621+ break;
6622+ }
6623+ case 1: /* high active */
6624+ {
6625+ polarity = 0;
6626+ break;
6627+ }
6628+ case 2: /* reserved */
6629+ {
6630+ printk(KERN_WARNING "broken BIOS!!\n");
6631+ polarity = 1;
6632+ break;
6633+ }
6634+ case 3: /* low active */
6635+ {
6636+ polarity = 1;
6637+ break;
6638+ }
6639+ default: /* invalid */
6640+ {
6641+ printk(KERN_WARNING "broken BIOS!!\n");
6642+ polarity = 1;
6643+ break;
6644+ }
6645 }
6646 return polarity;
6647 }
6648
6649 static int MPBIOS_trigger(int idx)
6650 {
6651- int bus = mp_irqs[idx].mpc_srcbus;
6652+ int bus = mp_irqs[idx].mp_srcbus;
6653 int trigger;
6654
6655 /*
6656 * Determine IRQ trigger mode (edge or level sensitive):
6657 */
6658- switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
6659+ switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
6660+ case 0: /* conforms, ie. bus-type dependent */
6661 {
6662- case 0: /* conforms, ie. bus-type dependent */
6663- {
6664- trigger = test_bit(bus, mp_bus_not_pci)?
6665- default_ISA_trigger(idx):
6666- default_PCI_trigger(idx);
6667+ trigger = test_bit(bus, mp_bus_not_pci)?
6668+ default_ISA_trigger(idx):
6669+ default_PCI_trigger(idx);
6670 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
6671- switch (mp_bus_id_to_type[bus])
6672- {
6673- case MP_BUS_ISA: /* ISA pin */
6674- {
6675- /* set before the switch */
6676- break;
6677- }
6678- case MP_BUS_EISA: /* EISA pin */
6679- {
6680- trigger = default_EISA_trigger(idx);
6681- break;
6682- }
6683- case MP_BUS_PCI: /* PCI pin */
6684- {
6685- /* set before the switch */
6686- break;
6687- }
6688- case MP_BUS_MCA: /* MCA pin */
6689- {
6690- trigger = default_MCA_trigger(idx);
6691- break;
6692- }
6693- default:
6694- {
6695- printk(KERN_WARNING "broken BIOS!!\n");
6696- trigger = 1;
6697- break;
6698- }
6699- }
6700-#endif
6701+ switch (mp_bus_id_to_type[bus]) {
6702+ case MP_BUS_ISA: /* ISA pin */
6703+ {
6704+ /* set before the switch */
6705 break;
6706 }
6707- case 1: /* edge */
6708+ case MP_BUS_EISA: /* EISA pin */
6709 {
6710- trigger = 0;
6711+ trigger = default_EISA_trigger(idx);
6712 break;
6713 }
6714- case 2: /* reserved */
6715+ case MP_BUS_PCI: /* PCI pin */
6716 {
6717- printk(KERN_WARNING "broken BIOS!!\n");
6718- trigger = 1;
6719+ /* set before the switch */
6720 break;
6721 }
6722- case 3: /* level */
6723+ case MP_BUS_MCA: /* MCA pin */
6724 {
6725- trigger = 1;
6726+ trigger = default_MCA_trigger(idx);
6727 break;
6728 }
6729- default: /* invalid */
6730+ default:
6731 {
6732 printk(KERN_WARNING "broken BIOS!!\n");
6733- trigger = 0;
6734+ trigger = 1;
6735 break;
6736 }
6737 }
6738+#endif
6739+ break;
6740+ }
6741+ case 1: /* edge */
6742+ {
6743+ trigger = 0;
6744+ break;
6745+ }
6746+ case 2: /* reserved */
6747+ {
6748+ printk(KERN_WARNING "broken BIOS!!\n");
6749+ trigger = 1;
6750+ break;
6751+ }
6752+ case 3: /* level */
6753+ {
6754+ trigger = 1;
6755+ break;
6756+ }
6757+ default: /* invalid */
6758+ {
6759+ printk(KERN_WARNING "broken BIOS!!\n");
6760+ trigger = 0;
6761+ break;
6762+ }
6763+ }
6764 return trigger;
6765 }
6766
6767@@ -1153,16 +1158,16 @@ static inline int irq_trigger(int idx)
6768 static int pin_2_irq(int idx, int apic, int pin)
6769 {
6770 int irq, i;
6771- int bus = mp_irqs[idx].mpc_srcbus;
6772+ int bus = mp_irqs[idx].mp_srcbus;
6773
6774 /*
6775 * Debugging check, we are in big trouble if this message pops up!
6776 */
6777- if (mp_irqs[idx].mpc_dstirq != pin)
6778+ if (mp_irqs[idx].mp_dstirq != pin)
6779 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
6780
6781 if (test_bit(bus, mp_bus_not_pci))
6782- irq = mp_irqs[idx].mpc_srcbusirq;
6783+ irq = mp_irqs[idx].mp_srcbusirq;
6784 else {
6785 /*
6786 * PCI IRQs are mapped in order
6787@@ -1204,8 +1209,8 @@ static inline int IO_APIC_irq_trigger(in
6788
6789 for (apic = 0; apic < nr_ioapics; apic++) {
6790 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6791- idx = find_irq_entry(apic,pin,mp_INT);
6792- if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
6793+ idx = find_irq_entry(apic, pin, mp_INT);
6794+ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
6795 return irq_trigger(idx);
6796 }
6797 }
6798@@ -1291,25 +1296,25 @@ static void __init setup_IO_APIC_irqs(vo
6799 /*
6800 * add it to the IO-APIC irq-routing table:
6801 */
6802- memset(&entry,0,sizeof(entry));
6803+ memset(&entry, 0, sizeof(entry));
6804
6805 entry.delivery_mode = INT_DELIVERY_MODE;
6806 entry.dest_mode = INT_DEST_MODE;
6807 entry.mask = 0; /* enable IRQ */
6808- entry.dest.logical.logical_dest =
6809+ entry.dest.logical.logical_dest =
6810 cpu_mask_to_apicid(TARGET_CPUS);
6811
6812- idx = find_irq_entry(apic,pin,mp_INT);
6813+ idx = find_irq_entry(apic, pin, mp_INT);
6814 if (idx == -1) {
6815 if (first_notcon) {
6816 apic_printk(APIC_VERBOSE, KERN_DEBUG
6817 " IO-APIC (apicid-pin) %d-%d",
6818- mp_ioapics[apic].mpc_apicid,
6819+ mp_ioapics[apic].mp_apicid,
6820 pin);
6821 first_notcon = 0;
6822 } else
6823 apic_printk(APIC_VERBOSE, ", %d-%d",
6824- mp_ioapics[apic].mpc_apicid, pin);
6825+ mp_ioapics[apic].mp_apicid, pin);
6826 continue;
6827 }
6828
6829@@ -1343,7 +1348,7 @@ static void __init setup_IO_APIC_irqs(vo
6830 vector = assign_irq_vector(irq);
6831 entry.vector = vector;
6832 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
6833-
6834+
6835 if (!apic && (irq < 16))
6836 disable_8259A_irq(irq);
6837 }
6838@@ -1355,27 +1360,23 @@ static void __init setup_IO_APIC_irqs(vo
6839 apic_printk(APIC_VERBOSE, " not connected.\n");
6840 }
6841
6842+#ifndef CONFIG_XEN
6843 /*
6844- * Set up the 8259A-master output pin:
6845+ * Set up the timer pin, possibly with the 8259A-master behind.
6846 */
6847-#ifndef CONFIG_XEN
6848-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
6849+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
6850+ int vector)
6851 {
6852 struct IO_APIC_route_entry entry;
6853
6854- memset(&entry,0,sizeof(entry));
6855-
6856- disable_8259A_irq(0);
6857-
6858- /* mask LVT0 */
6859- apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6860+ memset(&entry, 0, sizeof(entry));
6861
6862 /*
6863 * We use logical delivery to get the timer IRQ
6864 * to the first CPU.
6865 */
6866 entry.dest_mode = INT_DEST_MODE;
6867- entry.mask = 0; /* unmask IRQ now */
6868+ entry.mask = 1; /* mask IRQ now */
6869 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6870 entry.delivery_mode = INT_DELIVERY_MODE;
6871 entry.polarity = 0;
6872@@ -1384,17 +1385,14 @@ static void __init setup_ExtINT_IRQ0_pin
6873
6874 /*
6875 * The timer IRQ doesn't have to know that behind the
6876- * scene we have a 8259A-master in AEOI mode ...
6877+ * scene we may have a 8259A-master in AEOI mode ...
6878 */
6879- irq_desc[0].chip = &ioapic_chip;
6880- set_irq_handler(0, handle_edge_irq);
6881+ ioapic_register_intr(0, vector, IOAPIC_EDGE);
6882
6883 /*
6884 * Add it to the IO-APIC irq-routing table:
6885 */
6886 ioapic_write_entry(apic, pin, entry);
6887-
6888- enable_8259A_irq(0);
6889 }
6890
6891 void __init print_IO_APIC(void)
6892@@ -1409,10 +1407,10 @@ void __init print_IO_APIC(void)
6893 if (apic_verbosity == APIC_QUIET)
6894 return;
6895
6896- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6897+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6898 for (i = 0; i < nr_ioapics; i++)
6899 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
6900- mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
6901+ mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
6902
6903 /*
6904 * We are a bit conservative about what we expect. We have to
6905@@ -1431,7 +1429,7 @@ void __init print_IO_APIC(void)
6906 reg_03.raw = io_apic_read(apic, 3);
6907 spin_unlock_irqrestore(&ioapic_lock, flags);
6908
6909- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
6910+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
6911 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
6912 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
6913 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
6914@@ -1512,7 +1510,7 @@ void __init print_IO_APIC(void)
6915 return;
6916 }
6917
6918-static void print_APIC_bitfield (int base)
6919+static void print_APIC_bitfield(int base)
6920 {
6921 unsigned int v;
6922 int i, j;
6923@@ -1533,7 +1531,7 @@ static void print_APIC_bitfield (int bas
6924 }
6925 }
6926
6927-void /*__init*/ print_local_APIC(void * dummy)
6928+void /*__init*/ print_local_APIC(void *dummy)
6929 {
6930 unsigned int v, ver, maxlvt;
6931
6932@@ -1542,6 +1540,7 @@ void /*__init*/ print_local_APIC(void *
6933
6934 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
6935 smp_processor_id(), hard_smp_processor_id());
6936+ v = apic_read(APIC_ID);
6937 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
6938 GET_APIC_ID(read_apic_id()));
6939 v = apic_read(APIC_LVR);
6940@@ -1616,9 +1615,9 @@ void /*__init*/ print_local_APIC(void *
6941 printk("\n");
6942 }
6943
6944-void print_all_local_APICs (void)
6945+void print_all_local_APICs(void)
6946 {
6947- on_each_cpu(print_local_APIC, NULL, 1, 1);
6948+ on_each_cpu(print_local_APIC, NULL, 1);
6949 }
6950
6951 void /*__init*/ print_PIC(void)
6952@@ -1639,11 +1638,11 @@ void /*__init*/ print_PIC(void)
6953 v = inb(0xa0) << 8 | inb(0x20);
6954 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
6955
6956- outb(0x0b,0xa0);
6957- outb(0x0b,0x20);
6958+ outb(0x0b, 0xa0);
6959+ outb(0x0b, 0x20);
6960 v = inb(0xa0) << 8 | inb(0x20);
6961- outb(0x0a,0xa0);
6962- outb(0x0a,0x20);
6963+ outb(0x0a, 0xa0);
6964+ outb(0x0a, 0x20);
6965
6966 spin_unlock_irqrestore(&i8259A_lock, flags);
6967
6968@@ -1652,6 +1651,8 @@ void /*__init*/ print_PIC(void)
6969 v = inb(0x4d1) << 8 | inb(0x4d0);
6970 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
6971 }
6972+#else
6973+void __init print_IO_APIC(void) {}
6974 #endif /* !CONFIG_XEN */
6975
6976 static void __init enable_IO_APIC(void)
6977@@ -1681,7 +1682,7 @@ static void __init enable_IO_APIC(void)
6978 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
6979 }
6980 #ifndef CONFIG_XEN
6981- for(apic = 0; apic < nr_ioapics; apic++) {
6982+ for (apic = 0; apic < nr_ioapics; apic++) {
6983 int pin;
6984 /* See if any of the pins is in ExtINT mode */
6985 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6986@@ -1774,7 +1775,7 @@ void disable_IO_APIC(void)
6987 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
6988 */
6989
6990-#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
6991+#ifndef CONFIG_XEN
6992 static void __init setup_ioapic_ids_from_mpc(void)
6993 {
6994 union IO_APIC_reg_00 reg_00;
6995@@ -1784,6 +1785,11 @@ static void __init setup_ioapic_ids_from
6996 unsigned char old_id;
6997 unsigned long flags;
6998
6999+#ifdef CONFIG_X86_NUMAQ
7000+ if (found_numaq)
7001+ return;
7002+#endif
7003+
7004 /*
7005 * Don't check I/O APIC IDs for xAPIC systems. They have
7006 * no meaning without the serial APIC bus.
7007@@ -1806,15 +1812,15 @@ static void __init setup_ioapic_ids_from
7008 spin_lock_irqsave(&ioapic_lock, flags);
7009 reg_00.raw = io_apic_read(apic, 0);
7010 spin_unlock_irqrestore(&ioapic_lock, flags);
7011-
7012- old_id = mp_ioapics[apic].mpc_apicid;
7013
7014- if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
7015+ old_id = mp_ioapics[apic].mp_apicid;
7016+
7017+ if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
7018 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
7019- apic, mp_ioapics[apic].mpc_apicid);
7020+ apic, mp_ioapics[apic].mp_apicid);
7021 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7022 reg_00.bits.ID);
7023- mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
7024+ mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
7025 }
7026
7027 /*
7028@@ -1823,9 +1829,9 @@ static void __init setup_ioapic_ids_from
7029 * 'stuck on smp_invalidate_needed IPI wait' messages.
7030 */
7031 if (check_apicid_used(phys_id_present_map,
7032- mp_ioapics[apic].mpc_apicid)) {
7033+ mp_ioapics[apic].mp_apicid)) {
7034 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
7035- apic, mp_ioapics[apic].mpc_apicid);
7036+ apic, mp_ioapics[apic].mp_apicid);
7037 for (i = 0; i < get_physical_broadcast(); i++)
7038 if (!physid_isset(i, phys_id_present_map))
7039 break;
7040@@ -1834,13 +1840,13 @@ static void __init setup_ioapic_ids_from
7041 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7042 i);
7043 physid_set(i, phys_id_present_map);
7044- mp_ioapics[apic].mpc_apicid = i;
7045+ mp_ioapics[apic].mp_apicid = i;
7046 } else {
7047 physid_mask_t tmp;
7048- tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
7049+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
7050 apic_printk(APIC_VERBOSE, "Setting %d in the "
7051 "phys_id_present_map\n",
7052- mp_ioapics[apic].mpc_apicid);
7053+ mp_ioapics[apic].mp_apicid);
7054 physids_or(phys_id_present_map, phys_id_present_map, tmp);
7055 }
7056
7057@@ -1849,21 +1855,21 @@ static void __init setup_ioapic_ids_from
7058 * We need to adjust the IRQ routing table
7059 * if the ID changed.
7060 */
7061- if (old_id != mp_ioapics[apic].mpc_apicid)
7062+ if (old_id != mp_ioapics[apic].mp_apicid)
7063 for (i = 0; i < mp_irq_entries; i++)
7064- if (mp_irqs[i].mpc_dstapic == old_id)
7065- mp_irqs[i].mpc_dstapic
7066- = mp_ioapics[apic].mpc_apicid;
7067+ if (mp_irqs[i].mp_dstapic == old_id)
7068+ mp_irqs[i].mp_dstapic
7069+ = mp_ioapics[apic].mp_apicid;
7070
7071 /*
7072 * Read the right value from the MPC table and
7073 * write it into the ID register.
7074- */
7075+ */
7076 apic_printk(APIC_VERBOSE, KERN_INFO
7077 "...changing IO-APIC physical APIC ID to %d ...",
7078- mp_ioapics[apic].mpc_apicid);
7079+ mp_ioapics[apic].mp_apicid);
7080
7081- reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
7082+ reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
7083 spin_lock_irqsave(&ioapic_lock, flags);
7084 io_apic_write(apic, 0, reg_00.raw);
7085 spin_unlock_irqrestore(&ioapic_lock, flags);
7086@@ -1874,17 +1880,13 @@ static void __init setup_ioapic_ids_from
7087 spin_lock_irqsave(&ioapic_lock, flags);
7088 reg_00.raw = io_apic_read(apic, 0);
7089 spin_unlock_irqrestore(&ioapic_lock, flags);
7090- if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
7091+ if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
7092 printk("could not set ID!\n");
7093 else
7094 apic_printk(APIC_VERBOSE, " ok.\n");
7095 }
7096 }
7097-#else
7098-static void __init setup_ioapic_ids_from_mpc(void) { }
7099-#endif
7100
7101-#ifndef CONFIG_XEN
7102 int no_timer_check __initdata;
7103
7104 static int __init notimercheck(char *s)
7105@@ -2077,45 +2079,53 @@ static inline void init_IO_APIC_traps(vo
7106 * The local APIC irq-chip implementation:
7107 */
7108
7109-static void ack_apic(unsigned int irq)
7110+static void ack_lapic_irq(unsigned int irq)
7111 {
7112 ack_APIC_irq();
7113 }
7114
7115-static void mask_lapic_irq (unsigned int irq)
7116+static void mask_lapic_irq(unsigned int irq)
7117 {
7118 unsigned long v;
7119
7120 v = apic_read(APIC_LVT0);
7121- apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
7122+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
7123 }
7124
7125-static void unmask_lapic_irq (unsigned int irq)
7126+static void unmask_lapic_irq(unsigned int irq)
7127 {
7128 unsigned long v;
7129
7130 v = apic_read(APIC_LVT0);
7131- apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
7132+ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
7133 }
7134
7135 static struct irq_chip lapic_chip __read_mostly = {
7136- .name = "local-APIC-edge",
7137+ .name = "local-APIC",
7138 .mask = mask_lapic_irq,
7139 .unmask = unmask_lapic_irq,
7140- .eoi = ack_apic,
7141+ .ack = ack_lapic_irq,
7142 };
7143
7144+static void lapic_register_intr(int irq, int vector)
7145+{
7146+ irq_desc[irq].status &= ~IRQ_LEVEL;
7147+ set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
7148+ "edge");
7149+ set_intr_gate(vector, interrupt[irq]);
7150+}
7151+
7152 static void __init setup_nmi(void)
7153 {
7154 /*
7155- * Dirty trick to enable the NMI watchdog ...
7156+ * Dirty trick to enable the NMI watchdog ...
7157 * We put the 8259A master into AEOI mode and
7158 * unmask on all local APICs LVT0 as NMI.
7159 *
7160 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
7161 * is from Maciej W. Rozycki - so we do not have to EOI from
7162 * the NMI handler or the timer interrupt.
7163- */
7164+ */
7165 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
7166
7167 enable_NMI_through_LVT0();
7168@@ -2191,11 +2201,16 @@ static inline void __init unlock_ExtINT_
7169 static inline void __init check_timer(void)
7170 {
7171 int apic1, pin1, apic2, pin2;
7172+ int no_pin1 = 0;
7173 int vector;
7174+ unsigned int ver;
7175 unsigned long flags;
7176
7177 local_irq_save(flags);
7178
7179+ ver = apic_read(APIC_LVR);
7180+ ver = GET_APIC_VERSION(ver);
7181+
7182 /*
7183 * get/set the timer IRQ vector:
7184 */
7185@@ -2204,34 +2219,54 @@ static inline void __init check_timer(vo
7186 set_intr_gate(vector, interrupt[0]);
7187
7188 /*
7189- * Subtle, code in do_timer_interrupt() expects an AEOI
7190- * mode for the 8259A whenever interrupts are routed
7191- * through I/O APICs. Also IRQ0 has to be enabled in
7192- * the 8259A which implies the virtual wire has to be
7193- * disabled in the local APIC.
7194+ * As IRQ0 is to be enabled in the 8259A, the virtual
7195+ * wire has to be disabled in the local APIC. Also
7196+ * timer interrupts need to be acknowledged manually in
7197+ * the 8259A for the i82489DX when using the NMI
7198+ * watchdog as that APIC treats NMIs as level-triggered.
7199+ * The AEOI mode will finish them in the 8259A
7200+ * automatically.
7201 */
7202- apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7203+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7204 init_8259A(1);
7205- timer_ack = 1;
7206- if (timer_over_8254 > 0)
7207- enable_8259A_irq(0);
7208+ timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
7209
7210 pin1 = find_isa_irq_pin(0, mp_INT);
7211 apic1 = find_isa_irq_apic(0, mp_INT);
7212 pin2 = ioapic_i8259.pin;
7213 apic2 = ioapic_i8259.apic;
7214
7215- printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
7216- vector, apic1, pin1, apic2, pin2);
7217+ apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
7218+ "apic1=%d pin1=%d apic2=%d pin2=%d\n",
7219+ vector, apic1, pin1, apic2, pin2);
7220+
7221+ /*
7222+ * Some BIOS writers are clueless and report the ExtINTA
7223+ * I/O APIC input from the cascaded 8259A as the timer
7224+ * interrupt input. So just in case, if only one pin
7225+ * was found above, try it both directly and through the
7226+ * 8259A.
7227+ */
7228+ if (pin1 == -1) {
7229+ pin1 = pin2;
7230+ apic1 = apic2;
7231+ no_pin1 = 1;
7232+ } else if (pin2 == -1) {
7233+ pin2 = pin1;
7234+ apic2 = apic1;
7235+ }
7236
7237 if (pin1 != -1) {
7238 /*
7239 * Ok, does IRQ0 through the IOAPIC work?
7240 */
7241+ if (no_pin1) {
7242+ add_pin_to_irq(0, apic1, pin1);
7243+ setup_timer_IRQ0_pin(apic1, pin1, vector);
7244+ }
7245 unmask_IO_APIC_irq(0);
7246 if (timer_irq_works()) {
7247 if (nmi_watchdog == NMI_IO_APIC) {
7248- disable_8259A_irq(0);
7249 setup_nmi();
7250 enable_8259A_irq(0);
7251 }
7252@@ -2240,71 +2275,77 @@ static inline void __init check_timer(vo
7253 goto out;
7254 }
7255 clear_IO_APIC_pin(apic1, pin1);
7256- printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
7257- "IO-APIC\n");
7258- }
7259-
7260- printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
7261- if (pin2 != -1) {
7262- printk("\n..... (found pin %d) ...", pin2);
7263+ if (!no_pin1)
7264+ apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
7265+ "8254 timer not connected to IO-APIC\n");
7266+
7267+ apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
7268+ "(IRQ0) through the 8259A ...\n");
7269+ apic_printk(APIC_QUIET, KERN_INFO
7270+ "..... (found apic %d pin %d) ...\n", apic2, pin2);
7271 /*
7272 * legacy devices should be connected to IO APIC #0
7273 */
7274- setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
7275+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7276+ setup_timer_IRQ0_pin(apic2, pin2, vector);
7277+ unmask_IO_APIC_irq(0);
7278+ enable_8259A_irq(0);
7279 if (timer_irq_works()) {
7280- printk("works.\n");
7281- if (pin1 != -1)
7282- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7283- else
7284- add_pin_to_irq(0, apic2, pin2);
7285+ apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
7286+ timer_through_8259 = 1;
7287 if (nmi_watchdog == NMI_IO_APIC) {
7288+ disable_8259A_irq(0);
7289 setup_nmi();
7290+ enable_8259A_irq(0);
7291 }
7292 goto out;
7293 }
7294 /*
7295 * Cleanup, just in case ...
7296 */
7297+ disable_8259A_irq(0);
7298 clear_IO_APIC_pin(apic2, pin2);
7299+ apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
7300 }
7301- printk(" failed.\n");
7302
7303 if (nmi_watchdog == NMI_IO_APIC) {
7304- printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
7305- nmi_watchdog = 0;
7306+ apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
7307+ "through the IO-APIC - disabling NMI Watchdog!\n");
7308+ nmi_watchdog = NMI_NONE;
7309 }
7310+ timer_ack = 0;
7311
7312- printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
7313+ apic_printk(APIC_QUIET, KERN_INFO
7314+ "...trying to set up timer as Virtual Wire IRQ...\n");
7315
7316- disable_8259A_irq(0);
7317- set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
7318- "fasteoi");
7319- apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7320+ lapic_register_intr(0, vector);
7321+ apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7322 enable_8259A_irq(0);
7323
7324 if (timer_irq_works()) {
7325- printk(" works.\n");
7326+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7327 goto out;
7328 }
7329- apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7330- printk(" failed.\n");
7331+ disable_8259A_irq(0);
7332+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7333+ apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
7334
7335- printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
7336+ apic_printk(APIC_QUIET, KERN_INFO
7337+ "...trying to set up timer as ExtINT IRQ...\n");
7338
7339- timer_ack = 0;
7340 init_8259A(0);
7341 make_8259A_irq(0);
7342- apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
7343+ apic_write(APIC_LVT0, APIC_DM_EXTINT);
7344
7345 unlock_ExtINT_logic();
7346
7347 if (timer_irq_works()) {
7348- printk(" works.\n");
7349+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7350 goto out;
7351 }
7352- printk(" failed :(.\n");
7353+ apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
7354 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
7355- "report. Then try booting with the 'noapic' option");
7356+ "report. Then try booting with the 'noapic' option.\n");
7357 out:
7358 local_irq_restore(flags);
7359 }
7360@@ -2314,11 +2355,21 @@ int timer_uses_ioapic_pin_0 = 0;
7361 #endif
7362
7363 /*
7364- *
7365- * IRQ's that are handled by the PIC in the MPS IOAPIC case.
7366- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
7367- * Linux doesn't really care, as it's not actually used
7368- * for any interrupt handling anyway.
7369+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
7370+ * to devices. However there may be an I/O APIC pin available for
7371+ * this interrupt regardless. The pin may be left unconnected, but
7372+ * typically it will be reused as an ExtINT cascade interrupt for
7373+ * the master 8259A. In the MPS case such a pin will normally be
7374+ * reported as an ExtINT interrupt in the MP table. With ACPI
7375+ * there is no provision for ExtINT interrupts, and in the absence
7376+ * of an override it would be treated as an ordinary ISA I/O APIC
7377+ * interrupt, that is edge-triggered and unmasked by default. We
7378+ * used to do this, but it caused problems on some systems because
7379+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
7380+ * the same ExtINT cascade interrupt to drive the local APIC of the
7381+ * bootstrap processor. Therefore we refrain from routing IRQ2 to
7382+ * the I/O APIC in all cases now. No actual device should request
7383+ * it anyway. --macro
7384 */
7385 #define PIC_IRQS (1 << PIC_CASCADE_IR)
7386
7387@@ -2328,25 +2379,22 @@ void __init setup_IO_APIC(void)
7388 int i;
7389
7390 /* Reserve all the system vectors. */
7391- for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
7392+ for (i = first_system_vector; i < NR_VECTORS; i++)
7393 set_bit(i, used_vectors);
7394 #endif
7395
7396 enable_IO_APIC();
7397
7398- if (acpi_ioapic)
7399- io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
7400- else
7401- io_apic_irqs = ~PIC_IRQS;
7402+ io_apic_irqs = ~PIC_IRQS;
7403
7404 printk("ENABLING IO-APIC IRQs\n");
7405
7406+#ifndef CONFIG_XEN
7407 /*
7408 * Set up IO-APIC IRQ routing.
7409 */
7410 if (!acpi_ioapic)
7411 setup_ioapic_ids_from_mpc();
7412-#ifndef CONFIG_XEN
7413 sync_Arb_IDs();
7414 #endif
7415 setup_IO_APIC_irqs();
7416@@ -2356,28 +2404,14 @@ void __init setup_IO_APIC(void)
7417 print_IO_APIC();
7418 }
7419
7420-static int __init setup_disable_8254_timer(char *s)
7421-{
7422- timer_over_8254 = -1;
7423- return 1;
7424-}
7425-static int __init setup_enable_8254_timer(char *s)
7426-{
7427- timer_over_8254 = 2;
7428- return 1;
7429-}
7430-
7431-__setup("disable_8254_timer", setup_disable_8254_timer);
7432-__setup("enable_8254_timer", setup_enable_8254_timer);
7433-
7434 /*
7435 * Called after all the initialization is done. If we didnt find any
7436 * APIC bugs then we can allow the modify fast path
7437 */
7438-
7439+
7440 static int __init io_apic_bug_finalize(void)
7441 {
7442- if(sis_apic_bug == -1)
7443+ if (sis_apic_bug == -1)
7444 sis_apic_bug = 0;
7445 if (is_initial_xendomain()) {
7446 struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
7447@@ -2396,17 +2430,17 @@ struct sysfs_ioapic_data {
7448 struct sys_device dev;
7449 struct IO_APIC_route_entry entry[0];
7450 };
7451-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
7452+static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
7453
7454 static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
7455 {
7456 struct IO_APIC_route_entry *entry;
7457 struct sysfs_ioapic_data *data;
7458 int i;
7459-
7460+
7461 data = container_of(dev, struct sysfs_ioapic_data, dev);
7462 entry = data->entry;
7463- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7464+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7465 entry[i] = ioapic_read_entry(dev->id, i);
7466
7467 return 0;
7468@@ -2419,18 +2453,18 @@ static int ioapic_resume(struct sys_devi
7469 unsigned long flags;
7470 union IO_APIC_reg_00 reg_00;
7471 int i;
7472-
7473+
7474 data = container_of(dev, struct sysfs_ioapic_data, dev);
7475 entry = data->entry;
7476
7477 spin_lock_irqsave(&ioapic_lock, flags);
7478 reg_00.raw = io_apic_read(dev->id, 0);
7479- if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
7480- reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
7481+ if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
7482+ reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
7483 io_apic_write(dev->id, 0, reg_00.raw);
7484 }
7485 spin_unlock_irqrestore(&ioapic_lock, flags);
7486- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7487+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7488 ioapic_write_entry(dev->id, i, entry[i]);
7489
7490 return 0;
7491@@ -2444,24 +2478,23 @@ static struct sysdev_class ioapic_sysdev
7492
7493 static int __init ioapic_init_sysfs(void)
7494 {
7495- struct sys_device * dev;
7496+ struct sys_device *dev;
7497 int i, size, error = 0;
7498
7499 error = sysdev_class_register(&ioapic_sysdev_class);
7500 if (error)
7501 return error;
7502
7503- for (i = 0; i < nr_ioapics; i++ ) {
7504- size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7505+ for (i = 0; i < nr_ioapics; i++) {
7506+ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7507 * sizeof(struct IO_APIC_route_entry);
7508- mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
7509+ mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
7510 if (!mp_ioapic_data[i]) {
7511 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
7512 continue;
7513 }
7514- memset(mp_ioapic_data[i], 0, size);
7515 dev = &mp_ioapic_data[i]->dev;
7516- dev->id = i;
7517+ dev->id = i;
7518 dev->cls = &ioapic_sysdev_class;
7519 error = sysdev_register(dev);
7520 if (error) {
7521@@ -2538,7 +2571,7 @@ static int msi_compose_msg(struct pci_de
7522 msg->address_lo =
7523 MSI_ADDR_BASE_LO |
7524 ((INT_DEST_MODE == 0) ?
7525- MSI_ADDR_DEST_MODE_PHYSICAL:
7526+MSI_ADDR_DEST_MODE_PHYSICAL:
7527 MSI_ADDR_DEST_MODE_LOGICAL) |
7528 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7529 MSI_ADDR_REDIRECTION_CPU:
7530@@ -2549,7 +2582,7 @@ static int msi_compose_msg(struct pci_de
7531 MSI_DATA_TRIGGER_EDGE |
7532 MSI_DATA_LEVEL_ASSERT |
7533 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7534- MSI_DATA_DELIVERY_FIXED:
7535+MSI_DATA_DELIVERY_FIXED:
7536 MSI_DATA_DELIVERY_LOWPRI) |
7537 MSI_DATA_VECTOR(vector);
7538 }
7539@@ -2720,12 +2753,12 @@ int arch_setup_ht_irq(unsigned int irq,
7540 #endif /* CONFIG_HT_IRQ */
7541
7542 /* --------------------------------------------------------------------------
7543- ACPI-based IOAPIC Configuration
7544+ ACPI-based IOAPIC Configuration
7545 -------------------------------------------------------------------------- */
7546
7547 #ifdef CONFIG_ACPI
7548
7549-int __init io_apic_get_unique_id (int ioapic, int apic_id)
7550+int __init io_apic_get_unique_id(int ioapic, int apic_id)
7551 {
7552 #ifndef CONFIG_XEN
7553 union IO_APIC_reg_00 reg_00;
7554@@ -2735,10 +2768,10 @@ int __init io_apic_get_unique_id (int io
7555 int i = 0;
7556
7557 /*
7558- * The P4 platform supports up to 256 APIC IDs on two separate APIC
7559- * buses (one for LAPICs, one for IOAPICs), where predecessors only
7560+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
7561+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
7562 * supports up to 16 on one shared APIC bus.
7563- *
7564+ *
7565 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
7566 * advantage of new APIC bus architecture.
7567 */
7568@@ -2757,7 +2790,7 @@ int __init io_apic_get_unique_id (int io
7569 }
7570
7571 /*
7572- * Every APIC in a system must have a unique ID or we get lots of nice
7573+ * Every APIC in a system must have a unique ID or we get lots of nice
7574 * 'stuck on smp_invalidate_needed IPI wait' messages.
7575 */
7576 if (check_apicid_used(apic_id_map, apic_id)) {
7577@@ -2774,7 +2807,7 @@ int __init io_apic_get_unique_id (int io
7578 "trying %d\n", ioapic, apic_id, i);
7579
7580 apic_id = i;
7581- }
7582+ }
7583
7584 tmp = apicid_to_cpu_present(apic_id);
7585 physids_or(apic_id_map, apic_id_map, tmp);
7586@@ -2802,7 +2835,7 @@ int __init io_apic_get_unique_id (int io
7587 }
7588
7589
7590-int __init io_apic_get_version (int ioapic)
7591+int __init io_apic_get_version(int ioapic)
7592 {
7593 union IO_APIC_reg_01 reg_01;
7594 unsigned long flags;
7595@@ -2815,7 +2848,7 @@ int __init io_apic_get_version (int ioap
7596 }
7597
7598
7599-int __init io_apic_get_redir_entries (int ioapic)
7600+int __init io_apic_get_redir_entries(int ioapic)
7601 {
7602 union IO_APIC_reg_01 reg_01;
7603 unsigned long flags;
7604@@ -2828,7 +2861,7 @@ int __init io_apic_get_redir_entries (in
7605 }
7606
7607
7608-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
7609+int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
7610 {
7611 struct IO_APIC_route_entry entry;
7612
7613@@ -2844,7 +2877,7 @@ int io_apic_set_pci_routing (int ioapic,
7614 * corresponding device driver registers for this IRQ.
7615 */
7616
7617- memset(&entry,0,sizeof(entry));
7618+ memset(&entry, 0, sizeof(entry));
7619
7620 entry.delivery_mode = INT_DELIVERY_MODE;
7621 entry.dest_mode = INT_DEST_MODE;
7622@@ -2863,7 +2896,7 @@ int io_apic_set_pci_routing (int ioapic,
7623
7624 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
7625 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
7626- mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
7627+ mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
7628 edge_level, active_high_low);
7629
7630 ioapic_register_intr(irq, entry.vector, edge_level);
7631@@ -2884,8 +2917,8 @@ int acpi_get_override_irq(int bus_irq, i
7632 return -1;
7633
7634 for (i = 0; i < mp_irq_entries; i++)
7635- if (mp_irqs[i].mpc_irqtype == mp_INT &&
7636- mp_irqs[i].mpc_srcbusirq == bus_irq)
7637+ if (mp_irqs[i].mp_irqtype == mp_INT &&
7638+ mp_irqs[i].mp_srcbusirq == bus_irq)
7639 break;
7640 if (i >= mp_irq_entries)
7641 return -1;
7642@@ -2918,3 +2951,35 @@ static int __init parse_noapic(char *arg
7643 return 0;
7644 }
7645 early_param("noapic", parse_noapic);
7646+
7647+#ifndef CONFIG_XEN
7648+void __init ioapic_init_mappings(void)
7649+{
7650+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
7651+ int i;
7652+
7653+ for (i = 0; i < nr_ioapics; i++) {
7654+ if (smp_found_config) {
7655+ ioapic_phys = mp_ioapics[i].mp_apicaddr;
7656+ if (!ioapic_phys) {
7657+ printk(KERN_ERR
7658+ "WARNING: bogus zero IO-APIC "
7659+ "address found in MPTABLE, "
7660+ "disabling IO/APIC support!\n");
7661+ smp_found_config = 0;
7662+ skip_ioapic_setup = 1;
7663+ goto fake_ioapic_page;
7664+ }
7665+ } else {
7666+fake_ioapic_page:
7667+ ioapic_phys = (unsigned long)
7668+ alloc_bootmem_pages(PAGE_SIZE);
7669+ ioapic_phys = __pa(ioapic_phys);
7670+ }
7671+ set_fixmap_nocache(idx, ioapic_phys);
7672+ printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
7673+ __fix_to_virt(idx), ioapic_phys);
7674+ idx++;
7675+ }
7676+}
7677+#endif
7678Index: head-2008-12-01/arch/x86/kernel/io_apic_64-xen.c
7679===================================================================
7680--- head-2008-12-01.orig/arch/x86/kernel/io_apic_64-xen.c 2008-12-01 11:44:55.000000000 +0100
7681+++ head-2008-12-01/arch/x86/kernel/io_apic_64-xen.c 2008-12-01 11:49:07.000000000 +0100
7682@@ -45,6 +45,7 @@
7683 #include <asm/proto.h>
7684 #include <asm/acpi.h>
7685 #include <asm/dma.h>
7686+#include <asm/i8259.h>
7687 #include <asm/nmi.h>
7688 #include <asm/msidef.h>
7689 #include <asm/hypertransport.h>
7690@@ -63,10 +64,16 @@ struct irq_cfg {
7691 };
7692
7693 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
7694-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7695+static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7696
7697 static int assign_irq_vector(int irq, cpumask_t mask);
7698
7699+#ifndef CONFIG_XEN
7700+int first_system_vector = 0xfe;
7701+
7702+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
7703+#endif
7704+
7705 #define __apicdebuginit __init
7706
7707 int sis_apic_bug; /* not actually supported, dummy for compile */
7708@@ -89,14 +96,14 @@ unsigned long io_apic_irqs;
7709
7710 #define clear_IO_APIC() ((void)0)
7711 #else
7712-int timer_over_8254 __initdata = 1;
7713+int timer_through_8259 __initdata;
7714
7715 /* Where if anywhere is the i8259 connect in external int mode */
7716 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
7717 #endif
7718
7719 static DEFINE_SPINLOCK(ioapic_lock);
7720-DEFINE_SPINLOCK(vector_lock);
7721+static DEFINE_SPINLOCK(vector_lock);
7722
7723 /*
7724 * # of IRQ routing registers
7725@@ -104,15 +111,17 @@ DEFINE_SPINLOCK(vector_lock);
7726 int nr_ioapic_registers[MAX_IO_APICS];
7727
7728 /* I/O APIC entries */
7729-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
7730+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
7731 int nr_ioapics;
7732
7733 /* MP IRQ source entries */
7734-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7735+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7736
7737 /* # of MP IRQ source entries */
7738 int mp_irq_entries;
7739
7740+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
7741+
7742 /*
7743 * Rough estimation of how many shared IRQs there are, can
7744 * be changed anytime.
7745@@ -141,7 +150,7 @@ struct io_apic {
7746 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
7747 {
7748 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
7749- + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
7750+ + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
7751 }
7752 #endif
7753
7754@@ -155,7 +164,7 @@ static inline unsigned int io_apic_read(
7755 struct physdev_apic apic_op;
7756 int ret;
7757
7758- apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7759+ apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7760 apic_op.reg = reg;
7761 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
7762 if (ret)
7763@@ -173,7 +182,7 @@ static inline void io_apic_write(unsigne
7764 #else
7765 struct physdev_apic apic_op;
7766
7767- apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7768+ apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7769 apic_op.reg = reg;
7770 apic_op.value = value;
7771 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
7772@@ -209,7 +218,7 @@ static bool io_apic_level_ack_pending(un
7773 break;
7774 reg = io_apic_read(entry->apic, 0x10 + pin*2);
7775 /* Is the remote IRR bit set? */
7776- if ((reg >> 14) & 1) {
7777+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
7778 spin_unlock_irqrestore(&ioapic_lock, flags);
7779 return true;
7780 }
7781@@ -308,7 +317,7 @@ static void __target_IO_APIC_irq(unsigne
7782 break;
7783 io_apic_write(apic, 0x11 + pin*2, dest);
7784 reg = io_apic_read(apic, 0x10 + pin*2);
7785- reg &= ~0x000000ff;
7786+ reg &= ~IO_APIC_REDIR_VECTOR_MASK;
7787 reg |= vector;
7788 io_apic_modify(apic, reg);
7789 if (!entry->next)
7790@@ -372,6 +381,26 @@ static void add_pin_to_irq(unsigned int
7791 }
7792
7793 #ifndef CONFIG_XEN
7794+/*
7795+ * Reroute an IRQ to a different pin.
7796+ */
7797+static void __init replace_pin_at_irq(unsigned int irq,
7798+ int oldapic, int oldpin,
7799+ int newapic, int newpin)
7800+{
7801+ struct irq_pin_list *entry = irq_2_pin + irq;
7802+
7803+ while (1) {
7804+ if (entry->apic == oldapic && entry->pin == oldpin) {
7805+ entry->apic = newapic;
7806+ entry->pin = newpin;
7807+ }
7808+ if (!entry->next)
7809+ break;
7810+ entry = irq_2_pin + entry->next;
7811+ }
7812+}
7813+
7814 #define __DO_ACTION(R, ACTION, FINAL) \
7815 \
7816 { \
7817@@ -399,10 +428,11 @@ static void add_pin_to_irq(unsigned int
7818 static void name##_IO_APIC_irq (unsigned int irq) \
7819 __DO_ACTION(R, ACTION, FINAL)
7820
7821-DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
7822- /* mask = 1 */
7823-DO_ACTION( __unmask, 0, &= 0xfffeffff, )
7824- /* mask = 0 */
7825+/* mask = 1 */
7826+DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
7827+
7828+/* mask = 0 */
7829+DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
7830
7831 static void mask_IO_APIC_irq (unsigned int irq)
7832 {
7833@@ -465,22 +495,6 @@ static int __init disable_timer_pin_setu
7834 }
7835 __setup("disable_timer_pin_1", disable_timer_pin_setup);
7836
7837-#ifndef CONFIG_XEN
7838-static int __init setup_disable_8254_timer(char *s)
7839-{
7840- timer_over_8254 = -1;
7841- return 1;
7842-}
7843-static int __init setup_enable_8254_timer(char *s)
7844-{
7845- timer_over_8254 = 2;
7846- return 1;
7847-}
7848-
7849-__setup("disable_8254_timer", setup_disable_8254_timer);
7850-__setup("enable_8254_timer", setup_enable_8254_timer);
7851-#endif /* !CONFIG_XEN */
7852-
7853
7854 /*
7855 * Find the IRQ entry number of a certain pin.
7856@@ -490,10 +504,10 @@ static int find_irq_entry(int apic, int
7857 int i;
7858
7859 for (i = 0; i < mp_irq_entries; i++)
7860- if (mp_irqs[i].mpc_irqtype == type &&
7861- (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
7862- mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
7863- mp_irqs[i].mpc_dstirq == pin)
7864+ if (mp_irqs[i].mp_irqtype == type &&
7865+ (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
7866+ mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
7867+ mp_irqs[i].mp_dstirq == pin)
7868 return i;
7869
7870 return -1;
7871@@ -508,13 +522,13 @@ static int __init find_isa_irq_pin(int i
7872 int i;
7873
7874 for (i = 0; i < mp_irq_entries; i++) {
7875- int lbus = mp_irqs[i].mpc_srcbus;
7876+ int lbus = mp_irqs[i].mp_srcbus;
7877
7878 if (test_bit(lbus, mp_bus_not_pci) &&
7879- (mp_irqs[i].mpc_irqtype == type) &&
7880- (mp_irqs[i].mpc_srcbusirq == irq))
7881+ (mp_irqs[i].mp_irqtype == type) &&
7882+ (mp_irqs[i].mp_srcbusirq == irq))
7883
7884- return mp_irqs[i].mpc_dstirq;
7885+ return mp_irqs[i].mp_dstirq;
7886 }
7887 return -1;
7888 }
7889@@ -524,17 +538,17 @@ static int __init find_isa_irq_apic(int
7890 int i;
7891
7892 for (i = 0; i < mp_irq_entries; i++) {
7893- int lbus = mp_irqs[i].mpc_srcbus;
7894+ int lbus = mp_irqs[i].mp_srcbus;
7895
7896 if (test_bit(lbus, mp_bus_not_pci) &&
7897- (mp_irqs[i].mpc_irqtype == type) &&
7898- (mp_irqs[i].mpc_srcbusirq == irq))
7899+ (mp_irqs[i].mp_irqtype == type) &&
7900+ (mp_irqs[i].mp_srcbusirq == irq))
7901 break;
7902 }
7903 if (i < mp_irq_entries) {
7904 int apic;
7905 for(apic = 0; apic < nr_ioapics; apic++) {
7906- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
7907+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
7908 return apic;
7909 }
7910 }
7911@@ -555,28 +569,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7912
7913 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
7914 bus, slot, pin);
7915- if (mp_bus_id_to_pci_bus[bus] == -1) {
7916+ if (test_bit(bus, mp_bus_not_pci)) {
7917 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
7918 return -1;
7919 }
7920 for (i = 0; i < mp_irq_entries; i++) {
7921- int lbus = mp_irqs[i].mpc_srcbus;
7922+ int lbus = mp_irqs[i].mp_srcbus;
7923
7924 for (apic = 0; apic < nr_ioapics; apic++)
7925- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
7926- mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
7927+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
7928+ mp_irqs[i].mp_dstapic == MP_APIC_ALL)
7929 break;
7930
7931 if (!test_bit(lbus, mp_bus_not_pci) &&
7932- !mp_irqs[i].mpc_irqtype &&
7933+ !mp_irqs[i].mp_irqtype &&
7934 (bus == lbus) &&
7935- (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
7936- int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
7937+ (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
7938+ int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
7939
7940 if (!(apic || IO_APIC_IRQ(irq)))
7941 continue;
7942
7943- if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
7944+ if (pin == (mp_irqs[i].mp_srcbusirq & 3))
7945 return irq;
7946 /*
7947 * Use the first all-but-pin matching entry as a
7948@@ -604,13 +618,13 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7949
7950 static int MPBIOS_polarity(int idx)
7951 {
7952- int bus = mp_irqs[idx].mpc_srcbus;
7953+ int bus = mp_irqs[idx].mp_srcbus;
7954 int polarity;
7955
7956 /*
7957 * Determine IRQ line polarity (high active or low active):
7958 */
7959- switch (mp_irqs[idx].mpc_irqflag & 3)
7960+ switch (mp_irqs[idx].mp_irqflag & 3)
7961 {
7962 case 0: /* conforms, ie. bus-type dependent polarity */
7963 if (test_bit(bus, mp_bus_not_pci))
7964@@ -646,13 +660,13 @@ static int MPBIOS_polarity(int idx)
7965
7966 static int MPBIOS_trigger(int idx)
7967 {
7968- int bus = mp_irqs[idx].mpc_srcbus;
7969+ int bus = mp_irqs[idx].mp_srcbus;
7970 int trigger;
7971
7972 /*
7973 * Determine IRQ trigger mode (edge or level sensitive):
7974 */
7975- switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
7976+ switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
7977 {
7978 case 0: /* conforms, ie. bus-type dependent */
7979 if (test_bit(bus, mp_bus_not_pci))
7980@@ -699,16 +713,16 @@ static inline int irq_trigger(int idx)
7981 static int pin_2_irq(int idx, int apic, int pin)
7982 {
7983 int irq, i;
7984- int bus = mp_irqs[idx].mpc_srcbus;
7985+ int bus = mp_irqs[idx].mp_srcbus;
7986
7987 /*
7988 * Debugging check, we are in big trouble if this message pops up!
7989 */
7990- if (mp_irqs[idx].mpc_dstirq != pin)
7991+ if (mp_irqs[idx].mp_dstirq != pin)
7992 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
7993
7994 if (test_bit(bus, mp_bus_not_pci)) {
7995- irq = mp_irqs[idx].mpc_srcbusirq;
7996+ irq = mp_irqs[idx].mp_srcbusirq;
7997 } else {
7998 /*
7999 * PCI IRQs are mapped in order
8000@@ -722,6 +736,19 @@ static int pin_2_irq(int idx, int apic,
8001 return irq;
8002 }
8003
8004+void lock_vector_lock(void)
8005+{
8006+ /* Used to the online set of cpus does not change
8007+ * during assign_irq_vector.
8008+ */
8009+ spin_lock(&vector_lock);
8010+}
8011+
8012+void unlock_vector_lock(void)
8013+{
8014+ spin_unlock(&vector_lock);
8015+}
8016+
8017 static int __assign_irq_vector(int irq, cpumask_t mask)
8018 {
8019 struct physdev_irq irq_op;
8020@@ -773,7 +800,7 @@ static void __clear_irq_vector(int irq)
8021
8022 vector = cfg->vector;
8023 cpus_and(mask, cfg->domain, cpu_online_map);
8024- for_each_cpu_mask(cpu, mask)
8025+ for_each_cpu_mask_nr(cpu, mask)
8026 per_cpu(vector_irq, cpu)[vector] = -1;
8027
8028 cfg->vector = 0;
8029@@ -842,7 +869,7 @@ static void setup_IO_APIC_irq(int apic,
8030 apic_printk(APIC_VERBOSE,KERN_DEBUG
8031 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
8032 "IRQ %d Mode:%i Active:%i)\n",
8033- apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
8034+ apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
8035 irq, trigger, polarity);
8036
8037 /*
8038@@ -883,10 +910,10 @@ static void __init setup_IO_APIC_irqs(vo
8039 idx = find_irq_entry(apic,pin,mp_INT);
8040 if (idx == -1) {
8041 if (first_notcon) {
8042- apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8043+ apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
8044 first_notcon = 0;
8045 } else
8046- apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8047+ apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
8048 continue;
8049 }
8050 if (!first_notcon) {
8051@@ -908,26 +935,21 @@ static void __init setup_IO_APIC_irqs(vo
8052
8053 #ifndef CONFIG_XEN
8054 /*
8055- * Set up the 8259A-master output pin as broadcast to all
8056- * CPUs.
8057+ * Set up the timer pin, possibly with the 8259A-master behind.
8058 */
8059-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
8060+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
8061+ int vector)
8062 {
8063 struct IO_APIC_route_entry entry;
8064
8065 memset(&entry, 0, sizeof(entry));
8066
8067- disable_8259A_irq(0);
8068-
8069- /* mask LVT0 */
8070- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8071-
8072 /*
8073 * We use logical delivery to get the timer IRQ
8074 * to the first CPU.
8075 */
8076 entry.dest_mode = INT_DEST_MODE;
8077- entry.mask = 0; /* unmask IRQ now */
8078+ entry.mask = 1; /* mask IRQ now */
8079 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
8080 entry.delivery_mode = INT_DELIVERY_MODE;
8081 entry.polarity = 0;
8082@@ -936,7 +958,7 @@ static void __init setup_ExtINT_IRQ0_pin
8083
8084 /*
8085 * The timer IRQ doesn't have to know that behind the
8086- * scene we have a 8259A-master in AEOI mode ...
8087+ * scene we may have a 8259A-master in AEOI mode ...
8088 */
8089 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
8090
8091@@ -944,8 +966,6 @@ static void __init setup_ExtINT_IRQ0_pin
8092 * Add it to the IO-APIC irq-routing table:
8093 */
8094 ioapic_write_entry(apic, pin, entry);
8095-
8096- enable_8259A_irq(0);
8097 }
8098
8099 void __apicdebuginit print_IO_APIC(void)
8100@@ -962,7 +982,7 @@ void __apicdebuginit print_IO_APIC(void)
8101 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
8102 for (i = 0; i < nr_ioapics; i++)
8103 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
8104- mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
8105+ mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
8106
8107 /*
8108 * We are a bit conservative about what we expect. We have to
8109@@ -980,7 +1000,7 @@ void __apicdebuginit print_IO_APIC(void)
8110 spin_unlock_irqrestore(&ioapic_lock, flags);
8111
8112 printk("\n");
8113- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
8114+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
8115 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
8116 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
8117
8118@@ -1072,6 +1092,7 @@ void __apicdebuginit print_local_APIC(vo
8119
8120 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
8121 smp_processor_id(), hard_smp_processor_id());
8122+ v = apic_read(APIC_ID);
8123 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
8124 v = apic_read(APIC_LVR);
8125 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
8126@@ -1141,7 +1162,7 @@ void __apicdebuginit print_local_APIC(vo
8127
8128 void print_all_local_APICs (void)
8129 {
8130- on_each_cpu(print_local_APIC, NULL, 1, 1);
8131+ on_each_cpu(print_local_APIC, NULL, 1);
8132 }
8133
8134 void __apicdebuginit print_PIC(void)
8135@@ -1175,6 +1196,8 @@ void __apicdebuginit print_PIC(void)
8136 v = inb(0x4d1) << 8 | inb(0x4d0);
8137 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
8138 }
8139+#else
8140+void __apicdebuginit print_IO_APIC(void) {}
8141 #endif /* !CONFIG_XEN */
8142
8143 void __init enable_IO_APIC(void)
8144@@ -1359,12 +1382,10 @@ static unsigned int startup_ioapic_irq(u
8145 static int ioapic_retrigger_irq(unsigned int irq)
8146 {
8147 struct irq_cfg *cfg = &irq_cfg[irq];
8148- cpumask_t mask;
8149 unsigned long flags;
8150
8151 spin_lock_irqsave(&vector_lock, flags);
8152- mask = cpumask_of_cpu(first_cpu(cfg->domain));
8153- send_IPI_mask(mask, cfg->vector);
8154+ send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
8155 spin_unlock_irqrestore(&vector_lock, flags);
8156
8157 return 1;
8158@@ -1545,7 +1566,7 @@ static inline void init_IO_APIC_traps(vo
8159 }
8160
8161 #ifndef CONFIG_XEN
8162-static void enable_lapic_irq (unsigned int irq)
8163+static void unmask_lapic_irq(unsigned int irq)
8164 {
8165 unsigned long v;
8166
8167@@ -1553,7 +1574,7 @@ static void enable_lapic_irq (unsigned i
8168 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
8169 }
8170
8171-static void disable_lapic_irq (unsigned int irq)
8172+static void mask_lapic_irq(unsigned int irq)
8173 {
8174 unsigned long v;
8175
8176@@ -1566,19 +1587,20 @@ static void ack_lapic_irq (unsigned int
8177 ack_APIC_irq();
8178 }
8179
8180-static void end_lapic_irq (unsigned int i) { /* nothing */ }
8181-
8182-static struct hw_interrupt_type lapic_irq_type __read_mostly = {
8183- .name = "local-APIC",
8184- .typename = "local-APIC-edge",
8185- .startup = NULL, /* startup_irq() not used for IRQ0 */
8186- .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
8187- .enable = enable_lapic_irq,
8188- .disable = disable_lapic_irq,
8189- .ack = ack_lapic_irq,
8190- .end = end_lapic_irq,
8191+static struct irq_chip lapic_chip __read_mostly = {
8192+ .name = "local-APIC",
8193+ .mask = mask_lapic_irq,
8194+ .unmask = unmask_lapic_irq,
8195+ .ack = ack_lapic_irq,
8196 };
8197
8198+static void lapic_register_intr(int irq)
8199+{
8200+ irq_desc[irq].status &= ~IRQ_LEVEL;
8201+ set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
8202+ "edge");
8203+}
8204+
8205 static void __init setup_nmi(void)
8206 {
8207 /*
8208@@ -1664,6 +1686,7 @@ static inline void __init check_timer(vo
8209 struct irq_cfg *cfg = irq_cfg + 0;
8210 int apic1, pin1, apic2, pin2;
8211 unsigned long flags;
8212+ int no_pin1 = 0;
8213
8214 local_irq_save(flags);
8215
8216@@ -1674,34 +1697,48 @@ static inline void __init check_timer(vo
8217 assign_irq_vector(0, TARGET_CPUS);
8218
8219 /*
8220- * Subtle, code in do_timer_interrupt() expects an AEOI
8221- * mode for the 8259A whenever interrupts are routed
8222- * through I/O APICs. Also IRQ0 has to be enabled in
8223- * the 8259A which implies the virtual wire has to be
8224- * disabled in the local APIC.
8225+ * As IRQ0 is to be enabled in the 8259A, the virtual
8226+ * wire has to be disabled in the local APIC.
8227 */
8228 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8229 init_8259A(1);
8230- if (timer_over_8254 > 0)
8231- enable_8259A_irq(0);
8232
8233 pin1 = find_isa_irq_pin(0, mp_INT);
8234 apic1 = find_isa_irq_apic(0, mp_INT);
8235 pin2 = ioapic_i8259.pin;
8236 apic2 = ioapic_i8259.apic;
8237
8238- apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
8239- cfg->vector, apic1, pin1, apic2, pin2);
8240+ apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
8241+ "apic1=%d pin1=%d apic2=%d pin2=%d\n",
8242+ cfg->vector, apic1, pin1, apic2, pin2);
8243+
8244+ /*
8245+ * Some BIOS writers are clueless and report the ExtINTA
8246+ * I/O APIC input from the cascaded 8259A as the timer
8247+ * interrupt input. So just in case, if only one pin
8248+ * was found above, try it both directly and through the
8249+ * 8259A.
8250+ */
8251+ if (pin1 == -1) {
8252+ pin1 = pin2;
8253+ apic1 = apic2;
8254+ no_pin1 = 1;
8255+ } else if (pin2 == -1) {
8256+ pin2 = pin1;
8257+ apic2 = apic1;
8258+ }
8259
8260 if (pin1 != -1) {
8261 /*
8262 * Ok, does IRQ0 through the IOAPIC work?
8263 */
8264+ if (no_pin1) {
8265+ add_pin_to_irq(0, apic1, pin1);
8266+ setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
8267+ }
8268 unmask_IO_APIC_irq(0);
8269 if (!no_timer_check && timer_irq_works()) {
8270- nmi_watchdog_default();
8271 if (nmi_watchdog == NMI_IO_APIC) {
8272- disable_8259A_irq(0);
8273 setup_nmi();
8274 enable_8259A_irq(0);
8275 }
8276@@ -1710,54 +1747,62 @@ static inline void __init check_timer(vo
8277 goto out;
8278 }
8279 clear_IO_APIC_pin(apic1, pin1);
8280- apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
8281- "connected to IO-APIC\n");
8282- }
8283-
8284- apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
8285- "through the 8259A ... ");
8286- if (pin2 != -1) {
8287- apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
8288- apic2, pin2);
8289+ if (!no_pin1)
8290+ apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
8291+ "8254 timer not connected to IO-APIC\n");
8292+
8293+ apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
8294+ "(IRQ0) through the 8259A ...\n");
8295+ apic_printk(APIC_QUIET, KERN_INFO
8296+ "..... (found apic %d pin %d) ...\n", apic2, pin2);
8297 /*
8298 * legacy devices should be connected to IO APIC #0
8299 */
8300- setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
8301+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
8302+ setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
8303+ unmask_IO_APIC_irq(0);
8304+ enable_8259A_irq(0);
8305 if (timer_irq_works()) {
8306- apic_printk(APIC_VERBOSE," works.\n");
8307- nmi_watchdog_default();
8308+ apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
8309+ timer_through_8259 = 1;
8310 if (nmi_watchdog == NMI_IO_APIC) {
8311+ disable_8259A_irq(0);
8312 setup_nmi();
8313+ enable_8259A_irq(0);
8314 }
8315 goto out;
8316 }
8317 /*
8318 * Cleanup, just in case ...
8319 */
8320+ disable_8259A_irq(0);
8321 clear_IO_APIC_pin(apic2, pin2);
8322+ apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
8323 }
8324- apic_printk(APIC_VERBOSE," failed.\n");
8325
8326 if (nmi_watchdog == NMI_IO_APIC) {
8327- printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
8328- nmi_watchdog = 0;
8329+ apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
8330+ "through the IO-APIC - disabling NMI Watchdog!\n");
8331+ nmi_watchdog = NMI_NONE;
8332 }
8333
8334- apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
8335+ apic_printk(APIC_QUIET, KERN_INFO
8336+ "...trying to set up timer as Virtual Wire IRQ...\n");
8337
8338- disable_8259A_irq(0);
8339- irq_desc[0].chip = &lapic_irq_type;
8340+ lapic_register_intr(0);
8341 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
8342 enable_8259A_irq(0);
8343
8344 if (timer_irq_works()) {
8345- apic_printk(APIC_VERBOSE," works.\n");
8346+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8347 goto out;
8348 }
8349+ disable_8259A_irq(0);
8350 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
8351- apic_printk(APIC_VERBOSE," failed.\n");
8352+ apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
8353
8354- apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
8355+ apic_printk(APIC_QUIET, KERN_INFO
8356+ "...trying to set up timer as ExtINT IRQ...\n");
8357
8358 init_8259A(0);
8359 make_8259A_irq(0);
8360@@ -1766,11 +1811,12 @@ static inline void __init check_timer(vo
8361 unlock_ExtINT_logic();
8362
8363 if (timer_irq_works()) {
8364- apic_printk(APIC_VERBOSE," works.\n");
8365+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8366 goto out;
8367 }
8368- apic_printk(APIC_VERBOSE," failed :(.\n");
8369- panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
8370+ apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
8371+ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
8372+ "report. Then try booting with the 'noapic' option.\n");
8373 out:
8374 local_irq_restore(flags);
8375 }
8376@@ -1788,10 +1834,21 @@ __setup("no_timer_check", notimercheck);
8377
8378 /*
8379 *
8380- * IRQs that are handled by the PIC in the MPS IOAPIC case.
8381- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
8382- * Linux doesn't really care, as it's not actually used
8383- * for any interrupt handling anyway.
8384+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
8385+ * to devices. However there may be an I/O APIC pin available for
8386+ * this interrupt regardless. The pin may be left unconnected, but
8387+ * typically it will be reused as an ExtINT cascade interrupt for
8388+ * the master 8259A. In the MPS case such a pin will normally be
8389+ * reported as an ExtINT interrupt in the MP table. With ACPI
8390+ * there is no provision for ExtINT interrupts, and in the absence
8391+ * of an override it would be treated as an ordinary ISA I/O APIC
8392+ * interrupt, that is edge-triggered and unmasked by default. We
8393+ * used to do this, but it caused problems on some systems because
8394+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
8395+ * the same ExtINT cascade interrupt to drive the local APIC of the
8396+ * bootstrap processor. Therefore we refrain from routing IRQ2 to
8397+ * the I/O APIC in all cases now. No actual device should request
8398+ * it anyway. --macro
8399 */
8400 #define PIC_IRQS (1<<2)
8401
8402@@ -1799,10 +1856,7 @@ void __init setup_IO_APIC(void)
8403 {
8404 enable_IO_APIC();
8405
8406- if (acpi_ioapic)
8407- io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
8408- else
8409- io_apic_irqs = ~PIC_IRQS;
8410+ io_apic_irqs = ~PIC_IRQS;
8411
8412 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
8413
8414@@ -1851,8 +1905,8 @@ static int ioapic_resume(struct sys_devi
8415
8416 spin_lock_irqsave(&ioapic_lock, flags);
8417 reg_00.raw = io_apic_read(dev->id, 0);
8418- if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
8419- reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
8420+ if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
8421+ reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
8422 io_apic_write(dev->id, 0, reg_00.raw);
8423 }
8424 spin_unlock_irqrestore(&ioapic_lock, flags);
8425@@ -2254,8 +2308,8 @@ int acpi_get_override_irq(int bus_irq, i
8426 return -1;
8427
8428 for (i = 0; i < mp_irq_entries; i++)
8429- if (mp_irqs[i].mpc_irqtype == mp_INT &&
8430- mp_irqs[i].mpc_srcbusirq == bus_irq)
8431+ if (mp_irqs[i].mp_irqtype == mp_INT &&
8432+ mp_irqs[i].mp_srcbusirq == bus_irq)
8433 break;
8434 if (i >= mp_irq_entries)
8435 return -1;
8436@@ -2349,7 +2403,7 @@ void __init ioapic_init_mappings(void)
8437 ioapic_res = ioapic_setup_resources();
8438 for (i = 0; i < nr_ioapics; i++) {
8439 if (smp_found_config) {
8440- ioapic_phys = mp_ioapics[i].mpc_apicaddr;
8441+ ioapic_phys = mp_ioapics[i].mp_apicaddr;
8442 } else {
8443 ioapic_phys = (unsigned long)
8444 alloc_bootmem_pages(PAGE_SIZE);
8445Index: head-2008-12-01/arch/x86/kernel/ipi-xen.c
8446===================================================================
8447--- head-2008-12-01.orig/arch/x86/kernel/ipi-xen.c 2008-12-01 11:44:55.000000000 +0100
8448+++ head-2008-12-01/arch/x86/kernel/ipi-xen.c 2008-12-01 11:49:07.000000000 +0100
8449@@ -8,7 +8,6 @@
8450 #include <linux/kernel_stat.h>
8451 #include <linux/mc146818rtc.h>
8452 #include <linux/cache.h>
8453-#include <linux/interrupt.h>
8454 #include <linux/cpu.h>
8455 #include <linux/module.h>
8456
8457@@ -85,7 +84,7 @@ void __send_IPI_shortcut(unsigned int sh
8458 /*
8459 * Send the IPI. The write to APIC_ICR fires this off.
8460 */
8461- apic_write_around(APIC_ICR, cfg);
8462+ apic_write(APIC_ICR, cfg);
8463 #else
8464 int cpu;
8465
8466@@ -132,7 +131,7 @@ static inline void __send_IPI_dest_field
8467 * prepare target chip field
8468 */
8469 cfg = __prepare_ICR2(mask);
8470- apic_write_around(APIC_ICR2, cfg);
8471+ apic_write(APIC_ICR2, cfg);
8472
8473 /*
8474 * program the ICR
8475@@ -142,7 +141,7 @@ static inline void __send_IPI_dest_field
8476 /*
8477 * Send the IPI. The write to APIC_ICR fires this off.
8478 */
8479- apic_write_around(APIC_ICR, cfg);
8480+ apic_write(APIC_ICR, cfg);
8481 }
8482 #endif
8483
8484Index: head-2008-12-01/arch/x86/kernel/irq_32-xen.c
8485===================================================================
8486--- head-2008-12-01.orig/arch/x86/kernel/irq_32-xen.c 2008-12-01 11:44:55.000000000 +0100
8487+++ head-2008-12-01/arch/x86/kernel/irq_32-xen.c 2008-12-01 11:49:07.000000000 +0100
8488@@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
8489 #endif
8490 }
8491
8492+#ifdef CONFIG_DEBUG_STACKOVERFLOW
8493+/* Debugging check for stack overflow: is there less than 1KB free? */
8494+static int check_stack_overflow(void)
8495+{
8496+ long sp;
8497+
8498+ __asm__ __volatile__("andl %%esp,%0" :
8499+ "=r" (sp) : "0" (THREAD_SIZE - 1));
8500+
8501+ return sp < (sizeof(struct thread_info) + STACK_WARN);
8502+}
8503+
8504+static void print_stack_overflow(void)
8505+{
8506+ printk(KERN_WARNING "low stack detected by irq handler\n");
8507+ dump_stack();
8508+}
8509+
8510+#else
8511+static inline int check_stack_overflow(void) { return 0; }
8512+static inline void print_stack_overflow(void) { }
8513+#endif
8514+
8515 #ifdef CONFIG_4KSTACKS
8516 /*
8517 * per-CPU IRQ handling contexts (thread information and stack)
8518@@ -59,48 +82,26 @@ union irq_ctx {
8519
8520 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
8521 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
8522-#endif
8523-
8524-/*
8525- * do_IRQ handles all normal device IRQ's (the special
8526- * SMP cross-CPU interrupts have their own specific
8527- * handlers).
8528- */
8529-unsigned int do_IRQ(struct pt_regs *regs)
8530-{
8531- struct pt_regs *old_regs;
8532- /* high bit used in ret_from_ code */
8533- int irq = ~regs->orig_ax;
8534- struct irq_desc *desc = irq_desc + irq;
8535-#ifdef CONFIG_4KSTACKS
8536- union irq_ctx *curctx, *irqctx;
8537- u32 *isp;
8538-#endif
8539
8540- if (unlikely((unsigned)irq >= NR_IRQS)) {
8541- printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8542- __func__, irq);
8543- BUG();
8544- }
8545+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8546+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8547
8548- old_regs = set_irq_regs(regs);
8549- /*irq_enter();*/
8550-#ifdef CONFIG_DEBUG_STACKOVERFLOW
8551- /* Debugging check for stack overflow: is there less than 1KB free? */
8552- {
8553- long sp;
8554-
8555- __asm__ __volatile__("andl %%esp,%0" :
8556- "=r" (sp) : "0" (THREAD_SIZE - 1));
8557- if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
8558- printk("do_IRQ: stack overflow: %ld\n",
8559- sp - sizeof(struct thread_info));
8560- dump_stack();
8561- }
8562- }
8563-#endif
8564+static void call_on_stack(void *func, void *stack)
8565+{
8566+ asm volatile("xchgl %%ebx,%%esp \n"
8567+ "call *%%edi \n"
8568+ "movl %%ebx,%%esp \n"
8569+ : "=b" (stack)
8570+ : "0" (stack),
8571+ "D"(func)
8572+ : "memory", "cc", "edx", "ecx", "eax");
8573+}
8574
8575-#ifdef CONFIG_4KSTACKS
8576+static inline int
8577+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
8578+{
8579+ union irq_ctx *curctx, *irqctx;
8580+ u32 *isp, arg1, arg2;
8581
8582 curctx = (union irq_ctx *) current_thread_info();
8583 irqctx = hardirq_ctx[smp_processor_id()];
8584@@ -111,52 +112,39 @@ unsigned int do_IRQ(struct pt_regs *regs
8585 * handler) we can't do that and just have to keep using the
8586 * current stack (which is the irq stack already after all)
8587 */
8588- if (curctx != irqctx) {
8589- int arg1, arg2, bx;
8590-
8591- /* build the stack frame on the IRQ stack */
8592- isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8593- irqctx->tinfo.task = curctx->tinfo.task;
8594- irqctx->tinfo.previous_esp = current_stack_pointer;
8595+ if (unlikely(curctx == irqctx))
8596+ return 0;
8597
8598- /*
8599- * Copy the softirq bits in preempt_count so that the
8600- * softirq checks work in the hardirq context.
8601- */
8602- irqctx->tinfo.preempt_count =
8603- (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8604- (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8605-
8606- asm volatile(
8607- " xchgl %%ebx,%%esp \n"
8608- " call *%%edi \n"
8609- " movl %%ebx,%%esp \n"
8610- : "=a" (arg1), "=d" (arg2), "=b" (bx)
8611- : "0" (irq), "1" (desc), "2" (isp),
8612- "D" (desc->handle_irq)
8613- : "memory", "cc", "ecx"
8614- );
8615- } else
8616-#endif
8617- desc->handle_irq(irq, desc);
8618+ /* build the stack frame on the IRQ stack */
8619+ isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
8620+ irqctx->tinfo.task = curctx->tinfo.task;
8621+ irqctx->tinfo.previous_esp = current_stack_pointer;
8622
8623- /*irq_exit();*/
8624- set_irq_regs(old_regs);
8625+ /*
8626+ * Copy the softirq bits in preempt_count so that the
8627+ * softirq checks work in the hardirq context.
8628+ */
8629+ irqctx->tinfo.preempt_count =
8630+ (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8631+ (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8632+
8633+ if (unlikely(overflow))
8634+ call_on_stack(print_stack_overflow, isp);
8635+
8636+ asm volatile("xchgl %%ebx,%%esp \n"
8637+ "call *%%edi \n"
8638+ "movl %%ebx,%%esp \n"
8639+ : "=a" (arg1), "=d" (arg2), "=b" (isp)
8640+ : "0" (irq), "1" (desc), "2" (isp),
8641+ "D" (desc->handle_irq)
8642+ : "memory", "cc", "ecx");
8643 return 1;
8644 }
8645
8646-#ifdef CONFIG_4KSTACKS
8647-
8648-static char softirq_stack[NR_CPUS * THREAD_SIZE]
8649- __attribute__((__section__(".bss.page_aligned")));
8650-
8651-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
8652- __attribute__((__section__(".bss.page_aligned")));
8653-
8654 /*
8655 * allocate per-cpu stacks for hardirq and for softirq processing
8656 */
8657-void irq_ctx_init(int cpu)
8658+void __cpuinit irq_ctx_init(int cpu)
8659 {
8660 union irq_ctx *irqctx;
8661
8662@@ -164,25 +152,25 @@ void irq_ctx_init(int cpu)
8663 return;
8664
8665 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
8666- irqctx->tinfo.task = NULL;
8667- irqctx->tinfo.exec_domain = NULL;
8668- irqctx->tinfo.cpu = cpu;
8669- irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8670- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8671+ irqctx->tinfo.task = NULL;
8672+ irqctx->tinfo.exec_domain = NULL;
8673+ irqctx->tinfo.cpu = cpu;
8674+ irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8675+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8676
8677 hardirq_ctx[cpu] = irqctx;
8678
8679 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
8680- irqctx->tinfo.task = NULL;
8681- irqctx->tinfo.exec_domain = NULL;
8682- irqctx->tinfo.cpu = cpu;
8683- irqctx->tinfo.preempt_count = 0;
8684- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8685+ irqctx->tinfo.task = NULL;
8686+ irqctx->tinfo.exec_domain = NULL;
8687+ irqctx->tinfo.cpu = cpu;
8688+ irqctx->tinfo.preempt_count = 0;
8689+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8690
8691 softirq_ctx[cpu] = irqctx;
8692
8693- printk("CPU %u irqstacks, hard=%p soft=%p\n",
8694- cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8695+ printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
8696+ cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8697 }
8698
8699 void irq_ctx_exit(int cpu)
8700@@ -211,25 +199,56 @@ asmlinkage void do_softirq(void)
8701 /* build the stack frame on the softirq stack */
8702 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8703
8704- asm volatile(
8705- " xchgl %%ebx,%%esp \n"
8706- " call __do_softirq \n"
8707- " movl %%ebx,%%esp \n"
8708- : "=b"(isp)
8709- : "0"(isp)
8710- : "memory", "cc", "edx", "ecx", "eax"
8711- );
8712+ call_on_stack(__do_softirq, isp);
8713 /*
8714 * Shouldnt happen, we returned above if in_interrupt():
8715- */
8716+ */
8717 WARN_ON_ONCE(softirq_count());
8718 }
8719
8720 local_irq_restore(flags);
8721 }
8722+
8723+#else
8724+static inline int
8725+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
8726 #endif
8727
8728 /*
8729+ * do_IRQ handles all normal device IRQ's (the special
8730+ * SMP cross-CPU interrupts have their own specific
8731+ * handlers).
8732+ */
8733+unsigned int do_IRQ(struct pt_regs *regs)
8734+{
8735+ struct pt_regs *old_regs;
8736+ /* high bit used in ret_from_ code */
8737+ int overflow, irq = ~regs->orig_ax;
8738+ struct irq_desc *desc = irq_desc + irq;
8739+
8740+ if (unlikely((unsigned)irq >= NR_IRQS)) {
8741+ printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8742+ __func__, irq);
8743+ BUG();
8744+ }
8745+
8746+ old_regs = set_irq_regs(regs);
8747+ /*irq_enter();*/
8748+
8749+ overflow = check_stack_overflow();
8750+
8751+ if (!execute_on_irq_stack(overflow, desc, irq)) {
8752+ if (unlikely(overflow))
8753+ print_stack_overflow();
8754+ desc->handle_irq(irq, desc);
8755+ }
8756+
8757+ /*irq_exit();*/
8758+ set_irq_regs(old_regs);
8759+ return 1;
8760+}
8761+
8762+/*
8763 * Interrupt statistics:
8764 */
8765
8766@@ -337,6 +356,42 @@ skip:
8767 return 0;
8768 }
8769
8770+/*
8771+ * /proc/stat helpers
8772+ */
8773+u64 arch_irq_stat_cpu(unsigned int cpu)
8774+{
8775+ u64 sum = nmi_count(cpu);
8776+
8777+#ifdef CONFIG_X86_LOCAL_APIC
8778+ sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
8779+#endif
8780+#ifdef CONFIG_SMP
8781+ sum += per_cpu(irq_stat, cpu).irq_resched_count;
8782+ sum += per_cpu(irq_stat, cpu).irq_call_count;
8783+#ifndef CONFIG_XEN
8784+ sum += per_cpu(irq_stat, cpu).irq_tlb_count;
8785+#endif
8786+#endif
8787+#ifdef CONFIG_X86_MCE
8788+ sum += per_cpu(irq_stat, cpu).irq_thermal_count;
8789+#endif
8790+#ifdef CONFIG_X86_LOCAL_APIC
8791+ sum += per_cpu(irq_stat, cpu).irq_spurious_count;
8792+#endif
8793+ return sum;
8794+}
8795+
8796+u64 arch_irq_stat(void)
8797+{
8798+ u64 sum = atomic_read(&irq_err_count);
8799+
8800+#ifdef CONFIG_X86_IO_APIC
8801+ sum += atomic_read(&irq_mis_count);
8802+#endif
8803+ return sum;
8804+}
8805+
8806 #ifdef CONFIG_HOTPLUG_CPU
8807
8808 void fixup_irqs(cpumask_t map)
8809Index: head-2008-12-01/arch/x86/kernel/irq_64-xen.c
8810===================================================================
8811--- head-2008-12-01.orig/arch/x86/kernel/irq_64-xen.c 2008-12-01 11:37:10.000000000 +0100
8812+++ head-2008-12-01/arch/x86/kernel/irq_64-xen.c 2008-12-01 11:49:07.000000000 +0100
8813@@ -163,6 +163,34 @@ skip:
8814 }
8815
8816 /*
8817+ * /proc/stat helpers
8818+ */
8819+u64 arch_irq_stat_cpu(unsigned int cpu)
8820+{
8821+ u64 sum = cpu_pda(cpu)->__nmi_count;
8822+
8823+ sum += cpu_pda(cpu)->apic_timer_irqs;
8824+#ifdef CONFIG_SMP
8825+ sum += cpu_pda(cpu)->irq_resched_count;
8826+ sum += cpu_pda(cpu)->irq_call_count;
8827+#ifndef CONFIG_XEN
8828+ sum += cpu_pda(cpu)->irq_tlb_count;
8829+#endif
8830+#endif
8831+#ifdef CONFIG_X86_MCE
8832+ sum += cpu_pda(cpu)->irq_thermal_count;
8833+ sum += cpu_pda(cpu)->irq_threshold_count;
8834+#endif
8835+ sum += cpu_pda(cpu)->irq_spurious_count;
8836+ return sum;
8837+}
8838+
8839+u64 arch_irq_stat(void)
8840+{
8841+ return atomic_read(&irq_err_count);
8842+}
8843+
8844+/*
8845 * do_IRQ handles all normal device IRQ's (the special
8846 * SMP cross-CPU interrupts have their own specific
8847 * handlers).
8848Index: head-2008-12-01/arch/x86/kernel/ldt-xen.c
8849===================================================================
8850--- head-2008-12-01.orig/arch/x86/kernel/ldt-xen.c 2008-12-01 11:37:10.000000000 +0100
8851+++ head-2008-12-01/arch/x86/kernel/ldt-xen.c 2008-12-01 11:49:07.000000000 +0100
8852@@ -20,9 +20,9 @@
8853 #include <asm/mmu_context.h>
8854
8855 #ifdef CONFIG_SMP
8856-static void flush_ldt(void *null)
8857+static void flush_ldt(void *current_mm)
8858 {
8859- if (current->active_mm)
8860+ if (current->active_mm == current_mm)
8861 load_LDT(&current->active_mm->context);
8862 }
8863 #endif
8864@@ -62,8 +62,6 @@ static int alloc_ldt(mm_context_t *pc, i
8865
8866 if (reload) {
8867 #ifdef CONFIG_SMP
8868- cpumask_t mask;
8869-
8870 preempt_disable();
8871 #endif
8872 make_pages_readonly(newldt,
8873@@ -71,9 +69,9 @@ static int alloc_ldt(mm_context_t *pc, i
8874 XENFEAT_writable_descriptor_tables);
8875 load_LDT(pc);
8876 #ifdef CONFIG_SMP
8877- mask = cpumask_of_cpu(smp_processor_id());
8878- if (!cpus_equal(current->mm->cpu_vm_mask, mask))
8879- smp_call_function(flush_ldt, NULL, 1, 1);
8880+ if (!cpus_equal(current->mm->cpu_vm_mask,
8881+ cpumask_of_cpu(smp_processor_id())))
8882+ smp_call_function(flush_ldt, current->mm, 1);
8883 preempt_enable();
8884 #endif
8885 }
8886Index: head-2008-12-01/arch/x86/kernel/microcode-xen.c
8887===================================================================
8888--- head-2008-12-01.orig/arch/x86/kernel/microcode-xen.c 2008-12-01 11:44:55.000000000 +0100
8889+++ head-2008-12-01/arch/x86/kernel/microcode-xen.c 2008-12-01 11:49:07.000000000 +0100
8890@@ -5,13 +5,14 @@
8891 * 2006 Shaohua Li <shaohua.li@intel.com>
8892 *
8893 * This driver allows to upgrade microcode on Intel processors
8894- * belonging to IA-32 family - PentiumPro, Pentium II,
8895+ * belonging to IA-32 family - PentiumPro, Pentium II,
8896 * Pentium III, Xeon, Pentium 4, etc.
8897 *
8898- * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
8899- * Order Number 245472 or free download from:
8900- *
8901- * http://developer.intel.com/design/pentium4/manuals/245472.htm
8902+ * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
8903+ * Software Developer's Manual
8904+ * Order Number 253668 or free download from:
8905+ *
8906+ * http://developer.intel.com/design/pentium4/manuals/253668.htm
8907 *
8908 * For more information, go to http://www.urbanmyth.org/microcode
8909 *
8910@@ -26,6 +27,7 @@
8911 #include <linux/kernel.h>
8912 #include <linux/init.h>
8913 #include <linux/sched.h>
8914+#include <linux/smp_lock.h>
8915 #include <linux/cpumask.h>
8916 #include <linux/module.h>
8917 #include <linux/slab.h>
8918@@ -86,6 +88,7 @@ static int do_microcode_update (const vo
8919
8920 static int microcode_open (struct inode *unused1, struct file *unused2)
8921 {
8922+ cycle_kernel_lock();
8923 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
8924 }
8925
8926@@ -162,7 +165,7 @@ static int request_microcode(void)
8927 c->x86, c->x86_model, c->x86_mask);
8928 error = request_firmware(&firmware, name, &microcode_pdev->dev);
8929 if (error) {
8930- pr_debug("microcode: ucode data file %s load failed\n", name);
8931+ pr_debug("microcode: data file %s load failed\n", name);
8932 return error;
8933 }
8934
8935@@ -183,6 +186,9 @@ static int __init microcode_init (void)
8936 {
8937 int error;
8938
8939+ printk(KERN_INFO
8940+ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8941+
8942 error = microcode_dev_init();
8943 if (error)
8944 return error;
8945@@ -195,8 +201,6 @@ static int __init microcode_init (void)
8946
8947 request_microcode();
8948
8949- printk(KERN_INFO
8950- "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8951 return 0;
8952 }
8953
8954Index: head-2008-12-01/arch/x86/kernel/mpparse-xen.c
8955===================================================================
8956--- head-2008-12-01.orig/arch/x86/kernel/mpparse-xen.c 2008-12-01 11:44:55.000000000 +0100
8957+++ head-2008-12-01/arch/x86/kernel/mpparse-xen.c 2008-12-01 11:49:07.000000000 +0100
8958@@ -25,6 +25,9 @@
8959 #include <asm/proto.h>
8960 #include <asm/acpi.h>
8961 #include <asm/bios_ebda.h>
8962+#include <asm/e820.h>
8963+#include <asm/trampoline.h>
8964+#include <asm/setup.h>
8965
8966 #include <mach_apic.h>
8967 #ifdef CONFIG_X86_32
8968@@ -32,28 +35,6 @@
8969 #include <mach_mpparse.h>
8970 #endif
8971
8972-/* Have we found an MP table */
8973-int smp_found_config;
8974-
8975-/*
8976- * Various Linux-internal data structures created from the
8977- * MP-table.
8978- */
8979-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
8980-int mp_bus_id_to_type[MAX_MP_BUSSES];
8981-#endif
8982-
8983-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
8984-int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
8985-
8986-static int mp_current_pci_id;
8987-
8988-int pic_mode;
8989-
8990-/*
8991- * Intel MP BIOS table parsing routines:
8992- */
8993-
8994 /*
8995 * Checksum an MP configuration block.
8996 */
8997@@ -68,20 +49,8 @@ static int __init mpf_checksum(unsigned
8998 return sum & 0xFF;
8999 }
9000
9001-#ifdef CONFIG_X86_NUMAQ
9002-/*
9003- * Have to match translation table entries to main table entries by counter
9004- * hence the mpc_record variable .... can't see a less disgusting way of
9005- * doing this ....
9006- */
9007-
9008-static int mpc_record;
9009-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
9010- __cpuinitdata;
9011-#endif
9012-
9013 #ifndef CONFIG_XEN
9014-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
9015+static void __init MP_processor_info(struct mpc_config_processor *m)
9016 {
9017 int apicid;
9018 char *bootup_cpu = "";
9019@@ -90,11 +59,12 @@ static void __cpuinit MP_processor_info(
9020 disabled_cpus++;
9021 return;
9022 }
9023-#ifdef CONFIG_X86_NUMAQ
9024- apicid = mpc_apic_id(m, translation_table[mpc_record]);
9025-#else
9026- apicid = m->mpc_apicid;
9027-#endif
9028+
9029+ if (x86_quirks->mpc_apic_id)
9030+ apicid = x86_quirks->mpc_apic_id(m);
9031+ else
9032+ apicid = m->mpc_apicid;
9033+
9034 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
9035 bootup_cpu = " (Bootup-CPU)";
9036 boot_cpu_physical_apicid = m->mpc_apicid;
9037@@ -104,24 +74,23 @@ static void __cpuinit MP_processor_info(
9038 generic_processor_info(apicid, m->mpc_apicver);
9039 }
9040 #else
9041-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
9042+static void __init MP_processor_info(struct mpc_config_processor *m)
9043 {
9044 num_processors++;
9045 }
9046 #endif /* CONFIG_XEN */
9047
9048+#ifdef CONFIG_X86_IO_APIC
9049 static void __init MP_bus_info(struct mpc_config_bus *m)
9050 {
9051 char str[7];
9052-
9053 memcpy(str, m->mpc_bustype, 6);
9054 str[6] = 0;
9055
9056-#ifdef CONFIG_X86_NUMAQ
9057- mpc_oem_bus_info(m, str, translation_table[mpc_record]);
9058-#else
9059- Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
9060-#endif
9061+ if (x86_quirks->mpc_oem_bus_info)
9062+ x86_quirks->mpc_oem_bus_info(m, str);
9063+ else
9064+ apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
9065
9066 #if MAX_MP_BUSSES < 256
9067 if (m->mpc_busid >= MAX_MP_BUSSES) {
9068@@ -138,12 +107,10 @@ static void __init MP_bus_info(struct mp
9069 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
9070 #endif
9071 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
9072-#ifdef CONFIG_X86_NUMAQ
9073- mpc_oem_pci_bus(m, translation_table[mpc_record]);
9074-#endif
9075+ if (x86_quirks->mpc_oem_pci_bus)
9076+ x86_quirks->mpc_oem_pci_bus(m);
9077+
9078 clear_bit(m->mpc_busid, mp_bus_not_pci);
9079- mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
9080- mp_current_pci_id++;
9081 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
9082 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
9083 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
9084@@ -154,6 +121,7 @@ static void __init MP_bus_info(struct mp
9085 } else
9086 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
9087 }
9088+#endif
9089
9090 #ifdef CONFIG_X86_IO_APIC
9091
9092@@ -183,117 +151,111 @@ static void __init MP_ioapic_info(struct
9093 if (bad_ioapic(m->mpc_apicaddr))
9094 return;
9095
9096- mp_ioapics[nr_ioapics] = *m;
9097+ mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
9098+ mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
9099+ mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
9100+ mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
9101+ mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
9102 nr_ioapics++;
9103 }
9104
9105-static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9106+static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
9107 {
9108- mp_irqs[mp_irq_entries] = *m;
9109- Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
9110+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9111 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9112 m->mpc_irqtype, m->mpc_irqflag & 3,
9113 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
9114 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
9115- if (++mp_irq_entries == MAX_IRQ_SOURCES)
9116- panic("Max # of irq sources exceeded!!\n");
9117 }
9118
9119-#endif
9120-
9121-static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9122+static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
9123 {
9124- Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
9125- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9126- m->mpc_irqtype, m->mpc_irqflag & 3,
9127- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9128- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9129+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9130+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9131+ mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
9132+ (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
9133+ mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
9134 }
9135
9136-#ifdef CONFIG_X86_NUMAQ
9137-static void __init MP_translation_info(struct mpc_config_translation *m)
9138+static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
9139+ struct mp_config_intsrc *mp_irq)
9140 {
9141- printk(KERN_INFO
9142- "Translation: record %d, type %d, quad %d, global %d, local %d\n",
9143- mpc_record, m->trans_type, m->trans_quad, m->trans_global,
9144- m->trans_local);
9145+ mp_irq->mp_dstapic = m->mpc_dstapic;
9146+ mp_irq->mp_type = m->mpc_type;
9147+ mp_irq->mp_irqtype = m->mpc_irqtype;
9148+ mp_irq->mp_irqflag = m->mpc_irqflag;
9149+ mp_irq->mp_srcbus = m->mpc_srcbus;
9150+ mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
9151+ mp_irq->mp_dstirq = m->mpc_dstirq;
9152+}
9153
9154- if (mpc_record >= MAX_MPC_ENTRY)
9155- printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
9156- else
9157- translation_table[mpc_record] = m; /* stash this for later */
9158- if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
9159- node_set_online(m->trans_quad);
9160+static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
9161+ struct mpc_config_intsrc *m)
9162+{
9163+ m->mpc_dstapic = mp_irq->mp_dstapic;
9164+ m->mpc_type = mp_irq->mp_type;
9165+ m->mpc_irqtype = mp_irq->mp_irqtype;
9166+ m->mpc_irqflag = mp_irq->mp_irqflag;
9167+ m->mpc_srcbus = mp_irq->mp_srcbus;
9168+ m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
9169+ m->mpc_dstirq = mp_irq->mp_dstirq;
9170 }
9171
9172-/*
9173- * Read/parse the MPC oem tables
9174- */
9175+static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
9176+ struct mpc_config_intsrc *m)
9177+{
9178+ if (mp_irq->mp_dstapic != m->mpc_dstapic)
9179+ return 1;
9180+ if (mp_irq->mp_type != m->mpc_type)
9181+ return 2;
9182+ if (mp_irq->mp_irqtype != m->mpc_irqtype)
9183+ return 3;
9184+ if (mp_irq->mp_irqflag != m->mpc_irqflag)
9185+ return 4;
9186+ if (mp_irq->mp_srcbus != m->mpc_srcbus)
9187+ return 5;
9188+ if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
9189+ return 6;
9190+ if (mp_irq->mp_dstirq != m->mpc_dstirq)
9191+ return 7;
9192+
9193+ return 0;
9194+}
9195
9196-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
9197- unsigned short oemsize)
9198+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9199 {
9200- int count = sizeof(*oemtable); /* the header size */
9201- unsigned char *oemptr = ((unsigned char *)oemtable) + count;
9202+ int i;
9203
9204- mpc_record = 0;
9205- printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
9206- oemtable);
9207- if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
9208- printk(KERN_WARNING
9209- "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
9210- oemtable->oem_signature[0], oemtable->oem_signature[1],
9211- oemtable->oem_signature[2], oemtable->oem_signature[3]);
9212- return;
9213- }
9214- if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
9215- printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
9216- return;
9217- }
9218- while (count < oemtable->oem_length) {
9219- switch (*oemptr) {
9220- case MP_TRANSLATION:
9221- {
9222- struct mpc_config_translation *m =
9223- (struct mpc_config_translation *)oemptr;
9224- MP_translation_info(m);
9225- oemptr += sizeof(*m);
9226- count += sizeof(*m);
9227- ++mpc_record;
9228- break;
9229- }
9230- default:
9231- {
9232- printk(KERN_WARNING
9233- "Unrecognised OEM table entry type! - %d\n",
9234- (int)*oemptr);
9235- return;
9236- }
9237- }
9238+ print_MP_intsrc_info(m);
9239+
9240+ for (i = 0; i < mp_irq_entries; i++) {
9241+ if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
9242+ return;
9243 }
9244+
9245+ assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
9246+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
9247+ panic("Max # of irq sources exceeded!!\n");
9248 }
9249
9250-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
9251- char *productid)
9252+#endif
9253+
9254+static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9255 {
9256- if (strncmp(oem, "IBM NUMA", 8))
9257- printk("Warning! May not be a NUMA-Q system!\n");
9258- if (mpc->mpc_oemptr)
9259- smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
9260- mpc->mpc_oemsize);
9261+ apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
9262+ " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9263+ m->mpc_irqtype, m->mpc_irqflag & 3,
9264+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9265+ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9266 }
9267-#endif /* CONFIG_X86_NUMAQ */
9268
9269 /*
9270 * Read/parse the MPC
9271 */
9272
9273-static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9274+static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
9275+ char *str)
9276 {
9277- char str[16];
9278- char oem[10];
9279- int count = sizeof(*mpc);
9280- unsigned char *mpt = ((unsigned char *)mpc) + count;
9281
9282 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
9283 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
9284@@ -316,19 +278,41 @@ static int __init smp_read_mpc(struct mp
9285 }
9286 memcpy(oem, mpc->mpc_oem, 8);
9287 oem[8] = 0;
9288- printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
9289+ printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
9290
9291 memcpy(str, mpc->mpc_productid, 12);
9292 str[12] = 0;
9293- printk("Product ID: %s ", str);
9294
9295-#ifdef CONFIG_X86_32
9296- mps_oem_check(mpc, oem, str);
9297-#endif
9298- printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
9299+ printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
9300
9301 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
9302
9303+ return 1;
9304+}
9305+
9306+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9307+{
9308+ char str[16];
9309+ char oem[10];
9310+
9311+ int count = sizeof(*mpc);
9312+ unsigned char *mpt = ((unsigned char *)mpc) + count;
9313+
9314+ if (!smp_check_mpc(mpc, oem, str))
9315+ return 0;
9316+
9317+#ifdef CONFIG_X86_32
9318+ /*
9319+ * need to make sure summit and es7000's mps_oem_check is safe to be
9320+ * called early via genericarch 's mps_oem_check
9321+ */
9322+ if (early) {
9323+#ifdef CONFIG_X86_NUMAQ
9324+ numaq_mps_oem_check(mpc, oem, str);
9325+#endif
9326+ } else
9327+ mps_oem_check(mpc, oem, str);
9328+#endif
9329 /* save the local APIC address, it might be non-default */
9330 if (!acpi_lapic)
9331 mp_lapic_addr = mpc->mpc_lapic;
9332@@ -336,12 +320,17 @@ static int __init smp_read_mpc(struct mp
9333 if (early)
9334 return 1;
9335
9336+ if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
9337+ struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
9338+ x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
9339+ }
9340+
9341 /*
9342 * Now process the configuration blocks.
9343 */
9344-#ifdef CONFIG_X86_NUMAQ
9345- mpc_record = 0;
9346-#endif
9347+ if (x86_quirks->mpc_record)
9348+ *x86_quirks->mpc_record = 0;
9349+
9350 while (count < mpc->mpc_length) {
9351 switch (*mpt) {
9352 case MP_PROCESSOR:
9353@@ -359,7 +348,9 @@ static int __init smp_read_mpc(struct mp
9354 {
9355 struct mpc_config_bus *m =
9356 (struct mpc_config_bus *)mpt;
9357+#ifdef CONFIG_X86_IO_APIC
9358 MP_bus_info(m);
9359+#endif
9360 mpt += sizeof(*m);
9361 count += sizeof(*m);
9362 break;
9363@@ -405,10 +396,14 @@ static int __init smp_read_mpc(struct mp
9364 count = mpc->mpc_length;
9365 break;
9366 }
9367-#ifdef CONFIG_X86_NUMAQ
9368- ++mpc_record;
9369-#endif
9370+ if (x86_quirks->mpc_record)
9371+ (*x86_quirks->mpc_record)++;
9372 }
9373+
9374+#ifdef CONFIG_X86_GENERICARCH
9375+ generic_bigsmp_probe();
9376+#endif
9377+
9378 setup_apic_routing();
9379 if (!num_processors)
9380 printk(KERN_ERR "MPTABLE: no processors registered!\n");
9381@@ -434,7 +429,7 @@ static void __init construct_default_ioi
9382 intsrc.mpc_type = MP_INTSRC;
9383 intsrc.mpc_irqflag = 0; /* conforming */
9384 intsrc.mpc_srcbus = 0;
9385- intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
9386+ intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
9387
9388 intsrc.mpc_irqtype = mp_INT;
9389
9390@@ -495,40 +490,11 @@ static void __init construct_default_ioi
9391 MP_intsrc_info(&intsrc);
9392 }
9393
9394-#endif
9395
9396-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9397+static void __init construct_ioapic_table(int mpc_default_type)
9398 {
9399- struct mpc_config_processor processor;
9400- struct mpc_config_bus bus;
9401-#ifdef CONFIG_X86_IO_APIC
9402 struct mpc_config_ioapic ioapic;
9403-#endif
9404- struct mpc_config_lintsrc lintsrc;
9405- int linttypes[2] = { mp_ExtINT, mp_NMI };
9406- int i;
9407-
9408- /*
9409- * local APIC has default address
9410- */
9411- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9412-
9413- /*
9414- * 2 CPUs, numbered 0 & 1.
9415- */
9416- processor.mpc_type = MP_PROCESSOR;
9417- /* Either an integrated APIC or a discrete 82489DX. */
9418- processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9419- processor.mpc_cpuflag = CPU_ENABLED;
9420- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9421- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9422- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9423- processor.mpc_reserved[0] = 0;
9424- processor.mpc_reserved[1] = 0;
9425- for (i = 0; i < 2; i++) {
9426- processor.mpc_apicid = i;
9427- MP_processor_info(&processor);
9428- }
9429+ struct mpc_config_bus bus;
9430
9431 bus.mpc_type = MP_BUS;
9432 bus.mpc_busid = 0;
9433@@ -557,7 +523,6 @@ static inline void __init construct_defa
9434 MP_bus_info(&bus);
9435 }
9436
9437-#ifdef CONFIG_X86_IO_APIC
9438 ioapic.mpc_type = MP_IOAPIC;
9439 ioapic.mpc_apicid = 2;
9440 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9441@@ -569,7 +534,42 @@ static inline void __init construct_defa
9442 * We set up most of the low 16 IO-APIC pins according to MPS rules.
9443 */
9444 construct_default_ioirq_mptable(mpc_default_type);
9445+}
9446+#else
9447+static inline void __init construct_ioapic_table(int mpc_default_type) { }
9448 #endif
9449+
9450+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9451+{
9452+ struct mpc_config_processor processor;
9453+ struct mpc_config_lintsrc lintsrc;
9454+ int linttypes[2] = { mp_ExtINT, mp_NMI };
9455+ int i;
9456+
9457+ /*
9458+ * local APIC has default address
9459+ */
9460+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9461+
9462+ /*
9463+ * 2 CPUs, numbered 0 & 1.
9464+ */
9465+ processor.mpc_type = MP_PROCESSOR;
9466+ /* Either an integrated APIC or a discrete 82489DX. */
9467+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9468+ processor.mpc_cpuflag = CPU_ENABLED;
9469+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9470+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9471+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9472+ processor.mpc_reserved[0] = 0;
9473+ processor.mpc_reserved[1] = 0;
9474+ for (i = 0; i < 2; i++) {
9475+ processor.mpc_apicid = i;
9476+ MP_processor_info(&processor);
9477+ }
9478+
9479+ construct_ioapic_table(mpc_default_type);
9480+
9481 lintsrc.mpc_type = MP_LINTSRC;
9482 lintsrc.mpc_irqflag = 0; /* conforming */
9483 lintsrc.mpc_srcbusid = 0;
9484@@ -587,10 +587,14 @@ static struct intel_mp_floating *mpf_fou
9485 /*
9486 * Scan the memory blocks for an SMP configuration block.
9487 */
9488-static void __init __get_smp_config(unsigned early)
9489+static void __init __get_smp_config(unsigned int early)
9490 {
9491 struct intel_mp_floating *mpf = mpf_found;
9492
9493+ if (x86_quirks->mach_get_smp_config) {
9494+ if (x86_quirks->mach_get_smp_config(early))
9495+ return;
9496+ }
9497 if (acpi_lapic && early)
9498 return;
9499 /*
9500@@ -607,7 +611,7 @@ static void __init __get_smp_config(unsi
9501
9502 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
9503 mpf->mpf_specification);
9504-#ifdef CONFIG_X86_32
9505+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
9506 if (mpf->mpf_feature2 & (1 << 7)) {
9507 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
9508 pic_mode = 1;
9509@@ -639,7 +643,9 @@ static void __init __get_smp_config(unsi
9510 * override the defaults.
9511 */
9512 if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
9513+#ifdef CONFIG_X86_LOCAL_APIC
9514 smp_found_config = 0;
9515+#endif
9516 printk(KERN_ERR
9517 "BIOS bug, MP table errors detected!...\n");
9518 printk(KERN_ERR "... disabling SMP support. "
9519@@ -696,7 +702,8 @@ static int __init smp_scan_config(unsign
9520 unsigned int *bp = isa_bus_to_virt(base);
9521 struct intel_mp_floating *mpf;
9522
9523- Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
9524+ apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
9525+ bp, length);
9526 BUILD_BUG_ON(sizeof(*mpf) != 16);
9527
9528 while (length > 0) {
9529@@ -706,16 +713,22 @@ static int __init smp_scan_config(unsign
9530 !mpf_checksum((unsigned char *)bp, 16) &&
9531 ((mpf->mpf_specification == 1)
9532 || (mpf->mpf_specification == 4))) {
9533-
9534+#ifdef CONFIG_X86_LOCAL_APIC
9535 smp_found_config = 1;
9536+#endif
9537 mpf_found = mpf;
9538-#ifdef CONFIG_X86_32
9539+
9540 #ifndef CONFIG_XEN
9541 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9542 mpf, virt_to_phys(mpf));
9543- reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
9544+
9545+ if (!reserve)
9546+ return 1;
9547+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
9548 BOOTMEM_DEFAULT);
9549 if (mpf->mpf_physptr) {
9550+ unsigned long size = PAGE_SIZE;
9551+#ifdef CONFIG_X86_32
9552 /*
9553 * We cannot access to MPC table to compute
9554 * table size yet, as only few megabytes from
9555@@ -725,27 +738,18 @@ static int __init smp_scan_config(unsign
9556 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
9557 * in reserve_bootmem.
9558 */
9559- unsigned long size = PAGE_SIZE;
9560 unsigned long end = max_low_pfn * PAGE_SIZE;
9561 if (mpf->mpf_physptr + size > end)
9562 size = end - mpf->mpf_physptr;
9563- reserve_bootmem(mpf->mpf_physptr, size,
9564+#endif
9565+ reserve_bootmem_generic(mpf->mpf_physptr, size,
9566 BOOTMEM_DEFAULT);
9567 }
9568 #else
9569 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9570 mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
9571 #endif
9572-#elif !defined(CONFIG_XEN)
9573- if (!reserve)
9574- return 1;
9575-
9576- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
9577- if (mpf->mpf_physptr)
9578- reserve_bootmem_generic(mpf->mpf_physptr,
9579- PAGE_SIZE);
9580-#endif
9581- return 1;
9582+ return 1;
9583 }
9584 bp += 4;
9585 length -= 16;
9586@@ -753,10 +757,15 @@ static int __init smp_scan_config(unsign
9587 return 0;
9588 }
9589
9590-static void __init __find_smp_config(unsigned reserve)
9591+static void __init __find_smp_config(unsigned int reserve)
9592 {
9593 #ifndef CONFIG_XEN
9594 unsigned int address;
9595+
9596+ if (x86_quirks->mach_find_smp_config) {
9597+ if (x86_quirks->mach_find_smp_config(reserve))
9598+ return;
9599+ }
9600 #endif
9601
9602 /*
9603@@ -805,300 +814,301 @@ void __init find_smp_config(void)
9604 __find_smp_config(1);
9605 }
9606
9607-/* --------------------------------------------------------------------------
9608- ACPI-based MP Configuration
9609- -------------------------------------------------------------------------- */
9610-
9611-/*
9612- * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
9613- */
9614-int es7000_plat;
9615-
9616-#ifdef CONFIG_ACPI
9617+#ifdef CONFIG_X86_IO_APIC
9618+static u8 __initdata irq_used[MAX_IRQ_SOURCES];
9619
9620-#ifdef CONFIG_X86_IO_APIC
9621+static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
9622+{
9623+ int i;
9624
9625-#define MP_ISA_BUS 0
9626+ if (m->mpc_irqtype != mp_INT)
9627+ return 0;
9628
9629-extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
9630+ if (m->mpc_irqflag != 0x0f)
9631+ return 0;
9632
9633-static int mp_find_ioapic(int gsi)
9634-{
9635- int i = 0;
9636+ /* not legacy */
9637
9638- /* Find the IOAPIC that manages this GSI. */
9639- for (i = 0; i < nr_ioapics; i++) {
9640- if ((gsi >= mp_ioapic_routing[i].gsi_base)
9641- && (gsi <= mp_ioapic_routing[i].gsi_end))
9642- return i;
9643+ for (i = 0; i < mp_irq_entries; i++) {
9644+ if (mp_irqs[i].mp_irqtype != mp_INT)
9645+ continue;
9646+
9647+ if (mp_irqs[i].mp_irqflag != 0x0f)
9648+ continue;
9649+
9650+ if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
9651+ continue;
9652+ if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
9653+ continue;
9654+ if (irq_used[i]) {
9655+ /* already claimed */
9656+ return -2;
9657+ }
9658+ irq_used[i] = 1;
9659+ return i;
9660 }
9661
9662- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
9663+ /* not found */
9664 return -1;
9665 }
9666
9667-static u8 __init uniq_ioapic_id(u8 id)
9668-{
9669-#ifdef CONFIG_X86_32
9670- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
9671- !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
9672- return io_apic_get_unique_id(nr_ioapics, id);
9673- else
9674- return id;
9675-#else
9676- int i;
9677- DECLARE_BITMAP(used, 256);
9678- bitmap_zero(used, 256);
9679- for (i = 0; i < nr_ioapics; i++) {
9680- struct mpc_config_ioapic *ia = &mp_ioapics[i];
9681- __set_bit(ia->mpc_apicid, used);
9682- }
9683- if (!test_bit(id, used))
9684- return id;
9685- return find_first_zero_bit(used, 256);
9686+#define SPARE_SLOT_NUM 20
9687+
9688+static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
9689 #endif
9690-}
9691
9692-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
9693+static int __init replace_intsrc_all(struct mp_config_table *mpc,
9694+ unsigned long mpc_new_phys,
9695+ unsigned long mpc_new_length)
9696 {
9697- int idx = 0;
9698-
9699- if (bad_ioapic(address))
9700- return;
9701+#ifdef CONFIG_X86_IO_APIC
9702+ int i;
9703+ int nr_m_spare = 0;
9704+#endif
9705
9706- idx = nr_ioapics;
9707+ int count = sizeof(*mpc);
9708+ unsigned char *mpt = ((unsigned char *)mpc) + count;
9709
9710- mp_ioapics[idx].mpc_type = MP_IOAPIC;
9711- mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
9712- mp_ioapics[idx].mpc_apicaddr = address;
9713+ printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
9714+ while (count < mpc->mpc_length) {
9715+ switch (*mpt) {
9716+ case MP_PROCESSOR:
9717+ {
9718+ struct mpc_config_processor *m =
9719+ (struct mpc_config_processor *)mpt;
9720+ mpt += sizeof(*m);
9721+ count += sizeof(*m);
9722+ break;
9723+ }
9724+ case MP_BUS:
9725+ {
9726+ struct mpc_config_bus *m =
9727+ (struct mpc_config_bus *)mpt;
9728+ mpt += sizeof(*m);
9729+ count += sizeof(*m);
9730+ break;
9731+ }
9732+ case MP_IOAPIC:
9733+ {
9734+ mpt += sizeof(struct mpc_config_ioapic);
9735+ count += sizeof(struct mpc_config_ioapic);
9736+ break;
9737+ }
9738+ case MP_INTSRC:
9739+ {
9740+#ifdef CONFIG_X86_IO_APIC
9741+ struct mpc_config_intsrc *m =
9742+ (struct mpc_config_intsrc *)mpt;
9743
9744-#ifndef CONFIG_XEN
9745- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
9746+ printk(KERN_INFO "OLD ");
9747+ print_MP_intsrc_info(m);
9748+ i = get_MP_intsrc_index(m);
9749+ if (i > 0) {
9750+ assign_to_mpc_intsrc(&mp_irqs[i], m);
9751+ printk(KERN_INFO "NEW ");
9752+ print_mp_irq_info(&mp_irqs[i]);
9753+ } else if (!i) {
9754+ /* legacy, do nothing */
9755+ } else if (nr_m_spare < SPARE_SLOT_NUM) {
9756+ /*
9757+ * not found (-1), or duplicated (-2)
9758+ * are invalid entries,
9759+ * we need to use the slot later
9760+ */
9761+ m_spare[nr_m_spare] = m;
9762+ nr_m_spare++;
9763+ }
9764 #endif
9765- mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
9766-#ifdef CONFIG_X86_32
9767- mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
9768-#else
9769- mp_ioapics[idx].mpc_apicver = 0;
9770+ mpt += sizeof(struct mpc_config_intsrc);
9771+ count += sizeof(struct mpc_config_intsrc);
9772+ break;
9773+ }
9774+ case MP_LINTSRC:
9775+ {
9776+ struct mpc_config_lintsrc *m =
9777+ (struct mpc_config_lintsrc *)mpt;
9778+ mpt += sizeof(*m);
9779+ count += sizeof(*m);
9780+ break;
9781+ }
9782+ default:
9783+ /* wrong mptable */
9784+ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
9785+ printk(KERN_ERR "type %x\n", *mpt);
9786+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
9787+ 1, mpc, mpc->mpc_length, 1);
9788+ goto out;
9789+ }
9790+ }
9791+
9792+#ifdef CONFIG_X86_IO_APIC
9793+ for (i = 0; i < mp_irq_entries; i++) {
9794+ if (irq_used[i])
9795+ continue;
9796+
9797+ if (mp_irqs[i].mp_irqtype != mp_INT)
9798+ continue;
9799+
9800+ if (mp_irqs[i].mp_irqflag != 0x0f)
9801+ continue;
9802+
9803+ if (nr_m_spare > 0) {
9804+ printk(KERN_INFO "*NEW* found ");
9805+ nr_m_spare--;
9806+ assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
9807+ m_spare[nr_m_spare] = NULL;
9808+ } else {
9809+ struct mpc_config_intsrc *m =
9810+ (struct mpc_config_intsrc *)mpt;
9811+ count += sizeof(struct mpc_config_intsrc);
9812+ if (!mpc_new_phys) {
9813+ printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
9814+ } else {
9815+ if (count <= mpc_new_length)
9816+ printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
9817+ else {
9818+ printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
9819+ goto out;
9820+ }
9821+ }
9822+ assign_to_mpc_intsrc(&mp_irqs[i], m);
9823+ mpc->mpc_length = count;
9824+ mpt += sizeof(struct mpc_config_intsrc);
9825+ }
9826+ print_mp_irq_info(&mp_irqs[i]);
9827+ }
9828 #endif
9829- /*
9830- * Build basic GSI lookup table to facilitate gsi->io_apic lookups
9831- * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
9832- */
9833- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
9834- mp_ioapic_routing[idx].gsi_base = gsi_base;
9835- mp_ioapic_routing[idx].gsi_end = gsi_base +
9836- io_apic_get_redir_entries(idx);
9837-
9838- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
9839- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
9840- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
9841- mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
9842+out:
9843+ /* update checksum */
9844+ mpc->mpc_checksum = 0;
9845+ mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
9846+ mpc->mpc_length);
9847
9848- nr_ioapics++;
9849+ return 0;
9850 }
9851
9852-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
9853-{
9854- struct mpc_config_intsrc intsrc;
9855- int ioapic = -1;
9856- int pin = -1;
9857-
9858- /*
9859- * Convert 'gsi' to 'ioapic.pin'.
9860- */
9861- ioapic = mp_find_ioapic(gsi);
9862- if (ioapic < 0)
9863- return;
9864- pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9865+static int __initdata enable_update_mptable;
9866
9867- /*
9868- * TBD: This check is for faulty timer entries, where the override
9869- * erroneously sets the trigger to level, resulting in a HUGE
9870- * increase of timer interrupts!
9871- */
9872- if ((bus_irq == 0) && (trigger == 3))
9873- trigger = 1;
9874+static int __init update_mptable_setup(char *str)
9875+{
9876+ enable_update_mptable = 1;
9877+ return 0;
9878+}
9879+early_param("update_mptable", update_mptable_setup);
9880
9881- intsrc.mpc_type = MP_INTSRC;
9882- intsrc.mpc_irqtype = mp_INT;
9883- intsrc.mpc_irqflag = (trigger << 2) | polarity;
9884- intsrc.mpc_srcbus = MP_ISA_BUS;
9885- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
9886- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
9887- intsrc.mpc_dstirq = pin; /* INTIN# */
9888+static unsigned long __initdata mpc_new_phys;
9889+static unsigned long mpc_new_length __initdata = 4096;
9890
9891- MP_intsrc_info(&intsrc);
9892+/* alloc_mptable or alloc_mptable=4k */
9893+static int __initdata alloc_mptable;
9894+static int __init parse_alloc_mptable_opt(char *p)
9895+{
9896+ enable_update_mptable = 1;
9897+ alloc_mptable = 1;
9898+ if (!p)
9899+ return 0;
9900+ mpc_new_length = PAGE_SIZE << get_order(memparse(p, &p));
9901+ return 0;
9902 }
9903+early_param("alloc_mptable", parse_alloc_mptable_opt);
9904
9905-void __init mp_config_acpi_legacy_irqs(void)
9906+void __init early_reserve_e820_mpc_new(void)
9907 {
9908- struct mpc_config_intsrc intsrc;
9909- int i = 0;
9910- int ioapic = -1;
9911-
9912-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9913- /*
9914- * Fabricate the legacy ISA bus (bus #31).
9915- */
9916- mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
9917+ if (enable_update_mptable && alloc_mptable) {
9918+ u64 startt = PAGE_SIZE;
9919+#ifdef CONFIG_X86_TRAMPOLINE
9920+ startt = TRAMPOLINE_BASE;
9921 #endif
9922- set_bit(MP_ISA_BUS, mp_bus_not_pci);
9923- Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
9924-
9925- /*
9926- * Older generations of ES7000 have no legacy identity mappings
9927- */
9928- if (es7000_plat == 1)
9929- return;
9930-
9931- /*
9932- * Locate the IOAPIC that manages the ISA IRQs (0-15).
9933- */
9934- ioapic = mp_find_ioapic(0);
9935- if (ioapic < 0)
9936- return;
9937-
9938- intsrc.mpc_type = MP_INTSRC;
9939- intsrc.mpc_irqflag = 0; /* Conforming */
9940- intsrc.mpc_srcbus = MP_ISA_BUS;
9941-#ifdef CONFIG_X86_IO_APIC
9942- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
9943-#endif
9944- /*
9945- * Use the default configuration for the IRQs 0-15. Unless
9946- * overridden by (MADT) interrupt source override entries.
9947- */
9948- for (i = 0; i < 16; i++) {
9949- int idx;
9950-
9951- for (idx = 0; idx < mp_irq_entries; idx++) {
9952- struct mpc_config_intsrc *irq = mp_irqs + idx;
9953-
9954- /* Do we already have a mapping for this ISA IRQ? */
9955- if (irq->mpc_srcbus == MP_ISA_BUS
9956- && irq->mpc_srcbusirq == i)
9957- break;
9958-
9959- /* Do we already have a mapping for this IOAPIC pin */
9960- if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
9961- (irq->mpc_dstirq == i))
9962- break;
9963- }
9964-
9965- if (idx != mp_irq_entries) {
9966- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
9967- continue; /* IRQ already used */
9968- }
9969-
9970- intsrc.mpc_irqtype = mp_INT;
9971- intsrc.mpc_srcbusirq = i; /* Identity mapped */
9972- intsrc.mpc_dstirq = i;
9973-
9974- MP_intsrc_info(&intsrc);
9975+ mpc_new_phys = early_reserve_e820(startt, mpc_new_length,
9976+ mpc_new_length);
9977 }
9978 }
9979
9980-int mp_register_gsi(u32 gsi, int triggering, int polarity)
9981+static int __init update_mp_table(void)
9982 {
9983- int ioapic;
9984- int ioapic_pin;
9985-#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
9986-#define MAX_GSI_NUM 4096
9987-#define IRQ_COMPRESSION_START 64
9988+ char str[16];
9989+ char oem[10];
9990+ struct intel_mp_floating *mpf;
9991+ struct mp_config_table *mpc;
9992+ struct mp_config_table *mpc_new;
9993+
9994+ if (!enable_update_mptable)
9995+ return 0;
9996+
9997+ mpf = mpf_found;
9998+ if (!mpf)
9999+ return 0;
10000
10001- static int pci_irq = IRQ_COMPRESSION_START;
10002 /*
10003- * Mapping between Global System Interrupts, which
10004- * represent all possible interrupts, and IRQs
10005- * assigned to actual devices.
10006+ * Now see if we need to go further.
10007 */
10008- static int gsi_to_irq[MAX_GSI_NUM];
10009-#else
10010+ if (mpf->mpf_feature1 != 0)
10011+ return 0;
10012
10013- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
10014- return gsi;
10015-#endif
10016+ if (!mpf->mpf_physptr)
10017+ return 0;
10018
10019- /* Don't set up the ACPI SCI because it's already set up */
10020- if (acpi_gbl_FADT.sci_interrupt == gsi)
10021- return gsi;
10022+ mpc = isa_bus_to_virt(mpf->mpf_physptr);
10023
10024- ioapic = mp_find_ioapic(gsi);
10025- if (ioapic < 0) {
10026- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
10027- return gsi;
10028- }
10029+ if (!smp_check_mpc(mpc, oem, str))
10030+ return 0;
10031
10032- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
10033+ printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
10034+ printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
10035
10036-#ifndef CONFIG_X86_32
10037- if (ioapic_renumber_irq)
10038- gsi = ioapic_renumber_irq(ioapic, gsi);
10039-#endif
10040+ if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
10041+ mpc_new_phys = 0;
10042+ printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
10043+ mpc_new_length);
10044+ }
10045+
10046+ if (!mpc_new_phys) {
10047+ unsigned char old, new;
10048+ /* check if we can change the postion */
10049+ mpc->mpc_checksum = 0;
10050+ old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10051+ mpc->mpc_checksum = 0xff;
10052+ new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10053+ if (old == new) {
10054+ printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
10055+ return 0;
10056+ }
10057+ printk(KERN_INFO "use in-positon replacing\n");
10058+ } else {
10059+ maddr_t mpc_new_bus;
10060
10061- /*
10062- * Avoid pin reprogramming. PRTs typically include entries
10063- * with redundant pin->gsi mappings (but unique PCI devices);
10064- * we only program the IOAPIC on the first.
10065- */
10066- if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
10067- printk(KERN_ERR "Invalid reference to IOAPIC pin "
10068- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
10069- ioapic_pin);
10070- return gsi;
10071- }
10072- if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
10073- Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
10074- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
10075-#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10076- return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
10077-#else
10078- return gsi;
10079-#endif
10080+ if (xen_create_contiguous_region((unsigned long)phys_to_virt(mpc_new_phys),
10081+ get_order(mpc_new_length), 32))
10082+ BUG();
10083+ mpc_new_bus = phys_to_machine(mpc_new_phys);
10084+ mpf->mpf_physptr = mpc_new_bus;
10085+ mpc_new = phys_to_virt(mpc_new_phys);
10086+ memcpy(mpc_new, mpc, mpc->mpc_length);
10087+ mpc = mpc_new;
10088+ /* check if we can modify that */
10089+ if (mpc_new_bus - mpf->mpf_physptr) {
10090+ struct intel_mp_floating *mpf_new;
10091+ /* steal 16 bytes from [0, 1k) */
10092+ printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
10093+ mpf_new = isa_bus_to_virt(0x400 - 16);
10094+ memcpy(mpf_new, mpf, 16);
10095+ mpf = mpf_new;
10096+ mpf->mpf_physptr = mpc_new_bus;
10097+ }
10098+ mpf->mpf_checksum = 0;
10099+ mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
10100+ printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
10101 }
10102
10103- set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
10104-#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10105 /*
10106- * For GSI >= 64, use IRQ compression
10107+ * only replace the one with mp_INT and
10108+ * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
10109+ * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
10110+ * may need pci=routeirq for all coverage
10111 */
10112- if ((gsi >= IRQ_COMPRESSION_START)
10113- && (triggering == ACPI_LEVEL_SENSITIVE)) {
10114- /*
10115- * For PCI devices assign IRQs in order, avoiding gaps
10116- * due to unused I/O APIC pins.
10117- */
10118- int irq = gsi;
10119- if (gsi < MAX_GSI_NUM) {
10120- /*
10121- * Retain the VIA chipset work-around (gsi > 15), but
10122- * avoid a problem where the 8254 timer (IRQ0) is setup
10123- * via an override (so it's not on pin 0 of the ioapic),
10124- * and at the same time, the pin 0 interrupt is a PCI
10125- * type. The gsi > 15 test could cause these two pins
10126- * to be shared as IRQ0, and they are not shareable.
10127- * So test for this condition, and if necessary, avoid
10128- * the pin collision.
10129- */
10130- gsi = pci_irq++;
10131- /*
10132- * Don't assign IRQ used by ACPI SCI
10133- */
10134- if (gsi == acpi_gbl_FADT.sci_interrupt)
10135- gsi = pci_irq++;
10136- gsi_to_irq[irq] = gsi;
10137- } else {
10138- printk(KERN_ERR "GSI %u is too high\n", gsi);
10139- return gsi;
10140- }
10141- }
10142-#endif
10143- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
10144- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
10145- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
10146- return gsi;
10147+ replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
10148+
10149+ return 0;
10150 }
10151
10152-#endif /* CONFIG_X86_IO_APIC */
10153-#endif /* CONFIG_ACPI */
10154+late_initcall(update_mp_table);
10155Index: head-2008-12-01/arch/x86/kernel/nmi.c
10156===================================================================
10157--- head-2008-12-01.orig/arch/x86/kernel/nmi.c 2008-12-03 15:48:43.000000000 +0100
10158+++ head-2008-12-01/arch/x86/kernel/nmi.c 2008-12-01 11:49:07.000000000 +0100
10159@@ -27,7 +27,9 @@
10160 #include <linux/kdebug.h>
10161 #include <linux/smp.h>
10162
10163+#ifndef CONFIG_XEN
10164 #include <asm/i8259.h>
10165+#endif
10166 #include <asm/io_apic.h>
10167 #include <asm/smp.h>
10168 #include <asm/nmi.h>
10169@@ -179,8 +181,10 @@ int __init check_nmi_watchdog(void)
10170 kfree(prev_nmi_count);
10171 return 0;
10172 error:
10173+#ifndef CONFIG_XEN
10174 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
10175 disable_8259A_irq(0);
10176+#endif
10177 #ifdef CONFIG_X86_32
10178 timer_ack = 0;
10179 #endif
10180Index: head-2008-12-01/arch/x86/kernel/pci-dma-xen.c
10181===================================================================
10182--- head-2008-12-01.orig/arch/x86/kernel/pci-dma-xen.c 2008-12-01 11:44:55.000000000 +0100
10183+++ head-2008-12-01/arch/x86/kernel/pci-dma-xen.c 2008-12-01 11:49:07.000000000 +0100
10184@@ -5,13 +5,13 @@
10185
10186 #include <asm/proto.h>
10187 #include <asm/dma.h>
10188-#include <asm/gart.h>
10189+#include <asm/iommu.h>
10190 #include <asm/calgary.h>
10191+#include <asm/amd_iommu.h>
10192
10193-int forbid_dac __read_mostly;
10194-EXPORT_SYMBOL(forbid_dac);
10195+static int forbid_dac __read_mostly;
10196
10197-const struct dma_mapping_ops *dma_ops;
10198+struct dma_mapping_ops *dma_ops;
10199 EXPORT_SYMBOL(dma_ops);
10200
10201 static int iommu_sac_force __read_mostly;
10202@@ -74,13 +74,17 @@ early_param("dma32_size", parse_dma32_si
10203 void __init dma32_reserve_bootmem(void)
10204 {
10205 unsigned long size, align;
10206- if (end_pfn <= MAX_DMA32_PFN)
10207+ if (max_pfn <= MAX_DMA32_PFN)
10208 return;
10209
10210+ /*
10211+ * check aperture_64.c allocate_aperture() for reason about
10212+ * using 512M as goal
10213+ */
10214 align = 64ULL<<20;
10215 size = round_up(dma32_bootmem_size, align);
10216 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
10217- __pa(MAX_DMA_ADDRESS));
10218+ 512ULL<<20);
10219 if (dma32_bootmem_ptr)
10220 dma32_bootmem_size = size;
10221 else
10222@@ -88,17 +92,14 @@ void __init dma32_reserve_bootmem(void)
10223 }
10224 static void __init dma32_free_bootmem(void)
10225 {
10226- int node;
10227
10228- if (end_pfn <= MAX_DMA32_PFN)
10229+ if (max_pfn <= MAX_DMA32_PFN)
10230 return;
10231
10232 if (!dma32_bootmem_ptr)
10233 return;
10234
10235- for_each_online_node(node)
10236- free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
10237- dma32_bootmem_size);
10238+ free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
10239
10240 dma32_bootmem_ptr = NULL;
10241 dma32_bootmem_size = 0;
10242@@ -107,7 +108,7 @@ static void __init dma32_free_bootmem(vo
10243 #define dma32_free_bootmem() ((void)0)
10244 #endif
10245
10246-static const struct dma_mapping_ops swiotlb_dma_ops = {
10247+static struct dma_mapping_ops swiotlb_dma_ops = {
10248 .mapping_error = swiotlb_dma_mapping_error,
10249 .map_single = swiotlb_map_single_phys,
10250 .unmap_single = swiotlb_unmap_single,
10251@@ -130,25 +131,31 @@ void __init pci_iommu_alloc(void)
10252 * The order of these functions is important for
10253 * fall-back/fail-over reasons
10254 */
10255-#ifdef CONFIG_GART_IOMMU
10256 gart_iommu_hole_init();
10257-#endif
10258
10259-#ifdef CONFIG_CALGARY_IOMMU
10260 detect_calgary();
10261-#endif
10262
10263 detect_intel_iommu();
10264
10265-#ifdef CONFIG_SWIOTLB
10266+ amd_iommu_detect();
10267+
10268 swiotlb_init();
10269 if (swiotlb) {
10270 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
10271 dma_ops = &swiotlb_dma_ops;
10272 }
10273-#endif
10274 }
10275
10276+#ifndef CONFIG_XEN
10277+unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
10278+{
10279+ unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
10280+
10281+ return size >> PAGE_SHIFT;
10282+}
10283+EXPORT_SYMBOL(iommu_num_pages);
10284+#endif
10285+
10286 /*
10287 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
10288 * documentation.
10289@@ -201,9 +208,7 @@ static __init int iommu_setup(char *p)
10290 swiotlb = 1;
10291 #endif
10292
10293-#ifdef CONFIG_GART_IOMMU
10294 gart_parse_options(p);
10295-#endif
10296
10297 #ifdef CONFIG_CALGARY_IOMMU
10298 if (!strncmp(p, "calgary", 7))
10299@@ -245,136 +250,19 @@ int range_straddles_page_boundary(paddr_
10300 !check_pages_physically_contiguous(pfn, offset, size));
10301 }
10302
10303-#ifdef CONFIG_X86_32
10304-int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
10305- dma_addr_t device_addr, size_t size, int flags)
10306-{
10307- void __iomem *mem_base = NULL;
10308- int pages = size >> PAGE_SHIFT;
10309- int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
10310-
10311- if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
10312- goto out;
10313- if (!size)
10314- goto out;
10315- if (dev->dma_mem)
10316- goto out;
10317-
10318- /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
10319-
10320- mem_base = ioremap(bus_addr, size);
10321- if (!mem_base)
10322- goto out;
10323-
10324- dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
10325- if (!dev->dma_mem)
10326- goto out;
10327- dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
10328- if (!dev->dma_mem->bitmap)
10329- goto free1_out;
10330-
10331- dev->dma_mem->virt_base = mem_base;
10332- dev->dma_mem->device_base = device_addr;
10333- dev->dma_mem->size = pages;
10334- dev->dma_mem->flags = flags;
10335-
10336- if (flags & DMA_MEMORY_MAP)
10337- return DMA_MEMORY_MAP;
10338-
10339- return DMA_MEMORY_IO;
10340-
10341- free1_out:
10342- kfree(dev->dma_mem);
10343- out:
10344- if (mem_base)
10345- iounmap(mem_base);
10346- return 0;
10347-}
10348-EXPORT_SYMBOL(dma_declare_coherent_memory);
10349-
10350-void dma_release_declared_memory(struct device *dev)
10351-{
10352- struct dma_coherent_mem *mem = dev->dma_mem;
10353-
10354- if (!mem)
10355- return;
10356- dev->dma_mem = NULL;
10357- iounmap(mem->virt_base);
10358- kfree(mem->bitmap);
10359- kfree(mem);
10360-}
10361-EXPORT_SYMBOL(dma_release_declared_memory);
10362-
10363-void *dma_mark_declared_memory_occupied(struct device *dev,
10364- dma_addr_t device_addr, size_t size)
10365-{
10366- struct dma_coherent_mem *mem = dev->dma_mem;
10367- int pos, err;
10368- int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
10369-
10370- pages >>= PAGE_SHIFT;
10371-
10372- if (!mem)
10373- return ERR_PTR(-EINVAL);
10374-
10375- pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
10376- err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
10377- if (err != 0)
10378- return ERR_PTR(err);
10379- return mem->virt_base + (pos << PAGE_SHIFT);
10380-}
10381-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
10382-
10383-static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
10384- dma_addr_t *dma_handle, void **ret)
10385-{
10386- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10387- int order = get_order(size);
10388-
10389- if (mem) {
10390- int page = bitmap_find_free_region(mem->bitmap, mem->size,
10391- order);
10392- if (page >= 0) {
10393- *dma_handle = mem->device_base + (page << PAGE_SHIFT);
10394- *ret = mem->virt_base + (page << PAGE_SHIFT);
10395- memset(*ret, 0, size);
10396- }
10397- if (mem->flags & DMA_MEMORY_EXCLUSIVE)
10398- *ret = NULL;
10399- }
10400- return (mem != NULL);
10401-}
10402-
10403-static int dma_release_coherent(struct device *dev, int order, void *vaddr)
10404-{
10405- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10406-
10407- if (mem && vaddr >= mem->virt_base && vaddr <
10408- (mem->virt_base + (mem->size << PAGE_SHIFT))) {
10409- int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
10410-
10411- bitmap_release_region(mem->bitmap, page, order);
10412- return 1;
10413- }
10414- return 0;
10415-}
10416-#else
10417-#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
10418-#define dma_release_coherent(dev, order, vaddr) (0)
10419-#endif /* CONFIG_X86_32 */
10420-
10421 int dma_supported(struct device *dev, u64 mask)
10422 {
10423+ struct dma_mapping_ops *ops = get_dma_ops(dev);
10424+
10425 #ifdef CONFIG_PCI
10426 if (mask > 0xffffffff && forbid_dac > 0) {
10427- printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
10428- dev->bus_id);
10429+ dev_info(dev, "PCI: Disallowing DAC for device\n");
10430 return 0;
10431 }
10432 #endif
10433
10434- if (dma_ops->dma_supported)
10435- return dma_ops->dma_supported(dev, mask);
10436+ if (ops->dma_supported)
10437+ return ops->dma_supported(dev, mask);
10438
10439 /* Copied from i386. Doesn't make much sense, because it will
10440 only work for pci_alloc_coherent.
10441@@ -395,8 +283,7 @@ int dma_supported(struct device *dev, u6
10442 type. Normally this doesn't make any difference, but gives
10443 more gentle handling of IOMMU overflow. */
10444 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
10445- printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
10446- dev->bus_id, mask);
10447+ dev_info(dev, "Force SAC with mask %Lx\n", mask);
10448 return 0;
10449 }
10450
10451@@ -422,6 +309,9 @@ void *
10452 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
10453 gfp_t gfp)
10454 {
10455+#ifndef CONFIG_XEN
10456+ struct dma_mapping_ops *ops = get_dma_ops(dev);
10457+#endif
10458 void *memory = NULL;
10459 struct page *page;
10460 unsigned long dma_mask = 0;
10461@@ -431,7 +321,7 @@ dma_alloc_coherent(struct device *dev, s
10462 /* ignore region specifiers */
10463 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
10464
10465- if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
10466+ if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
10467 return memory;
10468
10469 if (!dev) {
10470@@ -491,8 +381,8 @@ dma_alloc_coherent(struct device *dev, s
10471 /* Let low level make its own zone decisions */
10472 gfp &= ~(GFP_DMA32|GFP_DMA);
10473
10474- if (dma_ops->alloc_coherent)
10475- return dma_ops->alloc_coherent(dev, size,
10476+ if (ops->alloc_coherent)
10477+ return ops->alloc_coherent(dev, size,
10478 dma_handle, gfp);
10479 return NULL;
10480 }
10481@@ -504,14 +394,14 @@ dma_alloc_coherent(struct device *dev, s
10482 }
10483 }
10484
10485- if (dma_ops->alloc_coherent) {
10486+ if (ops->alloc_coherent) {
10487 free_pages((unsigned long)memory, order);
10488 gfp &= ~(GFP_DMA|GFP_DMA32);
10489- return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
10490+ return ops->alloc_coherent(dev, size, dma_handle, gfp);
10491 }
10492
10493- if (dma_ops->map_simple) {
10494- *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
10495+ if (ops->map_simple) {
10496+ *dma_handle = ops->map_simple(dev, virt_to_bus(memory),
10497 size,
10498 PCI_DMA_BIDIRECTIONAL);
10499 if (*dma_handle != bad_dma_address)
10500@@ -542,13 +432,17 @@ EXPORT_SYMBOL(dma_alloc_coherent);
10501 void dma_free_coherent(struct device *dev, size_t size,
10502 void *vaddr, dma_addr_t bus)
10503 {
10504+#ifndef CONFIG_XEN
10505+ struct dma_mapping_ops *ops = get_dma_ops(dev);
10506+#endif
10507+
10508 int order = get_order(size);
10509 WARN_ON(irqs_disabled()); /* for portability */
10510- if (dma_release_coherent(dev, order, vaddr))
10511+ if (dma_release_from_coherent(dev, order, vaddr))
10512 return;
10513 #ifndef CONFIG_XEN
10514- if (dma_ops->unmap_single)
10515- dma_ops->unmap_single(dev, bus, size, 0);
10516+ if (ops->unmap_single)
10517+ ops->unmap_single(dev, bus, size, 0);
10518 #endif
10519 xen_destroy_contiguous_region((unsigned long)vaddr, order);
10520 free_pages((unsigned long)vaddr, order);
10521@@ -557,15 +451,13 @@ EXPORT_SYMBOL(dma_free_coherent);
10522
10523 static int __init pci_iommu_init(void)
10524 {
10525-#ifdef CONFIG_CALGARY_IOMMU
10526 calgary_iommu_init();
10527-#endif
10528
10529 intel_iommu_init();
10530
10531-#ifdef CONFIG_GART_IOMMU
10532+ amd_iommu_init();
10533+
10534 gart_iommu_init();
10535-#endif
10536
10537 no_iommu_init();
10538 return 0;
10539Index: head-2008-12-01/arch/x86/kernel/pci-nommu-xen.c
10540===================================================================
10541--- head-2008-12-01.orig/arch/x86/kernel/pci-nommu-xen.c 2008-12-01 11:44:55.000000000 +0100
10542+++ head-2008-12-01/arch/x86/kernel/pci-nommu-xen.c 2008-12-01 11:49:07.000000000 +0100
10543@@ -79,18 +79,12 @@ gnttab_unmap_single(struct device *dev,
10544 gnttab_dma_unmap_page(dma_addr);
10545 }
10546
10547-static int nommu_mapping_error(dma_addr_t dma_addr)
10548-{
10549- return (dma_addr == bad_dma_address);
10550-}
10551-
10552-static const struct dma_mapping_ops nommu_dma_ops = {
10553+static struct dma_mapping_ops nommu_dma_ops = {
10554 .map_single = gnttab_map_single,
10555 .unmap_single = gnttab_unmap_single,
10556 .map_sg = gnttab_map_sg,
10557 .unmap_sg = gnttab_unmap_sg,
10558 .dma_supported = swiotlb_dma_supported,
10559- .mapping_error = nommu_mapping_error
10560 };
10561
10562 void __init no_iommu_init(void)
10563Index: head-2008-12-01/arch/x86/kernel/probe_roms_32.c
10564===================================================================
10565--- head-2008-12-01.orig/arch/x86/kernel/probe_roms_32.c 2008-12-03 15:48:43.000000000 +0100
10566+++ head-2008-12-01/arch/x86/kernel/probe_roms_32.c 2008-12-01 11:49:07.000000000 +0100
10567@@ -99,6 +99,11 @@ void __init probe_roms(void)
10568 unsigned char c;
10569 int i;
10570
10571+#ifdef CONFIG_XEN
10572+ if (!is_initial_xendomain())
10573+ return;
10574+#endif
10575+
10576 /* video rom */
10577 upper = adapter_rom_resources[0].start;
10578 for (start = video_rom_resource.start; start < upper; start += 2048) {
10579@@ -131,7 +136,7 @@ void __init probe_roms(void)
10580 upper = system_rom_resource.start;
10581
10582 /* check for extension rom (ignore length byte!) */
10583- rom = isa_bus_to_virt(extension_rom_resource.start);
10584+ rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
10585 if (romsignature(rom)) {
10586 length = extension_rom_resource.end - extension_rom_resource.start + 1;
10587 if (romchecksum(rom, length)) {
10588Index: head-2008-12-01/arch/x86/kernel/process-xen.c
10589===================================================================
10590--- head-2008-12-01.orig/arch/x86/kernel/process-xen.c 2008-12-01 11:44:55.000000000 +0100
10591+++ head-2008-12-01/arch/x86/kernel/process-xen.c 2008-12-01 11:49:07.000000000 +0100
10592@@ -6,6 +6,13 @@
10593 #include <linux/sched.h>
10594 #include <linux/module.h>
10595 #include <linux/pm.h>
10596+#include <linux/clockchips.h>
10597+#include <asm/system.h>
10598+
10599+unsigned long idle_halt;
10600+EXPORT_SYMBOL(idle_halt);
10601+unsigned long idle_nomwait;
10602+EXPORT_SYMBOL(idle_nomwait);
10603
10604 struct kmem_cache *task_xstate_cachep;
10605
10606@@ -45,6 +52,70 @@ void arch_task_cache_init(void)
10607 SLAB_PANIC, NULL);
10608 }
10609
10610+/*
10611+ * Idle related variables and functions
10612+ */
10613+unsigned long boot_option_idle_override = 0;
10614+EXPORT_SYMBOL(boot_option_idle_override);
10615+
10616+/*
10617+ * Powermanagement idle function, if any..
10618+ */
10619+void (*pm_idle)(void);
10620+EXPORT_SYMBOL(pm_idle);
10621+
10622+#ifdef CONFIG_X86_32
10623+/*
10624+ * This halt magic was a workaround for ancient floppy DMA
10625+ * wreckage. It should be safe to remove.
10626+ */
10627+static int hlt_counter;
10628+void disable_hlt(void)
10629+{
10630+ hlt_counter++;
10631+}
10632+EXPORT_SYMBOL(disable_hlt);
10633+
10634+void enable_hlt(void)
10635+{
10636+ hlt_counter--;
10637+}
10638+EXPORT_SYMBOL(enable_hlt);
10639+
10640+static inline int hlt_use_halt(void)
10641+{
10642+ return (!hlt_counter && boot_cpu_data.hlt_works_ok);
10643+}
10644+#else
10645+static inline int hlt_use_halt(void)
10646+{
10647+ return 1;
10648+}
10649+#endif
10650+
10651+/*
10652+ * We use this if we don't have any better
10653+ * idle routine..
10654+ */
10655+void xen_idle(void)
10656+{
10657+ current_thread_info()->status &= ~TS_POLLING;
10658+ /*
10659+ * TS_POLLING-cleared state must be visible before we
10660+ * test NEED_RESCHED:
10661+ */
10662+ smp_mb();
10663+
10664+ if (!need_resched())
10665+ safe_halt(); /* enables interrupts racelessly */
10666+ else
10667+ local_irq_enable();
10668+ current_thread_info()->status |= TS_POLLING;
10669+}
10670+#ifdef CONFIG_APM_MODULE
10671+EXPORT_SYMBOL(default_idle);
10672+#endif
10673+
10674 static void do_nothing(void *unused)
10675 {
10676 }
10677@@ -61,7 +132,7 @@ void cpu_idle_wait(void)
10678 {
10679 smp_mb();
10680 /* kick all the CPUs so that they exit out of pm_idle */
10681- smp_call_function(do_nothing, NULL, 0, 1);
10682+ smp_call_function(do_nothing, NULL, 1);
10683 }
10684 EXPORT_SYMBOL_GPL(cpu_idle_wait);
10685
10686@@ -125,60 +196,175 @@ static void poll_idle(void)
10687 *
10688 * idle=mwait overrides this decision and forces the usage of mwait.
10689 */
10690+static int __cpuinitdata force_mwait;
10691+
10692+#define MWAIT_INFO 0x05
10693+#define MWAIT_ECX_EXTENDED_INFO 0x01
10694+#define MWAIT_EDX_C1 0xf0
10695+
10696 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
10697 {
10698+ u32 eax, ebx, ecx, edx;
10699+
10700 if (force_mwait)
10701 return 1;
10702
10703- if (c->x86_vendor == X86_VENDOR_AMD) {
10704- switch(c->x86) {
10705- case 0x10:
10706- case 0x11:
10707- return 0;
10708+ if (c->cpuid_level < MWAIT_INFO)
10709+ return 0;
10710+
10711+ cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
10712+ /* Check, whether EDX has extended info about MWAIT */
10713+ if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
10714+ return 1;
10715+
10716+ /*
10717+ * edx enumeratios MONITOR/MWAIT extensions. Check, whether
10718+ * C1 supports MWAIT
10719+ */
10720+ return (edx & MWAIT_EDX_C1);
10721+}
10722+
10723+/*
10724+ * Check for AMD CPUs, which have potentially C1E support
10725+ */
10726+static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
10727+{
10728+ if (c->x86_vendor != X86_VENDOR_AMD)
10729+ return 0;
10730+
10731+ if (c->x86 < 0x0F)
10732+ return 0;
10733+
10734+ /* Family 0x0f models < rev F do not have C1E */
10735+ if (c->x86 == 0x0f && c->x86_model < 0x40)
10736+ return 0;
10737+
10738+ return 1;
10739+}
10740+
10741+static cpumask_t c1e_mask = CPU_MASK_NONE;
10742+static int c1e_detected;
10743+
10744+void c1e_remove_cpu(int cpu)
10745+{
10746+ cpu_clear(cpu, c1e_mask);
10747+}
10748+
10749+/*
10750+ * C1E aware idle routine. We check for C1E active in the interrupt
10751+ * pending message MSR. If we detect C1E, then we handle it the same
10752+ * way as C3 power states (local apic timer and TSC stop)
10753+ */
10754+static void c1e_idle(void)
10755+{
10756+ if (need_resched())
10757+ return;
10758+
10759+ if (!c1e_detected) {
10760+ u32 lo, hi;
10761+
10762+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
10763+ if (lo & K8_INTP_C1E_ACTIVE_MASK) {
10764+ c1e_detected = 1;
10765+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
10766+ mark_tsc_unstable("TSC halt in AMD C1E");
10767+ printk(KERN_INFO "System has AMD C1E enabled\n");
10768+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
10769 }
10770 }
10771- return 1;
10772+
10773+ if (c1e_detected) {
10774+ int cpu = smp_processor_id();
10775+
10776+ if (!cpu_isset(cpu, c1e_mask)) {
10777+ cpu_set(cpu, c1e_mask);
10778+ /*
10779+ * Force broadcast so ACPI can not interfere. Needs
10780+ * to run with interrupts enabled as it uses
10781+ * smp_function_call.
10782+ */
10783+ local_irq_enable();
10784+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
10785+ &cpu);
10786+ printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
10787+ cpu);
10788+ local_irq_disable();
10789+ }
10790+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
10791+
10792+ default_idle();
10793+
10794+ /*
10795+ * The switch back from broadcast mode needs to be
10796+ * called with interrupts disabled.
10797+ */
10798+ local_irq_disable();
10799+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
10800+ local_irq_enable();
10801+ } else
10802+ default_idle();
10803 }
10804 #endif
10805
10806 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
10807 {
10808 #ifndef CONFIG_XEN
10809- static int selected;
10810-
10811- if (selected)
10812- return;
10813 #ifdef CONFIG_X86_SMP
10814 if (pm_idle == poll_idle && smp_num_siblings > 1) {
10815 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
10816 " performance may degrade.\n");
10817 }
10818 #endif
10819+ if (pm_idle)
10820+ return;
10821+
10822 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
10823 /*
10824- * Skip, if setup has overridden idle.
10825 * One CPU supports mwait => All CPUs supports mwait
10826 */
10827- if (!pm_idle) {
10828- printk(KERN_INFO "using mwait in idle threads.\n");
10829- pm_idle = mwait_idle;
10830- }
10831- }
10832- selected = 1;
10833+ printk(KERN_INFO "using mwait in idle threads.\n");
10834+ pm_idle = mwait_idle;
10835+ } else if (check_c1e_idle(c)) {
10836+ printk(KERN_INFO "using C1E aware idle routine\n");
10837+ pm_idle = c1e_idle;
10838+ } else
10839+ pm_idle = default_idle;
10840 #endif
10841 }
10842
10843 static int __init idle_setup(char *str)
10844 {
10845+ if (!str)
10846+ return -EINVAL;
10847+
10848 if (!strcmp(str, "poll")) {
10849 printk("using polling idle threads.\n");
10850 pm_idle = poll_idle;
10851- }
10852 #ifndef CONFIG_XEN
10853- else if (!strcmp(str, "mwait"))
10854+ } else if (!strcmp(str, "mwait"))
10855 force_mwait = 1;
10856+ else if (!strcmp(str, "halt")) {
10857+ /*
10858+ * When the boot option of idle=halt is added, halt is
10859+ * forced to be used for CPU idle. In such case CPU C2/C3
10860+ * won't be used again.
10861+ * To continue to load the CPU idle driver, don't touch
10862+ * the boot_option_idle_override.
10863+ */
10864+ pm_idle = default_idle;
10865+ idle_halt = 1;
10866+ return 0;
10867+ } else if (!strcmp(str, "nomwait")) {
10868+ /*
10869+ * If the boot option of "idle=nomwait" is added,
10870+ * it means that mwait will be disabled for CPU C2/C3
10871+ * states. In such case it won't touch the variable
10872+ * of boot_option_idle_override.
10873+ */
10874+ idle_nomwait = 1;
10875+ return 0;
10876 #endif
10877- else
10878+ } else
10879 return -1;
10880
10881 boot_option_idle_override = 1;
10882Index: head-2008-12-01/arch/x86/kernel/process_32-xen.c
10883===================================================================
10884--- head-2008-12-01.orig/arch/x86/kernel/process_32-xen.c 2008-12-01 11:44:55.000000000 +0100
10885+++ head-2008-12-01/arch/x86/kernel/process_32-xen.c 2008-12-01 11:49:07.000000000 +0100
10886@@ -59,15 +59,11 @@
10887 #include <asm/tlbflush.h>
10888 #include <asm/cpu.h>
10889 #include <asm/kdebug.h>
10890+#include <asm/idle.h>
10891
10892 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
10893 asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
10894
10895-static int hlt_counter;
10896-
10897-unsigned long boot_option_idle_override = 0;
10898-EXPORT_SYMBOL(boot_option_idle_override);
10899-
10900 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
10901 EXPORT_PER_CPU_SYMBOL(current_task);
10902
10903@@ -82,46 +78,27 @@ unsigned long thread_saved_pc(struct tas
10904 return ((unsigned long *)tsk->thread.sp)[3];
10905 }
10906
10907-/*
10908- * Powermanagement idle function, if any..
10909- */
10910-void (*pm_idle)(void);
10911-EXPORT_SYMBOL(pm_idle);
10912+#ifdef CONFIG_HOTPLUG_CPU
10913+#ifndef CONFIG_XEN
10914+#include <asm/nmi.h>
10915
10916-void disable_hlt(void)
10917+static void cpu_exit_clear(void)
10918 {
10919- hlt_counter++;
10920-}
10921+ int cpu = raw_smp_processor_id();
10922
10923-EXPORT_SYMBOL(disable_hlt);
10924-
10925-void enable_hlt(void)
10926-{
10927- hlt_counter--;
10928-}
10929+ idle_task_exit();
10930
10931-EXPORT_SYMBOL(enable_hlt);
10932+ cpu_uninit();
10933+ irq_ctx_exit(cpu);
10934
10935-static void xen_idle(void)
10936-{
10937- current_thread_info()->status &= ~TS_POLLING;
10938- /*
10939- * TS_POLLING-cleared state must be visible before we
10940- * test NEED_RESCHED:
10941- */
10942- smp_mb();
10943+ cpu_clear(cpu, cpu_callout_map);
10944+ cpu_clear(cpu, cpu_callin_map);
10945
10946- if (!need_resched())
10947- safe_halt(); /* enables interrupts racelessly */
10948- else
10949- local_irq_enable();
10950- current_thread_info()->status |= TS_POLLING;
10951+ numa_remove_cpu(cpu);
10952+ c1e_remove_cpu(cpu);
10953 }
10954-#ifdef CONFIG_APM_MODULE
10955-EXPORT_SYMBOL(default_idle);
10956 #endif
10957
10958-#ifdef CONFIG_HOTPLUG_CPU
10959 static inline void play_dead(void)
10960 {
10961 idle_task_exit();
10962@@ -152,13 +129,11 @@ void cpu_idle(void)
10963
10964 /* endless idle loop with no priority at all */
10965 while (1) {
10966- tick_nohz_stop_sched_tick();
10967+ tick_nohz_stop_sched_tick(1);
10968 while (!need_resched()) {
10969- void (*idle)(void);
10970
10971 check_pgt_cache();
10972 rmb();
10973- idle = xen_idle; /* no alternatives */
10974
10975 if (rcu_pending(cpu))
10976 rcu_check_callbacks(cpu, 0);
10977@@ -168,7 +143,10 @@ void cpu_idle(void)
10978
10979 local_irq_disable();
10980 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
10981- idle();
10982+ /* Don't trace irqs off for idle */
10983+ stop_critical_timings();
10984+ xen_idle();
10985+ start_critical_timings();
10986 }
10987 tick_nohz_restart_sched_tick();
10988 preempt_enable_no_resched();
10989Index: head-2008-12-01/arch/x86/kernel/process_64-xen.c
10990===================================================================
10991--- head-2008-12-01.orig/arch/x86/kernel/process_64-xen.c 2008-12-01 11:44:55.000000000 +0100
10992+++ head-2008-12-01/arch/x86/kernel/process_64-xen.c 2008-12-01 11:49:07.000000000 +0100
10993@@ -65,15 +65,6 @@ asmlinkage extern void ret_from_fork(voi
10994
10995 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
10996
10997-unsigned long boot_option_idle_override = 0;
10998-EXPORT_SYMBOL(boot_option_idle_override);
10999-
11000-/*
11001- * Powermanagement idle function, if any..
11002- */
11003-void (*pm_idle)(void);
11004-EXPORT_SYMBOL(pm_idle);
11005-
11006 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
11007
11008 void idle_notifier_register(struct notifier_block *n)
11009@@ -103,25 +94,13 @@ void exit_idle(void)
11010 __exit_idle();
11011 }
11012
11013-static void xen_idle(void)
11014-{
11015- current_thread_info()->status &= ~TS_POLLING;
11016- /*
11017- * TS_POLLING-cleared state must be visible before we
11018- * test NEED_RESCHED:
11019- */
11020- smp_mb();
11021- if (!need_resched())
11022- safe_halt(); /* enables interrupts racelessly */
11023- else
11024- local_irq_enable();
11025- current_thread_info()->status |= TS_POLLING;
11026-}
11027-
11028 #ifdef CONFIG_HOTPLUG_CPU
11029 static inline void play_dead(void)
11030 {
11031 idle_task_exit();
11032+#ifndef CONFIG_XEN
11033+ c1e_remove_cpu(raw_smp_processor_id());
11034+#endif
11035 local_irq_disable();
11036 cpu_clear(smp_processor_id(), cpu_initialized);
11037 preempt_enable_no_resched();
11038@@ -146,12 +125,11 @@ void cpu_idle(void)
11039 current_thread_info()->status |= TS_POLLING;
11040 /* endless idle loop with no priority at all */
11041 while (1) {
11042- tick_nohz_stop_sched_tick();
11043+ tick_nohz_stop_sched_tick(1);
11044 while (!need_resched()) {
11045- void (*idle)(void);
11046
11047 rmb();
11048- idle = xen_idle; /* no alternatives */
11049+
11050 if (cpu_is_offline(smp_processor_id()))
11051 play_dead();
11052 /*
11053@@ -161,7 +139,10 @@ void cpu_idle(void)
11054 */
11055 local_irq_disable();
11056 enter_idle();
11057- idle();
11058+ /* Don't trace irqs off for idle */
11059+ stop_critical_timings();
11060+ xen_idle();
11061+ start_critical_timings();
11062 /* In many cases the interrupt that ended idle
11063 has already called exit_idle. But some idle
11064 loops can be woken up without interrupt. */
11065@@ -271,7 +252,7 @@ void exit_thread(void)
11066 }
11067 }
11068
11069-void load_gs_index(unsigned gs)
11070+void xen_load_gs_index(unsigned gs)
11071 {
11072 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
11073 }
11074@@ -373,10 +354,10 @@ int copy_thread(int nr, unsigned long cl
11075 p->thread.fs = me->thread.fs;
11076 p->thread.gs = me->thread.gs;
11077
11078- asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
11079- asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
11080- asm("mov %%es,%0" : "=m" (p->thread.es));
11081- asm("mov %%ds,%0" : "=m" (p->thread.ds));
11082+ savesegment(gs, p->thread.gsindex);
11083+ savesegment(fs, p->thread.fsindex);
11084+ savesegment(es, p->thread.es);
11085+ savesegment(ds, p->thread.ds);
11086
11087 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
11088 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
11089@@ -417,7 +398,9 @@ out:
11090 void
11091 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
11092 {
11093- asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
11094+ loadsegment(fs, 0);
11095+ loadsegment(es, 0);
11096+ loadsegment(ds, 0);
11097 load_gs_index(0);
11098 regs->ip = new_ip;
11099 regs->sp = new_sp;
11100@@ -557,8 +540,8 @@ static inline void __switch_to_xtra(stru
11101 struct task_struct *
11102 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
11103 {
11104- struct thread_struct *prev = &prev_p->thread,
11105- *next = &next_p->thread;
11106+ struct thread_struct *prev = &prev_p->thread;
11107+ struct thread_struct *next = &next_p->thread;
11108 int cpu = smp_processor_id();
11109 #ifndef CONFIG_X86_NO_TSS
11110 struct tss_struct *tss = &per_cpu(init_tss, cpu);
11111@@ -661,12 +644,25 @@ __switch_to(struct task_struct *prev_p,
11112 */
11113 if (unlikely(next->es))
11114 loadsegment(es, next->es);
11115-
11116+
11117 if (unlikely(next->ds))
11118 loadsegment(ds, next->ds);
11119
11120+ /*
11121+ * Leave lazy mode, flushing any hypercalls made here.
11122+ * This must be done before restoring TLS segments so
11123+ * the GDT and LDT are properly updated, and must be
11124+ * done before math_state_restore, so the TS bit is up
11125+ * to date.
11126+ */
11127+ arch_leave_lazy_cpu_mode();
11128+
11129 /*
11130 * Switch FS and GS.
11131+ *
11132+ * Segment register != 0 always requires a reload. Also
11133+ * reload when it has changed. When prev process used 64bit
11134+ * base always reload to avoid an information leak.
11135 */
11136 if (unlikely(next->fsindex))
11137 loadsegment(fs, next->fsindex);
11138@@ -687,7 +683,8 @@ __switch_to(struct task_struct *prev_p,
11139 write_pda(oldrsp, next->usersp);
11140 write_pda(pcurrent, next_p);
11141 write_pda(kernelstack,
11142- (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
11143+ (unsigned long)task_stack_page(next_p) +
11144+ THREAD_SIZE - PDA_STACKOFFSET);
11145 #ifdef CONFIG_CC_STACKPROTECTOR
11146 write_pda(stack_canary, next_p->stack_canary);
11147
11148@@ -848,7 +845,7 @@ long do_arch_prctl(struct task_struct *t
11149 set_32bit_tls(task, FS_TLS, addr);
11150 if (doit) {
11151 load_TLS(&task->thread, cpu);
11152- asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
11153+ loadsegment(fs, FS_TLS_SEL);
11154 }
11155 task->thread.fsindex = FS_TLS_SEL;
11156 task->thread.fs = 0;
11157@@ -858,7 +855,7 @@ long do_arch_prctl(struct task_struct *t
11158 if (doit) {
11159 /* set the selector to 0 to not confuse
11160 __switch_to */
11161- asm volatile("movl %0,%%fs" :: "r" (0));
11162+ loadsegment(fs, 0);
11163 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
11164 addr);
11165 }
11166@@ -882,7 +879,7 @@ long do_arch_prctl(struct task_struct *t
11167 if (task->thread.gsindex == GS_TLS_SEL)
11168 base = read_32bit_tls(task, GS_TLS);
11169 else if (doit) {
11170- asm("movl %%gs,%0" : "=r" (gsindex));
11171+ savesegment(gs, gsindex);
11172 if (gsindex)
11173 rdmsrl(MSR_KERNEL_GS_BASE, base);
11174 else
11175Index: head-2008-12-01/arch/x86/kernel/quirks-xen.c
11176===================================================================
11177--- head-2008-12-01.orig/arch/x86/kernel/quirks-xen.c 2008-12-01 11:37:10.000000000 +0100
11178+++ head-2008-12-01/arch/x86/kernel/quirks-xen.c 2008-12-01 11:49:07.000000000 +0100
11179@@ -63,6 +63,7 @@ static enum {
11180 ICH_FORCE_HPET_RESUME,
11181 VT8237_FORCE_HPET_RESUME,
11182 NVIDIA_FORCE_HPET_RESUME,
11183+ ATI_FORCE_HPET_RESUME,
11184 } force_hpet_resume_type;
11185
11186 static void __iomem *rcba_base;
11187@@ -156,6 +157,8 @@ static void ich_force_enable_hpet(struct
11188
11189 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
11190 ich_force_enable_hpet);
11191+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
11192+ ich_force_enable_hpet);
11193 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
11194 ich_force_enable_hpet);
11195 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
11196@@ -172,6 +175,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
11197
11198 static struct pci_dev *cached_dev;
11199
11200+static void hpet_print_force_info(void)
11201+{
11202+ printk(KERN_INFO "HPET not enabled in BIOS. "
11203+ "You might try hpet=force boot option\n");
11204+}
11205+
11206 static void old_ich_force_hpet_resume(void)
11207 {
11208 u32 val;
11209@@ -251,8 +260,12 @@ static void old_ich_force_enable_hpet_us
11210 {
11211 if (hpet_force_user)
11212 old_ich_force_enable_hpet(dev);
11213+ else
11214+ hpet_print_force_info();
11215 }
11216
11217+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
11218+ old_ich_force_enable_hpet_user);
11219 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
11220 old_ich_force_enable_hpet_user);
11221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
11222@@ -288,9 +301,14 @@ static void vt8237_force_enable_hpet(str
11223 {
11224 u32 uninitialized_var(val);
11225
11226- if (!hpet_force_user || hpet_address || force_hpet_address)
11227+ if (hpet_address || force_hpet_address)
11228 return;
11229
11230+ if (!hpet_force_user) {
11231+ hpet_print_force_info();
11232+ return;
11233+ }
11234+
11235 pci_read_config_dword(dev, 0x68, &val);
11236 /*
11237 * Bit 7 is HPET enable bit.
11238@@ -328,6 +346,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_V
11239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
11240 vt8237_force_enable_hpet);
11241
11242+static void ati_force_hpet_resume(void)
11243+{
11244+ pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
11245+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
11246+}
11247+
11248+static void ati_force_enable_hpet(struct pci_dev *dev)
11249+{
11250+ u32 uninitialized_var(val);
11251+
11252+ if (hpet_address || force_hpet_address)
11253+ return;
11254+
11255+ if (!hpet_force_user) {
11256+ hpet_print_force_info();
11257+ return;
11258+ }
11259+
11260+ pci_write_config_dword(dev, 0x14, 0xfed00000);
11261+ pci_read_config_dword(dev, 0x14, &val);
11262+ force_hpet_address = val;
11263+ force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
11264+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
11265+ force_hpet_address);
11266+ cached_dev = dev;
11267+ return;
11268+}
11269+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
11270+ ati_force_enable_hpet);
11271+
11272 /*
11273 * Undocumented chipset feature taken from LinuxBIOS.
11274 */
11275@@ -341,8 +389,13 @@ static void nvidia_force_enable_hpet(str
11276 {
11277 u32 uninitialized_var(val);
11278
11279- if (!hpet_force_user || hpet_address || force_hpet_address)
11280+ if (hpet_address || force_hpet_address)
11281+ return;
11282+
11283+ if (!hpet_force_user) {
11284+ hpet_print_force_info();
11285 return;
11286+ }
11287
11288 pci_write_config_dword(dev, 0x44, 0xfed00001);
11289 pci_read_config_dword(dev, 0x44, &val);
11290@@ -395,6 +448,9 @@ void force_hpet_resume(void)
11291 case NVIDIA_FORCE_HPET_RESUME:
11292 nvidia_force_hpet_resume();
11293 return;
11294+ case ATI_FORCE_HPET_RESUME:
11295+ ati_force_hpet_resume();
11296+ return;
11297 default:
11298 break;
11299 }
11300Index: head-2008-12-01/arch/x86/kernel/setup-xen.c
11301===================================================================
11302--- head-2008-12-01.orig/arch/x86/kernel/setup-xen.c 2008-12-01 11:44:55.000000000 +0100
11303+++ head-2008-12-01/arch/x86/kernel/setup-xen.c 2008-12-01 11:49:07.000000000 +0100
11304@@ -1,141 +1,1147 @@
11305-#include <linux/kernel.h>
11306+/*
11307+ * Copyright (C) 1995 Linus Torvalds
11308+ *
11309+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11310+ *
11311+ * Memory region support
11312+ * David Parsons <orc@pell.chi.il.us>, July-August 1999
11313+ *
11314+ * Added E820 sanitization routine (removes overlapping memory regions);
11315+ * Brian Moyle <bmoyle@mvista.com>, February 2001
11316+ *
11317+ * Moved CPU detection code to cpu/${cpu}.c
11318+ * Patrick Mochel <mochel@osdl.org>, March 2002
11319+ *
11320+ * Provisions for empty E820 memory regions (reported by certain BIOSes).
11321+ * Alex Achenbach <xela@slit.de>, December 2002.
11322+ *
11323+ */
11324+
11325+/*
11326+ * This file handles the architecture-dependent parts of initialization
11327+ */
11328+
11329+#include <linux/sched.h>
11330+#include <linux/mm.h>
11331+#include <linux/mmzone.h>
11332+#include <linux/screen_info.h>
11333+#include <linux/ioport.h>
11334+#include <linux/acpi.h>
11335+#include <linux/apm_bios.h>
11336+#include <linux/initrd.h>
11337+#include <linux/bootmem.h>
11338+#include <linux/seq_file.h>
11339+#include <linux/console.h>
11340+#include <linux/mca.h>
11341+#include <linux/root_dev.h>
11342+#include <linux/highmem.h>
11343 #include <linux/module.h>
11344+#include <linux/efi.h>
11345 #include <linux/init.h>
11346-#include <linux/bootmem.h>
11347+#include <linux/edd.h>
11348+#include <linux/iscsi_ibft.h>
11349+#include <linux/nodemask.h>
11350+#include <linux/kexec.h>
11351+#include <linux/dmi.h>
11352+#include <linux/pfn.h>
11353+#include <linux/pci.h>
11354+#include <asm/pci-direct.h>
11355+#include <linux/init_ohci1394_dma.h>
11356+#include <linux/kvm_para.h>
11357+
11358+#include <linux/errno.h>
11359+#include <linux/kernel.h>
11360+#include <linux/stddef.h>
11361+#include <linux/unistd.h>
11362+#include <linux/ptrace.h>
11363+#include <linux/slab.h>
11364+#include <linux/user.h>
11365+#include <linux/delay.h>
11366+
11367+#include <linux/kallsyms.h>
11368+#include <linux/cpufreq.h>
11369+#include <linux/dma-mapping.h>
11370+#include <linux/ctype.h>
11371+#include <linux/uaccess.h>
11372+
11373 #include <linux/percpu.h>
11374-#include <asm/smp.h>
11375-#include <asm/percpu.h>
11376+#include <linux/crash_dump.h>
11377+
11378+#include <video/edid.h>
11379+
11380+#include <asm/mtrr.h>
11381+#include <asm/apic.h>
11382+#include <asm/e820.h>
11383+#include <asm/mpspec.h>
11384+#include <asm/setup.h>
11385+#include <asm/arch_hooks.h>
11386+#include <asm/efi.h>
11387 #include <asm/sections.h>
11388+#include <asm/dmi.h>
11389+#include <asm/io_apic.h>
11390+#include <asm/ist.h>
11391+#include <asm/vmi.h>
11392+#include <setup_arch.h>
11393+#include <asm/bios_ebda.h>
11394+#include <asm/cacheflush.h>
11395 #include <asm/processor.h>
11396-#include <asm/setup.h>
11397+#include <asm/bugs.h>
11398+
11399+#include <asm/system.h>
11400+#include <asm/vsyscall.h>
11401+#include <asm/smp.h>
11402+#include <asm/desc.h>
11403+#include <asm/dma.h>
11404+#include <asm/iommu.h>
11405+#include <asm/mmu_context.h>
11406+#include <asm/proto.h>
11407+
11408+#include <mach_apic.h>
11409+#include <asm/paravirt.h>
11410+
11411+#include <asm/percpu.h>
11412 #include <asm/topology.h>
11413-#include <asm/mpspec.h>
11414 #include <asm/apicdef.h>
11415+#ifdef CONFIG_X86_64
11416+#include <asm/numa_64.h>
11417+#endif
11418+
11419+#ifdef CONFIG_XEN
11420+#include <asm/hypervisor.h>
11421+#include <xen/interface/kexec.h>
11422+#include <xen/interface/memory.h>
11423+#include <xen/interface/nmi.h>
11424+#include <xen/interface/physdev.h>
11425+#include <xen/features.h>
11426+#include <xen/firmware.h>
11427+#include <xen/xencons.h>
11428+
11429+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
11430+EXPORT_SYMBOL(HYPERVISOR_shared_info);
11431
11432-#ifdef CONFIG_X86_LOCAL_APIC
11433-unsigned int num_processors;
11434-unsigned disabled_cpus __cpuinitdata;
11435-/* Processor that is doing the boot up */
11436-unsigned int boot_cpu_physical_apicid = -1U;
11437-EXPORT_SYMBOL(boot_cpu_physical_apicid);
11438+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
11439+static struct notifier_block xen_panic_block = {
11440+ xen_panic_event, NULL, 0 /* try to go last */
11441+};
11442
11443-DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
11444-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
11445+unsigned long *phys_to_machine_mapping;
11446+EXPORT_SYMBOL(phys_to_machine_mapping);
11447
11448-/* Bitmask of physically existing CPUs */
11449-physid_mask_t phys_cpu_present_map;
11450+unsigned long *pfn_to_mfn_frame_list_list,
11451+#ifdef CONFIG_X86_64
11452+ *pfn_to_mfn_frame_list[512];
11453+#else
11454+ *pfn_to_mfn_frame_list[128];
11455+#endif
11456+
11457+/* Raw start-of-day parameters from the hypervisor. */
11458+start_info_t *xen_start_info;
11459+EXPORT_SYMBOL(xen_start_info);
11460+#endif
11461+
11462+#ifndef ARCH_SETUP
11463+#define ARCH_SETUP
11464+#endif
11465+
11466+#ifndef CONFIG_XEN
11467+#ifndef CONFIG_DEBUG_BOOT_PARAMS
11468+struct boot_params __initdata boot_params;
11469+#else
11470+struct boot_params boot_params;
11471+#endif
11472 #endif
11473
11474-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
11475 /*
11476- * Copy data used in early init routines from the initial arrays to the
11477- * per cpu data areas. These arrays then become expendable and the
11478- * *_early_ptr's are zeroed indicating that the static arrays are gone.
11479+ * Machine setup..
11480 */
11481-static void __init setup_per_cpu_maps(void)
11482+static struct resource data_resource = {
11483+ .name = "Kernel data",
11484+ .start = 0,
11485+ .end = 0,
11486+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11487+};
11488+
11489+static struct resource code_resource = {
11490+ .name = "Kernel code",
11491+ .start = 0,
11492+ .end = 0,
11493+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11494+};
11495+
11496+static struct resource bss_resource = {
11497+ .name = "Kernel bss",
11498+ .start = 0,
11499+ .end = 0,
11500+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11501+};
11502+
11503+
11504+#ifdef CONFIG_X86_32
11505+#ifndef CONFIG_XEN
11506+/* This value is set up by the early boot code to point to the value
11507+ immediately after the boot time page tables. It contains a *physical*
11508+ address, and must not be in the .bss segment! */
11509+unsigned long init_pg_tables_start __initdata = ~0UL;
11510+unsigned long init_pg_tables_end __initdata = ~0UL;
11511+#endif
11512+
11513+static struct resource video_ram_resource = {
11514+ .name = "Video RAM area",
11515+ .start = 0xa0000,
11516+ .end = 0xbffff,
11517+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11518+};
11519+
11520+/* cpu data as detected by the assembly code in head.S */
11521+struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11522+/* common cpu data for all cpus */
11523+struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11524+EXPORT_SYMBOL(boot_cpu_data);
11525+#ifndef CONFIG_XEN
11526+static void set_mca_bus(int x)
11527 {
11528+#ifdef CONFIG_MCA
11529+ MCA_bus = x;
11530+#endif
11531+}
11532+
11533+unsigned int def_to_bigsmp;
11534+
11535+/* for MCA, but anyone else can use it if they want */
11536+unsigned int machine_id;
11537+unsigned int machine_submodel_id;
11538+unsigned int BIOS_revision;
11539+
11540+struct apm_info apm_info;
11541+EXPORT_SYMBOL(apm_info);
11542+#endif
11543+
11544+#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
11545+struct ist_info ist_info;
11546+EXPORT_SYMBOL(ist_info);
11547+#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
11548+struct ist_info ist_info;
11549+#endif
11550+
11551+#else
11552+struct cpuinfo_x86 boot_cpu_data __read_mostly;
11553+EXPORT_SYMBOL(boot_cpu_data);
11554+#endif
11555+
11556+
11557+#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
11558+unsigned long mmu_cr4_features;
11559+#else
11560+unsigned long mmu_cr4_features = X86_CR4_PAE;
11561+#endif
11562+
11563+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
11564+int bootloader_type;
11565+
11566+/*
11567+ * Early DMI memory
11568+ */
11569+int dmi_alloc_index;
11570+char dmi_alloc_data[DMI_MAX_DATA];
11571+
11572+/*
11573+ * Setup options
11574+ */
11575+struct screen_info screen_info;
11576+EXPORT_SYMBOL(screen_info);
11577+struct edid_info edid_info;
11578+EXPORT_SYMBOL_GPL(edid_info);
11579+
11580+extern int root_mountflags;
11581+
11582+unsigned long saved_video_mode;
11583+
11584+#define RAMDISK_IMAGE_START_MASK 0x07FF
11585+#define RAMDISK_PROMPT_FLAG 0x8000
11586+#define RAMDISK_LOAD_FLAG 0x4000
11587+
11588+static char __initdata command_line[COMMAND_LINE_SIZE];
11589+
11590+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
11591+struct edd edd;
11592+#ifdef CONFIG_EDD_MODULE
11593+EXPORT_SYMBOL(edd);
11594+#endif
11595 #ifndef CONFIG_XEN
11596- int cpu;
11597+/**
11598+ * copy_edd() - Copy the BIOS EDD information
11599+ * from boot_params into a safe place.
11600+ *
11601+ */
11602+static inline void copy_edd(void)
11603+{
11604+ memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
11605+ sizeof(edd.mbr_signature));
11606+ memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
11607+ edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
11608+ edd.edd_info_nr = boot_params.eddbuf_entries;
11609+}
11610+#endif
11611+#else
11612+static inline void copy_edd(void)
11613+{
11614+}
11615+#endif
11616+
11617+#ifdef CONFIG_BLK_DEV_INITRD
11618+
11619+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11620+
11621+#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
11622+static void __init relocate_initrd(void)
11623+{
11624+
11625+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11626+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11627+ u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11628+ u64 ramdisk_here;
11629+ unsigned long slop, clen, mapaddr;
11630+ char *p, *q;
11631+
11632+ /* We need to move the initrd down into lowmem */
11633+ ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
11634+ PAGE_SIZE);
11635+
11636+ if (ramdisk_here == -1ULL)
11637+ panic("Cannot find place for new RAMDISK of size %lld\n",
11638+ ramdisk_size);
11639+
11640+ /* Note: this includes all the lowmem currently occupied by
11641+ the initrd, we rely on that fact to keep the data intact. */
11642+ reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
11643+ "NEW RAMDISK");
11644+ initrd_start = ramdisk_here + PAGE_OFFSET;
11645+ initrd_end = initrd_start + ramdisk_size;
11646+ printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
11647+ ramdisk_here, ramdisk_here + ramdisk_size);
11648+
11649+ q = (char *)initrd_start;
11650+
11651+ /* Copy any lowmem portion of the initrd */
11652+ if (ramdisk_image < end_of_lowmem) {
11653+ clen = end_of_lowmem - ramdisk_image;
11654+ p = (char *)__va(ramdisk_image);
11655+ memcpy(q, p, clen);
11656+ q += clen;
11657+ ramdisk_image += clen;
11658+ ramdisk_size -= clen;
11659+ }
11660+
11661+ /* Copy the highmem portion of the initrd */
11662+ while (ramdisk_size) {
11663+ slop = ramdisk_image & ~PAGE_MASK;
11664+ clen = ramdisk_size;
11665+ if (clen > MAX_MAP_CHUNK-slop)
11666+ clen = MAX_MAP_CHUNK-slop;
11667+ mapaddr = ramdisk_image & PAGE_MASK;
11668+ p = early_ioremap(mapaddr, clen+slop);
11669+ memcpy(q, p+slop, clen);
11670+ early_iounmap(p, clen+slop);
11671+ q += clen;
11672+ ramdisk_image += clen;
11673+ ramdisk_size -= clen;
11674+ }
11675+ /* high pages is not converted by early_res_to_bootmem */
11676+ ramdisk_image = boot_params.hdr.ramdisk_image;
11677+ ramdisk_size = boot_params.hdr.ramdisk_size;
11678+ printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
11679+ " %08llx - %08llx\n",
11680+ ramdisk_image, ramdisk_image + ramdisk_size - 1,
11681+ ramdisk_here, ramdisk_here + ramdisk_size - 1);
11682+}
11683+#endif
11684+
11685+static void __init reserve_initrd(void)
11686+{
11687+#ifndef CONFIG_XEN
11688+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11689+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11690+ u64 ramdisk_end = ramdisk_image + ramdisk_size;
11691+ u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11692
11693- for_each_possible_cpu(cpu) {
11694- per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
11695- per_cpu(x86_bios_cpu_apicid, cpu) =
11696- x86_bios_cpu_apicid_init[cpu];
11697-#ifdef CONFIG_NUMA
11698- per_cpu(x86_cpu_to_node_map, cpu) =
11699- x86_cpu_to_node_map_init[cpu];
11700+ if (!boot_params.hdr.type_of_loader ||
11701+ !ramdisk_image || !ramdisk_size)
11702+ return; /* No initrd provided by bootloader */
11703+#else
11704+ unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
11705+ unsigned long ramdisk_size = xen_start_info->mod_len;
11706+ unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
11707+ unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11708+
11709+ if (!xen_start_info->mod_start || !ramdisk_size)
11710+ return; /* No initrd provided by bootloader */
11711 #endif
11712+
11713+ initrd_start = 0;
11714+
11715+ if (ramdisk_size >= (end_of_lowmem>>1)) {
11716+ free_early(ramdisk_image, ramdisk_end);
11717+ printk(KERN_ERR "initrd too large to handle, "
11718+ "disabling initrd\n");
11719+ return;
11720 }
11721
11722- /* indicate the early static arrays will soon be gone */
11723- x86_cpu_to_apicid_early_ptr = NULL;
11724- x86_bios_cpu_apicid_early_ptr = NULL;
11725-#ifdef CONFIG_NUMA
11726- x86_cpu_to_node_map_early_ptr = NULL;
11727+ printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
11728+ ramdisk_end);
11729+
11730+
11731+ if (ramdisk_end <= end_of_lowmem) {
11732+ /* All in lowmem, easy case */
11733+ /*
11734+ * don't need to reserve again, already reserved early
11735+ * in i386_start_kernel
11736+ */
11737+ initrd_start = ramdisk_image + PAGE_OFFSET;
11738+ initrd_end = initrd_start + ramdisk_size;
11739+#ifdef CONFIG_X86_64_XEN
11740+ initrd_below_start_ok = 1;
11741 #endif
11742+ return;
11743+ }
11744+
11745+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11746+ relocate_initrd();
11747+#else
11748+ printk(KERN_ERR "initrd extends beyond end of memory "
11749+ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
11750+ ramdisk_end, end_of_lowmem);
11751+ initrd_start = 0;
11752 #endif
11753+ free_early(ramdisk_image, ramdisk_end);
11754 }
11755+#else
11756+static void __init reserve_initrd(void)
11757+{
11758+}
11759+#endif /* CONFIG_BLK_DEV_INITRD */
11760
11761-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
11762-cpumask_t *cpumask_of_cpu_map __read_mostly;
11763-EXPORT_SYMBOL(cpumask_of_cpu_map);
11764+static void __init parse_setup_data(void)
11765+{
11766+#ifndef CONFIG_XEN
11767+ struct setup_data *data;
11768+ u64 pa_data;
11769+
11770+ if (boot_params.hdr.version < 0x0209)
11771+ return;
11772+ pa_data = boot_params.hdr.setup_data;
11773+ while (pa_data) {
11774+ data = early_ioremap(pa_data, PAGE_SIZE);
11775+ switch (data->type) {
11776+ case SETUP_E820_EXT:
11777+ parse_e820_ext(data, pa_data);
11778+ break;
11779+ default:
11780+ break;
11781+ }
11782+ pa_data = data->next;
11783+ early_iounmap(data, PAGE_SIZE);
11784+ }
11785+#endif
11786+}
11787
11788-/* requires nr_cpu_ids to be initialized */
11789-static void __init setup_cpumask_of_cpu(void)
11790+static void __init e820_reserve_setup_data(void)
11791 {
11792- int i;
11793+#ifndef CONFIG_XEN
11794+ struct setup_data *data;
11795+ u64 pa_data;
11796+ int found = 0;
11797
11798- /* alloc_bootmem zeroes memory */
11799- cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
11800- for (i = 0; i < nr_cpu_ids; i++)
11801- cpu_set(i, cpumask_of_cpu_map[i]);
11802+ if (boot_params.hdr.version < 0x0209)
11803+ return;
11804+ pa_data = boot_params.hdr.setup_data;
11805+ while (pa_data) {
11806+ data = early_ioremap(pa_data, sizeof(*data));
11807+ e820_update_range(pa_data, sizeof(*data)+data->len,
11808+ E820_RAM, E820_RESERVED_KERN);
11809+ found = 1;
11810+ pa_data = data->next;
11811+ early_iounmap(data, sizeof(*data));
11812+ }
11813+ if (!found)
11814+ return;
11815+
11816+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
11817+ memcpy(&e820_saved, &e820, sizeof(struct e820map));
11818+ printk(KERN_INFO "extended physical RAM map:\n");
11819+ e820_print_map("reserve setup_data");
11820+#endif
11821 }
11822-#else
11823-static inline void setup_cpumask_of_cpu(void) { }
11824+
11825+static void __init reserve_early_setup_data(void)
11826+{
11827+#ifndef CONFIG_XEN
11828+ struct setup_data *data;
11829+ u64 pa_data;
11830+ char buf[32];
11831+
11832+ if (boot_params.hdr.version < 0x0209)
11833+ return;
11834+ pa_data = boot_params.hdr.setup_data;
11835+ while (pa_data) {
11836+ data = early_ioremap(pa_data, sizeof(*data));
11837+ sprintf(buf, "setup data %x", data->type);
11838+ reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
11839+ pa_data = data->next;
11840+ early_iounmap(data, sizeof(*data));
11841+ }
11842 #endif
11843+}
11844
11845-#ifdef CONFIG_X86_32
11846 /*
11847- * Great future not-so-futuristic plan: make i386 and x86_64 do it
11848- * the same way
11849+ * --------- Crashkernel reservation ------------------------------
11850+ */
11851+
11852+#ifdef CONFIG_KEXEC
11853+
11854+#ifndef CONFIG_XEN
11855+/**
11856+ * Reserve @size bytes of crashkernel memory at any suitable offset.
11857+ *
11858+ * @size: Size of the crashkernel memory to reserve.
11859+ * Returns the base address on success, and -1ULL on failure.
11860+ */
11861+unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
11862+{
11863+ const unsigned long long alignment = 16<<20; /* 16M */
11864+ unsigned long long start = 0LL;
11865+
11866+ while (1) {
11867+ int ret;
11868+
11869+ start = find_e820_area(start, ULONG_MAX, size, alignment);
11870+ if (start == -1ULL)
11871+ return start;
11872+
11873+ /* try to reserve it */
11874+ ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
11875+ if (ret >= 0)
11876+ return start;
11877+
11878+ start += alignment;
11879+ }
11880+}
11881+
11882+static inline unsigned long long get_total_mem(void)
11883+{
11884+ unsigned long long total;
11885+
11886+ total = max_low_pfn - min_low_pfn;
11887+#ifdef CONFIG_HIGHMEM
11888+ total += highend_pfn - highstart_pfn;
11889+#endif
11890+
11891+ return total << PAGE_SHIFT;
11892+}
11893+
11894+static void __init reserve_crashkernel(void)
11895+{
11896+ unsigned long long total_mem;
11897+ unsigned long long crash_size, crash_base;
11898+ int ret;
11899+
11900+ total_mem = get_total_mem();
11901+
11902+ ret = parse_crashkernel(boot_command_line, total_mem,
11903+ &crash_size, &crash_base);
11904+ if (ret != 0 || crash_size <= 0)
11905+ return;
11906+
11907+ /* 0 means: find the address automatically */
11908+ if (crash_base <= 0) {
11909+ crash_base = find_and_reserve_crashkernel(crash_size);
11910+ if (crash_base == -1ULL) {
11911+ pr_info("crashkernel reservation failed. "
11912+ "No suitable area found.\n");
11913+ return;
11914+ }
11915+ } else {
11916+ ret = reserve_bootmem_generic(crash_base, crash_size,
11917+ BOOTMEM_EXCLUSIVE);
11918+ if (ret < 0) {
11919+ pr_info("crashkernel reservation failed - "
11920+ "memory is in use\n");
11921+ return;
11922+ }
11923+ }
11924+
11925+ printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
11926+ "for crashkernel (System RAM: %ldMB)\n",
11927+ (unsigned long)(crash_size >> 20),
11928+ (unsigned long)(crash_base >> 20),
11929+ (unsigned long)(total_mem >> 20));
11930+
11931+ crashk_res.start = crash_base;
11932+ crashk_res.end = crash_base + crash_size - 1;
11933+ insert_resource(&iomem_resource, &crashk_res);
11934+}
11935+#else
11936+#define reserve_crashkernel xen_machine_kexec_setup_resources
11937+#endif
11938+#else
11939+static void __init reserve_crashkernel(void)
11940+{
11941+}
11942+#endif
11943+
11944+static struct resource standard_io_resources[] = {
11945+ { .name = "dma1", .start = 0x00, .end = 0x1f,
11946+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11947+ { .name = "pic1", .start = 0x20, .end = 0x21,
11948+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11949+ { .name = "timer0", .start = 0x40, .end = 0x43,
11950+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11951+ { .name = "timer1", .start = 0x50, .end = 0x53,
11952+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11953+ { .name = "keyboard", .start = 0x60, .end = 0x60,
11954+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11955+ { .name = "keyboard", .start = 0x64, .end = 0x64,
11956+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11957+ { .name = "dma page reg", .start = 0x80, .end = 0x8f,
11958+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11959+ { .name = "pic2", .start = 0xa0, .end = 0xa1,
11960+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11961+ { .name = "dma2", .start = 0xc0, .end = 0xdf,
11962+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11963+ { .name = "fpu", .start = 0xf0, .end = 0xff,
11964+ .flags = IORESOURCE_BUSY | IORESOURCE_IO }
11965+};
11966+
11967+static void __init reserve_standard_io_resources(void)
11968+{
11969+ int i;
11970+
11971+ /* Nothing to do if not running in dom0. */
11972+ if (!is_initial_xendomain())
11973+ return;
11974+
11975+ /* request I/O space for devices used on all i[345]86 PCs */
11976+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
11977+ request_resource(&ioport_resource, &standard_io_resources[i]);
11978+
11979+}
11980+
11981+#ifdef CONFIG_PROC_VMCORE
11982+/* elfcorehdr= specifies the location of elf core header
11983+ * stored by the crashed kernel. This option will be passed
11984+ * by kexec loader to the capture kernel.
11985 */
11986-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
11987-EXPORT_SYMBOL(__per_cpu_offset);
11988+static int __init setup_elfcorehdr(char *arg)
11989+{
11990+ char *end;
11991+ if (!arg)
11992+ return -EINVAL;
11993+ elfcorehdr_addr = memparse(arg, &end);
11994+ return end > arg ? 0 : -EINVAL;
11995+}
11996+early_param("elfcorehdr", setup_elfcorehdr);
11997 #endif
11998
11999+static struct x86_quirks default_x86_quirks __initdata;
12000+
12001+struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
12002+
12003 /*
12004- * Great future plan:
12005- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
12006- * Always point %gs to its beginning
12007+ * Determine if we were loaded by an EFI loader. If so, then we have also been
12008+ * passed the efi memmap, systab, etc., so we should use these data structures
12009+ * for initialization. Note, the efi init code path is determined by the
12010+ * global efi_enabled. This allows the same kernel image to be used on existing
12011+ * systems (with a traditional BIOS) as well as on EFI systems.
12012 */
12013-void __init setup_per_cpu_areas(void)
12014+/*
12015+ * setup_arch - architecture-specific boot-time initializations
12016+ *
12017+ * Note: On x86_64, fixmaps are ready for use even before this is called.
12018+ */
12019+
12020+void __init setup_arch(char **cmdline_p)
12021 {
12022- int i, highest_cpu = 0;
12023- unsigned long size;
12024+#ifdef CONFIG_XEN
12025+ unsigned int i;
12026+ unsigned long p2m_pages;
12027+ struct physdev_set_iopl set_iopl;
12028
12029-#ifdef CONFIG_HOTPLUG_CPU
12030- prefill_possible_map();
12031+#ifdef CONFIG_X86_32
12032+ /* Force a quick death if the kernel panics (not domain 0). */
12033+ extern int panic_timeout;
12034+ if (!panic_timeout && !is_initial_xendomain())
12035+ panic_timeout = 1;
12036 #endif
12037
12038- /* Copy section for each CPU (we discard the original) */
12039- size = PERCPU_ENOUGH_ROOM;
12040- printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
12041- size);
12042-
12043- for_each_possible_cpu(i) {
12044- char *ptr;
12045-#ifndef CONFIG_NEED_MULTIPLE_NODES
12046- ptr = alloc_bootmem_pages(size);
12047-#else
12048- int node = early_cpu_to_node(i);
12049- if (!node_online(node) || !NODE_DATA(node)) {
12050- ptr = alloc_bootmem_pages(size);
12051- printk(KERN_INFO
12052- "cpu %d has no node or node-local memory\n", i);
12053- }
12054- else
12055- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
12056+ /* Register a call for panic conditions. */
12057+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
12058+
12059+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12060+ VMASST_TYPE_writable_pagetables));
12061+#ifdef CONFIG_X86_32
12062+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12063+ VMASST_TYPE_4gb_segments));
12064+#endif
12065+#endif /* CONFIG_XEN */
12066+
12067+#ifdef CONFIG_X86_32
12068+ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
12069+ visws_early_detect();
12070+ pre_setup_arch_hook();
12071+#else
12072+ printk(KERN_INFO "Command line: %s\n", boot_command_line);
12073+#endif
12074+
12075+ early_cpu_init();
12076+ early_ioremap_init();
12077+
12078+#ifndef CONFIG_XEN
12079+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
12080+ screen_info = boot_params.screen_info;
12081+ edid_info = boot_params.edid_info;
12082+#ifdef CONFIG_X86_32
12083+ apm_info.bios = boot_params.apm_bios_info;
12084+ ist_info = boot_params.ist_info;
12085+ if (boot_params.sys_desc_table.length != 0) {
12086+ set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
12087+ machine_id = boot_params.sys_desc_table.table[0];
12088+ machine_submodel_id = boot_params.sys_desc_table.table[1];
12089+ BIOS_revision = boot_params.sys_desc_table.table[2];
12090+ }
12091+#endif
12092+ saved_video_mode = boot_params.hdr.vid_mode;
12093+ bootloader_type = boot_params.hdr.type_of_loader;
12094+
12095+#ifdef CONFIG_BLK_DEV_RAM
12096+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
12097+ rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
12098+ rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
12099+#endif
12100+#ifdef CONFIG_EFI
12101+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
12102+#ifdef CONFIG_X86_32
12103+ "EL32",
12104+#else
12105+ "EL64",
12106+#endif
12107+ 4)) {
12108+ efi_enabled = 1;
12109+ efi_reserve_early();
12110+ }
12111+#endif
12112+#else /* CONFIG_XEN */
12113+#ifdef CONFIG_X86_32
12114+ /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
12115+ properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
12116+ */
12117+ ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
12118+#else
12119+ ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
12120+#endif
12121+ if (is_initial_xendomain()) {
12122+ const struct dom0_vga_console_info *info =
12123+ (void *)((char *)xen_start_info +
12124+ xen_start_info->console.dom0.info_off);
12125+
12126+ dom0_init_screen_info(info,
12127+ xen_start_info->console.dom0.info_size);
12128+ xen_start_info->console.domU.mfn = 0;
12129+ xen_start_info->console.domU.evtchn = 0;
12130+ } else
12131+ screen_info.orig_video_isVGA = 0;
12132+ copy_edid();
12133+#endif /* CONFIG_XEN */
12134+
12135+ ARCH_SETUP
12136+
12137+ setup_memory_map();
12138+ parse_setup_data();
12139+ /* update the e820_saved too */
12140+ e820_reserve_setup_data();
12141+
12142+ copy_edd();
12143+
12144+#ifndef CONFIG_XEN
12145+ if (!boot_params.hdr.root_flags)
12146+ root_mountflags &= ~MS_RDONLY;
12147 #endif
12148- if (!ptr)
12149- panic("Cannot allocate cpu data for CPU %d\n", i);
12150+ init_mm.start_code = (unsigned long) _text;
12151+ init_mm.end_code = (unsigned long) _etext;
12152+ init_mm.end_data = (unsigned long) _edata;
12153+#ifdef CONFIG_X86_32
12154+#ifndef CONFIG_XEN
12155+ init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
12156+#else
12157+ init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
12158+ xen_start_info->nr_pt_frames) << PAGE_SHIFT;
12159+#endif
12160+#else
12161+ init_mm.brk = (unsigned long) &_end;
12162+#endif
12163+
12164+ code_resource.start = virt_to_phys(_text);
12165+ code_resource.end = virt_to_phys(_etext)-1;
12166+ data_resource.start = virt_to_phys(_etext);
12167+ data_resource.end = virt_to_phys(_edata)-1;
12168+ bss_resource.start = virt_to_phys(&__bss_start);
12169+ bss_resource.end = virt_to_phys(&__bss_stop)-1;
12170+
12171+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
12172+ *cmdline_p = command_line;
12173+
12174+ parse_early_param();
12175+
12176 #ifdef CONFIG_X86_64
12177- cpu_pda(i)->data_offset = ptr - __per_cpu_start;
12178+ check_efer();
12179+#endif
12180+
12181+#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
12182+ /*
12183+ * Must be before kernel pagetables are setup
12184+ * or fixmap area is touched.
12185+ */
12186+ vmi_init();
12187+#endif
12188+
12189+ /* after early param, so could get panic from serial */
12190+ reserve_early_setup_data();
12191+
12192+ if (acpi_mps_check()) {
12193+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
12194+ disable_apic = 1;
12195+#endif
12196+ setup_clear_cpu_cap(X86_FEATURE_APIC);
12197+ }
12198+
12199+#ifdef CONFIG_PCI
12200+ if (pci_early_dump_regs)
12201+ early_dump_pci_devices();
12202+#endif
12203+
12204+ finish_e820_parsing();
12205+
12206+#ifdef CONFIG_X86_32
12207+ probe_roms();
12208+#endif
12209+
12210+#ifndef CONFIG_XEN
12211+ /* after parse_early_param, so could debug it */
12212+ insert_resource(&iomem_resource, &code_resource);
12213+ insert_resource(&iomem_resource, &data_resource);
12214+ insert_resource(&iomem_resource, &bss_resource);
12215+
12216+ if (efi_enabled)
12217+ efi_init();
12218+
12219+#ifdef CONFIG_X86_32
12220+ if (ppro_with_ram_bug()) {
12221+ e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
12222+ E820_RESERVED);
12223+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
12224+ printk(KERN_INFO "fixed physical RAM map:\n");
12225+ e820_print_map("bad_ppro");
12226+ }
12227 #else
12228- __per_cpu_offset[i] = ptr - __per_cpu_start;
12229+ early_gart_iommu_check();
12230 #endif
12231- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
12232+#endif /* CONFIG_XEN */
12233
12234- highest_cpu = i;
12235+ /*
12236+ * partially used pages are not usable - thus
12237+ * we are rounding upwards:
12238+ */
12239+ max_pfn = e820_end_of_ram_pfn();
12240+
12241+ /* preallocate 4k for mptable mpc */
12242+ early_reserve_e820_mpc_new();
12243+ /* update e820 for memory not covered by WB MTRRs */
12244+ mtrr_bp_init();
12245+#ifndef CONFIG_XEN
12246+ if (mtrr_trim_uncached_memory(max_pfn))
12247+ max_pfn = e820_end_of_ram_pfn();
12248+#endif
12249+
12250+#ifdef CONFIG_X86_32
12251+ /* max_low_pfn get updated here */
12252+ find_low_pfn_range();
12253+#else
12254+ num_physpages = max_pfn;
12255+ max_mapnr = max_pfn;
12256+
12257+
12258+ /* How many end-of-memory variables you have, grandma! */
12259+ /* need this before calling reserve_initrd */
12260+ if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
12261+ max_low_pfn = e820_end_of_low_ram_pfn();
12262+ else
12263+ max_low_pfn = max_pfn;
12264+
12265+ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
12266+#endif
12267+
12268+ /* max_pfn_mapped is updated here */
12269+#ifdef CONFIG_X86_64_XEN
12270+ /*
12271+ * Due to the way initial table space gets calculated on Xen, we have
12272+ * to call init_memory_mapping() with the larger end address first.
12273+ */
12274+ if (max_pfn > max_low_pfn)
12275+ max_pfn_mapped = init_memory_mapping(1UL<<32,
12276+ max_pfn<<PAGE_SHIFT);
12277+ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
12278+ if (max_pfn > max_low_pfn)
12279+ /* can we preserve max_low_pfn ?*/
12280+ max_low_pfn = max_pfn;
12281+ else
12282+ max_pfn_mapped = max_low_pfn_mapped;
12283+#else
12284+ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
12285+ max_pfn_mapped = max_low_pfn_mapped;
12286+
12287+#ifdef CONFIG_X86_64
12288+ if (max_pfn > max_low_pfn) {
12289+ max_pfn_mapped = init_memory_mapping(1UL<<32,
12290+ max_pfn<<PAGE_SHIFT);
12291+ /* can we preseve max_low_pfn ?*/
12292+ max_low_pfn = max_pfn;
12293 }
12294+#endif
12295+#endif
12296
12297- nr_cpu_ids = highest_cpu + 1;
12298- printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
12299+ /*
12300+ * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
12301+ */
12302
12303- /* Setup percpu data maps */
12304- setup_per_cpu_maps();
12305+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
12306+ if (init_ohci1394_dma_early)
12307+ init_ohci1394_dma_on_all_controllers();
12308+#endif
12309
12310- /* Setup cpumask_of_cpu map */
12311- setup_cpumask_of_cpu();
12312-}
12313+ reserve_initrd();
12314+
12315+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12316+ vsmp_init();
12317+#endif
12318+
12319+ if (is_initial_xendomain())
12320+ dmi_scan_machine();
12321+
12322+ io_delay_init();
12323+
12324+#ifdef CONFIG_ACPI
12325+ if (!is_initial_xendomain()) {
12326+ printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
12327+ disable_acpi();
12328+ }
12329+#endif
12330+
12331+ /*
12332+ * Parse the ACPI tables for possible boot-time SMP configuration.
12333+ */
12334+ acpi_boot_table_init();
12335+
12336+#ifdef CONFIG_ACPI_NUMA
12337+ /*
12338+ * Parse SRAT to discover nodes.
12339+ */
12340+ acpi_numa_init();
12341+#endif
12342+
12343+ initmem_init(0, max_pfn);
12344
12345+#ifdef CONFIG_ACPI_SLEEP
12346+ /*
12347+ * Reserve low memory region for sleep support.
12348+ */
12349+ acpi_reserve_bootmem();
12350 #endif
12351+#ifdef CONFIG_X86_FIND_SMP_CONFIG
12352+ /*
12353+ * Find and reserve possible boot-time SMP configuration:
12354+ */
12355+ find_smp_config();
12356+#endif
12357+ reserve_crashkernel();
12358+
12359+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12360+ /*
12361+ * dma32_reserve_bootmem() allocates bootmem which may conflict
12362+ * with the crashkernel command line, so do that after
12363+ * reserve_crashkernel()
12364+ */
12365+ dma32_reserve_bootmem();
12366+#endif
12367+
12368+ reserve_ibft_region();
12369+
12370+#ifdef CONFIG_KVM_CLOCK
12371+ kvmclock_init();
12372+#endif
12373+
12374+ xen_pagetable_setup_start(swapper_pg_dir);
12375+ paging_init();
12376+ xen_pagetable_setup_done(swapper_pg_dir);
12377+ paravirt_post_allocator_init();
12378+
12379+#ifdef CONFIG_X86_64
12380+ map_vsyscall();
12381+#endif
12382+
12383+#ifdef CONFIG_XEN
12384+ p2m_pages = max_pfn;
12385+ if (xen_start_info->nr_pages > max_pfn) {
12386+ /*
12387+ * the max_pfn was shrunk (probably by mem= or highmem=
12388+ * kernel parameter); shrink reservation with the HV
12389+ */
12390+ struct xen_memory_reservation reservation = {
12391+ .address_bits = 0,
12392+ .extent_order = 0,
12393+ .domid = DOMID_SELF
12394+ };
12395+ unsigned int difference;
12396+ int ret;
12397+
12398+ difference = xen_start_info->nr_pages - max_pfn;
12399+
12400+ set_xen_guest_handle(reservation.extent_start,
12401+ ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
12402+ reservation.nr_extents = difference;
12403+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
12404+ &reservation);
12405+ BUG_ON(ret != difference);
12406+ }
12407+ else if (max_pfn > xen_start_info->nr_pages)
12408+ p2m_pages = xen_start_info->nr_pages;
12409+
12410+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
12411+ unsigned long i, j;
12412+ unsigned int k, fpp;
12413+
12414+ /* Make sure we have a large enough P->M table. */
12415+ phys_to_machine_mapping = alloc_bootmem_pages(
12416+ max_pfn * sizeof(unsigned long));
12417+ memset(phys_to_machine_mapping, ~0,
12418+ max_pfn * sizeof(unsigned long));
12419+ memcpy(phys_to_machine_mapping,
12420+ (unsigned long *)xen_start_info->mfn_list,
12421+ p2m_pages * sizeof(unsigned long));
12422+ free_bootmem(
12423+ __pa(xen_start_info->mfn_list),
12424+ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
12425+ sizeof(unsigned long))));
12426+
12427+ /*
12428+ * Initialise the list of the frames that specify the list of
12429+ * frames that make up the p2m table. Used by save/restore.
12430+ */
12431+ pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
12432+
12433+ fpp = PAGE_SIZE/sizeof(unsigned long);
12434+ for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
12435+ if (j == fpp)
12436+ j = 0;
12437+ if (j == 0) {
12438+ k++;
12439+ BUG_ON(k>=ARRAY_SIZE(pfn_to_mfn_frame_list));
12440+ pfn_to_mfn_frame_list[k] =
12441+ alloc_bootmem_pages(PAGE_SIZE);
12442+ pfn_to_mfn_frame_list_list[k] =
12443+ virt_to_mfn(pfn_to_mfn_frame_list[k]);
12444+ }
12445+ pfn_to_mfn_frame_list[k][j] =
12446+ virt_to_mfn(&phys_to_machine_mapping[i]);
12447+ }
12448+ HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
12449+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
12450+ virt_to_mfn(pfn_to_mfn_frame_list_list);
12451+ }
12452+
12453+ /* Mark all ISA DMA channels in-use - using them wouldn't work. */
12454+ for (i = 0; i < MAX_DMA_CHANNELS; ++i)
12455+ if (i != 4 && request_dma(i, "xen") != 0)
12456+ BUG();
12457+#endif /* CONFIG_XEN */
12458+
12459+#ifdef CONFIG_X86_GENERICARCH
12460+ generic_apic_probe();
12461+#endif
12462+
12463+#ifndef CONFIG_XEN
12464+ early_quirks();
12465+#endif
12466+
12467+ /*
12468+ * Read APIC and some other early information from ACPI tables.
12469+ */
12470+ acpi_boot_init();
12471+
12472+#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
12473+ /*
12474+ * get boot-time SMP configuration:
12475+ */
12476+ if (smp_found_config)
12477+ get_smp_config();
12478+#endif
12479+
12480+ prefill_possible_map();
12481+#ifdef CONFIG_X86_64
12482+ init_cpu_to_node();
12483+#endif
12484+
12485+#ifndef CONFIG_XEN
12486+ init_apic_mappings();
12487+ ioapic_init_mappings();
12488+
12489+ kvm_guest_init();
12490+
12491+ e820_reserve_resources();
12492+ e820_mark_nosave_regions(max_low_pfn);
12493+#else
12494+ if (is_initial_xendomain())
12495+ e820_reserve_resources();
12496+#endif
12497+
12498+#ifdef CONFIG_X86_32
12499+ request_resource(&iomem_resource, &video_ram_resource);
12500+#endif
12501+ reserve_standard_io_resources();
12502+
12503+#ifndef CONFIG_XEN
12504+ e820_setup_gap();
12505+
12506+#ifdef CONFIG_VT
12507+#if defined(CONFIG_VGA_CONSOLE)
12508+ if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
12509+ conswitchp = &vga_con;
12510+#elif defined(CONFIG_DUMMY_CONSOLE)
12511+ conswitchp = &dummy_con;
12512+#endif
12513+#endif
12514+#else /* CONFIG_XEN */
12515+ if (is_initial_xendomain())
12516+ e820_setup_gap();
12517+
12518+ set_iopl.iopl = 1;
12519+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
12520+
12521+#ifdef CONFIG_VT
12522+#ifdef CONFIG_DUMMY_CONSOLE
12523+ conswitchp = &dummy_con;
12524+#endif
12525+#ifdef CONFIG_VGA_CONSOLE
12526+ if (is_initial_xendomain())
12527+ conswitchp = &vga_con;
12528+#endif
12529+#endif
12530+#endif /* CONFIG_XEN */
12531+}
12532+
12533+#ifdef CONFIG_XEN
12534+static int
12535+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
12536+{
12537+ HYPERVISOR_shutdown(SHUTDOWN_crash);
12538+ /* we're never actually going to get here... */
12539+ return NOTIFY_DONE;
12540+}
12541+#endif /* !CONFIG_XEN */
12542Index: head-2008-12-01/arch/x86/kernel/setup64-xen.c
12543===================================================================
12544--- head-2008-12-01.orig/arch/x86/kernel/setup64-xen.c 2008-12-01 11:44:55.000000000 +0100
12545+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12546@@ -1,370 +0,0 @@
12547-/*
12548- * X86-64 specific CPU setup.
12549- * Copyright (C) 1995 Linus Torvalds
12550- * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
12551- * See setup.c for older changelog.
12552- *
12553- * Jun Nakajima <jun.nakajima@intel.com>
12554- * Modified for Xen
12555- *
12556- */
12557-#include <linux/init.h>
12558-#include <linux/kernel.h>
12559-#include <linux/sched.h>
12560-#include <linux/string.h>
12561-#include <linux/bootmem.h>
12562-#include <linux/bitops.h>
12563-#include <linux/module.h>
12564-#include <linux/kgdb.h>
12565-#include <asm/pda.h>
12566-#include <asm/pgtable.h>
12567-#include <asm/processor.h>
12568-#include <asm/desc.h>
12569-#include <asm/atomic.h>
12570-#include <asm/mmu_context.h>
12571-#include <asm/smp.h>
12572-#include <asm/i387.h>
12573-#include <asm/percpu.h>
12574-#include <asm/proto.h>
12575-#include <asm/sections.h>
12576-#include <asm/setup.h>
12577-#include <asm/genapic.h>
12578-#ifdef CONFIG_XEN
12579-#include <asm/hypervisor.h>
12580-#endif
12581-
12582-#ifndef CONFIG_DEBUG_BOOT_PARAMS
12583-struct boot_params __initdata boot_params;
12584-#else
12585-struct boot_params boot_params;
12586-#endif
12587-
12588-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
12589-
12590-struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
12591-EXPORT_SYMBOL(_cpu_pda);
12592-struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
12593-
12594-#ifndef CONFIG_X86_NO_IDT
12595-struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
12596-#endif
12597-
12598-char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
12599-
12600-unsigned long __supported_pte_mask __read_mostly = ~0UL;
12601-EXPORT_SYMBOL(__supported_pte_mask);
12602-
12603-static int do_not_nx __cpuinitdata = 0;
12604-
12605-/* noexec=on|off
12606-Control non executable mappings for 64bit processes.
12607-
12608-on Enable(default)
12609-off Disable
12610-*/
12611-static int __init nonx_setup(char *str)
12612-{
12613- if (!str)
12614- return -EINVAL;
12615- if (!strncmp(str, "on", 2)) {
12616- __supported_pte_mask |= _PAGE_NX;
12617- do_not_nx = 0;
12618- } else if (!strncmp(str, "off", 3)) {
12619- do_not_nx = 1;
12620- __supported_pte_mask &= ~_PAGE_NX;
12621- }
12622- return 0;
12623-}
12624-early_param("noexec", nonx_setup);
12625-
12626-int force_personality32 = 0;
12627-
12628-/* noexec32=on|off
12629-Control non executable heap for 32bit processes.
12630-To control the stack too use noexec=off
12631-
12632-on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
12633-off PROT_READ implies PROT_EXEC
12634-*/
12635-static int __init nonx32_setup(char *str)
12636-{
12637- if (!strcmp(str, "on"))
12638- force_personality32 &= ~READ_IMPLIES_EXEC;
12639- else if (!strcmp(str, "off"))
12640- force_personality32 |= READ_IMPLIES_EXEC;
12641- return 1;
12642-}
12643-__setup("noexec32=", nonx32_setup);
12644-
12645-#ifdef CONFIG_XEN
12646-static void __init_refok switch_pt(int cpu)
12647-{
12648- if (cpu == 0)
12649- xen_init_pt();
12650- xen_pt_switch(__pa_symbol(init_level4_pgt));
12651- xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
12652-}
12653-#define switch_pt() switch_pt(cpu)
12654-
12655-static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12656-{
12657- unsigned long frames[16];
12658- unsigned long va;
12659- int f;
12660-
12661- for (va = gdt_descr->address, f = 0;
12662- va < gdt_descr->address + gdt_descr->size;
12663- va += PAGE_SIZE, f++) {
12664- frames[f] = virt_to_mfn(va);
12665- make_page_readonly(
12666- (void *)va, XENFEAT_writable_descriptor_tables);
12667- }
12668- if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
12669- sizeof (struct desc_struct)))
12670- BUG();
12671-}
12672-#else
12673-static void switch_pt(void)
12674-{
12675- asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
12676-}
12677-
12678-static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12679-{
12680- load_gdt(gdt_descr);
12681- load_idt(idt_descr);
12682-}
12683-#endif
12684-
12685-void pda_init(int cpu)
12686-{
12687- struct x8664_pda *pda = cpu_pda(cpu);
12688-
12689- /* Setup up data that may be needed in __get_free_pages early */
12690- asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
12691-#ifndef CONFIG_XEN
12692- /* Memory clobbers used to order PDA accessed */
12693- mb();
12694- wrmsrl(MSR_GS_BASE, pda);
12695- mb();
12696-#else
12697- if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
12698- (unsigned long)pda))
12699- BUG();
12700-#endif
12701- pda->cpunumber = cpu;
12702- pda->irqcount = -1;
12703- pda->kernelstack =
12704- (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
12705- pda->active_mm = &init_mm;
12706- pda->mmu_state = 0;
12707-
12708- if (cpu == 0) {
12709- /* others are initialized in smpboot.c */
12710- pda->pcurrent = &init_task;
12711- pda->irqstackptr = boot_cpu_stack;
12712- } else {
12713- pda->irqstackptr = (char *)
12714- __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
12715- if (!pda->irqstackptr)
12716- panic("cannot allocate irqstack for cpu %d", cpu);
12717- }
12718-
12719- switch_pt();
12720-
12721- pda->irqstackptr += IRQSTACKSIZE-64;
12722-}
12723-
12724-#ifndef CONFIG_X86_NO_TSS
12725-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
12726-__attribute__((section(".bss.page_aligned")));
12727-#endif
12728-
12729-extern asmlinkage void ignore_sysret(void);
12730-
12731-/* May not be marked __init: used by software suspend */
12732-void syscall_init(void)
12733-{
12734-#ifndef CONFIG_XEN
12735- /*
12736- * LSTAR and STAR live in a bit strange symbiosis.
12737- * They both write to the same internal register. STAR allows to set CS/DS
12738- * but only a 32bit target. LSTAR sets the 64bit rip.
12739- */
12740- wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
12741- wrmsrl(MSR_LSTAR, system_call);
12742- wrmsrl(MSR_CSTAR, ignore_sysret);
12743-
12744- /* Flags to clear on syscall */
12745- wrmsrl(MSR_SYSCALL_MASK,
12746- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
12747-#endif
12748-#ifdef CONFIG_IA32_EMULATION
12749- syscall32_cpu_init ();
12750-#else
12751- {
12752- static const struct callback_register cstar = {
12753- .type = CALLBACKTYPE_syscall32,
12754- .address = (unsigned long)ignore_sysret
12755- };
12756- if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
12757- printk(KERN_WARN "Unable to register CSTAR callback\n");
12758- }
12759-#endif
12760-}
12761-
12762-void __cpuinit check_efer(void)
12763-{
12764- unsigned long efer;
12765-
12766- rdmsrl(MSR_EFER, efer);
12767- if (!(efer & EFER_NX) || do_not_nx) {
12768- __supported_pte_mask &= ~_PAGE_NX;
12769- }
12770-}
12771-
12772-unsigned long kernel_eflags;
12773-
12774-#ifndef CONFIG_X86_NO_TSS
12775-/*
12776- * Copies of the original ist values from the tss are only accessed during
12777- * debugging, no special alignment required.
12778- */
12779-DEFINE_PER_CPU(struct orig_ist, orig_ist);
12780-#endif
12781-
12782-/*
12783- * cpu_init() initializes state that is per-CPU. Some data is already
12784- * initialized (naturally) in the bootstrap process, such as the GDT
12785- * and IDT. We reload them nevertheless, this function acts as a
12786- * 'CPU state barrier', nothing should get across.
12787- * A lot of state is already set up in PDA init.
12788- */
12789-void __cpuinit cpu_init (void)
12790-{
12791- int cpu = stack_smp_processor_id();
12792-#ifndef CONFIG_X86_NO_TSS
12793- struct tss_struct *t = &per_cpu(init_tss, cpu);
12794- struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
12795- unsigned long v;
12796- char *estacks = NULL;
12797- unsigned i;
12798-#endif
12799- struct task_struct *me;
12800-
12801- /* CPU 0 is initialised in head64.c */
12802- if (cpu != 0) {
12803- pda_init(cpu);
12804- }
12805-#ifndef CONFIG_X86_NO_TSS
12806- else
12807- estacks = boot_exception_stacks;
12808-#endif
12809-
12810- me = current;
12811-
12812- if (cpu_test_and_set(cpu, cpu_initialized))
12813- panic("CPU#%d already initialized!\n", cpu);
12814-
12815- printk("Initializing CPU#%d\n", cpu);
12816-
12817- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
12818-
12819- /*
12820- * Initialize the per-CPU GDT with the boot GDT,
12821- * and set up the GDT descriptor:
12822- */
12823-#ifndef CONFIG_XEN
12824- if (cpu)
12825- memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
12826-#endif
12827-
12828- cpu_gdt_descr[cpu].size = GDT_SIZE;
12829- cpu_gdt_init(&cpu_gdt_descr[cpu]);
12830-
12831- memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
12832- syscall_init();
12833-
12834- wrmsrl(MSR_FS_BASE, 0);
12835- wrmsrl(MSR_KERNEL_GS_BASE, 0);
12836- barrier();
12837-
12838- check_efer();
12839-
12840-#ifndef CONFIG_X86_NO_TSS
12841- /*
12842- * set up and load the per-CPU TSS
12843- */
12844- for (v = 0; v < N_EXCEPTION_STACKS; v++) {
12845- static const unsigned int order[N_EXCEPTION_STACKS] = {
12846- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
12847- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
12848- };
12849- if (cpu) {
12850- estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
12851- if (!estacks)
12852- panic("Cannot allocate exception stack %ld %d\n",
12853- v, cpu);
12854- }
12855- estacks += PAGE_SIZE << order[v];
12856- orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
12857- }
12858-
12859- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
12860- /*
12861- * <= is required because the CPU will access up to
12862- * 8 bits beyond the end of the IO permission bitmap.
12863- */
12864- for (i = 0; i <= IO_BITMAP_LONGS; i++)
12865- t->io_bitmap[i] = ~0UL;
12866-#endif
12867-
12868- atomic_inc(&init_mm.mm_count);
12869- me->active_mm = &init_mm;
12870- if (me->mm)
12871- BUG();
12872- enter_lazy_tlb(&init_mm, me);
12873-
12874-#ifndef CONFIG_X86_NO_TSS
12875- set_tss_desc(cpu, t);
12876-#endif
12877-#ifndef CONFIG_XEN
12878- load_TR_desc();
12879-#endif
12880- load_LDT(&init_mm.context);
12881-
12882-#ifdef CONFIG_KGDB
12883- /*
12884- * If the kgdb is connected no debug regs should be altered. This
12885- * is only applicable when KGDB and a KGDB I/O module are built
12886- * into the kernel and you are using early debugging with
12887- * kgdbwait. KGDB will control the kernel HW breakpoint registers.
12888- */
12889- if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
12890- arch_kgdb_ops.correct_hw_break();
12891- else {
12892-#endif
12893- /*
12894- * Clear all 6 debug registers:
12895- */
12896-
12897- set_debugreg(0UL, 0);
12898- set_debugreg(0UL, 1);
12899- set_debugreg(0UL, 2);
12900- set_debugreg(0UL, 3);
12901- set_debugreg(0UL, 6);
12902- set_debugreg(0UL, 7);
12903-#ifdef CONFIG_KGDB
12904- /* If the kgdb is connected no debug regs should be altered. */
12905- }
12906-#endif
12907-
12908- fpu_init();
12909-
12910- asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
12911- if (raw_irqs_disabled())
12912- kernel_eflags &= ~X86_EFLAGS_IF;
12913-
12914- if (is_uv_system())
12915- uv_cpu_init();
12916-}
12917Index: head-2008-12-01/arch/x86/kernel/setup_32-xen.c
12918===================================================================
12919--- head-2008-12-01.orig/arch/x86/kernel/setup_32-xen.c 2008-12-01 11:44:55.000000000 +0100
12920+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12921@@ -1,1151 +0,0 @@
12922-/*
12923- * Copyright (C) 1995 Linus Torvalds
12924- *
12925- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
12926- *
12927- * Memory region support
12928- * David Parsons <orc@pell.chi.il.us>, July-August 1999
12929- *
12930- * Added E820 sanitization routine (removes overlapping memory regions);
12931- * Brian Moyle <bmoyle@mvista.com>, February 2001
12932- *
12933- * Moved CPU detection code to cpu/${cpu}.c
12934- * Patrick Mochel <mochel@osdl.org>, March 2002
12935- *
12936- * Provisions for empty E820 memory regions (reported by certain BIOSes).
12937- * Alex Achenbach <xela@slit.de>, December 2002.
12938- *
12939- */
12940-
12941-/*
12942- * This file handles the architecture-dependent parts of initialization
12943- */
12944-
12945-#include <linux/sched.h>
12946-#include <linux/mm.h>
12947-#include <linux/mmzone.h>
12948-#include <linux/screen_info.h>
12949-#include <linux/ioport.h>
12950-#include <linux/acpi.h>
12951-#include <linux/apm_bios.h>
12952-#include <linux/initrd.h>
12953-#include <linux/bootmem.h>
12954-#include <linux/seq_file.h>
12955-#include <linux/console.h>
12956-#include <linux/mca.h>
12957-#include <linux/root_dev.h>
12958-#include <linux/highmem.h>
12959-#include <linux/module.h>
12960-#include <linux/efi.h>
12961-#include <linux/init.h>
12962-#include <linux/edd.h>
12963-#include <linux/iscsi_ibft.h>
12964-#include <linux/nodemask.h>
12965-#include <linux/kernel.h>
12966-#include <linux/percpu.h>
12967-#include <linux/notifier.h>
12968-#include <linux/kexec.h>
12969-#include <linux/crash_dump.h>
12970-#include <linux/dmi.h>
12971-#include <linux/pfn.h>
12972-#include <linux/pci.h>
12973-#include <linux/init_ohci1394_dma.h>
12974-#include <linux/kvm_para.h>
12975-
12976-#include <video/edid.h>
12977-
12978-#include <asm/mtrr.h>
12979-#include <asm/apic.h>
12980-#include <asm/e820.h>
12981-#include <asm/mpspec.h>
12982-#include <asm/mmzone.h>
12983-#include <asm/setup.h>
12984-#include <asm/arch_hooks.h>
12985-#include <asm/sections.h>
12986-#include <asm/io_apic.h>
12987-#include <asm/ist.h>
12988-#include <asm/io.h>
12989-#include <asm/hypervisor.h>
12990-#include <xen/interface/physdev.h>
12991-#include <xen/interface/memory.h>
12992-#include <xen/features.h>
12993-#include <xen/firmware.h>
12994-#include <xen/xencons.h>
12995-#include <setup_arch.h>
12996-#include <asm/bios_ebda.h>
12997-#include <asm/cacheflush.h>
12998-#include <asm/processor.h>
12999-
13000-#ifdef CONFIG_XEN
13001-#include <xen/interface/kexec.h>
13002-#endif
13003-
13004-static int xen_panic_event(struct notifier_block *, unsigned long, void *);
13005-static struct notifier_block xen_panic_block = {
13006- xen_panic_event, NULL, 0 /* try to go last */
13007-};
13008-
13009-/*
13010- * Machine setup..
13011- */
13012-static struct resource data_resource = {
13013- .name = "Kernel data",
13014- .start = 0,
13015- .end = 0,
13016- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13017-};
13018-
13019-static struct resource code_resource = {
13020- .name = "Kernel code",
13021- .start = 0,
13022- .end = 0,
13023- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13024-};
13025-
13026-static struct resource bss_resource = {
13027- .name = "Kernel bss",
13028- .start = 0,
13029- .end = 0,
13030- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13031-};
13032-
13033-static struct resource video_ram_resource = {
13034- .name = "Video RAM area",
13035- .start = 0xa0000,
13036- .end = 0xbffff,
13037- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13038-};
13039-
13040-static struct resource standard_io_resources[] = { {
13041- .name = "dma1",
13042- .start = 0x0000,
13043- .end = 0x001f,
13044- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13045-}, {
13046- .name = "pic1",
13047- .start = 0x0020,
13048- .end = 0x0021,
13049- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13050-}, {
13051- .name = "timer0",
13052- .start = 0x0040,
13053- .end = 0x0043,
13054- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13055-}, {
13056- .name = "timer1",
13057- .start = 0x0050,
13058- .end = 0x0053,
13059- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13060-}, {
13061- .name = "keyboard",
13062- .start = 0x0060,
13063- .end = 0x0060,
13064- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13065-}, {
13066- .name = "keyboard",
13067- .start = 0x0064,
13068- .end = 0x0064,
13069- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13070-}, {
13071- .name = "dma page reg",
13072- .start = 0x0080,
13073- .end = 0x008f,
13074- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13075-}, {
13076- .name = "pic2",
13077- .start = 0x00a0,
13078- .end = 0x00a1,
13079- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13080-}, {
13081- .name = "dma2",
13082- .start = 0x00c0,
13083- .end = 0x00df,
13084- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13085-}, {
13086- .name = "fpu",
13087- .start = 0x00f0,
13088- .end = 0x00ff,
13089- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13090-} };
13091-
13092-/* cpu data as detected by the assembly code in head.S */
13093-struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13094-/* common cpu data for all cpus */
13095-struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13096-EXPORT_SYMBOL(boot_cpu_data);
13097-
13098-unsigned int def_to_bigsmp;
13099-
13100-#ifndef CONFIG_X86_PAE
13101-unsigned long mmu_cr4_features;
13102-#else
13103-unsigned long mmu_cr4_features = X86_CR4_PAE;
13104-#endif
13105-
13106-/* for MCA, but anyone else can use it if they want */
13107-unsigned int machine_id;
13108-unsigned int machine_submodel_id;
13109-unsigned int BIOS_revision;
13110-
13111-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
13112-int bootloader_type;
13113-
13114-/* user-defined highmem size */
13115-static unsigned int highmem_pages = -1;
13116-
13117-/*
13118- * Setup options
13119- */
13120-struct screen_info screen_info;
13121-EXPORT_SYMBOL(screen_info);
13122-struct apm_info apm_info;
13123-EXPORT_SYMBOL(apm_info);
13124-struct edid_info edid_info;
13125-EXPORT_SYMBOL_GPL(edid_info);
13126-#ifndef CONFIG_XEN
13127-#define copy_edid() (edid_info = boot_params.edid_info)
13128-#endif
13129-struct ist_info ist_info;
13130-#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
13131- defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
13132-EXPORT_SYMBOL(ist_info);
13133-#endif
13134-
13135-extern void early_cpu_init(void);
13136-extern int root_mountflags;
13137-
13138-unsigned long saved_video_mode;
13139-
13140-#define RAMDISK_IMAGE_START_MASK 0x07FF
13141-#define RAMDISK_PROMPT_FLAG 0x8000
13142-#define RAMDISK_LOAD_FLAG 0x4000
13143-
13144-static char __initdata command_line[COMMAND_LINE_SIZE];
13145-
13146-#ifndef CONFIG_DEBUG_BOOT_PARAMS
13147-struct boot_params __initdata boot_params;
13148-#else
13149-struct boot_params boot_params;
13150-#endif
13151-
13152-/*
13153- * Point at the empty zero page to start with. We map the real shared_info
13154- * page as soon as fixmap is up and running.
13155- */
13156-shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
13157-EXPORT_SYMBOL(HYPERVISOR_shared_info);
13158-
13159-unsigned long *phys_to_machine_mapping;
13160-unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
13161-EXPORT_SYMBOL(phys_to_machine_mapping);
13162-
13163-/* Raw start-of-day parameters from the hypervisor. */
13164-start_info_t *xen_start_info;
13165-EXPORT_SYMBOL(xen_start_info);
13166-
13167-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
13168-struct edd edd;
13169-#ifdef CONFIG_EDD_MODULE
13170-EXPORT_SYMBOL(edd);
13171-#endif
13172-#ifndef CONFIG_XEN
13173-/**
13174- * copy_edd() - Copy the BIOS EDD information
13175- * from boot_params into a safe place.
13176- *
13177- */
13178-static inline void copy_edd(void)
13179-{
13180- memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
13181- sizeof(edd.mbr_signature));
13182- memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
13183- edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
13184- edd.edd_info_nr = boot_params.eddbuf_entries;
13185-}
13186-#endif
13187-#else
13188-static inline void copy_edd(void)
13189-{
13190-}
13191-#endif
13192-
13193-int __initdata user_defined_memmap;
13194-
13195-/*
13196- * "mem=nopentium" disables the 4MB page tables.
13197- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
13198- * to <mem>, overriding the bios size.
13199- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
13200- * <start> to <start>+<mem>, overriding the bios size.
13201- *
13202- * HPA tells me bootloaders need to parse mem=, so no new
13203- * option should be mem= [also see Documentation/i386/boot.txt]
13204- */
13205-static int __init parse_mem(char *arg)
13206-{
13207- if (!arg)
13208- return -EINVAL;
13209-
13210- if (strcmp(arg, "nopentium") == 0) {
13211- setup_clear_cpu_cap(X86_FEATURE_PSE);
13212- } else {
13213- /* If the user specifies memory size, we
13214- * limit the BIOS-provided memory map to
13215- * that size. exactmap can be used to specify
13216- * the exact map. mem=number can be used to
13217- * trim the existing memory map.
13218- */
13219- unsigned long long mem_size;
13220-
13221- mem_size = memparse(arg, &arg);
13222- limit_regions(mem_size);
13223- user_defined_memmap = 1;
13224- }
13225- return 0;
13226-}
13227-early_param("mem", parse_mem);
13228-
13229-#ifdef CONFIG_PROC_VMCORE
13230-/* elfcorehdr= specifies the location of elf core header
13231- * stored by the crashed kernel.
13232- */
13233-static int __init parse_elfcorehdr(char *arg)
13234-{
13235- if (!arg)
13236- return -EINVAL;
13237-
13238- elfcorehdr_addr = memparse(arg, &arg);
13239- return 0;
13240-}
13241-early_param("elfcorehdr", parse_elfcorehdr);
13242-#endif /* CONFIG_PROC_VMCORE */
13243-
13244-/*
13245- * highmem=size forces highmem to be exactly 'size' bytes.
13246- * This works even on boxes that have no highmem otherwise.
13247- * This also works to reduce highmem size on bigger boxes.
13248- */
13249-static int __init parse_highmem(char *arg)
13250-{
13251- if (!arg)
13252- return -EINVAL;
13253-
13254- highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
13255- return 0;
13256-}
13257-early_param("highmem", parse_highmem);
13258-
13259-/*
13260- * vmalloc=size forces the vmalloc area to be exactly 'size'
13261- * bytes. This can be used to increase (or decrease) the
13262- * vmalloc area - the default is 128m.
13263- */
13264-static int __init parse_vmalloc(char *arg)
13265-{
13266- if (!arg)
13267- return -EINVAL;
13268-
13269- __VMALLOC_RESERVE = memparse(arg, &arg);
13270- return 0;
13271-}
13272-early_param("vmalloc", parse_vmalloc);
13273-
13274-#ifndef CONFIG_XEN
13275-/*
13276- * reservetop=size reserves a hole at the top of the kernel address space which
13277- * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
13278- * so relocating the fixmap can be done before paging initialization.
13279- */
13280-static int __init parse_reservetop(char *arg)
13281-{
13282- unsigned long address;
13283-
13284- if (!arg)
13285- return -EINVAL;
13286-
13287- address = memparse(arg, &arg);
13288- reserve_top_address(address);
13289- return 0;
13290-}
13291-early_param("reservetop", parse_reservetop);
13292-#endif
13293-
13294-/*
13295- * Determine low and high memory ranges:
13296- */
13297-unsigned long __init find_max_low_pfn(void)
13298-{
13299- unsigned long max_low_pfn;
13300-
13301- max_low_pfn = max_pfn;
13302- if (max_low_pfn > MAXMEM_PFN) {
13303- if (highmem_pages == -1)
13304- highmem_pages = max_pfn - MAXMEM_PFN;
13305- if (highmem_pages + MAXMEM_PFN < max_pfn)
13306- max_pfn = MAXMEM_PFN + highmem_pages;
13307- if (highmem_pages + MAXMEM_PFN > max_pfn) {
13308- printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
13309- highmem_pages = 0;
13310- }
13311- max_low_pfn = MAXMEM_PFN;
13312-#ifndef CONFIG_HIGHMEM
13313- /* Maximum memory usable is what is directly addressable */
13314- printk(KERN_WARNING "Warning only %ldMB will be used.\n",
13315- MAXMEM>>20);
13316- if (max_pfn > MAX_NONPAE_PFN)
13317- printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13318- else
13319- printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
13320- max_pfn = MAXMEM_PFN;
13321-#else /* !CONFIG_HIGHMEM */
13322-#ifndef CONFIG_HIGHMEM64G
13323- if (max_pfn > MAX_NONPAE_PFN) {
13324- max_pfn = MAX_NONPAE_PFN;
13325- printk(KERN_WARNING "Warning only 4GB will be used.\n");
13326- printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13327- }
13328-#endif /* !CONFIG_HIGHMEM64G */
13329-#endif /* !CONFIG_HIGHMEM */
13330- } else {
13331- if (highmem_pages == -1)
13332- highmem_pages = 0;
13333-#ifdef CONFIG_HIGHMEM
13334- if (highmem_pages >= max_pfn) {
13335- printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
13336- highmem_pages = 0;
13337- }
13338- if (highmem_pages) {
13339- if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
13340- printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
13341- highmem_pages = 0;
13342- }
13343- max_low_pfn -= highmem_pages;
13344- }
13345-#else
13346- if (highmem_pages)
13347- printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
13348-#endif
13349- }
13350- return max_low_pfn;
13351-}
13352-
13353-#ifndef CONFIG_XEN
13354-#define BIOS_LOWMEM_KILOBYTES 0x413
13355-
13356-/*
13357- * The BIOS places the EBDA/XBDA at the top of conventional
13358- * memory, and usually decreases the reported amount of
13359- * conventional memory (int 0x12) too. This also contains a
13360- * workaround for Dell systems that neglect to reserve EBDA.
13361- * The same workaround also avoids a problem with the AMD768MPX
13362- * chipset: reserve a page before VGA to prevent PCI prefetch
13363- * into it (errata #56). Usually the page is reserved anyways,
13364- * unless you have no PS/2 mouse plugged in.
13365- */
13366-static void __init reserve_ebda_region(void)
13367-{
13368- unsigned int lowmem, ebda_addr;
13369-
13370- /* To determine the position of the EBDA and the */
13371- /* end of conventional memory, we need to look at */
13372- /* the BIOS data area. In a paravirtual environment */
13373- /* that area is absent. We'll just have to assume */
13374- /* that the paravirt case can handle memory setup */
13375- /* correctly, without our help. */
13376- if (paravirt_enabled())
13377- return;
13378-
13379- /* end of low (conventional) memory */
13380- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
13381- lowmem <<= 10;
13382-
13383- /* start of EBDA area */
13384- ebda_addr = get_bios_ebda();
13385-
13386- /* Fixup: bios puts an EBDA in the top 64K segment */
13387- /* of conventional memory, but does not adjust lowmem. */
13388- if ((lowmem - ebda_addr) <= 0x10000)
13389- lowmem = ebda_addr;
13390-
13391- /* Fixup: bios does not report an EBDA at all. */
13392- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
13393- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
13394- lowmem = 0x9f000;
13395-
13396- /* Paranoia: should never happen, but... */
13397- if ((lowmem == 0) || (lowmem >= 0x100000))
13398- lowmem = 0x9f000;
13399-
13400- /* reserve all memory between lowmem and the 1MB mark */
13401- reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
13402-}
13403-#endif
13404-
13405-#ifndef CONFIG_NEED_MULTIPLE_NODES
13406-static void __init setup_bootmem_allocator(void);
13407-static unsigned long __init setup_memory(void)
13408-{
13409- /*
13410- * partially used pages are not usable - thus
13411- * we are rounding upwards:
13412- */
13413- min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
13414- xen_start_info->nr_pt_frames;
13415-
13416- max_low_pfn = find_max_low_pfn();
13417-
13418-#ifdef CONFIG_HIGHMEM
13419- highstart_pfn = highend_pfn = max_pfn;
13420- if (max_pfn > max_low_pfn) {
13421- highstart_pfn = max_low_pfn;
13422- }
13423- printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
13424- pages_to_mb(highend_pfn - highstart_pfn));
13425- num_physpages = highend_pfn;
13426- high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
13427-#else
13428- num_physpages = max_low_pfn;
13429- high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
13430-#endif
13431-#ifdef CONFIG_FLATMEM
13432- max_mapnr = num_physpages;
13433-#endif
13434- printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
13435- pages_to_mb(max_low_pfn));
13436-
13437- setup_bootmem_allocator();
13438-
13439- return max_low_pfn;
13440-}
13441-
13442-static void __init zone_sizes_init(void)
13443-{
13444- unsigned long max_zone_pfns[MAX_NR_ZONES];
13445- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
13446- max_zone_pfns[ZONE_DMA] =
13447- virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
13448- max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
13449-#ifdef CONFIG_HIGHMEM
13450- max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
13451- add_active_range(0, 0, highend_pfn);
13452-#else
13453- add_active_range(0, 0, max_low_pfn);
13454-#endif
13455-
13456- free_area_init_nodes(max_zone_pfns);
13457-}
13458-#else
13459-extern unsigned long __init setup_memory(void);
13460-extern void zone_sizes_init(void);
13461-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
13462-
13463-static inline unsigned long long get_total_mem(void)
13464-{
13465- unsigned long long total;
13466-
13467- total = max_low_pfn - min_low_pfn;
13468-#ifdef CONFIG_HIGHMEM
13469- total += highend_pfn - highstart_pfn;
13470-#endif
13471-
13472- return total << PAGE_SHIFT;
13473-}
13474-
13475-#ifdef CONFIG_KEXEC
13476-#ifndef CONFIG_XEN
13477-static void __init reserve_crashkernel(void)
13478-{
13479- unsigned long long total_mem;
13480- unsigned long long crash_size, crash_base;
13481- int ret;
13482-
13483- total_mem = get_total_mem();
13484-
13485- ret = parse_crashkernel(boot_command_line, total_mem,
13486- &crash_size, &crash_base);
13487- if (ret == 0 && crash_size > 0) {
13488- if (crash_base > 0) {
13489- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
13490- "for crashkernel (System RAM: %ldMB)\n",
13491- (unsigned long)(crash_size >> 20),
13492- (unsigned long)(crash_base >> 20),
13493- (unsigned long)(total_mem >> 20));
13494-
13495- if (reserve_bootmem(crash_base, crash_size,
13496- BOOTMEM_EXCLUSIVE) < 0) {
13497- printk(KERN_INFO "crashkernel reservation "
13498- "failed - memory is in use\n");
13499- return;
13500- }
13501-
13502- crashk_res.start = crash_base;
13503- crashk_res.end = crash_base + crash_size - 1;
13504- } else
13505- printk(KERN_INFO "crashkernel reservation failed - "
13506- "you have to specify a base address\n");
13507- }
13508-}
13509-#else
13510-#define reserve_crashkernel xen_machine_kexec_setup_resources
13511-#endif
13512-#else
13513-static inline void __init reserve_crashkernel(void)
13514-{}
13515-#endif
13516-
13517-#ifdef CONFIG_BLK_DEV_INITRD
13518-
13519-static bool do_relocate_initrd = false;
13520-
13521-static void __init reserve_initrd(void)
13522-{
13523- unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
13524- unsigned long ramdisk_size = xen_start_info->mod_len;
13525- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
13526- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13527- unsigned long ramdisk_here;
13528-
13529- initrd_start = 0;
13530-
13531- if (!xen_start_info->mod_start || !ramdisk_size)
13532- return; /* No initrd provided by bootloader */
13533-
13534- if (ramdisk_end < ramdisk_image) {
13535- printk(KERN_ERR "initrd wraps around end of memory, "
13536- "disabling initrd\n");
13537- return;
13538- }
13539- if (ramdisk_size >= end_of_lowmem/2) {
13540- printk(KERN_ERR "initrd too large to handle, "
13541- "disabling initrd\n");
13542- return;
13543- }
13544- if (ramdisk_end <= end_of_lowmem) {
13545- /* All in lowmem, easy case */
13546- reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
13547- initrd_start = ramdisk_image + PAGE_OFFSET;
13548- initrd_end = initrd_start+ramdisk_size;
13549- return;
13550- }
13551-
13552- /* We need to move the initrd down into lowmem */
13553- ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
13554-
13555- /* Note: this includes all the lowmem currently occupied by
13556- the initrd, we rely on that fact to keep the data intact. */
13557- reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
13558- initrd_start = ramdisk_here + PAGE_OFFSET;
13559- initrd_end = initrd_start + ramdisk_size;
13560-
13561- do_relocate_initrd = true;
13562-}
13563-
13564-#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
13565-
13566-static void __init relocate_initrd(void)
13567-{
13568- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
13569- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
13570- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13571- unsigned long ramdisk_here;
13572- unsigned long slop, clen, mapaddr;
13573- char *p, *q;
13574-
13575- if (!do_relocate_initrd)
13576- return;
13577-
13578- ramdisk_here = initrd_start - PAGE_OFFSET;
13579-
13580- q = (char *)initrd_start;
13581-
13582- /* Copy any lowmem portion of the initrd */
13583- if (ramdisk_image < end_of_lowmem) {
13584- clen = end_of_lowmem - ramdisk_image;
13585- p = (char *)__va(ramdisk_image);
13586- memcpy(q, p, clen);
13587- q += clen;
13588- ramdisk_image += clen;
13589- ramdisk_size -= clen;
13590- }
13591-
13592- /* Copy the highmem portion of the initrd */
13593- while (ramdisk_size) {
13594- slop = ramdisk_image & ~PAGE_MASK;
13595- clen = ramdisk_size;
13596- if (clen > MAX_MAP_CHUNK-slop)
13597- clen = MAX_MAP_CHUNK-slop;
13598- mapaddr = ramdisk_image & PAGE_MASK;
13599- p = early_ioremap(mapaddr, clen+slop);
13600- memcpy(q, p+slop, clen);
13601- early_iounmap(p, clen+slop);
13602- q += clen;
13603- ramdisk_image += clen;
13604- ramdisk_size -= clen;
13605- }
13606-}
13607-
13608-#endif /* CONFIG_BLK_DEV_INITRD */
13609-
13610-void __init setup_bootmem_allocator(void)
13611-{
13612- unsigned long bootmap_size;
13613- /*
13614- * Initialize the boot-time allocator (with low memory only):
13615- */
13616- bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
13617-
13618- register_bootmem_low_pages(max_low_pfn);
13619-
13620- /*
13621- * Reserve the bootmem bitmap itself as well. We do this in two
13622- * steps (first step was init_bootmem()) because this catches
13623- * the (very unlikely) case of us accidentally initializing the
13624- * bootmem allocator with an invalid RAM area.
13625- */
13626- reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
13627- bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
13628- BOOTMEM_DEFAULT);
13629-
13630-#ifndef CONFIG_XEN
13631- /*
13632- * reserve physical page 0 - it's a special BIOS page on many boxes,
13633- * enabling clean reboots, SMP operation, laptop functions.
13634- */
13635- reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
13636-
13637- /* reserve EBDA region */
13638- reserve_ebda_region();
13639-
13640-#ifdef CONFIG_SMP
13641- /*
13642- * But first pinch a few for the stack/trampoline stuff
13643- * FIXME: Don't need the extra page at 4K, but need to fix
13644- * trampoline before removing it. (see the GDT stuff)
13645- */
13646- reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
13647-#endif
13648-#ifdef CONFIG_ACPI_SLEEP
13649- /*
13650- * Reserve low memory region for sleep support.
13651- */
13652- acpi_reserve_bootmem();
13653-#endif
13654-#endif /* !CONFIG_XEN */
13655-
13656-#ifdef CONFIG_BLK_DEV_INITRD
13657- reserve_initrd();
13658-#endif
13659- numa_kva_reserve();
13660- reserve_crashkernel();
13661-
13662- reserve_ibft_region();
13663-}
13664-
13665-/*
13666- * The node 0 pgdat is initialized before all of these because
13667- * it's needed for bootmem. node>0 pgdats have their virtual
13668- * space allocated before the pagetables are in place to access
13669- * them, so they can't be cleared then.
13670- *
13671- * This should all compile down to nothing when NUMA is off.
13672- */
13673-static void __init remapped_pgdat_init(void)
13674-{
13675- int nid;
13676-
13677- for_each_online_node(nid) {
13678- if (nid != 0)
13679- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
13680- }
13681-}
13682-
13683-#ifdef CONFIG_MCA
13684-static void set_mca_bus(int x)
13685-{
13686- MCA_bus = x;
13687-}
13688-#else
13689-static void set_mca_bus(int x) { }
13690-#endif
13691-
13692-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
13693-char * __init __attribute__((weak)) memory_setup(void)
13694-{
13695- return machine_specific_memory_setup();
13696-}
13697-
13698-#ifdef CONFIG_NUMA
13699-/*
13700- * In the golden day, when everything among i386 and x86_64 will be
13701- * integrated, this will not live here
13702- */
13703-void *x86_cpu_to_node_map_early_ptr;
13704-int x86_cpu_to_node_map_init[NR_CPUS] = {
13705- [0 ... NR_CPUS-1] = NUMA_NO_NODE
13706-};
13707-DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
13708-#endif
13709-
13710-/*
13711- * Determine if we were loaded by an EFI loader. If so, then we have also been
13712- * passed the efi memmap, systab, etc., so we should use these data structures
13713- * for initialization. Note, the efi init code path is determined by the
13714- * global efi_enabled. This allows the same kernel image to be used on existing
13715- * systems (with a traditional BIOS) as well as on EFI systems.
13716- */
13717-void __init setup_arch(char **cmdline_p)
13718-{
13719- int i, j, k, fpp;
13720- struct physdev_set_iopl set_iopl;
13721- unsigned long max_low_pfn;
13722- unsigned long p2m_pages;
13723-
13724- /* Force a quick death if the kernel panics (not domain 0). */
13725- extern int panic_timeout;
13726- if (!panic_timeout && !is_initial_xendomain())
13727- panic_timeout = 1;
13728-
13729- /* Register a call for panic conditions. */
13730- atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
13731-
13732- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13733- VMASST_TYPE_4gb_segments));
13734- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13735- VMASST_TYPE_writable_pagetables));
13736-
13737- memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
13738- pre_setup_arch_hook();
13739- early_cpu_init();
13740- early_ioremap_init();
13741-#ifdef CONFIG_SMP
13742- prefill_possible_map();
13743-#endif
13744-
13745-#ifdef CONFIG_EFI
13746- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
13747- "EL32", 4))
13748- efi_enabled = 1;
13749-#endif
13750-
13751- /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
13752- properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
13753- */
13754- ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
13755- screen_info = boot_params.screen_info;
13756- copy_edid();
13757- apm_info.bios = boot_params.apm_bios_info;
13758- ist_info = boot_params.ist_info;
13759- saved_video_mode = boot_params.hdr.vid_mode;
13760- if( boot_params.sys_desc_table.length != 0 ) {
13761- set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
13762- machine_id = boot_params.sys_desc_table.table[0];
13763- machine_submodel_id = boot_params.sys_desc_table.table[1];
13764- BIOS_revision = boot_params.sys_desc_table.table[2];
13765- }
13766- bootloader_type = boot_params.hdr.type_of_loader;
13767-
13768- if (is_initial_xendomain()) {
13769- const struct dom0_vga_console_info *info =
13770- (void *)((char *)xen_start_info +
13771- xen_start_info->console.dom0.info_off);
13772-
13773- dom0_init_screen_info(info,
13774- xen_start_info->console.dom0.info_size);
13775- xen_start_info->console.domU.mfn = 0;
13776- xen_start_info->console.domU.evtchn = 0;
13777- } else
13778- screen_info.orig_video_isVGA = 0;
13779-
13780-#ifdef CONFIG_BLK_DEV_RAM
13781- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
13782- rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
13783- rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
13784-#endif
13785-
13786- ARCH_SETUP
13787-
13788- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
13789- print_memory_map(memory_setup());
13790-
13791- copy_edd();
13792-
13793- if (!boot_params.hdr.root_flags)
13794- root_mountflags &= ~MS_RDONLY;
13795- init_mm.start_code = (unsigned long) _text;
13796- init_mm.end_code = (unsigned long) _etext;
13797- init_mm.end_data = (unsigned long) _edata;
13798- init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
13799- xen_start_info->nr_pt_frames) << PAGE_SHIFT;
13800-
13801- code_resource.start = virt_to_phys(_text);
13802- code_resource.end = virt_to_phys(_etext)-1;
13803- data_resource.start = virt_to_phys(_etext);
13804- data_resource.end = virt_to_phys(_edata)-1;
13805- bss_resource.start = virt_to_phys(&__bss_start);
13806- bss_resource.end = virt_to_phys(&__bss_stop)-1;
13807-
13808- if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
13809- i = COMMAND_LINE_SIZE;
13810- memcpy(boot_command_line, xen_start_info->cmd_line, i);
13811- boot_command_line[i - 1] = '\0';
13812- parse_early_param();
13813-
13814- if (user_defined_memmap) {
13815- printk(KERN_INFO "user-defined physical RAM map:\n");
13816- print_memory_map("user");
13817- }
13818-
13819- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
13820- *cmdline_p = command_line;
13821-
13822- if (efi_enabled)
13823- efi_init();
13824-
13825- /* update e820 for memory not covered by WB MTRRs */
13826- propagate_e820_map();
13827- mtrr_bp_init();
13828-#ifndef CONFIG_XEN
13829- if (mtrr_trim_uncached_memory(max_pfn))
13830- propagate_e820_map();
13831-#endif
13832-
13833- max_low_pfn = setup_memory();
13834-
13835-#ifdef CONFIG_KVM_CLOCK
13836- kvmclock_init();
13837-#endif
13838-
13839-#ifdef CONFIG_VMI
13840- /*
13841- * Must be after max_low_pfn is determined, and before kernel
13842- * pagetables are setup.
13843- */
13844- vmi_init();
13845-#endif
13846- kvm_guest_init();
13847-
13848- /*
13849- * NOTE: before this point _nobody_ is allowed to allocate
13850- * any memory using the bootmem allocator. Although the
13851- * allocator is now initialised only the first 8Mb of the kernel
13852- * virtual address space has been mapped. All allocations before
13853- * paging_init() has completed must use the alloc_bootmem_low_pages()
13854- * variant (which allocates DMA'able memory) and care must be taken
13855- * not to exceed the 8Mb limit.
13856- */
13857-
13858-#ifdef CONFIG_SMP
13859- smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
13860-#endif
13861- paging_init();
13862-
13863- /*
13864- * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
13865- */
13866-
13867-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
13868- if (init_ohci1394_dma_early)
13869- init_ohci1394_dma_on_all_controllers();
13870-#endif
13871-
13872- remapped_pgdat_init();
13873- sparse_init();
13874- zone_sizes_init();
13875-
13876-#ifdef CONFIG_X86_FIND_SMP_CONFIG
13877- /*
13878- * Find and reserve possible boot-time SMP configuration:
13879- */
13880- find_smp_config();
13881-#endif
13882-
13883- p2m_pages = max_pfn;
13884- if (xen_start_info->nr_pages > max_pfn) {
13885- /*
13886- * the max_pfn was shrunk (probably by mem= or highmem=
13887- * kernel parameter); shrink reservation with the HV
13888- */
13889- struct xen_memory_reservation reservation = {
13890- .address_bits = 0,
13891- .extent_order = 0,
13892- .domid = DOMID_SELF
13893- };
13894- unsigned int difference;
13895- int ret;
13896-
13897- difference = xen_start_info->nr_pages - max_pfn;
13898-
13899- set_xen_guest_handle(reservation.extent_start,
13900- ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
13901- reservation.nr_extents = difference;
13902- ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
13903- &reservation);
13904- BUG_ON (ret != difference);
13905- }
13906- else if (max_pfn > xen_start_info->nr_pages)
13907- p2m_pages = xen_start_info->nr_pages;
13908-
13909- /* Make sure we have a correctly sized P->M table. */
13910- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
13911- phys_to_machine_mapping = alloc_bootmem_low_pages(
13912- max_pfn * sizeof(unsigned long));
13913- memset(phys_to_machine_mapping, ~0,
13914- max_pfn * sizeof(unsigned long));
13915- memcpy(phys_to_machine_mapping,
13916- (unsigned long *)xen_start_info->mfn_list,
13917- p2m_pages * sizeof(unsigned long));
13918- free_bootmem(
13919- __pa(xen_start_info->mfn_list),
13920- PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
13921- sizeof(unsigned long))));
13922-
13923- /*
13924- * Initialise the list of the frames that specify the list of
13925- * frames that make up the p2m table. Used by save/restore
13926- */
13927- pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
13928-
13929- fpp = PAGE_SIZE/sizeof(unsigned long);
13930- for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
13931- if ((j % fpp) == 0) {
13932- k++;
13933- BUG_ON(k>=16);
13934- pfn_to_mfn_frame_list[k] =
13935- alloc_bootmem_low_pages(PAGE_SIZE);
13936- pfn_to_mfn_frame_list_list[k] =
13937- virt_to_mfn(pfn_to_mfn_frame_list[k]);
13938- j=0;
13939- }
13940- pfn_to_mfn_frame_list[k][j] =
13941- virt_to_mfn(&phys_to_machine_mapping[i]);
13942- }
13943- HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
13944- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
13945- virt_to_mfn(pfn_to_mfn_frame_list_list);
13946- }
13947-
13948- /* Mark all ISA DMA channels in-use - using them wouldn't work. */
13949- for (i = 0; i < MAX_DMA_CHANNELS; ++i)
13950- if (i != 4 && request_dma(i, "xen") != 0)
13951- BUG();
13952-
13953- /*
13954- * NOTE: at this point the bootmem allocator is fully available.
13955- */
13956-
13957-#ifdef CONFIG_BLK_DEV_INITRD
13958- relocate_initrd();
13959-#endif
13960-
13961- paravirt_post_allocator_init();
13962-
13963- if (is_initial_xendomain())
13964- dmi_scan_machine();
13965-
13966- io_delay_init();
13967-
13968-#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
13969- /*
13970- * setup to use the early static init tables during kernel startup
13971- * X86_SMP will exclude sub-arches that don't deal well with it.
13972- */
13973- x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
13974- x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
13975-#ifdef CONFIG_NUMA
13976- x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
13977-#endif
13978-#endif
13979-
13980-#ifdef CONFIG_X86_GENERICARCH
13981- generic_apic_probe();
13982-#endif
13983-
13984- set_iopl.iopl = 1;
13985- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
13986-
13987-#ifdef CONFIG_ACPI
13988- if (!is_initial_xendomain()) {
13989- printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
13990- acpi_disabled = 1;
13991- acpi_ht = 0;
13992- }
13993-
13994- /*
13995- * Parse the ACPI tables for possible boot-time SMP configuration.
13996- */
13997- acpi_boot_table_init();
13998-#endif
13999-
14000-#ifndef CONFIG_XEN
14001- early_quirks();
14002-#endif
14003-
14004-#ifdef CONFIG_ACPI
14005- acpi_boot_init();
14006-
14007-#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
14008- if (def_to_bigsmp)
14009- printk(KERN_WARNING "More than 8 CPUs detected and "
14010- "CONFIG_X86_PC cannot handle it.\nUse "
14011- "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
14012-#endif
14013-#endif
14014-#ifdef CONFIG_X86_LOCAL_APIC
14015- if (smp_found_config)
14016- get_smp_config();
14017-#endif
14018-
14019- e820_register_memory();
14020- e820_mark_nosave_regions();
14021-
14022- if (is_initial_xendomain()) {
14023-#ifdef CONFIG_VT
14024-#if defined(CONFIG_VGA_CONSOLE)
14025- if (!efi_enabled ||
14026- (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14027- conswitchp = &vga_con;
14028-#elif defined(CONFIG_DUMMY_CONSOLE)
14029- conswitchp = &dummy_con;
14030-#endif
14031-#endif
14032- } else {
14033-#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14034- conswitchp = &dummy_con;
14035-#endif
14036- }
14037-}
14038-
14039-static int
14040-xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14041-{
14042- HYPERVISOR_shutdown(SHUTDOWN_crash);
14043- /* we're never actually going to get here... */
14044- return NOTIFY_DONE;
14045-}
14046-
14047-/*
14048- * Request address space for all standard resources
14049- *
14050- * This is called just before pcibios_init(), which is also a
14051- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
14052- */
14053-static int __init request_standard_resources(void)
14054-{
14055- int i;
14056-
14057- /* Nothing to do if not running in dom0. */
14058- if (!is_initial_xendomain())
14059- return 0;
14060-
14061- printk(KERN_INFO "Setting up standard PCI resources\n");
14062- init_iomem_resources(&code_resource, &data_resource, &bss_resource);
14063-
14064- request_resource(&iomem_resource, &video_ram_resource);
14065-
14066- /* request I/O space for devices used on all i[345]86 PCs */
14067- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14068- request_resource(&ioport_resource, &standard_io_resources[i]);
14069- return 0;
14070-}
14071-
14072-subsys_initcall(request_standard_resources);
14073Index: head-2008-12-01/arch/x86/kernel/setup_64-xen.c
14074===================================================================
14075--- head-2008-12-01.orig/arch/x86/kernel/setup_64-xen.c 2008-12-01 11:44:55.000000000 +0100
14076+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
14077@@ -1,1433 +0,0 @@
14078-/*
14079- * Copyright (C) 1995 Linus Torvalds
14080- */
14081-
14082-/*
14083- * This file handles the architecture-dependent parts of initialization
14084- */
14085-
14086-#include <linux/errno.h>
14087-#include <linux/sched.h>
14088-#include <linux/kernel.h>
14089-#include <linux/mm.h>
14090-#include <linux/stddef.h>
14091-#include <linux/unistd.h>
14092-#include <linux/ptrace.h>
14093-#include <linux/slab.h>
14094-#include <linux/user.h>
14095-#include <linux/screen_info.h>
14096-#include <linux/ioport.h>
14097-#include <linux/delay.h>
14098-#include <linux/init.h>
14099-#include <linux/initrd.h>
14100-#include <linux/highmem.h>
14101-#include <linux/bootmem.h>
14102-#include <linux/module.h>
14103-#include <asm/processor.h>
14104-#include <linux/console.h>
14105-#include <linux/seq_file.h>
14106-#include <linux/crash_dump.h>
14107-#include <linux/root_dev.h>
14108-#include <linux/pci.h>
14109-#include <asm/pci-direct.h>
14110-#include <linux/efi.h>
14111-#include <linux/acpi.h>
14112-#include <linux/kallsyms.h>
14113-#include <linux/edd.h>
14114-#include <linux/iscsi_ibft.h>
14115-#include <linux/mmzone.h>
14116-#include <linux/kexec.h>
14117-#include <linux/cpufreq.h>
14118-#include <linux/dmi.h>
14119-#include <linux/dma-mapping.h>
14120-#include <linux/ctype.h>
14121-#include <linux/sort.h>
14122-#include <linux/uaccess.h>
14123-#include <linux/init_ohci1394_dma.h>
14124-#include <linux/kvm_para.h>
14125-
14126-#include <asm/mtrr.h>
14127-#include <asm/uaccess.h>
14128-#include <asm/system.h>
14129-#include <asm/vsyscall.h>
14130-#include <asm/io.h>
14131-#include <asm/smp.h>
14132-#include <asm/msr.h>
14133-#include <asm/desc.h>
14134-#include <video/edid.h>
14135-#include <asm/e820.h>
14136-#include <asm/dma.h>
14137-#include <asm/gart.h>
14138-#include <asm/mpspec.h>
14139-#include <asm/mmu_context.h>
14140-#include <asm/proto.h>
14141-#include <asm/setup.h>
14142-#include <asm/numa.h>
14143-#include <asm/sections.h>
14144-#include <asm/dmi.h>
14145-#include <asm/cacheflush.h>
14146-#include <asm/mce.h>
14147-#include <asm/ds.h>
14148-#include <asm/topology.h>
14149-#include <asm/pat.h>
14150-
14151-#include <mach_apic.h>
14152-#ifdef CONFIG_XEN
14153-#include <linux/percpu.h>
14154-#include <xen/interface/physdev.h>
14155-#include "setup_arch_pre.h"
14156-#include <asm/hypervisor.h>
14157-#include <xen/interface/nmi.h>
14158-#include <xen/features.h>
14159-#include <xen/firmware.h>
14160-#include <xen/xencons.h>
14161-#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
14162-#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
14163-#include <asm/mach-xen/setup_arch_post.h>
14164-#include <xen/interface/memory.h>
14165-
14166-#ifdef CONFIG_XEN
14167-#include <xen/interface/kexec.h>
14168-#endif
14169-
14170-extern unsigned long start_pfn;
14171-extern struct edid_info edid_info;
14172-
14173-shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
14174-EXPORT_SYMBOL(HYPERVISOR_shared_info);
14175-
14176-static int xen_panic_event(struct notifier_block *, unsigned long, void *);
14177-static struct notifier_block xen_panic_block = {
14178- xen_panic_event, NULL, 0 /* try to go last */
14179-};
14180-
14181-unsigned long *phys_to_machine_mapping;
14182-unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
14183-
14184-EXPORT_SYMBOL(phys_to_machine_mapping);
14185-
14186-DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
14187-DEFINE_PER_CPU(int, nr_multicall_ents);
14188-
14189-/* Raw start-of-day parameters from the hypervisor. */
14190-start_info_t *xen_start_info;
14191-EXPORT_SYMBOL(xen_start_info);
14192-#endif
14193-
14194-/*
14195- * Machine setup..
14196- */
14197-
14198-struct cpuinfo_x86 boot_cpu_data __read_mostly;
14199-EXPORT_SYMBOL(boot_cpu_data);
14200-
14201-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
14202-
14203-unsigned long mmu_cr4_features;
14204-
14205-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
14206-int bootloader_type;
14207-
14208-unsigned long saved_video_mode;
14209-
14210-int force_mwait __cpuinitdata;
14211-
14212-/*
14213- * Early DMI memory
14214- */
14215-int dmi_alloc_index;
14216-char dmi_alloc_data[DMI_MAX_DATA];
14217-
14218-/*
14219- * Setup options
14220- */
14221-struct screen_info screen_info;
14222-EXPORT_SYMBOL(screen_info);
14223-struct sys_desc_table_struct {
14224- unsigned short length;
14225- unsigned char table[0];
14226-};
14227-
14228-struct edid_info edid_info;
14229-EXPORT_SYMBOL_GPL(edid_info);
14230-
14231-extern int root_mountflags;
14232-
14233-char __initdata command_line[COMMAND_LINE_SIZE];
14234-
14235-static struct resource standard_io_resources[] = {
14236- { .name = "dma1", .start = 0x00, .end = 0x1f,
14237- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14238- { .name = "pic1", .start = 0x20, .end = 0x21,
14239- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14240- { .name = "timer0", .start = 0x40, .end = 0x43,
14241- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14242- { .name = "timer1", .start = 0x50, .end = 0x53,
14243- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14244- { .name = "keyboard", .start = 0x60, .end = 0x60,
14245- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14246- { .name = "keyboard", .start = 0x64, .end = 0x64,
14247- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14248- { .name = "dma page reg", .start = 0x80, .end = 0x8f,
14249- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14250- { .name = "pic2", .start = 0xa0, .end = 0xa1,
14251- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14252- { .name = "dma2", .start = 0xc0, .end = 0xdf,
14253- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14254- { .name = "fpu", .start = 0xf0, .end = 0xff,
14255- .flags = IORESOURCE_BUSY | IORESOURCE_IO }
14256-};
14257-
14258-#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
14259-
14260-static struct resource data_resource = {
14261- .name = "Kernel data",
14262- .start = 0,
14263- .end = 0,
14264- .flags = IORESOURCE_RAM,
14265-};
14266-static struct resource code_resource = {
14267- .name = "Kernel code",
14268- .start = 0,
14269- .end = 0,
14270- .flags = IORESOURCE_RAM,
14271-};
14272-static struct resource bss_resource = {
14273- .name = "Kernel bss",
14274- .start = 0,
14275- .end = 0,
14276- .flags = IORESOURCE_RAM,
14277-};
14278-
14279-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
14280-
14281-#ifdef CONFIG_PROC_VMCORE
14282-/* elfcorehdr= specifies the location of elf core header
14283- * stored by the crashed kernel. This option will be passed
14284- * by kexec loader to the capture kernel.
14285- */
14286-static int __init setup_elfcorehdr(char *arg)
14287-{
14288- char *end;
14289- if (!arg)
14290- return -EINVAL;
14291- elfcorehdr_addr = memparse(arg, &end);
14292- return end > arg ? 0 : -EINVAL;
14293-}
14294-early_param("elfcorehdr", setup_elfcorehdr);
14295-#endif
14296-
14297-#ifndef CONFIG_NUMA
14298-static void __init
14299-contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
14300-{
14301- unsigned long bootmap_size, bootmap;
14302-
14303- bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
14304- bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
14305- PAGE_SIZE);
14306- if (bootmap == -1L)
14307- panic("Cannot find bootmem map of size %ld\n", bootmap_size);
14308- bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
14309- e820_register_active_regions(0, start_pfn, end_pfn);
14310-#ifdef CONFIG_XEN
14311- free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
14312- early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
14313-#else
14314- free_bootmem_with_active_regions(0, end_pfn);
14315- early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
14316-#endif
14317- reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
14318-}
14319-#endif
14320-
14321-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
14322-struct edd edd;
14323-#ifdef CONFIG_EDD_MODULE
14324-EXPORT_SYMBOL(edd);
14325-#endif
14326-#ifndef CONFIG_XEN
14327-/**
14328- * copy_edd() - Copy the BIOS EDD information
14329- * from boot_params into a safe place.
14330- *
14331- */
14332-static inline void copy_edd(void)
14333-{
14334- memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
14335- sizeof(edd.mbr_signature));
14336- memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
14337- edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
14338- edd.edd_info_nr = boot_params.eddbuf_entries;
14339-}
14340-#endif
14341-#else
14342-static inline void copy_edd(void)
14343-{
14344-}
14345-#endif
14346-
14347-#ifdef CONFIG_KEXEC
14348-#ifndef CONFIG_XEN
14349-static void __init reserve_crashkernel(void)
14350-{
14351- unsigned long long total_mem;
14352- unsigned long long crash_size, crash_base;
14353- int ret;
14354-
14355- total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
14356-
14357- ret = parse_crashkernel(boot_command_line, total_mem,
14358- &crash_size, &crash_base);
14359- if (ret == 0 && crash_size) {
14360- if (crash_base <= 0) {
14361- printk(KERN_INFO "crashkernel reservation failed - "
14362- "you have to specify a base address\n");
14363- return;
14364- }
14365-
14366- if (reserve_bootmem(crash_base, crash_size,
14367- BOOTMEM_EXCLUSIVE) < 0) {
14368- printk(KERN_INFO "crashkernel reservation failed - "
14369- "memory is in use\n");
14370- return;
14371- }
14372-
14373- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
14374- "for crashkernel (System RAM: %ldMB)\n",
14375- (unsigned long)(crash_size >> 20),
14376- (unsigned long)(crash_base >> 20),
14377- (unsigned long)(total_mem >> 20));
14378- crashk_res.start = crash_base;
14379- crashk_res.end = crash_base + crash_size - 1;
14380- insert_resource(&iomem_resource, &crashk_res);
14381- }
14382-}
14383-#else
14384-#define reserve_crashkernel xen_machine_kexec_setup_resources
14385-#endif
14386-#else
14387-static inline void __init reserve_crashkernel(void)
14388-{}
14389-#endif
14390-
14391-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
14392-void __attribute__((weak)) __init memory_setup(void)
14393-{
14394- machine_specific_memory_setup();
14395-}
14396-
14397-static void __init parse_setup_data(void)
14398-{
14399- struct setup_data *data;
14400- unsigned long pa_data;
14401-
14402- if (boot_params.hdr.version < 0x0209)
14403- return;
14404- pa_data = boot_params.hdr.setup_data;
14405- while (pa_data) {
14406- data = early_ioremap(pa_data, PAGE_SIZE);
14407- switch (data->type) {
14408- default:
14409- break;
14410- }
14411-#ifndef CONFIG_DEBUG_BOOT_PARAMS
14412- free_early(pa_data, pa_data+sizeof(*data)+data->len);
14413-#endif
14414- pa_data = data->next;
14415- early_iounmap(data, PAGE_SIZE);
14416- }
14417-}
14418-
14419-#ifdef CONFIG_PCI_MMCONFIG
14420-extern void __cpuinit fam10h_check_enable_mmcfg(void);
14421-extern void __init check_enable_amd_mmconf_dmi(void);
14422-#else
14423-void __cpuinit fam10h_check_enable_mmcfg(void)
14424-{
14425-}
14426-void __init check_enable_amd_mmconf_dmi(void)
14427-{
14428-}
14429-#endif
14430-
14431-/*
14432- * setup_arch - architecture-specific boot-time initializations
14433- *
14434- * Note: On x86_64, fixmaps are ready for use even before this is called.
14435- */
14436-void __init setup_arch(char **cmdline_p)
14437-{
14438- unsigned i;
14439-
14440-#ifdef CONFIG_XEN
14441- extern struct e820map machine_e820;
14442-
14443- printk(KERN_INFO "Command line: %s\n", boot_command_line);
14444-
14445- /* Register a call for panic conditions. */
14446- atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
14447-
14448- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
14449- VMASST_TYPE_writable_pagetables));
14450-
14451- early_ioremap_init();
14452-
14453- ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
14454- screen_info = boot_params.screen_info;
14455-
14456- if (is_initial_xendomain()) {
14457- const struct dom0_vga_console_info *info =
14458- (void *)((char *)xen_start_info +
14459- xen_start_info->console.dom0.info_off);
14460-
14461- dom0_init_screen_info(info,
14462- xen_start_info->console.dom0.info_size);
14463- xen_start_info->console.domU.mfn = 0;
14464- xen_start_info->console.domU.evtchn = 0;
14465- } else
14466- screen_info.orig_video_isVGA = 0;
14467-
14468- copy_edid();
14469-#else
14470- printk(KERN_INFO "Command line: %s\n", boot_command_line);
14471-
14472- ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
14473- screen_info = boot_params.screen_info;
14474- edid_info = boot_params.edid_info;
14475-#endif /* !CONFIG_XEN */
14476- saved_video_mode = boot_params.hdr.vid_mode;
14477- bootloader_type = boot_params.hdr.type_of_loader;
14478-
14479-#ifdef CONFIG_BLK_DEV_RAM
14480- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
14481- rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
14482- rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
14483-#endif
14484-#ifdef CONFIG_EFI
14485- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
14486- "EL64", 4))
14487- efi_enabled = 1;
14488-#endif
14489-
14490- ARCH_SETUP
14491-
14492- memory_setup();
14493- copy_edd();
14494-
14495- if (!boot_params.hdr.root_flags)
14496- root_mountflags &= ~MS_RDONLY;
14497- init_mm.start_code = (unsigned long) &_text;
14498- init_mm.end_code = (unsigned long) &_etext;
14499- init_mm.end_data = (unsigned long) &_edata;
14500- init_mm.brk = (unsigned long) &_end;
14501-
14502- code_resource.start = virt_to_phys(&_text);
14503- code_resource.end = virt_to_phys(&_etext)-1;
14504- data_resource.start = virt_to_phys(&_etext);
14505- data_resource.end = virt_to_phys(&_edata)-1;
14506- bss_resource.start = virt_to_phys(&__bss_start);
14507- bss_resource.end = virt_to_phys(&__bss_stop)-1;
14508-
14509- early_identify_cpu(&boot_cpu_data);
14510-
14511- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
14512- *cmdline_p = command_line;
14513-
14514- parse_setup_data();
14515-
14516- parse_early_param();
14517-
14518-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
14519- if (init_ohci1394_dma_early)
14520- init_ohci1394_dma_on_all_controllers();
14521-#endif
14522-
14523- finish_e820_parsing();
14524-
14525-#ifndef CONFIG_XEN
14526- /* after parse_early_param, so could debug it */
14527- insert_resource(&iomem_resource, &code_resource);
14528- insert_resource(&iomem_resource, &data_resource);
14529- insert_resource(&iomem_resource, &bss_resource);
14530-#endif
14531-
14532- early_gart_iommu_check();
14533-
14534- e820_register_active_regions(0, 0, -1UL);
14535- /*
14536- * partially used pages are not usable - thus
14537- * we are rounding upwards:
14538- */
14539- end_pfn = e820_end_of_ram();
14540- /* update e820 for memory not covered by WB MTRRs */
14541- mtrr_bp_init();
14542-#ifndef CONFIG_XEN
14543- if (mtrr_trim_uncached_memory(end_pfn)) {
14544- e820_register_active_regions(0, 0, -1UL);
14545- end_pfn = e820_end_of_ram();
14546- }
14547-#endif
14548-
14549- num_physpages = end_pfn;
14550- max_mapnr = end_pfn;
14551-
14552- check_efer();
14553-
14554- max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
14555- if (efi_enabled)
14556- efi_init();
14557-
14558-#ifndef CONFIG_XEN
14559- vsmp_init();
14560-#endif
14561-
14562- if (is_initial_xendomain())
14563- dmi_scan_machine();
14564-
14565- io_delay_init();
14566-
14567-#ifdef CONFIG_KVM_CLOCK
14568- kvmclock_init();
14569-#endif
14570-
14571-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
14572- /* setup to use the early static init tables during kernel startup */
14573- x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
14574- x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
14575-#ifdef CONFIG_NUMA
14576- x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
14577-#endif
14578-#endif
14579-
14580- /* How many end-of-memory variables you have, grandma! */
14581- max_low_pfn = end_pfn;
14582- max_pfn = end_pfn;
14583- high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
14584-
14585- /* Remove active ranges so rediscovery with NUMA-awareness happens */
14586- remove_all_active_ranges();
14587-
14588-#ifdef CONFIG_ACPI_NUMA
14589- /*
14590- * Parse SRAT to discover nodes.
14591- */
14592- acpi_numa_init();
14593-#endif
14594-
14595-#ifdef CONFIG_NUMA
14596- numa_initmem_init(0, end_pfn);
14597-#else
14598- contig_initmem_init(0, end_pfn);
14599-#endif
14600-
14601-#ifndef CONFIG_XEN
14602- dma32_reserve_bootmem();
14603-
14604-#ifdef CONFIG_ACPI_SLEEP
14605- /*
14606- * Reserve low memory region for sleep support.
14607- */
14608- acpi_reserve_bootmem();
14609-#endif
14610-
14611- if (efi_enabled)
14612- efi_reserve_bootmem();
14613-#endif
14614-
14615-#ifdef CONFIG_BLK_DEV_INITRD
14616-#ifdef CONFIG_XEN
14617- if (xen_start_info->mod_start) {
14618- unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
14619- unsigned long ramdisk_size = xen_start_info->mod_len;
14620-#else
14621- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
14622- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
14623- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
14624-#endif
14625- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
14626- unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
14627-
14628- if (ramdisk_end <= end_of_mem) {
14629- /*
14630- * don't need to reserve again, already reserved early
14631- * in x86_64_start_kernel, and early_res_to_bootmem
14632- * convert that to reserved in bootmem
14633- */
14634- initrd_start = ramdisk_image + PAGE_OFFSET;
14635- initrd_end = initrd_start+ramdisk_size;
14636-#ifdef CONFIG_XEN
14637- initrd_below_start_ok = 1;
14638-#endif
14639- } else {
14640- free_bootmem(ramdisk_image, ramdisk_size);
14641- printk(KERN_ERR "initrd extends beyond end of memory "
14642- "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
14643- ramdisk_end, end_of_mem);
14644- initrd_start = 0;
14645- }
14646- }
14647-#endif
14648- reserve_crashkernel();
14649-
14650- reserve_ibft_region();
14651-
14652- paging_init();
14653- map_vsyscall();
14654-#ifdef CONFIG_X86_LOCAL_APIC
14655- /*
14656- * Find and reserve possible boot-time SMP configuration:
14657- */
14658- find_smp_config();
14659-#endif
14660-#ifdef CONFIG_XEN
14661- {
14662- int i, j, k, fpp;
14663- unsigned long p2m_pages;
14664-
14665- p2m_pages = end_pfn;
14666- if (xen_start_info->nr_pages > end_pfn) {
14667- /*
14668- * the end_pfn was shrunk (probably by mem= or highmem=
14669- * kernel parameter); shrink reservation with the HV
14670- */
14671- struct xen_memory_reservation reservation = {
14672- .address_bits = 0,
14673- .extent_order = 0,
14674- .domid = DOMID_SELF
14675- };
14676- unsigned int difference;
14677- int ret;
14678-
14679- difference = xen_start_info->nr_pages - end_pfn;
14680-
14681- set_xen_guest_handle(reservation.extent_start,
14682- ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
14683- reservation.nr_extents = difference;
14684- ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
14685- &reservation);
14686- BUG_ON (ret != difference);
14687- }
14688- else if (end_pfn > xen_start_info->nr_pages)
14689- p2m_pages = xen_start_info->nr_pages;
14690-
14691- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
14692- /* Make sure we have a large enough P->M table. */
14693- phys_to_machine_mapping = alloc_bootmem_pages(
14694- end_pfn * sizeof(unsigned long));
14695- memset(phys_to_machine_mapping, ~0,
14696- end_pfn * sizeof(unsigned long));
14697- memcpy(phys_to_machine_mapping,
14698- (unsigned long *)xen_start_info->mfn_list,
14699- p2m_pages * sizeof(unsigned long));
14700- free_bootmem(
14701- __pa(xen_start_info->mfn_list),
14702- PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
14703- sizeof(unsigned long))));
14704-
14705- /*
14706- * Initialise the list of the frames that specify the
14707- * list of frames that make up the p2m table. Used by
14708- * save/restore.
14709- */
14710- pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
14711-
14712- fpp = PAGE_SIZE/sizeof(unsigned long);
14713- for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
14714- if ((j % fpp) == 0) {
14715- k++;
14716- BUG_ON(k>=fpp);
14717- pfn_to_mfn_frame_list[k] =
14718- alloc_bootmem_pages(PAGE_SIZE);
14719- pfn_to_mfn_frame_list_list[k] =
14720- virt_to_mfn(pfn_to_mfn_frame_list[k]);
14721- j=0;
14722- }
14723- pfn_to_mfn_frame_list[k][j] =
14724- virt_to_mfn(&phys_to_machine_mapping[i]);
14725- }
14726- HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
14727- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
14728- virt_to_mfn(pfn_to_mfn_frame_list_list);
14729- }
14730-
14731- /* Mark all ISA DMA channels in-use - using them wouldn't work. */
14732- for (i = 0; i < MAX_DMA_CHANNELS; ++i)
14733- if (i != 4 && request_dma(i, "xen") != 0)
14734- BUG();
14735- }
14736-
14737-#ifdef CONFIG_ACPI
14738- if (!is_initial_xendomain()) {
14739- acpi_disabled = 1;
14740- acpi_ht = 0;
14741- }
14742-#endif
14743-#endif
14744-
14745-#ifndef CONFIG_XEN
14746- early_quirks();
14747-#endif
14748-
14749-#ifdef CONFIG_ACPI
14750- /*
14751- * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
14752- * Call this early for SRAT node setup.
14753- */
14754- acpi_boot_table_init();
14755-
14756- /*
14757- * Read APIC and some other early information from ACPI tables.
14758- */
14759- acpi_boot_init();
14760-#endif
14761-
14762- init_cpu_to_node();
14763-
14764-#ifdef CONFIG_X86_LOCAL_APIC
14765- /*
14766- * get boot-time SMP configuration:
14767- */
14768- if (smp_found_config)
14769- get_smp_config();
14770-#ifndef CONFIG_XEN
14771- init_apic_mappings();
14772- ioapic_init_mappings();
14773-#endif
14774-#endif
14775-#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
14776- prefill_possible_map();
14777-#endif
14778-
14779- kvm_guest_init();
14780-
14781- /*
14782- * We trust e820 completely. No explicit ROM probing in memory.
14783- */
14784-#ifdef CONFIG_XEN
14785- if (is_initial_xendomain())
14786- e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
14787-#else
14788- e820_reserve_resources(e820.map, e820.nr_map);
14789- e820_mark_nosave_regions();
14790-#endif
14791-
14792- /* request I/O space for devices used on all i[345]86 PCs */
14793- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14794- request_resource(&ioport_resource, &standard_io_resources[i]);
14795-
14796-#ifdef CONFIG_XEN
14797- if (is_initial_xendomain())
14798- e820_setup_gap(machine_e820.map, machine_e820.nr_map);
14799-#else
14800- e820_setup_gap(e820.map, e820.nr_map);
14801-#endif
14802-
14803-#ifdef CONFIG_XEN
14804- {
14805- struct physdev_set_iopl set_iopl;
14806-
14807- set_iopl.iopl = 1;
14808- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
14809-
14810- if (is_initial_xendomain()) {
14811-#ifdef CONFIG_VT
14812-#if defined(CONFIG_VGA_CONSOLE)
14813- conswitchp = &vga_con;
14814-#elif defined(CONFIG_DUMMY_CONSOLE)
14815- conswitchp = &dummy_con;
14816-#endif
14817-#endif
14818- } else {
14819-#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14820- conswitchp = &dummy_con;
14821-#endif
14822- }
14823- }
14824-#else /* CONFIG_XEN */
14825-
14826-#ifdef CONFIG_VT
14827-#if defined(CONFIG_VGA_CONSOLE)
14828- if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14829- conswitchp = &vga_con;
14830-#elif defined(CONFIG_DUMMY_CONSOLE)
14831- conswitchp = &dummy_con;
14832-#endif
14833-#endif
14834-
14835-#endif /* !CONFIG_XEN */
14836-
14837- /* do this before identify_cpu for boot cpu */
14838- check_enable_amd_mmconf_dmi();
14839-}
14840-
14841-#ifdef CONFIG_XEN
14842-static int
14843-xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14844-{
14845- HYPERVISOR_shutdown(SHUTDOWN_crash);
14846- /* we're never actually going to get here... */
14847- return NOTIFY_DONE;
14848-}
14849-#endif /* !CONFIG_XEN */
14850-
14851-
14852-static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
14853-{
14854- unsigned int *v;
14855-
14856- if (c->extended_cpuid_level < 0x80000004)
14857- return 0;
14858-
14859- v = (unsigned int *) c->x86_model_id;
14860- cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
14861- cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
14862- cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
14863- c->x86_model_id[48] = 0;
14864- return 1;
14865-}
14866-
14867-
14868-static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
14869-{
14870- unsigned int n, dummy, eax, ebx, ecx, edx;
14871-
14872- n = c->extended_cpuid_level;
14873-
14874- if (n >= 0x80000005) {
14875- cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
14876- printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
14877- "D cache %dK (%d bytes/line)\n",
14878- edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
14879- c->x86_cache_size = (ecx>>24) + (edx>>24);
14880- /* On K8 L1 TLB is inclusive, so don't count it */
14881- c->x86_tlbsize = 0;
14882- }
14883-
14884- if (n >= 0x80000006) {
14885- cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
14886- ecx = cpuid_ecx(0x80000006);
14887- c->x86_cache_size = ecx >> 16;
14888- c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
14889-
14890- printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
14891- c->x86_cache_size, ecx & 0xFF);
14892- }
14893- if (n >= 0x80000008) {
14894- cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
14895- c->x86_virt_bits = (eax >> 8) & 0xff;
14896- c->x86_phys_bits = eax & 0xff;
14897- }
14898-}
14899-
14900-#ifdef CONFIG_NUMA
14901-static int __cpuinit nearby_node(int apicid)
14902-{
14903- int i, node;
14904-
14905- for (i = apicid - 1; i >= 0; i--) {
14906- node = apicid_to_node[i];
14907- if (node != NUMA_NO_NODE && node_online(node))
14908- return node;
14909- }
14910- for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
14911- node = apicid_to_node[i];
14912- if (node != NUMA_NO_NODE && node_online(node))
14913- return node;
14914- }
14915- return first_node(node_online_map); /* Shouldn't happen */
14916-}
14917-#endif
14918-
14919-/*
14920- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
14921- * Assumes number of cores is a power of two.
14922- */
14923-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
14924-{
14925-#ifdef CONFIG_SMP
14926- unsigned bits;
14927-#ifdef CONFIG_NUMA
14928- int cpu = smp_processor_id();
14929- int node = 0;
14930- unsigned apicid = hard_smp_processor_id();
14931-#endif
14932- bits = c->x86_coreid_bits;
14933-
14934- /* Low order bits define the core id (index of core in socket) */
14935- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
14936- /* Convert the initial APIC ID into the socket ID */
14937- c->phys_proc_id = c->initial_apicid >> bits;
14938-
14939-#ifdef CONFIG_NUMA
14940- node = c->phys_proc_id;
14941- if (apicid_to_node[apicid] != NUMA_NO_NODE)
14942- node = apicid_to_node[apicid];
14943- if (!node_online(node)) {
14944- /* Two possibilities here:
14945- - The CPU is missing memory and no node was created.
14946- In that case try picking one from a nearby CPU
14947- - The APIC IDs differ from the HyperTransport node IDs
14948- which the K8 northbridge parsing fills in.
14949- Assume they are all increased by a constant offset,
14950- but in the same order as the HT nodeids.
14951- If that doesn't result in a usable node fall back to the
14952- path for the previous case. */
14953-
14954- int ht_nodeid = c->initial_apicid;
14955-
14956- if (ht_nodeid >= 0 &&
14957- apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
14958- node = apicid_to_node[ht_nodeid];
14959- /* Pick a nearby node */
14960- if (!node_online(node))
14961- node = nearby_node(apicid);
14962- }
14963- numa_set_node(cpu, node);
14964-
14965- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
14966-#endif
14967-#endif
14968-}
14969-
14970-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
14971-{
14972-#ifdef CONFIG_SMP
14973- unsigned bits, ecx;
14974-
14975- /* Multi core CPU? */
14976- if (c->extended_cpuid_level < 0x80000008)
14977- return;
14978-
14979- ecx = cpuid_ecx(0x80000008);
14980-
14981- c->x86_max_cores = (ecx & 0xff) + 1;
14982-
14983- /* CPU telling us the core id bits shift? */
14984- bits = (ecx >> 12) & 0xF;
14985-
14986- /* Otherwise recompute */
14987- if (bits == 0) {
14988- while ((1 << bits) < c->x86_max_cores)
14989- bits++;
14990- }
14991-
14992- c->x86_coreid_bits = bits;
14993-
14994-#endif
14995-}
14996-
14997-#define ENABLE_C1E_MASK 0x18000000
14998-#define CPUID_PROCESSOR_SIGNATURE 1
14999-#define CPUID_XFAM 0x0ff00000
15000-#define CPUID_XFAM_K8 0x00000000
15001-#define CPUID_XFAM_10H 0x00100000
15002-#define CPUID_XFAM_11H 0x00200000
15003-#define CPUID_XMOD 0x000f0000
15004-#define CPUID_XMOD_REV_F 0x00040000
15005-
15006-#ifndef CONFIG_XEN
15007-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
15008-static __cpuinit int amd_apic_timer_broken(void)
15009-{
15010- u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
15011-
15012- switch (eax & CPUID_XFAM) {
15013- case CPUID_XFAM_K8:
15014- if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
15015- break;
15016- case CPUID_XFAM_10H:
15017- case CPUID_XFAM_11H:
15018- rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
15019- if (lo & ENABLE_C1E_MASK)
15020- return 1;
15021- break;
15022- default:
15023- /* err on the side of caution */
15024- return 1;
15025- }
15026- return 0;
15027-}
15028-#endif
15029-
15030-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
15031-{
15032- early_init_amd_mc(c);
15033-
15034- /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
15035- if (c->x86_power & (1<<8))
15036- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15037-}
15038-
15039-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
15040-{
15041- unsigned level;
15042-
15043-#ifdef CONFIG_SMP
15044- unsigned long value;
15045-
15046- /*
15047- * Disable TLB flush filter by setting HWCR.FFDIS on K8
15048- * bit 6 of msr C001_0015
15049- *
15050- * Errata 63 for SH-B3 steppings
15051- * Errata 122 for all steppings (F+ have it disabled by default)
15052- */
15053- if (c->x86 == 15) {
15054- rdmsrl(MSR_K8_HWCR, value);
15055- value |= 1 << 6;
15056- wrmsrl(MSR_K8_HWCR, value);
15057- }
15058-#endif
15059-
15060- /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
15061- 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
15062- clear_cpu_cap(c, 0*32+31);
15063-
15064- /* On C+ stepping K8 rep microcode works well for copy/memset */
15065- level = cpuid_eax(1);
15066- if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
15067- level >= 0x0f58))
15068- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15069- if (c->x86 == 0x10 || c->x86 == 0x11)
15070- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15071-
15072- /* Enable workaround for FXSAVE leak */
15073- if (c->x86 >= 6)
15074- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
15075-
15076- level = get_model_name(c);
15077- if (!level) {
15078- switch (c->x86) {
15079- case 15:
15080- /* Should distinguish Models here, but this is only
15081- a fallback anyways. */
15082- strcpy(c->x86_model_id, "Hammer");
15083- break;
15084- }
15085- }
15086- display_cacheinfo(c);
15087-
15088- /* Multi core CPU? */
15089- if (c->extended_cpuid_level >= 0x80000008)
15090- amd_detect_cmp(c);
15091-
15092- if (c->extended_cpuid_level >= 0x80000006 &&
15093- (cpuid_edx(0x80000006) & 0xf000))
15094- num_cache_leaves = 4;
15095- else
15096- num_cache_leaves = 3;
15097-
15098- if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
15099- set_cpu_cap(c, X86_FEATURE_K8);
15100-
15101- /* MFENCE stops RDTSC speculation */
15102- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
15103-
15104- if (c->x86 == 0x10)
15105- fam10h_check_enable_mmcfg();
15106-
15107-#ifndef CONFIG_XEN
15108- if (amd_apic_timer_broken())
15109- disable_apic_timer = 1;
15110-
15111- if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
15112- unsigned long long tseg;
15113-
15114- /*
15115- * Split up direct mapping around the TSEG SMM area.
15116- * Don't do it for gbpages because there seems very little
15117- * benefit in doing so.
15118- */
15119- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
15120- (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
15121- set_memory_4k((unsigned long)__va(tseg), 1);
15122- }
15123-#endif
15124-}
15125-
15126-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
15127-{
15128-#ifdef CONFIG_SMP
15129- u32 eax, ebx, ecx, edx;
15130- int index_msb, core_bits;
15131-
15132- cpuid(1, &eax, &ebx, &ecx, &edx);
15133-
15134-
15135- if (!cpu_has(c, X86_FEATURE_HT))
15136- return;
15137- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
15138- goto out;
15139-
15140- smp_num_siblings = (ebx & 0xff0000) >> 16;
15141-
15142- if (smp_num_siblings == 1) {
15143- printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
15144- } else if (smp_num_siblings > 1) {
15145-
15146- if (smp_num_siblings > NR_CPUS) {
15147- printk(KERN_WARNING "CPU: Unsupported number of "
15148- "siblings %d", smp_num_siblings);
15149- smp_num_siblings = 1;
15150- return;
15151- }
15152-
15153- index_msb = get_count_order(smp_num_siblings);
15154- c->phys_proc_id = phys_pkg_id(index_msb);
15155-
15156- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
15157-
15158- index_msb = get_count_order(smp_num_siblings);
15159-
15160- core_bits = get_count_order(c->x86_max_cores);
15161-
15162- c->cpu_core_id = phys_pkg_id(index_msb) &
15163- ((1 << core_bits) - 1);
15164- }
15165-out:
15166- if ((c->x86_max_cores * smp_num_siblings) > 1) {
15167- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
15168- c->phys_proc_id);
15169- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
15170- c->cpu_core_id);
15171- }
15172-
15173-#endif
15174-}
15175-
15176-/*
15177- * find out the number of processor cores on the die
15178- */
15179-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
15180-{
15181- unsigned int eax, t;
15182-
15183- if (c->cpuid_level < 4)
15184- return 1;
15185-
15186- cpuid_count(4, 0, &eax, &t, &t, &t);
15187-
15188- if (eax & 0x1f)
15189- return ((eax >> 26) + 1);
15190- else
15191- return 1;
15192-}
15193-
15194-static void __cpuinit srat_detect_node(void)
15195-{
15196-#ifdef CONFIG_NUMA
15197- unsigned node;
15198- int cpu = smp_processor_id();
15199- int apicid = hard_smp_processor_id();
15200-
15201- /* Don't do the funky fallback heuristics the AMD version employs
15202- for now. */
15203- node = apicid_to_node[apicid];
15204- if (node == NUMA_NO_NODE || !node_online(node))
15205- node = first_node(node_online_map);
15206- numa_set_node(cpu, node);
15207-
15208- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
15209-#endif
15210-}
15211-
15212-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
15213-{
15214- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
15215- (c->x86 == 0x6 && c->x86_model >= 0x0e))
15216- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15217-}
15218-
15219-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
15220-{
15221- /* Cache sizes */
15222- unsigned n;
15223-
15224- init_intel_cacheinfo(c);
15225- if (c->cpuid_level > 9) {
15226- unsigned eax = cpuid_eax(10);
15227- /* Check for version and the number of counters */
15228- if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
15229- set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
15230- }
15231-
15232- if (cpu_has_ds) {
15233- unsigned int l1, l2;
15234- rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
15235- if (!(l1 & (1<<11)))
15236- set_cpu_cap(c, X86_FEATURE_BTS);
15237- if (!(l1 & (1<<12)))
15238- set_cpu_cap(c, X86_FEATURE_PEBS);
15239- }
15240-
15241-
15242- if (cpu_has_bts)
15243- ds_init_intel(c);
15244-
15245- n = c->extended_cpuid_level;
15246- if (n >= 0x80000008) {
15247- unsigned eax = cpuid_eax(0x80000008);
15248- c->x86_virt_bits = (eax >> 8) & 0xff;
15249- c->x86_phys_bits = eax & 0xff;
15250- /* CPUID workaround for Intel 0F34 CPU */
15251- if (c->x86_vendor == X86_VENDOR_INTEL &&
15252- c->x86 == 0xF && c->x86_model == 0x3 &&
15253- c->x86_mask == 0x4)
15254- c->x86_phys_bits = 36;
15255- }
15256-
15257- if (c->x86 == 15)
15258- c->x86_cache_alignment = c->x86_clflush_size * 2;
15259- if (c->x86 == 6)
15260- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15261- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15262- c->x86_max_cores = intel_num_cpu_cores(c);
15263-
15264- srat_detect_node();
15265-}
15266-
15267-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
15268-{
15269- if (c->x86 == 0x6 && c->x86_model >= 0xf)
15270- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15271-}
15272-
15273-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
15274-{
15275- /* Cache sizes */
15276- unsigned n;
15277-
15278- n = c->extended_cpuid_level;
15279- if (n >= 0x80000008) {
15280- unsigned eax = cpuid_eax(0x80000008);
15281- c->x86_virt_bits = (eax >> 8) & 0xff;
15282- c->x86_phys_bits = eax & 0xff;
15283- }
15284-
15285- if (c->x86 == 0x6 && c->x86_model >= 0xf) {
15286- c->x86_cache_alignment = c->x86_clflush_size * 2;
15287- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15288- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15289- }
15290- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15291-}
15292-
15293-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
15294-{
15295- char *v = c->x86_vendor_id;
15296-
15297- if (!strcmp(v, "AuthenticAMD"))
15298- c->x86_vendor = X86_VENDOR_AMD;
15299- else if (!strcmp(v, "GenuineIntel"))
15300- c->x86_vendor = X86_VENDOR_INTEL;
15301- else if (!strcmp(v, "CentaurHauls"))
15302- c->x86_vendor = X86_VENDOR_CENTAUR;
15303- else
15304- c->x86_vendor = X86_VENDOR_UNKNOWN;
15305-}
15306-
15307-/* Do some early cpuid on the boot CPU to get some parameter that are
15308- needed before check_bugs. Everything advanced is in identify_cpu
15309- below. */
15310-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
15311-{
15312- u32 tfms, xlvl;
15313-
15314- c->loops_per_jiffy = loops_per_jiffy;
15315- c->x86_cache_size = -1;
15316- c->x86_vendor = X86_VENDOR_UNKNOWN;
15317- c->x86_model = c->x86_mask = 0; /* So far unknown... */
15318- c->x86_vendor_id[0] = '\0'; /* Unset */
15319- c->x86_model_id[0] = '\0'; /* Unset */
15320- c->x86_clflush_size = 64;
15321- c->x86_cache_alignment = c->x86_clflush_size;
15322- c->x86_max_cores = 1;
15323- c->x86_coreid_bits = 0;
15324- c->extended_cpuid_level = 0;
15325- memset(&c->x86_capability, 0, sizeof c->x86_capability);
15326-
15327- /* Get vendor name */
15328- cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
15329- (unsigned int *)&c->x86_vendor_id[0],
15330- (unsigned int *)&c->x86_vendor_id[8],
15331- (unsigned int *)&c->x86_vendor_id[4]);
15332-
15333- get_cpu_vendor(c);
15334-
15335- /* Initialize the standard set of capabilities */
15336- /* Note that the vendor-specific code below might override */
15337-
15338- /* Intel-defined flags: level 0x00000001 */
15339- if (c->cpuid_level >= 0x00000001) {
15340- __u32 misc;
15341- cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
15342- &c->x86_capability[0]);
15343- c->x86 = (tfms >> 8) & 0xf;
15344- c->x86_model = (tfms >> 4) & 0xf;
15345- c->x86_mask = tfms & 0xf;
15346- if (c->x86 == 0xf)
15347- c->x86 += (tfms >> 20) & 0xff;
15348- if (c->x86 >= 0x6)
15349- c->x86_model += ((tfms >> 16) & 0xF) << 4;
15350- if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
15351- c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
15352- } else {
15353- /* Have CPUID level 0 only - unheard of */
15354- c->x86 = 4;
15355- }
15356-
15357- c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
15358-#ifdef CONFIG_SMP
15359- c->phys_proc_id = c->initial_apicid;
15360-#endif
15361- /* AMD-defined flags: level 0x80000001 */
15362- xlvl = cpuid_eax(0x80000000);
15363- c->extended_cpuid_level = xlvl;
15364- if ((xlvl & 0xffff0000) == 0x80000000) {
15365- if (xlvl >= 0x80000001) {
15366- c->x86_capability[1] = cpuid_edx(0x80000001);
15367- c->x86_capability[6] = cpuid_ecx(0x80000001);
15368- }
15369- if (xlvl >= 0x80000004)
15370- get_model_name(c); /* Default name */
15371- }
15372-
15373- /* Transmeta-defined flags: level 0x80860001 */
15374- xlvl = cpuid_eax(0x80860000);
15375- if ((xlvl & 0xffff0000) == 0x80860000) {
15376- /* Don't set x86_cpuid_level here for now to not confuse. */
15377- if (xlvl >= 0x80860001)
15378- c->x86_capability[2] = cpuid_edx(0x80860001);
15379- }
15380-
15381- c->extended_cpuid_level = cpuid_eax(0x80000000);
15382- if (c->extended_cpuid_level >= 0x80000007)
15383- c->x86_power = cpuid_edx(0x80000007);
15384-
15385- switch (c->x86_vendor) {
15386- case X86_VENDOR_AMD:
15387- early_init_amd(c);
15388- break;
15389- case X86_VENDOR_INTEL:
15390- early_init_intel(c);
15391- break;
15392- case X86_VENDOR_CENTAUR:
15393- early_init_centaur(c);
15394- break;
15395- }
15396-
15397- validate_pat_support(c);
15398-}
15399-
15400-/*
15401- * This does the hard work of actually picking apart the CPU stuff...
15402- */
15403-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
15404-{
15405- int i;
15406-
15407- early_identify_cpu(c);
15408-
15409- init_scattered_cpuid_features(c);
15410-
15411- c->apicid = phys_pkg_id(0);
15412-
15413- /*
15414- * Vendor-specific initialization. In this section we
15415- * canonicalize the feature flags, meaning if there are
15416- * features a certain CPU supports which CPUID doesn't
15417- * tell us, CPUID claiming incorrect flags, or other bugs,
15418- * we handle them here.
15419- *
15420- * At the end of this section, c->x86_capability better
15421- * indicate the features this CPU genuinely supports!
15422- */
15423- switch (c->x86_vendor) {
15424- case X86_VENDOR_AMD:
15425- init_amd(c);
15426- break;
15427-
15428- case X86_VENDOR_INTEL:
15429- init_intel(c);
15430- break;
15431-
15432- case X86_VENDOR_CENTAUR:
15433- init_centaur(c);
15434- break;
15435-
15436- case X86_VENDOR_UNKNOWN:
15437- default:
15438- display_cacheinfo(c);
15439- break;
15440- }
15441-
15442- detect_ht(c);
15443-
15444- /*
15445- * On SMP, boot_cpu_data holds the common feature set between
15446- * all CPUs; so make sure that we indicate which features are
15447- * common between the CPUs. The first time this routine gets
15448- * executed, c == &boot_cpu_data.
15449- */
15450- if (c != &boot_cpu_data) {
15451- /* AND the already accumulated flags with these */
15452- for (i = 0; i < NCAPINTS; i++)
15453- boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
15454- }
15455-
15456- /* Clear all flags overriden by options */
15457- for (i = 0; i < NCAPINTS; i++)
15458- c->x86_capability[i] &= ~cleared_cpu_caps[i];
15459-
15460-#ifdef CONFIG_X86_MCE
15461- mcheck_init(c);
15462-#endif
15463- select_idle_routine(c);
15464-
15465-#ifdef CONFIG_NUMA
15466- numa_add_cpu(smp_processor_id());
15467-#endif
15468-
15469-}
15470-
15471-void __cpuinit identify_boot_cpu(void)
15472-{
15473- identify_cpu(&boot_cpu_data);
15474-}
15475-
15476-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
15477-{
15478- BUG_ON(c == &boot_cpu_data);
15479- identify_cpu(c);
15480- mtrr_ap_init();
15481-}
15482-
15483-static __init int setup_noclflush(char *arg)
15484-{
15485- setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
15486- return 1;
15487-}
15488-__setup("noclflush", setup_noclflush);
15489-
15490-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
15491-{
15492- if (c->x86_model_id[0])
15493- printk(KERN_CONT "%s", c->x86_model_id);
15494-
15495- if (c->x86_mask || c->cpuid_level >= 0)
15496- printk(KERN_CONT " stepping %02x\n", c->x86_mask);
15497- else
15498- printk(KERN_CONT "\n");
15499-}
15500-
15501-static __init int setup_disablecpuid(char *arg)
15502-{
15503- int bit;
15504- if (get_option(&arg, &bit) && bit < NCAPINTS*32)
15505- setup_clear_cpu_cap(bit);
15506- else
15507- return 0;
15508- return 1;
15509-}
15510-__setup("clearcpuid=", setup_disablecpuid);
15511Index: head-2008-12-01/arch/x86/kernel/setup_percpu-xen.c
15512===================================================================
15513--- /dev/null 1970-01-01 00:00:00.000000000 +0000
15514+++ head-2008-12-01/arch/x86/kernel/setup_percpu-xen.c 2008-12-01 11:49:07.000000000 +0100
15515@@ -0,0 +1,385 @@
15516+#include <linux/kernel.h>
15517+#include <linux/module.h>
15518+#include <linux/init.h>
15519+#include <linux/bootmem.h>
15520+#include <linux/percpu.h>
15521+#include <linux/kexec.h>
15522+#include <linux/crash_dump.h>
15523+#include <asm/smp.h>
15524+#include <asm/percpu.h>
15525+#include <asm/sections.h>
15526+#include <asm/processor.h>
15527+#include <asm/setup.h>
15528+#include <asm/topology.h>
15529+#include <asm/mpspec.h>
15530+#include <asm/apicdef.h>
15531+#include <asm/highmem.h>
15532+
15533+#ifdef CONFIG_X86_LOCAL_APIC
15534+unsigned int num_processors;
15535+unsigned disabled_cpus __cpuinitdata;
15536+/* Processor that is doing the boot up */
15537+unsigned int boot_cpu_physical_apicid = -1U;
15538+unsigned int max_physical_apicid;
15539+EXPORT_SYMBOL(boot_cpu_physical_apicid);
15540+
15541+/* Bitmask of physically existing CPUs */
15542+physid_mask_t phys_cpu_present_map;
15543+#endif
15544+
15545+/* map cpu index to physical APIC ID */
15546+#ifndef CONFIG_XEN
15547+DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
15548+DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
15549+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15550+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
15551+#else
15552+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
15553+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15554+#endif
15555+
15556+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
15557+#define X86_64_NUMA 1
15558+
15559+/* map cpu index to node index */
15560+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
15561+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
15562+
15563+/* which logical CPUs are on which nodes */
15564+cpumask_t *node_to_cpumask_map;
15565+EXPORT_SYMBOL(node_to_cpumask_map);
15566+
15567+/* setup node_to_cpumask_map */
15568+static void __init setup_node_to_cpumask_map(void);
15569+
15570+#else
15571+static inline void setup_node_to_cpumask_map(void) { }
15572+#endif
15573+
15574+#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
15575+/*
15576+ * Copy data used in early init routines from the initial arrays to the
15577+ * per cpu data areas. These arrays then become expendable and the
15578+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
15579+ */
15580+static void __init setup_per_cpu_maps(void)
15581+{
15582+#ifndef CONFIG_XEN
15583+ int cpu;
15584+
15585+ for_each_possible_cpu(cpu) {
15586+ per_cpu(x86_cpu_to_apicid, cpu) =
15587+ early_per_cpu_map(x86_cpu_to_apicid, cpu);
15588+ per_cpu(x86_bios_cpu_apicid, cpu) =
15589+ early_per_cpu_map(x86_bios_cpu_apicid, cpu);
15590+#ifdef X86_64_NUMA
15591+ per_cpu(x86_cpu_to_node_map, cpu) =
15592+ early_per_cpu_map(x86_cpu_to_node_map, cpu);
15593+#endif
15594+ }
15595+
15596+ /* indicate the early static arrays will soon be gone */
15597+ early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
15598+ early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
15599+#ifdef X86_64_NUMA
15600+ early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
15601+#endif
15602+#endif
15603+}
15604+
15605+#ifdef CONFIG_X86_32
15606+/*
15607+ * Great future not-so-futuristic plan: make i386 and x86_64 do it
15608+ * the same way
15609+ */
15610+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
15611+EXPORT_SYMBOL(__per_cpu_offset);
15612+static inline void setup_cpu_pda_map(void) { }
15613+
15614+#elif !defined(CONFIG_SMP)
15615+static inline void setup_cpu_pda_map(void) { }
15616+
15617+#else /* CONFIG_SMP && CONFIG_X86_64 */
15618+
15619+/*
15620+ * Allocate cpu_pda pointer table and array via alloc_bootmem.
15621+ */
15622+static void __init setup_cpu_pda_map(void)
15623+{
15624+ char *pda;
15625+ struct x8664_pda **new_cpu_pda;
15626+ unsigned long size;
15627+ int cpu;
15628+
15629+ size = roundup(sizeof(struct x8664_pda), cache_line_size());
15630+
15631+ /* allocate cpu_pda array and pointer table */
15632+ {
15633+ unsigned long tsize = nr_cpu_ids * sizeof(void *);
15634+ unsigned long asize = size * (nr_cpu_ids - 1);
15635+
15636+ tsize = roundup(tsize, cache_line_size());
15637+ new_cpu_pda = alloc_bootmem(tsize + asize);
15638+ pda = (char *)new_cpu_pda + tsize;
15639+ }
15640+
15641+ /* initialize pointer table to static pda's */
15642+ for_each_possible_cpu(cpu) {
15643+ if (cpu == 0) {
15644+ /* leave boot cpu pda in place */
15645+ new_cpu_pda[0] = cpu_pda(0);
15646+ continue;
15647+ }
15648+ new_cpu_pda[cpu] = (struct x8664_pda *)pda;
15649+ new_cpu_pda[cpu]->in_bootmem = 1;
15650+ pda += size;
15651+ }
15652+
15653+ /* point to new pointer table */
15654+ _cpu_pda = new_cpu_pda;
15655+}
15656+#endif
15657+
15658+/*
15659+ * Great future plan:
15660+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
15661+ * Always point %gs to its beginning
15662+ */
15663+void __init setup_per_cpu_areas(void)
15664+{
15665+ ssize_t size = PERCPU_ENOUGH_ROOM;
15666+ char *ptr;
15667+ int cpu;
15668+
15669+ /* Setup cpu_pda map */
15670+ setup_cpu_pda_map();
15671+
15672+ /* Copy section for each CPU (we discard the original) */
15673+ size = PERCPU_ENOUGH_ROOM;
15674+ printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
15675+ size);
15676+
15677+ for_each_possible_cpu(cpu) {
15678+#ifndef CONFIG_NEED_MULTIPLE_NODES
15679+ ptr = alloc_bootmem_pages(size);
15680+#else
15681+ int node = early_cpu_to_node(cpu);
15682+ if (!node_online(node) || !NODE_DATA(node)) {
15683+ ptr = alloc_bootmem_pages(size);
15684+ printk(KERN_INFO
15685+ "cpu %d has no node %d or node-local memory\n",
15686+ cpu, node);
15687+ }
15688+ else
15689+ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
15690+#endif
15691+ per_cpu_offset(cpu) = ptr - __per_cpu_start;
15692+ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
15693+
15694+ }
15695+
15696+ printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
15697+ NR_CPUS, nr_cpu_ids, nr_node_ids);
15698+
15699+ /* Setup percpu data maps */
15700+ setup_per_cpu_maps();
15701+
15702+ /* Setup node to cpumask map */
15703+ setup_node_to_cpumask_map();
15704+}
15705+
15706+#endif
15707+
15708+#ifdef X86_64_NUMA
15709+
15710+/*
15711+ * Allocate node_to_cpumask_map based on number of available nodes
15712+ * Requires node_possible_map to be valid.
15713+ *
15714+ * Note: node_to_cpumask() is not valid until after this is done.
15715+ */
15716+static void __init setup_node_to_cpumask_map(void)
15717+{
15718+ unsigned int node, num = 0;
15719+ cpumask_t *map;
15720+
15721+ /* setup nr_node_ids if not done yet */
15722+ if (nr_node_ids == MAX_NUMNODES) {
15723+ for_each_node_mask(node, node_possible_map)
15724+ num = node;
15725+ nr_node_ids = num + 1;
15726+ }
15727+
15728+ /* allocate the map */
15729+ map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
15730+
15731+ pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
15732+ map, nr_node_ids);
15733+
15734+ /* node_to_cpumask() will now work */
15735+ node_to_cpumask_map = map;
15736+}
15737+
15738+void __cpuinit numa_set_node(int cpu, int node)
15739+{
15740+ int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
15741+
15742+ if (cpu_pda(cpu) && node != NUMA_NO_NODE)
15743+ cpu_pda(cpu)->nodenumber = node;
15744+
15745+ if (cpu_to_node_map)
15746+ cpu_to_node_map[cpu] = node;
15747+
15748+ else if (per_cpu_offset(cpu))
15749+ per_cpu(x86_cpu_to_node_map, cpu) = node;
15750+
15751+ else
15752+ pr_debug("Setting node for non-present cpu %d\n", cpu);
15753+}
15754+
15755+void __cpuinit numa_clear_node(int cpu)
15756+{
15757+ numa_set_node(cpu, NUMA_NO_NODE);
15758+}
15759+
15760+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
15761+
15762+void __cpuinit numa_add_cpu(int cpu)
15763+{
15764+ cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
15765+}
15766+
15767+void __cpuinit numa_remove_cpu(int cpu)
15768+{
15769+ cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
15770+}
15771+
15772+#else /* CONFIG_DEBUG_PER_CPU_MAPS */
15773+
15774+/*
15775+ * --------- debug versions of the numa functions ---------
15776+ */
15777+static void __cpuinit numa_set_cpumask(int cpu, int enable)
15778+{
15779+ int node = cpu_to_node(cpu);
15780+ cpumask_t *mask;
15781+ char buf[64];
15782+
15783+ if (node_to_cpumask_map == NULL) {
15784+ printk(KERN_ERR "node_to_cpumask_map NULL\n");
15785+ dump_stack();
15786+ return;
15787+ }
15788+
15789+ mask = &node_to_cpumask_map[node];
15790+ if (enable)
15791+ cpu_set(cpu, *mask);
15792+ else
15793+ cpu_clear(cpu, *mask);
15794+
15795+ cpulist_scnprintf(buf, sizeof(buf), *mask);
15796+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
15797+ enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
15798+ }
15799+
15800+void __cpuinit numa_add_cpu(int cpu)
15801+{
15802+ numa_set_cpumask(cpu, 1);
15803+}
15804+
15805+void __cpuinit numa_remove_cpu(int cpu)
15806+{
15807+ numa_set_cpumask(cpu, 0);
15808+}
15809+
15810+int cpu_to_node(int cpu)
15811+{
15812+ if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
15813+ printk(KERN_WARNING
15814+ "cpu_to_node(%d): usage too early!\n", cpu);
15815+ dump_stack();
15816+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15817+ }
15818+ return per_cpu(x86_cpu_to_node_map, cpu);
15819+}
15820+EXPORT_SYMBOL(cpu_to_node);
15821+
15822+/*
15823+ * Same function as cpu_to_node() but used if called before the
15824+ * per_cpu areas are setup.
15825+ */
15826+int early_cpu_to_node(int cpu)
15827+{
15828+ if (early_per_cpu_ptr(x86_cpu_to_node_map))
15829+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15830+
15831+ if (!per_cpu_offset(cpu)) {
15832+ printk(KERN_WARNING
15833+ "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
15834+ dump_stack();
15835+ return NUMA_NO_NODE;
15836+ }
15837+ return per_cpu(x86_cpu_to_node_map, cpu);
15838+}
15839+
15840+
15841+/* empty cpumask */
15842+static const cpumask_t cpu_mask_none;
15843+
15844+/*
15845+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
15846+ */
15847+const cpumask_t *_node_to_cpumask_ptr(int node)
15848+{
15849+ if (node_to_cpumask_map == NULL) {
15850+ printk(KERN_WARNING
15851+ "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
15852+ node);
15853+ dump_stack();
15854+ return (const cpumask_t *)&cpu_online_map;
15855+ }
15856+ if (node >= nr_node_ids) {
15857+ printk(KERN_WARNING
15858+ "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
15859+ node, nr_node_ids);
15860+ dump_stack();
15861+ return &cpu_mask_none;
15862+ }
15863+ return &node_to_cpumask_map[node];
15864+}
15865+EXPORT_SYMBOL(_node_to_cpumask_ptr);
15866+
15867+/*
15868+ * Returns a bitmask of CPUs on Node 'node'.
15869+ *
15870+ * Side note: this function creates the returned cpumask on the stack
15871+ * so with a high NR_CPUS count, excessive stack space is used. The
15872+ * node_to_cpumask_ptr function should be used whenever possible.
15873+ */
15874+cpumask_t node_to_cpumask(int node)
15875+{
15876+ if (node_to_cpumask_map == NULL) {
15877+ printk(KERN_WARNING
15878+ "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
15879+ dump_stack();
15880+ return cpu_online_map;
15881+ }
15882+ if (node >= nr_node_ids) {
15883+ printk(KERN_WARNING
15884+ "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
15885+ node, nr_node_ids);
15886+ dump_stack();
15887+ return cpu_mask_none;
15888+ }
15889+ return node_to_cpumask_map[node];
15890+}
15891+EXPORT_SYMBOL(node_to_cpumask);
15892+
15893+/*
15894+ * --------- end of debug versions of the numa functions ---------
15895+ */
15896+
15897+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
15898+
15899+#endif /* X86_64_NUMA */
15900+
15901Index: head-2008-12-01/arch/x86/kernel/smp-xen.c
15902===================================================================
15903--- head-2008-12-01.orig/arch/x86/kernel/smp-xen.c 2008-12-01 11:44:55.000000000 +0100
15904+++ head-2008-12-01/arch/x86/kernel/smp-xen.c 2008-12-01 11:49:07.000000000 +0100
15905@@ -121,132 +121,14 @@ void xen_smp_send_reschedule(int cpu)
15906 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
15907 }
15908
15909-/*
15910- * Structure and data for smp_call_function(). This is designed to minimise
15911- * static memory requirements. It also looks cleaner.
15912- */
15913-static DEFINE_SPINLOCK(call_lock);
15914-
15915-struct call_data_struct {
15916- void (*func) (void *info);
15917- void *info;
15918- atomic_t started;
15919- atomic_t finished;
15920- int wait;
15921-};
15922-
15923-void lock_ipi_call_lock(void)
15924+void xen_send_call_func_single_ipi(int cpu)
15925 {
15926- spin_lock_irq(&call_lock);
15927+ send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR);
15928 }
15929
15930-void unlock_ipi_call_lock(void)
15931+void xen_send_call_func_ipi(cpumask_t mask)
15932 {
15933- spin_unlock_irq(&call_lock);
15934-}
15935-
15936-static struct call_data_struct *call_data;
15937-
15938-static void __smp_call_function(void (*func) (void *info), void *info,
15939- int nonatomic, int wait)
15940-{
15941- struct call_data_struct data;
15942- int cpus = num_online_cpus() - 1;
15943-
15944- if (!cpus)
15945- return;
15946-
15947- data.func = func;
15948- data.info = info;
15949- atomic_set(&data.started, 0);
15950- data.wait = wait;
15951- if (wait)
15952- atomic_set(&data.finished, 0);
15953-
15954- call_data = &data;
15955- mb();
15956-
15957- /* Send a message to all other CPUs and wait for them to respond */
15958- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
15959-
15960- /* Wait for response */
15961- while (atomic_read(&data.started) != cpus)
15962- cpu_relax();
15963-
15964- if (wait)
15965- while (atomic_read(&data.finished) != cpus)
15966- cpu_relax();
15967-}
15968-
15969-
15970-/**
15971- * smp_call_function_mask(): Run a function on a set of other CPUs.
15972- * @mask: The set of cpus to run on. Must not include the current cpu.
15973- * @func: The function to run. This must be fast and non-blocking.
15974- * @info: An arbitrary pointer to pass to the function.
15975- * @wait: If true, wait (atomically) until function has completed on other CPUs.
15976- *
15977- * Returns 0 on success, else a negative status code.
15978- *
15979- * If @wait is true, then returns once @func has returned; otherwise
15980- * it returns just before the target cpu calls @func.
15981- *
15982- * You must not call this function with disabled interrupts or from a
15983- * hardware interrupt handler or from a bottom half handler.
15984- */
15985-int
15986-xen_smp_call_function_mask(cpumask_t mask,
15987- void (*func)(void *), void *info,
15988- int wait)
15989-{
15990- struct call_data_struct data;
15991- cpumask_t allbutself;
15992- int cpus;
15993-
15994- /* Can deadlock when called with interrupts disabled */
15995- WARN_ON(irqs_disabled());
15996-
15997- /* Holding any lock stops cpus from going down. */
15998- spin_lock(&call_lock);
15999-
16000- allbutself = cpu_online_map;
16001- cpu_clear(smp_processor_id(), allbutself);
16002-
16003- cpus_and(mask, mask, allbutself);
16004- cpus = cpus_weight(mask);
16005-
16006- if (!cpus) {
16007- spin_unlock(&call_lock);
16008- return 0;
16009- }
16010-
16011- data.func = func;
16012- data.info = info;
16013- atomic_set(&data.started, 0);
16014- data.wait = wait;
16015- if (wait)
16016- atomic_set(&data.finished, 0);
16017-
16018- call_data = &data;
16019- wmb();
16020-
16021- /* Send a message to other CPUs */
16022- if (cpus_equal(mask, allbutself) &&
16023- cpus_equal(cpu_online_map, cpu_callout_map))
16024- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
16025- else
16026- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16027-
16028- /* Wait for response */
16029- while (atomic_read(&data.started) != cpus)
16030- cpu_relax();
16031-
16032- if (wait)
16033- while (atomic_read(&data.finished) != cpus)
16034- cpu_relax();
16035- spin_unlock(&call_lock);
16036-
16037- return 0;
16038+ send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16039 }
16040
16041 static void stop_this_cpu(void *dummy)
16042@@ -268,15 +150,10 @@ static void stop_this_cpu(void *dummy)
16043
16044 void xen_smp_send_stop(void)
16045 {
16046- int nolock;
16047 unsigned long flags;
16048
16049- /* Don't deadlock on the call lock in panic */
16050- nolock = !spin_trylock(&call_lock);
16051+ smp_call_function(stop_this_cpu, NULL, 0);
16052 local_irq_save(flags);
16053- __smp_call_function(stop_this_cpu, NULL, 0, 0);
16054- if (!nolock)
16055- spin_unlock(&call_lock);
16056 disable_all_local_evtchn();
16057 local_irq_restore(flags);
16058 }
16059@@ -298,21 +175,8 @@ irqreturn_t smp_reschedule_interrupt(int
16060
16061 irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
16062 {
16063- void (*func) (void *info) = call_data->func;
16064- void *info = call_data->info;
16065- int wait = call_data->wait;
16066-
16067- /*
16068- * Notify initiating CPU that I've grabbed the data and am
16069- * about to execute the function
16070- */
16071- mb();
16072- atomic_inc(&call_data->started);
16073- /*
16074- * At this point the info structure may be out of scope unless wait==1
16075- */
16076 irq_enter();
16077- (*func)(info);
16078+ generic_smp_call_function_interrupt();
16079 #ifdef CONFIG_X86_32
16080 __get_cpu_var(irq_stat).irq_call_count++;
16081 #else
16082@@ -320,10 +184,19 @@ irqreturn_t smp_call_function_interrupt(
16083 #endif
16084 irq_exit();
16085
16086- if (wait) {
16087- mb();
16088- atomic_inc(&call_data->finished);
16089- }
16090+ return IRQ_HANDLED;
16091+}
16092+
16093+irqreturn_t smp_call_function_single_interrupt(int irq, void *dev_id)
16094+{
16095+ irq_enter();
16096+ generic_smp_call_function_single_interrupt();
16097+#ifdef CONFIG_X86_32
16098+ __get_cpu_var(irq_stat).irq_call_count++;
16099+#else
16100+ add_pda(irq_call_count, 1);
16101+#endif
16102+ irq_exit();
16103
16104 return IRQ_HANDLED;
16105 }
16106Index: head-2008-12-01/arch/x86/kernel/time_32-xen.c
16107===================================================================
16108--- head-2008-12-01.orig/arch/x86/kernel/time_32-xen.c 2008-12-01 11:44:55.000000000 +0100
16109+++ head-2008-12-01/arch/x86/kernel/time_32-xen.c 2008-12-01 11:58:30.000000000 +0100
16110@@ -470,7 +470,7 @@ irqreturn_t timer_interrupt(int irq, voi
16111
16112 /* Keep nmi watchdog up to date */
16113 #ifdef __i386__
16114- per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
16115+ x86_add_percpu(irq_stat.irq0_irqs, 1);
16116 #else
16117 add_pda(irq0_irqs, 1);
16118 #endif
16119@@ -748,9 +748,7 @@ void __init time_init(void)
16120
16121 update_wallclock();
16122
16123-#ifndef CONFIG_X86_64
16124 use_tsc_delay();
16125-#endif
16126
16127 /* Cannot request_irq() until kmem is initialised. */
16128 late_time_init = setup_cpu0_timer_irq;
16129@@ -807,7 +805,8 @@ static void stop_hz_timer(void)
16130
16131 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
16132 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
16133- (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
16134+ (j = get_next_timer_interrupt(jiffies),
16135+ time_before_eq(j, jiffies))) {
16136 cpu_clear(cpu, nohz_cpu_mask);
16137 j = jiffies + 1;
16138 }
16139Index: head-2008-12-01/arch/x86/kernel/traps_32-xen.c
16140===================================================================
16141--- head-2008-12-01.orig/arch/x86/kernel/traps_32-xen.c 2008-12-01 11:44:55.000000000 +0100
16142+++ head-2008-12-01/arch/x86/kernel/traps_32-xen.c 2008-12-01 11:49:07.000000000 +0100
16143@@ -1,5 +1,6 @@
16144 /*
16145 * Copyright (C) 1991, 1992 Linus Torvalds
16146+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
16147 *
16148 * Pentium III FXSR, SSE support
16149 * Gareth Hughes <gareth@valinux.com>, May 2000
16150@@ -57,11 +58,10 @@
16151 #include <asm/nmi.h>
16152 #include <asm/smp.h>
16153 #include <asm/io.h>
16154+#include <asm/traps.h>
16155
16156 #include "mach_traps.h"
16157
16158-int panic_on_unrecovered_nmi;
16159-
16160 #ifndef CONFIG_XEN
16161 DECLARE_BITMAP(used_vectors, NR_VECTORS);
16162 EXPORT_SYMBOL_GPL(used_vectors);
16163@@ -82,43 +82,22 @@ gate_desc idt_table[256]
16164 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
16165 #endif
16166
16167-asmlinkage void divide_error(void);
16168-asmlinkage void debug(void);
16169-asmlinkage void nmi(void);
16170-asmlinkage void int3(void);
16171-asmlinkage void overflow(void);
16172-asmlinkage void bounds(void);
16173-asmlinkage void invalid_op(void);
16174-asmlinkage void device_not_available(void);
16175-asmlinkage void coprocessor_segment_overrun(void);
16176-asmlinkage void invalid_TSS(void);
16177-asmlinkage void segment_not_present(void);
16178-asmlinkage void stack_segment(void);
16179-asmlinkage void general_protection(void);
16180-asmlinkage void page_fault(void);
16181-asmlinkage void coprocessor_error(void);
16182-asmlinkage void simd_coprocessor_error(void);
16183-asmlinkage void alignment_check(void);
16184-#ifndef CONFIG_XEN
16185-asmlinkage void spurious_interrupt_bug(void);
16186-#else
16187-asmlinkage void fixup_4gb_segment(void);
16188-#endif
16189-asmlinkage void machine_check(void);
16190-
16191+int panic_on_unrecovered_nmi;
16192 int kstack_depth_to_print = 24;
16193 static unsigned int code_bytes = 64;
16194+static int ignore_nmis;
16195+static int die_counter;
16196
16197 void printk_address(unsigned long address, int reliable)
16198 {
16199 #ifdef CONFIG_KALLSYMS
16200- char namebuf[KSYM_NAME_LEN];
16201 unsigned long offset = 0;
16202 unsigned long symsize;
16203 const char *symname;
16204- char reliab[4] = "";
16205- char *delim = ":";
16206 char *modname;
16207+ char *delim = ":";
16208+ char namebuf[KSYM_NAME_LEN];
16209+ char reliab[4] = "";
16210
16211 symname = kallsyms_lookup(address, &symsize, &offset,
16212 &modname, namebuf);
16213@@ -138,22 +117,23 @@ void printk_address(unsigned long addres
16214 #endif
16215 }
16216
16217-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
16218+static inline int valid_stack_ptr(struct thread_info *tinfo,
16219+ void *p, unsigned int size)
16220 {
16221- return p > (void *)tinfo &&
16222- p <= (void *)tinfo + THREAD_SIZE - size;
16223+ void *t = tinfo;
16224+ return p > t && p <= t + THREAD_SIZE - size;
16225 }
16226
16227 /* The form of the top of the frame on the stack */
16228 struct stack_frame {
16229- struct stack_frame *next_frame;
16230- unsigned long return_address;
16231+ struct stack_frame *next_frame;
16232+ unsigned long return_address;
16233 };
16234
16235 static inline unsigned long
16236 print_context_stack(struct thread_info *tinfo,
16237- unsigned long *stack, unsigned long bp,
16238- const struct stacktrace_ops *ops, void *data)
16239+ unsigned long *stack, unsigned long bp,
16240+ const struct stacktrace_ops *ops, void *data)
16241 {
16242 struct stack_frame *frame = (struct stack_frame *)bp;
16243
16244@@ -175,8 +155,6 @@ print_context_stack(struct thread_info *
16245 return bp;
16246 }
16247
16248-#define MSG(msg) ops->warning(data, msg)
16249-
16250 void dump_trace(struct task_struct *task, struct pt_regs *regs,
16251 unsigned long *stack, unsigned long bp,
16252 const struct stacktrace_ops *ops, void *data)
16253@@ -186,7 +164,6 @@ void dump_trace(struct task_struct *task
16254
16255 if (!stack) {
16256 unsigned long dummy;
16257-
16258 stack = &dummy;
16259 if (task != current)
16260 stack = (unsigned long *)task->thread.sp;
16261@@ -204,7 +181,7 @@ void dump_trace(struct task_struct *task
16262 }
16263 #endif
16264
16265- while (1) {
16266+ for (;;) {
16267 struct thread_info *context;
16268
16269 context = (struct thread_info *)
16270@@ -256,15 +233,15 @@ static void print_trace_address(void *da
16271 }
16272
16273 static const struct stacktrace_ops print_trace_ops = {
16274- .warning = print_trace_warning,
16275- .warning_symbol = print_trace_warning_symbol,
16276- .stack = print_trace_stack,
16277- .address = print_trace_address,
16278+ .warning = print_trace_warning,
16279+ .warning_symbol = print_trace_warning_symbol,
16280+ .stack = print_trace_stack,
16281+ .address = print_trace_address,
16282 };
16283
16284 static void
16285 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16286- unsigned long *stack, unsigned long bp, char *log_lvl)
16287+ unsigned long *stack, unsigned long bp, char *log_lvl)
16288 {
16289 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16290 printk("%s =======================\n", log_lvl);
16291@@ -359,15 +336,14 @@ void show_registers(struct pt_regs *regs
16292 printk(KERN_EMERG "Code: ");
16293
16294 ip = (u8 *)regs->ip - code_prologue;
16295- if (ip < (u8 *)PAGE_OFFSET ||
16296- probe_kernel_address(ip, c)) {
16297+ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
16298 /* try starting at EIP */
16299 ip = (u8 *)regs->ip;
16300 code_len = code_len - code_prologue + 1;
16301 }
16302 for (i = 0; i < code_len; i++, ip++) {
16303 if (ip < (u8 *)PAGE_OFFSET ||
16304- probe_kernel_address(ip, c)) {
16305+ probe_kernel_address(ip, c)) {
16306 printk(" Bad EIP value.");
16307 break;
16308 }
16309@@ -392,7 +368,53 @@ int is_valid_bugaddr(unsigned long ip)
16310 return ud2 == 0x0b0f;
16311 }
16312
16313-static int die_counter;
16314+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
16315+static int die_owner = -1;
16316+static unsigned int die_nest_count;
16317+
16318+unsigned __kprobes long oops_begin(void)
16319+{
16320+ unsigned long flags;
16321+
16322+ oops_enter();
16323+
16324+ if (die_owner != raw_smp_processor_id()) {
16325+ console_verbose();
16326+ raw_local_irq_save(flags);
16327+ __raw_spin_lock(&die_lock);
16328+ die_owner = smp_processor_id();
16329+ die_nest_count = 0;
16330+ bust_spinlocks(1);
16331+ } else {
16332+ raw_local_irq_save(flags);
16333+ }
16334+ die_nest_count++;
16335+ return flags;
16336+}
16337+
16338+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
16339+{
16340+ bust_spinlocks(0);
16341+ die_owner = -1;
16342+ add_taint(TAINT_DIE);
16343+ __raw_spin_unlock(&die_lock);
16344+ raw_local_irq_restore(flags);
16345+
16346+ if (!regs)
16347+ return;
16348+
16349+ if (kexec_should_crash(current))
16350+ crash_kexec(regs);
16351+
16352+ if (in_interrupt())
16353+ panic("Fatal exception in interrupt");
16354+
16355+ if (panic_on_oops)
16356+ panic("Fatal exception");
16357+
16358+ oops_exit();
16359+ do_exit(signr);
16360+}
16361
16362 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
16363 {
16364@@ -410,26 +432,22 @@ int __kprobes __die(const char *str, str
16365 printk("DEBUG_PAGEALLOC");
16366 #endif
16367 printk("\n");
16368-
16369 if (notify_die(DIE_OOPS, str, regs, err,
16370- current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
16371+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
16372+ return 1;
16373
16374- show_registers(regs);
16375- /* Executive summary in case the oops scrolled away */
16376- sp = (unsigned long) (&regs->sp);
16377- savesegment(ss, ss);
16378- if (user_mode(regs)) {
16379- sp = regs->sp;
16380- ss = regs->ss & 0xffff;
16381- }
16382- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16383- print_symbol("%s", regs->ip);
16384- printk(" SS:ESP %04x:%08lx\n", ss, sp);
16385-
16386- return 0;
16387- }
16388-
16389- return 1;
16390+ show_registers(regs);
16391+ /* Executive summary in case the oops scrolled away */
16392+ sp = (unsigned long) (&regs->sp);
16393+ savesegment(ss, ss);
16394+ if (user_mode(regs)) {
16395+ sp = regs->sp;
16396+ ss = regs->ss & 0xffff;
16397+ }
16398+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16399+ print_symbol("%s", regs->ip);
16400+ printk(" SS:ESP %04x:%08lx\n", ss, sp);
16401+ return 0;
16402 }
16403
16404 /*
16405@@ -438,31 +456,9 @@ int __kprobes __die(const char *str, str
16406 */
16407 void die(const char *str, struct pt_regs *regs, long err)
16408 {
16409- static struct {
16410- raw_spinlock_t lock;
16411- u32 lock_owner;
16412- int lock_owner_depth;
16413- } die = {
16414- .lock = __RAW_SPIN_LOCK_UNLOCKED,
16415- .lock_owner = -1,
16416- .lock_owner_depth = 0
16417- };
16418- unsigned long flags;
16419-
16420- oops_enter();
16421+ unsigned long flags = oops_begin();
16422
16423- if (die.lock_owner != raw_smp_processor_id()) {
16424- console_verbose();
16425- raw_local_irq_save(flags);
16426- __raw_spin_lock(&die.lock);
16427- die.lock_owner = smp_processor_id();
16428- die.lock_owner_depth = 0;
16429- bust_spinlocks(1);
16430- } else {
16431- raw_local_irq_save(flags);
16432- }
16433-
16434- if (++die.lock_owner_depth < 3) {
16435+ if (die_nest_count < 3) {
16436 report_bug(regs->ip, regs);
16437
16438 if (__die(str, regs, err))
16439@@ -471,26 +467,7 @@ void die(const char *str, struct pt_regs
16440 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
16441 }
16442
16443- bust_spinlocks(0);
16444- die.lock_owner = -1;
16445- add_taint(TAINT_DIE);
16446- __raw_spin_unlock(&die.lock);
16447- raw_local_irq_restore(flags);
16448-
16449- if (!regs)
16450- return;
16451-
16452- if (kexec_should_crash(current))
16453- crash_kexec(regs);
16454-
16455- if (in_interrupt())
16456- panic("Fatal exception in interrupt");
16457-
16458- if (panic_on_oops)
16459- panic("Fatal exception");
16460-
16461- oops_exit();
16462- do_exit(SIGSEGV);
16463+ oops_end(flags, regs, SIGSEGV);
16464 }
16465
16466 static inline void
16467@@ -554,7 +531,7 @@ void do_##name(struct pt_regs *regs, lon
16468 { \
16469 trace_hardirqs_fixup(); \
16470 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16471- == NOTIFY_STOP) \
16472+ == NOTIFY_STOP) \
16473 return; \
16474 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
16475 }
16476@@ -570,7 +547,7 @@ void do_##name(struct pt_regs *regs, lon
16477 info.si_code = sicode; \
16478 info.si_addr = (void __user *)siaddr; \
16479 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16480- == NOTIFY_STOP) \
16481+ == NOTIFY_STOP) \
16482 return; \
16483 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
16484 }
16485@@ -579,7 +556,7 @@ void do_##name(struct pt_regs *regs, lon
16486 void do_##name(struct pt_regs *regs, long error_code) \
16487 { \
16488 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16489- == NOTIFY_STOP) \
16490+ == NOTIFY_STOP) \
16491 return; \
16492 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
16493 }
16494@@ -594,28 +571,29 @@ void do_##name(struct pt_regs *regs, lon
16495 info.si_addr = (void __user *)siaddr; \
16496 trace_hardirqs_fixup(); \
16497 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16498- == NOTIFY_STOP) \
16499+ == NOTIFY_STOP) \
16500 return; \
16501 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
16502 }
16503
16504-DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16505+DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16506 #ifndef CONFIG_KPROBES
16507 DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
16508 #endif
16509 DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
16510 DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
16511-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16512-DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16513+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16514+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16515 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
16516-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16517-DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16518+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16519+DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16520 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
16521 DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
16522
16523-void __kprobes do_general_protection(struct pt_regs * regs,
16524- long error_code)
16525+void __kprobes
16526+do_general_protection(struct pt_regs *regs, long error_code)
16527 {
16528+ struct task_struct *tsk;
16529 struct thread_struct *thread;
16530
16531 thread = &current->thread;
16532@@ -623,23 +601,24 @@ void __kprobes do_general_protection(str
16533 if (regs->flags & X86_VM_MASK)
16534 goto gp_in_vm86;
16535
16536+ tsk = current;
16537 if (!user_mode(regs))
16538 goto gp_in_kernel;
16539
16540- current->thread.error_code = error_code;
16541- current->thread.trap_no = 13;
16542+ tsk->thread.error_code = error_code;
16543+ tsk->thread.trap_no = 13;
16544
16545- if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
16546- printk_ratelimit()) {
16547+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
16548+ printk_ratelimit()) {
16549 printk(KERN_INFO
16550- "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16551- current->comm, task_pid_nr(current),
16552- regs->ip, regs->sp, error_code);
16553+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16554+ tsk->comm, task_pid_nr(tsk),
16555+ regs->ip, regs->sp, error_code);
16556 print_vma_addr(" in ", regs->ip);
16557 printk("\n");
16558 }
16559
16560- force_sig(SIGSEGV, current);
16561+ force_sig(SIGSEGV, tsk);
16562 return;
16563
16564 gp_in_vm86:
16565@@ -648,14 +627,15 @@ gp_in_vm86:
16566 return;
16567
16568 gp_in_kernel:
16569- if (!fixup_exception(regs)) {
16570- current->thread.error_code = error_code;
16571- current->thread.trap_no = 13;
16572- if (notify_die(DIE_GPF, "general protection fault", regs,
16573+ if (fixup_exception(regs))
16574+ return;
16575+
16576+ tsk->thread.error_code = error_code;
16577+ tsk->thread.trap_no = 13;
16578+ if (notify_die(DIE_GPF, "general protection fault", regs,
16579 error_code, 13, SIGSEGV) == NOTIFY_STOP)
16580- return;
16581- die("general protection fault", regs, error_code);
16582- }
16583+ return;
16584+ die("general protection fault", regs, error_code);
16585 }
16586
16587 static notrace __kprobes void
16588@@ -722,9 +702,9 @@ unknown_nmi_error(unsigned char reason,
16589
16590 static DEFINE_SPINLOCK(nmi_print_lock);
16591
16592-void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
16593+void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
16594 {
16595- if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16596+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16597 return;
16598
16599 spin_lock(&nmi_print_lock);
16600@@ -733,10 +713,12 @@ void notrace __kprobes die_nmi(struct pt
16601 * to get a message out:
16602 */
16603 bust_spinlocks(1);
16604- printk(KERN_EMERG "%s", msg);
16605+ printk(KERN_EMERG "%s", str);
16606 printk(" on CPU%d, ip %08lx, registers:\n",
16607 smp_processor_id(), regs->ip);
16608 show_registers(regs);
16609+ if (do_panic)
16610+ panic("Non maskable interrupt");
16611 console_silent();
16612 spin_unlock(&nmi_print_lock);
16613 bust_spinlocks(0);
16614@@ -756,14 +738,17 @@ void notrace __kprobes die_nmi(struct pt
16615 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
16616 {
16617 unsigned char reason = 0;
16618+ int cpu;
16619
16620- /* Only the BSP gets external NMIs from the system: */
16621- if (!smp_processor_id())
16622+ cpu = smp_processor_id();
16623+
16624+ /* Only the BSP gets external NMIs from the system. */
16625+ if (!cpu)
16626 reason = get_nmi_reason();
16627
16628 if (!(reason & 0xc0)) {
16629 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
16630- == NOTIFY_STOP)
16631+ == NOTIFY_STOP)
16632 return;
16633 #ifdef CONFIG_X86_LOCAL_APIC
16634 /*
16635@@ -772,7 +757,7 @@ static notrace __kprobes void default_do
16636 */
16637 if (nmi_watchdog_tick(regs, reason))
16638 return;
16639- if (!do_nmi_callback(regs, smp_processor_id()))
16640+ if (!do_nmi_callback(regs, cpu))
16641 unknown_nmi_error(reason, regs);
16642 #else
16643 unknown_nmi_error(reason, regs);
16644@@ -782,6 +767,8 @@ static notrace __kprobes void default_do
16645 }
16646 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
16647 return;
16648+
16649+ /* AK: following checks seem to be broken on modern chipsets. FIXME */
16650 if (reason & 0x80)
16651 mem_parity_error(reason, regs);
16652 if (reason & 0x40)
16653@@ -793,8 +780,6 @@ static notrace __kprobes void default_do
16654 reassert_nmi();
16655 }
16656
16657-static int ignore_nmis;
16658-
16659 notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
16660 {
16661 int cpu;
16662@@ -879,7 +864,7 @@ void __kprobes do_debug(struct pt_regs *
16663 tsk->thread.debugctlmsr = 0;
16664
16665 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
16666- SIGTRAP) == NOTIFY_STOP)
16667+ SIGTRAP) == NOTIFY_STOP)
16668 return;
16669 /* It's safe to allow irq's after DR6 has been saved */
16670 if (regs->flags & X86_EFLAGS_IF)
16671@@ -940,9 +925,8 @@ clear_TF_reenable:
16672 void math_error(void __user *ip)
16673 {
16674 struct task_struct *task;
16675- unsigned short cwd;
16676- unsigned short swd;
16677 siginfo_t info;
16678+ unsigned short cwd, swd;
16679
16680 /*
16681 * Save the info for the exception handler and clear the error.
16682@@ -961,7 +945,7 @@ void math_error(void __user *ip)
16683 * C1 reg you need in case of a stack fault, 0x040 is the stack
16684 * fault bit. We should only be taking one exception at a time,
16685 * so if this combination doesn't produce any single exception,
16686- * then we have a bad program that isn't syncronizing its FPU usage
16687+ * then we have a bad program that isn't synchronizing its FPU usage
16688 * and it will suffer the consequences since we won't be able to
16689 * fully reproduce the context of the exception
16690 */
16691@@ -970,7 +954,7 @@ void math_error(void __user *ip)
16692 switch (swd & ~cwd & 0x3f) {
16693 case 0x000: /* No unmasked exception */
16694 return;
16695- default: /* Multiple exceptions */
16696+ default: /* Multiple exceptions */
16697 break;
16698 case 0x001: /* Invalid Op */
16699 /*
16700@@ -1006,8 +990,8 @@ void do_coprocessor_error(struct pt_regs
16701 static void simd_math_error(void __user *ip)
16702 {
16703 struct task_struct *task;
16704- unsigned short mxcsr;
16705 siginfo_t info;
16706+ unsigned short mxcsr;
16707
16708 /*
16709 * Save the info for the exception handler and clear the error.
16710@@ -1084,7 +1068,7 @@ void do_spurious_interrupt_bug(struct pt
16711
16712 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
16713 {
16714- struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
16715+ struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
16716 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
16717 unsigned long new_kesp = kesp - base;
16718 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
16719Index: head-2008-12-01/arch/x86/kernel/traps_64-xen.c
16720===================================================================
16721--- head-2008-12-01.orig/arch/x86/kernel/traps_64-xen.c 2008-12-01 11:44:55.000000000 +0100
16722+++ head-2008-12-01/arch/x86/kernel/traps_64-xen.c 2008-12-01 11:49:07.000000000 +0100
16723@@ -10,73 +10,56 @@
16724 * 'Traps.c' handles hardware traps and faults after we have saved some
16725 * state in 'entry.S'.
16726 */
16727-#include <linux/sched.h>
16728+#include <linux/moduleparam.h>
16729+#include <linux/interrupt.h>
16730+#include <linux/kallsyms.h>
16731+#include <linux/spinlock.h>
16732+#include <linux/kprobes.h>
16733+#include <linux/uaccess.h>
16734+#include <linux/utsname.h>
16735+#include <linux/kdebug.h>
16736 #include <linux/kernel.h>
16737+#include <linux/module.h>
16738+#include <linux/ptrace.h>
16739 #include <linux/string.h>
16740+#include <linux/unwind.h>
16741+#include <linux/delay.h>
16742 #include <linux/errno.h>
16743-#include <linux/ptrace.h>
16744+#include <linux/kexec.h>
16745+#include <linux/sched.h>
16746 #include <linux/timer.h>
16747-#include <linux/mm.h>
16748 #include <linux/init.h>
16749-#include <linux/delay.h>
16750-#include <linux/spinlock.h>
16751-#include <linux/interrupt.h>
16752-#include <linux/kallsyms.h>
16753-#include <linux/module.h>
16754-#include <linux/moduleparam.h>
16755-#include <linux/nmi.h>
16756-#include <linux/kprobes.h>
16757-#include <linux/kexec.h>
16758-#include <linux/unwind.h>
16759-#include <linux/uaccess.h>
16760 #include <linux/bug.h>
16761-#include <linux/kdebug.h>
16762-#include <linux/utsname.h>
16763-
16764-#include <mach_traps.h>
16765+#include <linux/nmi.h>
16766+#include <linux/mm.h>
16767
16768 #if defined(CONFIG_EDAC)
16769 #include <linux/edac.h>
16770 #endif
16771
16772-#include <asm/system.h>
16773-#include <asm/io.h>
16774-#include <asm/atomic.h>
16775+#include <asm/stacktrace.h>
16776+#include <asm/processor.h>
16777 #include <asm/debugreg.h>
16778+#include <asm/atomic.h>
16779+#include <asm/system.h>
16780+#include <asm/unwind.h>
16781 #include <asm/desc.h>
16782 #include <asm/i387.h>
16783-#include <asm/processor.h>
16784-#include <asm/unwind.h>
16785+#include <asm/nmi.h>
16786 #include <asm/smp.h>
16787+#include <asm/io.h>
16788 #include <asm/pgalloc.h>
16789-#include <asm/pda.h>
16790 #include <asm/proto.h>
16791-#include <asm/nmi.h>
16792-#include <asm/stacktrace.h>
16793+#include <asm/pda.h>
16794+#include <asm/traps.h>
16795
16796-asmlinkage void divide_error(void);
16797-asmlinkage void debug(void);
16798-asmlinkage void nmi(void);
16799-asmlinkage void int3(void);
16800-asmlinkage void overflow(void);
16801-asmlinkage void bounds(void);
16802-asmlinkage void invalid_op(void);
16803-asmlinkage void device_not_available(void);
16804-asmlinkage void double_fault(void);
16805-asmlinkage void coprocessor_segment_overrun(void);
16806-asmlinkage void invalid_TSS(void);
16807-asmlinkage void segment_not_present(void);
16808-asmlinkage void stack_segment(void);
16809-asmlinkage void general_protection(void);
16810-asmlinkage void page_fault(void);
16811-asmlinkage void coprocessor_error(void);
16812-asmlinkage void simd_coprocessor_error(void);
16813-asmlinkage void reserved(void);
16814-asmlinkage void alignment_check(void);
16815-asmlinkage void machine_check(void);
16816-asmlinkage void spurious_interrupt_bug(void);
16817+#include <mach_traps.h>
16818
16819+int panic_on_unrecovered_nmi;
16820+int kstack_depth_to_print = 12;
16821 static unsigned int code_bytes = 64;
16822+static int ignore_nmis;
16823+static int die_counter;
16824
16825 static inline void conditional_sti(struct pt_regs *regs)
16826 {
16827@@ -100,34 +83,9 @@ static inline void preempt_conditional_c
16828 dec_preempt_count();
16829 }
16830
16831-int kstack_depth_to_print = 12;
16832-
16833 void printk_address(unsigned long address, int reliable)
16834 {
16835-#ifdef CONFIG_KALLSYMS
16836- unsigned long offset = 0, symsize;
16837- const char *symname;
16838- char *modname;
16839- char *delim = ":";
16840- char namebuf[KSYM_NAME_LEN];
16841- char reliab[4] = "";
16842-
16843- symname = kallsyms_lookup(address, &symsize, &offset,
16844- &modname, namebuf);
16845- if (!symname) {
16846- printk(" [<%016lx>]\n", address);
16847- return;
16848- }
16849- if (!reliable)
16850- strcpy(reliab, "? ");
16851-
16852- if (!modname)
16853- modname = delim = "";
16854- printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
16855- address, reliab, delim, modname, delim, symname, offset, symsize);
16856-#else
16857- printk(" [<%016lx>]\n", address);
16858-#endif
16859+ printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
16860 }
16861
16862 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
16863@@ -206,8 +164,6 @@ static unsigned long *in_exception_stack
16864 return NULL;
16865 }
16866
16867-#define MSG(txt) ops->warning(data, txt)
16868-
16869 /*
16870 * x86-64 can have up to three kernel stacks:
16871 * process stack
16872@@ -234,11 +190,11 @@ struct stack_frame {
16873 unsigned long return_address;
16874 };
16875
16876-
16877-static inline unsigned long print_context_stack(struct thread_info *tinfo,
16878- unsigned long *stack, unsigned long bp,
16879- const struct stacktrace_ops *ops, void *data,
16880- unsigned long *end)
16881+static inline unsigned long
16882+print_context_stack(struct thread_info *tinfo,
16883+ unsigned long *stack, unsigned long bp,
16884+ const struct stacktrace_ops *ops, void *data,
16885+ unsigned long *end)
16886 {
16887 struct stack_frame *frame = (struct stack_frame *)bp;
16888
16889@@ -260,7 +216,7 @@ static inline unsigned long print_contex
16890 return bp;
16891 }
16892
16893-void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
16894+void dump_trace(struct task_struct *task, struct pt_regs *regs,
16895 unsigned long *stack, unsigned long bp,
16896 const struct stacktrace_ops *ops, void *data)
16897 {
16898@@ -269,36 +225,34 @@ void dump_trace(struct task_struct *tsk,
16899 unsigned used = 0;
16900 struct thread_info *tinfo;
16901
16902- if (!tsk)
16903- tsk = current;
16904- tinfo = task_thread_info(tsk);
16905+ if (!task)
16906+ task = current;
16907
16908 if (!stack) {
16909 unsigned long dummy;
16910 stack = &dummy;
16911- if (tsk && tsk != current)
16912- stack = (unsigned long *)tsk->thread.sp;
16913+ if (task && task != current)
16914+ stack = (unsigned long *)task->thread.sp;
16915 }
16916
16917 #ifdef CONFIG_FRAME_POINTER
16918 if (!bp) {
16919- if (tsk == current) {
16920+ if (task == current) {
16921 /* Grab bp right from our regs */
16922- asm("movq %%rbp, %0" : "=r" (bp):);
16923+ asm("movq %%rbp, %0" : "=r" (bp) :);
16924 } else {
16925 /* bp is the last reg pushed by switch_to */
16926- bp = *(unsigned long *) tsk->thread.sp;
16927+ bp = *(unsigned long *) task->thread.sp;
16928 }
16929 }
16930 #endif
16931
16932-
16933-
16934 /*
16935 * Print function call entries in all stacks, starting at the
16936 * current stack address. If the stacks consist of nested
16937 * exceptions
16938 */
16939+ tinfo = task_thread_info(task);
16940 for (;;) {
16941 char *id;
16942 unsigned long *estack_end;
16943@@ -383,18 +337,24 @@ static const struct stacktrace_ops print
16944 .address = print_trace_address,
16945 };
16946
16947-void
16948-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
16949- unsigned long bp)
16950+static void
16951+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16952+ unsigned long *stack, unsigned long bp, char *log_lvl)
16953 {
16954 printk("\nCall Trace:\n");
16955- dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
16956+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16957 printk("\n");
16958 }
16959
16960+void show_trace(struct task_struct *task, struct pt_regs *regs,
16961+ unsigned long *stack, unsigned long bp)
16962+{
16963+ show_trace_log_lvl(task, regs, stack, bp, "");
16964+}
16965+
16966 static void
16967-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
16968- unsigned long bp)
16969+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
16970+ unsigned long *sp, unsigned long bp, char *log_lvl)
16971 {
16972 unsigned long *stack;
16973 int i;
16974@@ -406,14 +366,14 @@ _show_stack(struct task_struct *tsk, str
16975 // back trace for this cpu.
16976
16977 if (sp == NULL) {
16978- if (tsk)
16979- sp = (unsigned long *)tsk->thread.sp;
16980+ if (task)
16981+ sp = (unsigned long *)task->thread.sp;
16982 else
16983 sp = (unsigned long *)&sp;
16984 }
16985
16986 stack = sp;
16987- for(i=0; i < kstack_depth_to_print; i++) {
16988+ for (i = 0; i < kstack_depth_to_print; i++) {
16989 if (stack >= irqstack && stack <= irqstack_end) {
16990 if (stack == irqstack_end) {
16991 stack = (unsigned long *) (irqstack_end[-1]);
16992@@ -428,12 +388,12 @@ _show_stack(struct task_struct *tsk, str
16993 printk(" %016lx", *stack++);
16994 touch_nmi_watchdog();
16995 }
16996- show_trace(tsk, regs, sp, bp);
16997+ show_trace_log_lvl(task, regs, sp, bp, log_lvl);
16998 }
16999
17000-void show_stack(struct task_struct *tsk, unsigned long * sp)
17001+void show_stack(struct task_struct *task, unsigned long *sp)
17002 {
17003- _show_stack(tsk, NULL, sp, 0);
17004+ show_stack_log_lvl(task, NULL, sp, 0, "");
17005 }
17006
17007 /*
17008@@ -441,8 +401,8 @@ void show_stack(struct task_struct *tsk,
17009 */
17010 void dump_stack(void)
17011 {
17012- unsigned long dummy;
17013 unsigned long bp = 0;
17014+ unsigned long stack;
17015
17016 #ifdef CONFIG_FRAME_POINTER
17017 if (!bp)
17018@@ -454,7 +414,7 @@ void dump_stack(void)
17019 init_utsname()->release,
17020 (int)strcspn(init_utsname()->version, " "),
17021 init_utsname()->version);
17022- show_trace(NULL, NULL, &dummy, bp);
17023+ show_trace(NULL, NULL, &stack, bp);
17024 }
17025
17026 EXPORT_SYMBOL(dump_stack);
17027@@ -465,12 +425,8 @@ void show_registers(struct pt_regs *regs
17028 unsigned long sp;
17029 const int cpu = smp_processor_id();
17030 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
17031- u8 *ip;
17032- unsigned int code_prologue = code_bytes * 43 / 64;
17033- unsigned int code_len = code_bytes;
17034
17035 sp = regs->sp;
17036- ip = (u8 *) regs->ip - code_prologue;
17037 printk("CPU %d ", cpu);
17038 __show_regs(regs);
17039 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
17040@@ -481,15 +437,22 @@ void show_registers(struct pt_regs *regs
17041 * time of the fault..
17042 */
17043 if (!user_mode(regs)) {
17044+ unsigned int code_prologue = code_bytes * 43 / 64;
17045+ unsigned int code_len = code_bytes;
17046 unsigned char c;
17047+ u8 *ip;
17048+
17049 printk("Stack: ");
17050- _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
17051+ show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
17052+ regs->bp, "");
17053 printk("\n");
17054
17055 printk(KERN_EMERG "Code: ");
17056+
17057+ ip = (u8 *)regs->ip - code_prologue;
17058 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
17059 /* try starting at RIP */
17060- ip = (u8 *) regs->ip;
17061+ ip = (u8 *)regs->ip;
17062 code_len = code_len - code_prologue + 1;
17063 }
17064 for (i = 0; i < code_len; i++, ip++) {
17065@@ -505,7 +468,7 @@ void show_registers(struct pt_regs *regs
17066 }
17067 }
17068 printk("\n");
17069-}
17070+}
17071
17072 int is_valid_bugaddr(unsigned long ip)
17073 {
17074@@ -545,7 +508,7 @@ unsigned __kprobes long oops_begin(void)
17075 }
17076
17077 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
17078-{
17079+{
17080 die_owner = -1;
17081 bust_spinlocks(0);
17082 die_nest_count--;
17083@@ -563,10 +526,9 @@ void __kprobes oops_end(unsigned long fl
17084 do_exit(signr);
17085 }
17086
17087-int __kprobes __die(const char * str, struct pt_regs * regs, long err)
17088+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
17089 {
17090- static int die_counter;
17091- printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
17092+ printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
17093 #ifdef CONFIG_PREEMPT
17094 printk("PREEMPT ");
17095 #endif
17096@@ -577,8 +539,10 @@ int __kprobes __die(const char * str, st
17097 printk("DEBUG_PAGEALLOC");
17098 #endif
17099 printk("\n");
17100- if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17101+ if (notify_die(DIE_OOPS, str, regs, err,
17102+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17103 return 1;
17104+
17105 show_registers(regs);
17106 add_taint(TAINT_DIE);
17107 /* Executive summary in case the oops scrolled away */
17108@@ -590,7 +554,7 @@ int __kprobes __die(const char * str, st
17109 return 0;
17110 }
17111
17112-void die(const char * str, struct pt_regs * regs, long err)
17113+void die(const char *str, struct pt_regs *regs, long err)
17114 {
17115 unsigned long flags = oops_begin();
17116
17117@@ -608,8 +572,7 @@ die_nmi(char *str, struct pt_regs *regs,
17118 {
17119 unsigned long flags;
17120
17121- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
17122- NOTIFY_STOP)
17123+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
17124 return;
17125
17126 flags = oops_begin();
17127@@ -617,7 +580,9 @@ die_nmi(char *str, struct pt_regs *regs,
17128 * We are in trouble anyway, lets at least try
17129 * to get a message out.
17130 */
17131- printk(str, smp_processor_id());
17132+ printk(KERN_EMERG "%s", str);
17133+ printk(" on CPU%d, ip %08lx, registers:\n",
17134+ smp_processor_id(), regs->ip);
17135 show_registers(regs);
17136 if (kexec_should_crash(current))
17137 crash_kexec(regs);
17138@@ -630,44 +595,44 @@ die_nmi(char *str, struct pt_regs *regs,
17139 }
17140 #endif
17141
17142-static void __kprobes do_trap(int trapnr, int signr, char *str,
17143- struct pt_regs * regs, long error_code,
17144- siginfo_t *info)
17145+static void __kprobes
17146+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
17147+ long error_code, siginfo_t *info)
17148 {
17149 struct task_struct *tsk = current;
17150
17151- if (user_mode(regs)) {
17152- /*
17153- * We want error_code and trap_no set for userspace
17154- * faults and kernelspace faults which result in
17155- * die(), but not kernelspace faults which are fixed
17156- * up. die() gives the process no chance to handle
17157- * the signal and notice the kernel fault information,
17158- * so that won't result in polluting the information
17159- * about previously queued, but not yet delivered,
17160- * faults. See also do_general_protection below.
17161- */
17162- tsk->thread.error_code = error_code;
17163- tsk->thread.trap_no = trapnr;
17164+ if (!user_mode(regs))
17165+ goto kernel_trap;
17166
17167- if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17168- printk_ratelimit()) {
17169- printk(KERN_INFO
17170- "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17171- tsk->comm, tsk->pid, str,
17172- regs->ip, regs->sp, error_code);
17173- print_vma_addr(" in ", regs->ip);
17174- printk("\n");
17175- }
17176+ /*
17177+ * We want error_code and trap_no set for userspace faults and
17178+ * kernelspace faults which result in die(), but not
17179+ * kernelspace faults which are fixed up. die() gives the
17180+ * process no chance to handle the signal and notice the
17181+ * kernel fault information, so that won't result in polluting
17182+ * the information about previously queued, but not yet
17183+ * delivered, faults. See also do_general_protection below.
17184+ */
17185+ tsk->thread.error_code = error_code;
17186+ tsk->thread.trap_no = trapnr;
17187
17188- if (info)
17189- force_sig_info(signr, info, tsk);
17190- else
17191- force_sig(signr, tsk);
17192- return;
17193+ if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17194+ printk_ratelimit()) {
17195+ printk(KERN_INFO
17196+ "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17197+ tsk->comm, tsk->pid, str,
17198+ regs->ip, regs->sp, error_code);
17199+ print_vma_addr(" in ", regs->ip);
17200+ printk("\n");
17201 }
17202
17203+ if (info)
17204+ force_sig_info(signr, info, tsk);
17205+ else
17206+ force_sig(signr, tsk);
17207+ return;
17208
17209+kernel_trap:
17210 if (!fixup_exception(regs)) {
17211 tsk->thread.error_code = error_code;
17212 tsk->thread.trap_no = trapnr;
17213@@ -677,41 +642,39 @@ static void __kprobes do_trap(int trapnr
17214 }
17215
17216 #define DO_ERROR(trapnr, signr, str, name) \
17217-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17218-{ \
17219- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17220- == NOTIFY_STOP) \
17221- return; \
17222+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17223+{ \
17224+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17225+ == NOTIFY_STOP) \
17226+ return; \
17227 conditional_sti(regs); \
17228- do_trap(trapnr, signr, str, regs, error_code, NULL); \
17229+ do_trap(trapnr, signr, str, regs, error_code, NULL); \
17230 }
17231
17232-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17233-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17234-{ \
17235- siginfo_t info; \
17236- info.si_signo = signr; \
17237- info.si_errno = 0; \
17238- info.si_code = sicode; \
17239- info.si_addr = (void __user *)siaddr; \
17240- trace_hardirqs_fixup(); \
17241- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17242- == NOTIFY_STOP) \
17243- return; \
17244+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17245+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17246+{ \
17247+ siginfo_t info; \
17248+ info.si_signo = signr; \
17249+ info.si_errno = 0; \
17250+ info.si_code = sicode; \
17251+ info.si_addr = (void __user *)siaddr; \
17252+ trace_hardirqs_fixup(); \
17253+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17254+ == NOTIFY_STOP) \
17255+ return; \
17256 conditional_sti(regs); \
17257- do_trap(trapnr, signr, str, regs, error_code, &info); \
17258+ do_trap(trapnr, signr, str, regs, error_code, &info); \
17259 }
17260
17261-DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17262-DO_ERROR( 4, SIGSEGV, "overflow", overflow)
17263-DO_ERROR( 5, SIGSEGV, "bounds", bounds)
17264-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17265-DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
17266-DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17267+DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17268+DO_ERROR(4, SIGSEGV, "overflow", overflow)
17269+DO_ERROR(5, SIGSEGV, "bounds", bounds)
17270+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17271+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17272 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
17273-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17274+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17275 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
17276-DO_ERROR(18, SIGSEGV, "reserved", reserved)
17277
17278 /* Runs on IST stack */
17279 asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
17280@@ -741,31 +704,34 @@ asmlinkage void do_double_fault(struct p
17281 die(str, regs, error_code);
17282 }
17283
17284-asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
17285- long error_code)
17286+asmlinkage void __kprobes
17287+do_general_protection(struct pt_regs *regs, long error_code)
17288 {
17289- struct task_struct *tsk = current;
17290+ struct task_struct *tsk;
17291
17292 conditional_sti(regs);
17293
17294- if (user_mode(regs)) {
17295- tsk->thread.error_code = error_code;
17296- tsk->thread.trap_no = 13;
17297+ tsk = current;
17298+ if (!user_mode(regs))
17299+ goto gp_in_kernel;
17300
17301- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17302- printk_ratelimit()) {
17303- printk(KERN_INFO
17304- "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17305- tsk->comm, tsk->pid,
17306- regs->ip, regs->sp, error_code);
17307- print_vma_addr(" in ", regs->ip);
17308- printk("\n");
17309- }
17310+ tsk->thread.error_code = error_code;
17311+ tsk->thread.trap_no = 13;
17312
17313- force_sig(SIGSEGV, tsk);
17314- return;
17315- }
17316+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17317+ printk_ratelimit()) {
17318+ printk(KERN_INFO
17319+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17320+ tsk->comm, tsk->pid,
17321+ regs->ip, regs->sp, error_code);
17322+ print_vma_addr(" in ", regs->ip);
17323+ printk("\n");
17324+ }
17325
17326+ force_sig(SIGSEGV, tsk);
17327+ return;
17328+
17329+gp_in_kernel:
17330 if (fixup_exception(regs))
17331 return;
17332
17333@@ -778,14 +744,14 @@ asmlinkage void __kprobes do_general_pro
17334 }
17335
17336 static notrace __kprobes void
17337-mem_parity_error(unsigned char reason, struct pt_regs * regs)
17338+mem_parity_error(unsigned char reason, struct pt_regs *regs)
17339 {
17340 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
17341 reason);
17342 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
17343
17344 #if defined(CONFIG_EDAC)
17345- if(edac_handler_set()) {
17346+ if (edac_handler_set()) {
17347 edac_atomic_assert_error();
17348 return;
17349 }
17350@@ -801,7 +767,7 @@ mem_parity_error(unsigned char reason, s
17351 }
17352
17353 static notrace __kprobes void
17354-io_check_error(unsigned char reason, struct pt_regs * regs)
17355+io_check_error(unsigned char reason, struct pt_regs *regs)
17356 {
17357 printk("NMI: IOCK error (debug interrupt?)\n");
17358 show_registers(regs);
17359@@ -827,14 +793,14 @@ unknown_nmi_error(unsigned char reason,
17360
17361 /* Runs on IST stack. This code must keep interrupts off all the time.
17362 Nested NMIs are prevented by the CPU. */
17363-asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17364+asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17365 {
17366 unsigned char reason = 0;
17367 int cpu;
17368
17369 cpu = smp_processor_id();
17370
17371- /* Only the BSP gets external NMIs from the system. */
17372+ /* Only the BSP gets external NMIs from the system. */
17373 if (!cpu)
17374 reason = get_nmi_reason();
17375
17376@@ -847,33 +813,58 @@ asmlinkage notrace __kprobes void defau
17377 * Ok, so this is none of the documented NMI sources,
17378 * so it must be the NMI watchdog.
17379 */
17380- if (nmi_watchdog_tick(regs,reason))
17381+ if (nmi_watchdog_tick(regs, reason))
17382 return;
17383 #endif
17384- if (!do_nmi_callback(regs,cpu))
17385+ if (!do_nmi_callback(regs, cpu))
17386 unknown_nmi_error(reason, regs);
17387
17388 return;
17389 }
17390 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
17391- return;
17392+ return;
17393
17394 /* AK: following checks seem to be broken on modern chipsets. FIXME */
17395-
17396 if (reason & 0x80)
17397 mem_parity_error(reason, regs);
17398 if (reason & 0x40)
17399 io_check_error(reason, regs);
17400 }
17401
17402+asmlinkage notrace __kprobes void
17403+do_nmi(struct pt_regs *regs, long error_code)
17404+{
17405+ nmi_enter();
17406+
17407+ add_pda(__nmi_count, 1);
17408+
17409+ if (!ignore_nmis)
17410+ default_do_nmi(regs);
17411+
17412+ nmi_exit();
17413+}
17414+
17415+void stop_nmi(void)
17416+{
17417+ acpi_nmi_disable();
17418+ ignore_nmis++;
17419+}
17420+
17421+void restart_nmi(void)
17422+{
17423+ ignore_nmis--;
17424+ acpi_nmi_enable();
17425+}
17426+
17427 /* runs on IST stack. */
17428 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
17429 {
17430 trace_hardirqs_fixup();
17431
17432- if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
17433+ if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
17434+ == NOTIFY_STOP)
17435 return;
17436- }
17437+
17438 preempt_conditional_sti(regs);
17439 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
17440 preempt_conditional_cli(regs);
17441@@ -904,8 +895,8 @@ asmlinkage __kprobes struct pt_regs *syn
17442 asmlinkage void __kprobes do_debug(struct pt_regs * regs,
17443 unsigned long error_code)
17444 {
17445- unsigned long condition;
17446 struct task_struct *tsk = current;
17447+ unsigned long condition;
17448 siginfo_t info;
17449
17450 trace_hardirqs_fixup();
17451@@ -926,21 +917,19 @@ asmlinkage void __kprobes do_debug(struc
17452
17453 /* Mask out spurious debug traps due to lazy DR7 setting */
17454 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
17455- if (!tsk->thread.debugreg7) {
17456+ if (!tsk->thread.debugreg7)
17457 goto clear_dr7;
17458- }
17459 }
17460
17461 tsk->thread.debugreg6 = condition;
17462
17463-
17464 /*
17465 * Single-stepping through TF: make sure we ignore any events in
17466 * kernel space (but re-enable TF when returning to user mode).
17467 */
17468 if (condition & DR_STEP) {
17469- if (!user_mode(regs))
17470- goto clear_TF_reenable;
17471+ if (!user_mode(regs))
17472+ goto clear_TF_reenable;
17473 }
17474
17475 /* Ok, finally something we can handle */
17476@@ -953,7 +942,7 @@ asmlinkage void __kprobes do_debug(struc
17477 force_sig_info(SIGTRAP, &info, tsk);
17478
17479 clear_dr7:
17480- set_debugreg(0UL, 7);
17481+ set_debugreg(0, 7);
17482 preempt_conditional_cli(regs);
17483 return;
17484
17485@@ -961,6 +950,7 @@ clear_TF_reenable:
17486 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
17487 regs->flags &= ~X86_EFLAGS_TF;
17488 preempt_conditional_cli(regs);
17489+ return;
17490 }
17491
17492 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
17493@@ -983,7 +973,7 @@ static int kernel_math_error(struct pt_r
17494 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
17495 {
17496 void __user *ip = (void __user *)(regs->ip);
17497- struct task_struct * task;
17498+ struct task_struct *task;
17499 siginfo_t info;
17500 unsigned short cwd, swd;
17501
17502@@ -1016,30 +1006,30 @@ asmlinkage void do_coprocessor_error(str
17503 cwd = get_fpu_cwd(task);
17504 swd = get_fpu_swd(task);
17505 switch (swd & ~cwd & 0x3f) {
17506- case 0x000:
17507- default:
17508- break;
17509- case 0x001: /* Invalid Op */
17510- /*
17511- * swd & 0x240 == 0x040: Stack Underflow
17512- * swd & 0x240 == 0x240: Stack Overflow
17513- * User must clear the SF bit (0x40) if set
17514- */
17515- info.si_code = FPE_FLTINV;
17516- break;
17517- case 0x002: /* Denormalize */
17518- case 0x010: /* Underflow */
17519- info.si_code = FPE_FLTUND;
17520- break;
17521- case 0x004: /* Zero Divide */
17522- info.si_code = FPE_FLTDIV;
17523- break;
17524- case 0x008: /* Overflow */
17525- info.si_code = FPE_FLTOVF;
17526- break;
17527- case 0x020: /* Precision */
17528- info.si_code = FPE_FLTRES;
17529- break;
17530+ case 0x000: /* No unmasked exception */
17531+ default: /* Multiple exceptions */
17532+ break;
17533+ case 0x001: /* Invalid Op */
17534+ /*
17535+ * swd & 0x240 == 0x040: Stack Underflow
17536+ * swd & 0x240 == 0x240: Stack Overflow
17537+ * User must clear the SF bit (0x40) if set
17538+ */
17539+ info.si_code = FPE_FLTINV;
17540+ break;
17541+ case 0x002: /* Denormalize */
17542+ case 0x010: /* Underflow */
17543+ info.si_code = FPE_FLTUND;
17544+ break;
17545+ case 0x004: /* Zero Divide */
17546+ info.si_code = FPE_FLTDIV;
17547+ break;
17548+ case 0x008: /* Overflow */
17549+ info.si_code = FPE_FLTOVF;
17550+ break;
17551+ case 0x020: /* Precision */
17552+ info.si_code = FPE_FLTRES;
17553+ break;
17554 }
17555 force_sig_info(SIGFPE, &info, task);
17556 }
17557@@ -1052,7 +1042,7 @@ asmlinkage void bad_intr(void)
17558 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
17559 {
17560 void __user *ip = (void __user *)(regs->ip);
17561- struct task_struct * task;
17562+ struct task_struct *task;
17563 siginfo_t info;
17564 unsigned short mxcsr;
17565
17566@@ -1080,25 +1070,25 @@ asmlinkage void do_simd_coprocessor_erro
17567 */
17568 mxcsr = get_fpu_mxcsr(task);
17569 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
17570- case 0x000:
17571- default:
17572- break;
17573- case 0x001: /* Invalid Op */
17574- info.si_code = FPE_FLTINV;
17575- break;
17576- case 0x002: /* Denormalize */
17577- case 0x010: /* Underflow */
17578- info.si_code = FPE_FLTUND;
17579- break;
17580- case 0x004: /* Zero Divide */
17581- info.si_code = FPE_FLTDIV;
17582- break;
17583- case 0x008: /* Overflow */
17584- info.si_code = FPE_FLTOVF;
17585- break;
17586- case 0x020: /* Precision */
17587- info.si_code = FPE_FLTRES;
17588- break;
17589+ case 0x000:
17590+ default:
17591+ break;
17592+ case 0x001: /* Invalid Op */
17593+ info.si_code = FPE_FLTINV;
17594+ break;
17595+ case 0x002: /* Denormalize */
17596+ case 0x010: /* Underflow */
17597+ info.si_code = FPE_FLTUND;
17598+ break;
17599+ case 0x004: /* Zero Divide */
17600+ info.si_code = FPE_FLTDIV;
17601+ break;
17602+ case 0x008: /* Overflow */
17603+ info.si_code = FPE_FLTOVF;
17604+ break;
17605+ case 0x020: /* Precision */
17606+ info.si_code = FPE_FLTRES;
17607+ break;
17608 }
17609 force_sig_info(SIGFPE, &info, task);
17610 }
17611@@ -1118,7 +1108,7 @@ asmlinkage void __attribute__((weak)) mc
17612 }
17613
17614 /*
17615- * 'math_state_restore()' saves the current math information in the
17616+ * 'math_state_restore()' saves the current math information in the
17617 * old math state array, and gets the new ones from the current task
17618 *
17619 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
17620@@ -1145,7 +1135,14 @@ asmlinkage void math_state_restore(void)
17621
17622 /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
17623
17624- restore_fpu_checking(&me->thread.xstate->fxsave);
17625+ /*
17626+ * Paranoid restore. send a SIGSEGV if we fail to restore the state.
17627+ */
17628+ if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
17629+ stts();
17630+ force_sig(SIGSEGV, me);
17631+ return;
17632+ }
17633 task_thread_info(me)->status |= TS_USEDFPU;
17634 me->fpu_counter++;
17635 }
17636@@ -1190,13 +1187,12 @@ void __init trap_init(void)
17637 ret = HYPERVISOR_set_trap_table(trap_table);
17638 if (ret)
17639 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
17640-
17641 /*
17642 * initialize the per thread extended state:
17643 */
17644- init_thread_xstate();
17645+ init_thread_xstate();
17646 /*
17647- * Should be a barrier for any external CPU state.
17648+ * Should be a barrier for any external CPU state:
17649 */
17650 cpu_init();
17651 }
17652@@ -1212,27 +1208,25 @@ void __cpuinit smp_trap_init(trap_info_t
17653 }
17654 }
17655
17656-
17657 static int __init oops_setup(char *s)
17658-{
17659+{
17660 if (!s)
17661 return -EINVAL;
17662 if (!strcmp(s, "panic"))
17663 panic_on_oops = 1;
17664 return 0;
17665-}
17666+}
17667 early_param("oops", oops_setup);
17668
17669 static int __init kstack_setup(char *s)
17670 {
17671 if (!s)
17672 return -EINVAL;
17673- kstack_depth_to_print = simple_strtoul(s,NULL,0);
17674+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
17675 return 0;
17676 }
17677 early_param("kstack", kstack_setup);
17678
17679-
17680 static int __init code_bytes_setup(char *s)
17681 {
17682 code_bytes = simple_strtoul(s, NULL, 0);
17683Index: head-2008-12-01/arch/x86/kernel/vsyscall_64-xen.c
17684===================================================================
17685--- head-2008-12-01.orig/arch/x86/kernel/vsyscall_64-xen.c 2008-12-01 11:44:55.000000000 +0100
17686+++ head-2008-12-01/arch/x86/kernel/vsyscall_64-xen.c 2008-12-01 11:49:07.000000000 +0100
17687@@ -42,7 +42,8 @@
17688 #include <asm/topology.h>
17689 #include <asm/vgtod.h>
17690
17691-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
17692+#define __vsyscall(nr) \
17693+ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
17694 #define __syscall_clobber "r11","cx","memory"
17695
17696 /*
17697@@ -264,10 +265,7 @@ static void __cpuinit vsyscall_set_cpu(i
17698 d |= cpu;
17699 d |= (node & 0xf) << 12;
17700 d |= (node >> 4) << 48;
17701- if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
17702- + GDT_ENTRY_PER_CPU),
17703- d))
17704- BUG();
17705+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
17706 }
17707
17708 static void __cpuinit cpu_vsyscall_init(void *arg)
17709@@ -281,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_bl
17710 {
17711 long cpu = (long)arg;
17712 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
17713- smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
17714+ smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
17715 return NOTIFY_DONE;
17716 }
17717
17718@@ -311,7 +309,7 @@ static int __init vsyscall_init(void)
17719 #ifdef CONFIG_SYSCTL
17720 register_sysctl_table(kernel_root_table2);
17721 #endif
17722- on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
17723+ on_each_cpu(cpu_vsyscall_init, NULL, 1);
17724 hotcpu_notifier(cpu_vsyscall_notifier, 0);
17725 return 0;
17726 }
17727Index: head-2008-12-01/arch/x86/mach-xen/setup.c
17728===================================================================
17729--- head-2008-12-01.orig/arch/x86/mach-xen/setup.c 2008-12-01 11:37:10.000000000 +0100
17730+++ head-2008-12-01/arch/x86/mach-xen/setup.c 2008-12-01 11:49:07.000000000 +0100
17731@@ -17,6 +17,8 @@
17732 #include <xen/interface/callback.h>
17733 #include <xen/interface/memory.h>
17734
17735+#ifdef CONFIG_X86_32
17736+
17737 #ifdef CONFIG_HOTPLUG_CPU
17738 #define DEFAULT_SEND_IPI (1)
17739 #else
17740@@ -44,51 +46,6 @@ static int __init print_ipi_mode(void)
17741
17742 late_initcall(print_ipi_mode);
17743
17744-/**
17745- * machine_specific_memory_setup - Hook for machine specific memory setup.
17746- *
17747- * Description:
17748- * This is included late in kernel/setup.c so that it can make
17749- * use of all of the static functions.
17750- **/
17751-
17752-char * __init machine_specific_memory_setup(void)
17753-{
17754- int rc;
17755- struct xen_memory_map memmap;
17756- /*
17757- * This is rather large for a stack variable but this early in
17758- * the boot process we know we have plenty slack space.
17759- */
17760- struct e820entry map[E820MAX];
17761-
17762- memmap.nr_entries = E820MAX;
17763- set_xen_guest_handle(memmap.buffer, map);
17764-
17765- rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
17766- if ( rc == -ENOSYS ) {
17767- memmap.nr_entries = 1;
17768- map[0].addr = 0ULL;
17769- map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
17770- /* 8MB slack (to balance backend allocations). */
17771- map[0].size += 8ULL << 20;
17772- map[0].type = E820_RAM;
17773- rc = 0;
17774- }
17775- BUG_ON(rc);
17776-
17777- sanitize_e820_map(map, (char *)&memmap.nr_entries);
17778-
17779- BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
17780-
17781- return "Xen";
17782-}
17783-
17784-
17785-extern void hypervisor_callback(void);
17786-extern void failsafe_callback(void);
17787-extern void nmi(void);
17788-
17789 unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
17790 EXPORT_SYMBOL(machine_to_phys_mapping);
17791 unsigned int machine_to_phys_order;
17792@@ -121,33 +78,66 @@ void __init pre_setup_arch_hook(void)
17793 (unsigned long *)xen_start_info->mfn_list;
17794 }
17795
17796+#endif /* CONFIG_X86_32 */
17797+
17798+extern void hypervisor_callback(void);
17799+extern void failsafe_callback(void);
17800+extern void nmi(void);
17801+
17802+#ifdef CONFIG_X86_64
17803+#include <asm/proto.h>
17804+#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
17805+#else
17806+#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
17807+#endif
17808+
17809 void __init machine_specific_arch_setup(void)
17810 {
17811 int ret;
17812 static struct callback_register __initdata event = {
17813 .type = CALLBACKTYPE_event,
17814- .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
17815+ .address = CALLBACK_ADDR(hypervisor_callback)
17816 };
17817 static struct callback_register __initdata failsafe = {
17818 .type = CALLBACKTYPE_failsafe,
17819- .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
17820+ .address = CALLBACK_ADDR(failsafe_callback)
17821+ };
17822+#ifdef CONFIG_X86_64
17823+ static struct callback_register __initdata syscall = {
17824+ .type = CALLBACKTYPE_syscall,
17825+ .address = CALLBACK_ADDR(system_call)
17826 };
17827+#endif
17828+#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17829 static struct callback_register __initdata nmi_cb = {
17830 .type = CALLBACKTYPE_nmi,
17831- .address = { __KERNEL_CS, (unsigned long)nmi },
17832+ .address = CALLBACK_ADDR(nmi)
17833 };
17834+#endif
17835
17836 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
17837 if (ret == 0)
17838 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
17839+#ifdef CONFIG_X86_64
17840+ if (ret == 0)
17841+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
17842+#endif
17843 #if CONFIG_XEN_COMPAT <= 0x030002
17844+#ifdef CONFIG_X86_32
17845 if (ret == -ENOSYS)
17846 ret = HYPERVISOR_set_callbacks(
17847 event.address.cs, event.address.eip,
17848 failsafe.address.cs, failsafe.address.eip);
17849+#else
17850+ ret = HYPERVISOR_set_callbacks(
17851+ event.address,
17852+ failsafe.address,
17853+ syscall.address);
17854+#endif
17855 #endif
17856 BUG_ON(ret);
17857
17858+#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17859 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
17860 #if CONFIG_XEN_COMPAT <= 0x030002
17861 if (ret == -ENOSYS) {
17862@@ -158,15 +148,43 @@ void __init machine_specific_arch_setup(
17863 HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
17864 }
17865 #endif
17866+#endif
17867
17868+#ifdef CONFIG_X86_32
17869 /* Do an early initialization of the fixmap area */
17870 {
17871 extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
17872 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
17873 pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
17874 pmd_t *pmd = pmd_offset(pud, addr);
17875+ unsigned int i;
17876
17877 make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
17878 set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
17879+
17880+#define __FIXADDR_TOP (-PAGE_SIZE)
17881+#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
17882+ != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
17883+ FIX_BUG_ON(SHARED_INFO);
17884+ FIX_BUG_ON(ISAMAP_BEGIN);
17885+ FIX_BUG_ON(ISAMAP_END);
17886+#undef __FIXADDR_TOP
17887+ BUG_ON(pte_index(hypervisor_virt_start));
17888+
17889+ /* Switch to the real shared_info page, and clear the
17890+ * dummy page. */
17891+ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
17892+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
17893+ memset(empty_zero_page, 0, sizeof(empty_zero_page));
17894+
17895+ /* Setup mapping of lower 1st MB */
17896+ for (i = 0; i < NR_FIX_ISAMAPS; i++)
17897+ if (is_initial_xendomain())
17898+ set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
17899+ else
17900+ __set_fixmap(FIX_ISAMAP_BEGIN - i,
17901+ virt_to_machine(empty_zero_page),
17902+ PAGE_KERNEL_RO);
17903 }
17904+#endif
17905 }
17906Index: head-2008-12-01/arch/x86/mm/fault-xen.c
17907===================================================================
17908--- head-2008-12-01.orig/arch/x86/mm/fault-xen.c 2008-12-01 11:44:55.000000000 +0100
17909+++ head-2008-12-01/arch/x86/mm/fault-xen.c 2008-12-01 11:49:07.000000000 +0100
17910@@ -10,6 +10,7 @@
17911 #include <linux/string.h>
17912 #include <linux/types.h>
17913 #include <linux/ptrace.h>
17914+#include <linux/mmiotrace.h>
17915 #include <linux/mman.h>
17916 #include <linux/mm.h>
17917 #include <linux/smp.h>
17918@@ -49,17 +50,23 @@
17919 #define PF_RSVD (1<<3)
17920 #define PF_INSTR (1<<4)
17921
17922+static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
17923+{
17924+#ifdef CONFIG_MMIOTRACE_HOOKS
17925+ if (unlikely(is_kmmio_active()))
17926+ if (kmmio_handler(regs, addr) == 1)
17927+ return -1;
17928+#endif
17929+ return 0;
17930+}
17931+
17932 static inline int notify_page_fault(struct pt_regs *regs)
17933 {
17934 #ifdef CONFIG_KPROBES
17935 int ret = 0;
17936
17937 /* kprobe_running() needs smp_processor_id() */
17938-#ifdef CONFIG_X86_32
17939 if (!user_mode_vm(regs)) {
17940-#else
17941- if (!user_mode(regs)) {
17942-#endif
17943 preempt_disable();
17944 if (kprobe_running() && kprobe_fault_handler(regs, 14))
17945 ret = 1;
17946@@ -409,11 +416,7 @@ static void show_fault_oops(struct pt_re
17947 printk(KERN_CONT "NULL pointer dereference");
17948 else
17949 printk(KERN_CONT "paging request");
17950-#ifdef CONFIG_X86_32
17951- printk(KERN_CONT " at %08lx\n", address);
17952-#else
17953- printk(KERN_CONT " at %016lx\n", address);
17954-#endif
17955+ printk(KERN_CONT " at %p\n", (void *) address);
17956 printk(KERN_ALERT "IP:");
17957 printk_address(regs->ip, 1);
17958 dump_pagetable(address);
17959@@ -628,6 +631,8 @@ void __kprobes do_page_fault(struct pt_r
17960
17961 if (notify_page_fault(regs))
17962 return;
17963+ if (unlikely(kmmio_fault(regs, address)))
17964+ return;
17965
17966 /*
17967 * We fault-in kernel-space virtual memory on-demand. The
17968@@ -832,14 +837,10 @@ bad_area_nosemaphore:
17969 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17970 printk_ratelimit()) {
17971 printk(
17972-#ifdef CONFIG_X86_32
17973- "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
17974-#else
17975- "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
17976-#endif
17977+ "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
17978 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
17979- tsk->comm, task_pid_nr(tsk), address, regs->ip,
17980- regs->sp, error_code);
17981+ tsk->comm, task_pid_nr(tsk), address,
17982+ (void *) regs->ip, (void *) regs->sp, error_code);
17983 print_vma_addr(" in ", regs->ip);
17984 printk("\n");
17985 }
17986@@ -947,81 +948,45 @@ LIST_HEAD(pgd_list);
17987 void vmalloc_sync_all(void)
17988 {
17989 #ifdef CONFIG_X86_32
17990- /*
17991- * Note that races in the updates of insync and start aren't
17992- * problematic: insync can only get set bits added, and updates to
17993- * start are only improving performance (without affecting correctness
17994- * if undone).
17995- * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
17996- * This change works just fine with 2-level paging too.
17997- */
17998-#define sync_index(a) ((a) >> PMD_SHIFT)
17999- static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
18000- static unsigned long start = TASK_SIZE;
18001- unsigned long address;
18002+ unsigned long address = VMALLOC_START & PGDIR_MASK;
18003
18004 if (SHARED_KERNEL_PMD)
18005 return;
18006
18007 BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
18008- for (address = start;
18009- address < hypervisor_virt_start;
18010- address += PMD_SIZE) {
18011- if (!test_bit(sync_index(address), insync)) {
18012- unsigned long flags;
18013- struct page *page;
18014-
18015- spin_lock_irqsave(&pgd_lock, flags);
18016- /* XEN: failure path assumes non-empty pgd_list. */
18017- if (unlikely(list_empty(&pgd_list))) {
18018- spin_unlock_irqrestore(&pgd_lock, flags);
18019- return;
18020- }
18021- list_for_each_entry(page, &pgd_list, lru) {
18022- if (!vmalloc_sync_one(page_address(page),
18023- address))
18024- break;
18025- }
18026- spin_unlock_irqrestore(&pgd_lock, flags);
18027- if (!page)
18028- set_bit(sync_index(address), insync);
18029+ for (; address < hypervisor_virt_start; address += PMD_SIZE) {
18030+ unsigned long flags;
18031+ struct page *page;
18032+
18033+ spin_lock_irqsave(&pgd_lock, flags);
18034+ list_for_each_entry(page, &pgd_list, lru) {
18035+ if (!vmalloc_sync_one(page_address(page),
18036+ address))
18037+ break;
18038 }
18039- if (address == start && test_bit(sync_index(address), insync))
18040- start = address + PMD_SIZE;
18041+ spin_unlock_irqrestore(&pgd_lock, flags);
18042 }
18043 #else /* CONFIG_X86_64 */
18044- /*
18045- * Note that races in the updates of insync and start aren't
18046- * problematic: insync can only get set bits added, and updates to
18047- * start are only improving performance (without affecting correctness
18048- * if undone).
18049- */
18050- static DECLARE_BITMAP(insync, PTRS_PER_PGD);
18051- static unsigned long start = VMALLOC_START & PGDIR_MASK;
18052+ unsigned long start = VMALLOC_START & PGDIR_MASK;
18053 unsigned long address;
18054
18055 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
18056- if (!test_bit(pgd_index(address), insync)) {
18057- const pgd_t *pgd_ref = pgd_offset_k(address);
18058- unsigned long flags;
18059- struct page *page;
18060-
18061- if (pgd_none(*pgd_ref))
18062- continue;
18063- spin_lock_irqsave(&pgd_lock, flags);
18064- list_for_each_entry(page, &pgd_list, lru) {
18065- pgd_t *pgd;
18066- pgd = (pgd_t *)page_address(page) + pgd_index(address);
18067- if (pgd_none(*pgd))
18068- set_pgd(pgd, *pgd_ref);
18069- else
18070- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18071- }
18072- spin_unlock_irqrestore(&pgd_lock, flags);
18073- set_bit(pgd_index(address), insync);
18074+ const pgd_t *pgd_ref = pgd_offset_k(address);
18075+ unsigned long flags;
18076+ struct page *page;
18077+
18078+ if (pgd_none(*pgd_ref))
18079+ continue;
18080+ spin_lock_irqsave(&pgd_lock, flags);
18081+ list_for_each_entry(page, &pgd_list, lru) {
18082+ pgd_t *pgd;
18083+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
18084+ if (pgd_none(*pgd))
18085+ set_pgd(pgd, *pgd_ref);
18086+ else
18087+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18088 }
18089- if (address == start)
18090- start = address + PGDIR_SIZE;
18091+ spin_unlock_irqrestore(&pgd_lock, flags);
18092 }
18093 #endif
18094 }
18095Index: head-2008-12-01/arch/x86/mm/hypervisor.c
18096===================================================================
18097--- head-2008-12-01.orig/arch/x86/mm/hypervisor.c 2008-12-01 11:37:10.000000000 +0100
18098+++ head-2008-12-01/arch/x86/mm/hypervisor.c 2008-12-01 11:49:07.000000000 +0100
18099@@ -837,42 +837,9 @@ int write_ldt_entry(struct desc_struct *
18100 return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
18101 }
18102
18103-#define MAX_BATCHED_FULL_PTES 32
18104-
18105-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
18106- unsigned long addr, unsigned long end, pgprot_t newprot,
18107- int dirty_accountable)
18108+int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
18109+ int type)
18110 {
18111- int rc = 0, i = 0;
18112- mmu_update_t u[MAX_BATCHED_FULL_PTES];
18113- pte_t *pte;
18114- spinlock_t *ptl;
18115-
18116- if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
18117- return 0;
18118-
18119- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
18120- do {
18121- if (pte_present(*pte)) {
18122- pte_t ptent = pte_modify(*pte, newprot);
18123-
18124- if (dirty_accountable && pte_dirty(ptent))
18125- ptent = pte_mkwrite(ptent);
18126- u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
18127- | ((unsigned long)pte & ~PAGE_MASK)
18128- | MMU_PT_UPDATE_PRESERVE_AD;
18129- u[i].val = __pte_val(ptent);
18130- if (++i == MAX_BATCHED_FULL_PTES) {
18131- if ((rc = HYPERVISOR_mmu_update(
18132- &u[0], i, NULL, DOMID_SELF)) != 0)
18133- break;
18134- i = 0;
18135- }
18136- }
18137- } while (pte++, addr += PAGE_SIZE, addr != end);
18138- if (i)
18139- rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
18140- pte_unmap_unlock(pte - 1, ptl);
18141- BUG_ON(rc && rc != -ENOSYS);
18142- return !rc;
18143+ maddr_t mach_gp = virt_to_machine(gdt + entry);
18144+ return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
18145 }
18146Index: head-2008-12-01/arch/x86/mm/init_32-xen.c
18147===================================================================
18148--- head-2008-12-01.orig/arch/x86/mm/init_32-xen.c 2008-12-01 11:44:55.000000000 +0100
18149+++ head-2008-12-01/arch/x86/mm/init_32-xen.c 2008-12-01 11:49:07.000000000 +0100
18150@@ -54,6 +54,7 @@
18151
18152 unsigned int __VMALLOC_RESERVE = 128 << 20;
18153
18154+unsigned long max_low_pfn_mapped;
18155 unsigned long max_pfn_mapped;
18156
18157 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
18158@@ -61,6 +62,27 @@ unsigned long highstart_pfn, highend_pfn
18159
18160 static noinline int do_test_wp_bit(void);
18161
18162+
18163+static unsigned long __initdata table_start;
18164+static unsigned long __initdata table_end;
18165+static unsigned long __initdata table_top;
18166+
18167+static int __initdata after_init_bootmem;
18168+
18169+static __init void *alloc_low_page(unsigned long *phys)
18170+{
18171+ unsigned long pfn = table_end++;
18172+ void *adr;
18173+
18174+ if (pfn >= table_top)
18175+ panic("alloc_low_page: ran out of memory");
18176+
18177+ adr = __va(pfn * PAGE_SIZE);
18178+ memset(adr, 0, PAGE_SIZE);
18179+ *phys = pfn * PAGE_SIZE;
18180+ return adr;
18181+}
18182+
18183 /*
18184 * Creates a middle page table and puts a pointer to it in the
18185 * given global directory entry. This only returns the gd entry
18186@@ -72,9 +94,12 @@ static pmd_t * __init one_md_table_init(
18187 pmd_t *pmd_table;
18188
18189 #ifdef CONFIG_X86_PAE
18190+ unsigned long phys;
18191 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
18192- pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18193-
18194+ if (after_init_bootmem)
18195+ pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18196+ else
18197+ pmd_table = (pmd_t *)alloc_low_page(&phys);
18198 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
18199 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
18200 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
18201@@ -101,12 +126,16 @@ static pte_t * __init one_page_table_ini
18202 #endif
18203 pte_t *page_table = NULL;
18204
18205+ if (after_init_bootmem) {
18206 #ifdef CONFIG_DEBUG_PAGEALLOC
18207- page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18208+ page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18209 #endif
18210- if (!page_table) {
18211- page_table =
18212+ if (!page_table)
18213+ page_table =
18214 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18215+ } else {
18216+ unsigned long phys;
18217+ page_table = (pte_t *)alloc_low_page(&phys);
18218 }
18219
18220 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
18221@@ -167,24 +196,24 @@ static inline int is_kernel_text(unsigne
18222 * of max_low_pfn pages, by creating page tables starting from address
18223 * PAGE_OFFSET:
18224 */
18225-static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
18226+static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
18227+ unsigned long start_pfn,
18228+ unsigned long end_pfn,
18229+ int use_pse)
18230 {
18231 int pgd_idx, pmd_idx, pte_ofs;
18232 unsigned long pfn;
18233 pgd_t *pgd;
18234 pmd_t *pmd;
18235 pte_t *pte;
18236+ unsigned pages_2m = 0, pages_4k = 0;
18237
18238- unsigned long max_ram_pfn = xen_start_info->nr_pages;
18239- if (max_ram_pfn > max_low_pfn)
18240- max_ram_pfn = max_low_pfn;
18241+ if (!cpu_has_pse)
18242+ use_pse = 0;
18243
18244- pgd_idx = pgd_index(PAGE_OFFSET);
18245+ pfn = start_pfn;
18246+ pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18247 pgd = pgd_base + pgd_idx;
18248- pfn = 0;
18249- pmd_idx = pmd_index(PAGE_OFFSET);
18250- pte_ofs = pte_index(PAGE_OFFSET);
18251-
18252 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
18253 #ifdef CONFIG_XEN
18254 /*
18255@@ -198,10 +227,16 @@ static void __init kernel_physical_mappi
18256 #else
18257 pmd = one_md_table_init(pgd);
18258 #endif
18259- if (pfn >= max_low_pfn)
18260+
18261+ if (pfn >= end_pfn)
18262 continue;
18263+#ifdef CONFIG_X86_PAE
18264+ pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18265 pmd += pmd_idx;
18266- for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
18267+#else
18268+ pmd_idx = 0;
18269+#endif
18270+ for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
18271 pmd++, pmd_idx++) {
18272 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
18273
18274@@ -211,13 +246,8 @@ static void __init kernel_physical_mappi
18275 /*
18276 * Map with big pages if possible, otherwise
18277 * create normal page tables:
18278- *
18279- * Don't use a large page for the first 2/4MB of memory
18280- * because there are often fixed size MTRRs in there
18281- * and overlapping MTRRs into large pages can cause
18282- * slowdowns.
18283 */
18284- if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
18285+ if (use_pse) {
18286 unsigned int addr2;
18287 pgprot_t prot = PAGE_KERNEL_LARGE;
18288
18289@@ -228,49 +258,35 @@ static void __init kernel_physical_mappi
18290 is_kernel_text(addr2))
18291 prot = PAGE_KERNEL_LARGE_EXEC;
18292
18293+ pages_2m++;
18294 set_pmd(pmd, pfn_pmd(pfn, prot));
18295
18296 pfn += PTRS_PER_PTE;
18297- max_pfn_mapped = pfn;
18298 continue;
18299 }
18300 pte = one_page_table_init(pmd);
18301
18302- for (pte += pte_ofs;
18303- pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
18304+ pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18305+ pte += pte_ofs;
18306+ for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
18307 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
18308 pgprot_t prot = PAGE_KERNEL;
18309
18310 /* XEN: Only map initial RAM allocation. */
18311- if ((pfn >= max_ram_pfn) || pte_present(*pte))
18312+ if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
18313 continue;
18314 if (is_kernel_text(addr))
18315 prot = PAGE_KERNEL_EXEC;
18316
18317+ pages_4k++;
18318 set_pte(pte, pfn_pte(pfn, prot));
18319 }
18320- max_pfn_mapped = pfn;
18321- pte_ofs = 0;
18322 }
18323- pmd_idx = 0;
18324 }
18325+ update_page_count(PG_LEVEL_2M, pages_2m);
18326+ update_page_count(PG_LEVEL_4K, pages_4k);
18327 }
18328
18329-#ifndef CONFIG_XEN
18330-
18331-static inline int page_kills_ppro(unsigned long pagenr)
18332-{
18333- if (pagenr >= 0x70000 && pagenr <= 0x7003F)
18334- return 1;
18335- return 0;
18336-}
18337-
18338-#else
18339-
18340-#define page_kills_ppro(p) 0
18341-
18342-#endif
18343-
18344 /*
18345 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
18346 * is valid. The argument is a physical page number.
18347@@ -331,30 +347,63 @@ static void __init permanent_kmaps_init(
18348 pkmap_page_table = pte;
18349 }
18350
18351-void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
18352+static void __init add_one_highpage_init(struct page *page, int pfn)
18353 {
18354- if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
18355- ClearPageReserved(page);
18356- init_page_count(page);
18357- if (pfn < xen_start_info->nr_pages)
18358- __free_page(page);
18359- totalhigh_pages++;
18360- } else
18361- SetPageReserved(page);
18362+ ClearPageReserved(page);
18363+ init_page_count(page);
18364+ if (pfn < xen_start_info->nr_pages)
18365+ __free_page(page);
18366+ totalhigh_pages++;
18367+}
18368+
18369+struct add_highpages_data {
18370+ unsigned long start_pfn;
18371+ unsigned long end_pfn;
18372+};
18373+
18374+static int __init add_highpages_work_fn(unsigned long start_pfn,
18375+ unsigned long end_pfn, void *datax)
18376+{
18377+ int node_pfn;
18378+ struct page *page;
18379+ unsigned long final_start_pfn, final_end_pfn;
18380+ struct add_highpages_data *data;
18381+
18382+ data = (struct add_highpages_data *)datax;
18383+
18384+ final_start_pfn = max(start_pfn, data->start_pfn);
18385+ final_end_pfn = min(end_pfn, data->end_pfn);
18386+ if (final_start_pfn >= final_end_pfn)
18387+ return 0;
18388+
18389+ for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
18390+ node_pfn++) {
18391+ if (!pfn_valid(node_pfn))
18392+ continue;
18393+ page = pfn_to_page(node_pfn);
18394+ add_one_highpage_init(page, node_pfn);
18395+ }
18396+
18397+ return 0;
18398+
18399+}
18400+
18401+void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
18402+ unsigned long end_pfn)
18403+{
18404+ struct add_highpages_data data;
18405+
18406+ data.start_pfn = start_pfn;
18407+ data.end_pfn = end_pfn;
18408+
18409+ work_with_active_regions(nid, add_highpages_work_fn, &data);
18410 }
18411
18412 #ifndef CONFIG_NUMA
18413-static void __init set_highmem_pages_init(int bad_ppro)
18414+static void __init set_highmem_pages_init(void)
18415 {
18416- int pfn;
18417+ add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
18418
18419- for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
18420- /*
18421- * Holes under sparsemem might not have no mem_map[]:
18422- */
18423- if (pfn_valid(pfn))
18424- add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
18425- }
18426 totalram_pages += totalhigh_pages;
18427 }
18428 #endif /* !CONFIG_NUMA */
18429@@ -362,24 +411,11 @@ static void __init set_highmem_pages_ini
18430 #else
18431 # define kmap_init() do { } while (0)
18432 # define permanent_kmaps_init(pgd_base) do { } while (0)
18433-# define set_highmem_pages_init(bad_ppro) do { } while (0)
18434+# define set_highmem_pages_init() do { } while (0)
18435 #endif /* CONFIG_HIGHMEM */
18436
18437-pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
18438-EXPORT_SYMBOL(__PAGE_KERNEL);
18439-
18440-pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
18441-
18442 pgd_t *swapper_pg_dir;
18443
18444-static void __init xen_pagetable_setup_start(pgd_t *base)
18445-{
18446-}
18447-
18448-static void __init xen_pagetable_setup_done(pgd_t *base)
18449-{
18450-}
18451-
18452 /*
18453 * Build a proper pagetable for the kernel mappings. Up until this
18454 * point, we've been running on some set of pagetables constructed by
18455@@ -399,27 +435,10 @@ static void __init xen_pagetable_setup_d
18456 * be partially populated, and so it avoids stomping on any existing
18457 * mappings.
18458 */
18459-static void __init pagetable_init(void)
18460+static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
18461 {
18462- pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18463 unsigned long vaddr, end;
18464
18465- xen_pagetable_setup_start(pgd_base);
18466-
18467- /* Enable PSE if available */
18468- if (cpu_has_pse)
18469- set_in_cr4(X86_CR4_PSE);
18470-
18471- /* Enable PGE if available */
18472- if (cpu_has_pge) {
18473- set_in_cr4(X86_CR4_PGE);
18474- __PAGE_KERNEL |= _PAGE_GLOBAL;
18475- __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
18476- }
18477-
18478- kernel_physical_mapping_init(pgd_base);
18479- remap_numa_kva();
18480-
18481 /*
18482 * Fixed mappings, only the page table structure has to be
18483 * created - mappings will be set by set_fixmap():
18484@@ -429,10 +448,13 @@ static void __init pagetable_init(void)
18485 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
18486 page_table_range_init(vaddr, end, pgd_base);
18487 early_ioremap_reset();
18488+}
18489
18490- permanent_kmaps_init(pgd_base);
18491+static void __init pagetable_init(void)
18492+{
18493+ pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18494
18495- xen_pagetable_setup_done(pgd_base);
18496+ permanent_kmaps_init(pgd_base);
18497 }
18498
18499 #if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
18500@@ -475,7 +497,7 @@ void zap_low_mappings(void)
18501
18502 int nx_enabled;
18503
18504-pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
18505+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
18506 EXPORT_SYMBOL_GPL(__supported_pte_mask);
18507
18508 #ifdef CONFIG_X86_PAE
18509@@ -528,42 +550,369 @@ static void __init set_nx(void)
18510 }
18511 #endif
18512
18513+/* user-defined highmem size */
18514+static unsigned int highmem_pages = -1;
18515+
18516 /*
18517- * paging_init() sets up the page tables - note that the first 8MB are
18518- * already mapped by head.S.
18519- *
18520- * This routines also unmaps the page at virtual kernel address 0, so
18521- * that we can trap those pesky NULL-reference errors in the kernel.
18522+ * highmem=size forces highmem to be exactly 'size' bytes.
18523+ * This works even on boxes that have no highmem otherwise.
18524+ * This also works to reduce highmem size on bigger boxes.
18525 */
18526-void __init paging_init(void)
18527+static int __init parse_highmem(char *arg)
18528+{
18529+ if (!arg)
18530+ return -EINVAL;
18531+
18532+ highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
18533+ return 0;
18534+}
18535+early_param("highmem", parse_highmem);
18536+
18537+/*
18538+ * Determine low and high memory ranges:
18539+ */
18540+void __init find_low_pfn_range(void)
18541+{
18542+ /* it could update max_pfn */
18543+
18544+ /* max_low_pfn is 0, we already have early_res support */
18545+
18546+ max_low_pfn = max_pfn;
18547+ if (max_low_pfn > MAXMEM_PFN) {
18548+ if (highmem_pages == -1)
18549+ highmem_pages = max_pfn - MAXMEM_PFN;
18550+ if (highmem_pages + MAXMEM_PFN < max_pfn)
18551+ max_pfn = MAXMEM_PFN + highmem_pages;
18552+ if (highmem_pages + MAXMEM_PFN > max_pfn) {
18553+ printk(KERN_WARNING "only %luMB highmem pages "
18554+ "available, ignoring highmem size of %uMB.\n",
18555+ pages_to_mb(max_pfn - MAXMEM_PFN),
18556+ pages_to_mb(highmem_pages));
18557+ highmem_pages = 0;
18558+ }
18559+ max_low_pfn = MAXMEM_PFN;
18560+#ifndef CONFIG_HIGHMEM
18561+ /* Maximum memory usable is what is directly addressable */
18562+ printk(KERN_WARNING "Warning only %ldMB will be used.\n",
18563+ MAXMEM>>20);
18564+ if (max_pfn > MAX_NONPAE_PFN)
18565+ printk(KERN_WARNING
18566+ "Use a HIGHMEM64G enabled kernel.\n");
18567+ else
18568+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
18569+ max_pfn = MAXMEM_PFN;
18570+#else /* !CONFIG_HIGHMEM */
18571+#ifndef CONFIG_HIGHMEM64G
18572+ if (max_pfn > MAX_NONPAE_PFN) {
18573+ max_pfn = MAX_NONPAE_PFN;
18574+ printk(KERN_WARNING "Warning only 4GB will be used."
18575+ "Use a HIGHMEM64G enabled kernel.\n");
18576+ }
18577+#endif /* !CONFIG_HIGHMEM64G */
18578+#endif /* !CONFIG_HIGHMEM */
18579+ } else {
18580+ if (highmem_pages == -1)
18581+ highmem_pages = 0;
18582+#ifdef CONFIG_HIGHMEM
18583+ if (highmem_pages >= max_pfn) {
18584+ printk(KERN_ERR "highmem size specified (%uMB) is "
18585+ "bigger than pages available (%luMB)!.\n",
18586+ pages_to_mb(highmem_pages),
18587+ pages_to_mb(max_pfn));
18588+ highmem_pages = 0;
18589+ }
18590+ if (highmem_pages) {
18591+ if (max_low_pfn - highmem_pages <
18592+ 64*1024*1024/PAGE_SIZE){
18593+ printk(KERN_ERR "highmem size %uMB results in "
18594+ "smaller than 64MB lowmem, ignoring it.\n"
18595+ , pages_to_mb(highmem_pages));
18596+ highmem_pages = 0;
18597+ }
18598+ max_low_pfn -= highmem_pages;
18599+ }
18600+#else
18601+ if (highmem_pages)
18602+ printk(KERN_ERR "ignoring highmem size on non-highmem"
18603+ " kernel!\n");
18604+#endif
18605+ }
18606+}
18607+
18608+#ifndef CONFIG_NEED_MULTIPLE_NODES
18609+void __init initmem_init(unsigned long start_pfn,
18610+ unsigned long end_pfn)
18611+{
18612+#ifdef CONFIG_HIGHMEM
18613+ highstart_pfn = highend_pfn = max_pfn;
18614+ if (max_pfn > max_low_pfn)
18615+ highstart_pfn = max_low_pfn;
18616+ memory_present(0, 0, highend_pfn);
18617+ e820_register_active_regions(0, 0, highend_pfn);
18618+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
18619+ pages_to_mb(highend_pfn - highstart_pfn));
18620+ num_physpages = highend_pfn;
18621+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
18622+#else
18623+ memory_present(0, 0, max_low_pfn);
18624+ e820_register_active_regions(0, 0, max_low_pfn);
18625+ num_physpages = max_low_pfn;
18626+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
18627+#endif
18628+#ifdef CONFIG_FLATMEM
18629+ max_mapnr = num_physpages;
18630+#endif
18631+ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
18632+ pages_to_mb(max_low_pfn));
18633+
18634+ setup_bootmem_allocator();
18635+}
18636+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
18637+
18638+static void __init zone_sizes_init(void)
18639+{
18640+ unsigned long max_zone_pfns[MAX_NR_ZONES];
18641+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
18642+ max_zone_pfns[ZONE_DMA] =
18643+ virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
18644+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
18645+#ifdef CONFIG_HIGHMEM
18646+ max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
18647+#endif
18648+
18649+ free_area_init_nodes(max_zone_pfns);
18650+}
18651+
18652+void __init setup_bootmem_allocator(void)
18653 {
18654 int i;
18655+ unsigned long bootmap_size, bootmap;
18656+ unsigned long end_pfn = min(max_low_pfn, xen_start_info->nr_pages);
18657+
18658+ /*
18659+ * Initialize the boot-time allocator (with low memory only):
18660+ */
18661+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
18662+ bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
18663+ max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
18664+ PAGE_SIZE);
18665+ if (bootmap == -1L)
18666+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
18667+ reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
18668+
18669+ /* don't touch min_low_pfn */
18670+ bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
18671+ min_low_pfn, end_pfn);
18672+ printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
18673+ max_pfn_mapped<<PAGE_SHIFT);
18674+ printk(KERN_INFO " low ram: %08lx - %08lx\n",
18675+ min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
18676+ printk(KERN_INFO " bootmap %08lx - %08lx\n",
18677+ bootmap, bootmap + bootmap_size);
18678+ for_each_online_node(i)
18679+ free_bootmem_with_active_regions(i, end_pfn);
18680+ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
18681+
18682+ after_init_bootmem = 1;
18683+}
18684+
18685+static unsigned long __init extend_init_mapping(unsigned long tables_space)
18686+{
18687+ unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
18688+ + xen_start_info->nr_pt_frames;
18689+ unsigned long start = start_pfn, va;
18690+ pgd_t *pgd;
18691+ pud_t *pud;
18692+ pmd_t *pmd;
18693+ pte_t *pte;
18694+
18695+ /* Kill mapping of low 1MB. */
18696+ for (va = PAGE_OFFSET; va < (unsigned long)&_text; va += PAGE_SIZE)
18697+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
18698+ BUG();
18699+
18700+ /* Ensure init mappings cover kernel text/data and initial tables. */
18701+ while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
18702+ pgd = pgd_offset_k(va);
18703+ pud = pud_offset(pgd, va);
18704+ pmd = pmd_offset(pud, va);
18705+ if (pmd_none(*pmd)) {
18706+ unsigned long pa = start_pfn++ << PAGE_SHIFT;
18707+
18708+ memset(__va(pa), 0, PAGE_SIZE);
18709+ make_lowmem_page_readonly(__va(pa),
18710+ XENFEAT_writable_page_tables);
18711+ xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
18712+ }
18713+ pte = pte_offset_kernel(pmd, va);
18714+ if (pte_none(*pte)) {
18715+ pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
18716+
18717+ if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
18718+ BUG();
18719+ }
18720+ va += PAGE_SIZE;
18721+ }
18722+
18723+ /* Finally, blow away any spurious initial mappings. */
18724+ while (1) {
18725+ pgd = pgd_offset_k(va);
18726+ pud = pud_offset(pgd, va);
18727+ pmd = pmd_offset(pud, va);
18728+ if (pmd_none(*pmd))
18729+ break;
18730+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
18731+ BUG();
18732+ va += PAGE_SIZE;
18733+ }
18734+
18735+ if (start_pfn > start)
18736+ reserve_early(start << PAGE_SHIFT,
18737+ start_pfn << PAGE_SHIFT, "INITMAP");
18738+
18739+ return start_pfn;
18740+}
18741+
18742+static void __init find_early_table_space(unsigned long end)
18743+{
18744+ unsigned long puds, pmds, ptes, tables;
18745+
18746+ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
18747+ tables = PAGE_ALIGN(puds * sizeof(pud_t));
18748+
18749+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
18750+ tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
18751+
18752+ if (cpu_has_pse) {
18753+ unsigned long extra;
18754+
18755+ extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
18756+ extra += PMD_SIZE;
18757+ ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
18758+ } else
18759+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
18760+
18761+ tables += PAGE_ALIGN(ptes * sizeof(pte_t));
18762+
18763+ /* for fixmap */
18764+ tables += PAGE_SIZE
18765+ * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK)
18766+ - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK))
18767+ >> PMD_SHIFT);
18768+
18769+ table_start = extend_init_mapping(tables);
18770+
18771+ table_end = table_start;
18772+ table_top = table_start + (tables>>PAGE_SHIFT);
18773+
18774+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
18775+ end, table_start << PAGE_SHIFT,
18776+ (table_start << PAGE_SHIFT) + tables);
18777+}
18778+
18779+unsigned long __init_refok init_memory_mapping(unsigned long start,
18780+ unsigned long end)
18781+{
18782+ pgd_t *pgd_base = swapper_pg_dir;
18783+ unsigned long start_pfn, end_pfn;
18784+ unsigned long big_page_start;
18785+
18786+ /*
18787+ * Find space for the kernel direct mapping tables.
18788+ */
18789+ if (!after_init_bootmem)
18790+ find_early_table_space(end);
18791
18792 #ifdef CONFIG_X86_PAE
18793 set_nx();
18794 if (nx_enabled)
18795 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
18796 #endif
18797+
18798+ /* Enable PSE if available */
18799+ if (cpu_has_pse)
18800+ set_in_cr4(X86_CR4_PSE);
18801+
18802+ /* Enable PGE if available */
18803+ if (cpu_has_pge) {
18804+ set_in_cr4(X86_CR4_PGE);
18805+ __supported_pte_mask |= _PAGE_GLOBAL;
18806+ }
18807+
18808+ /*
18809+ * Don't use a large page for the first 2/4MB of memory
18810+ * because there are often fixed size MTRRs in there
18811+ * and overlapping MTRRs into large pages can cause
18812+ * slowdowns.
18813+ */
18814+ big_page_start = PMD_SIZE;
18815+
18816+ if (start < big_page_start) {
18817+ start_pfn = start >> PAGE_SHIFT;
18818+ end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
18819+ } else {
18820+ /* head is not big page alignment ? */
18821+ start_pfn = start >> PAGE_SHIFT;
18822+ end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18823+ << (PMD_SHIFT - PAGE_SHIFT);
18824+ }
18825+ if (start_pfn < end_pfn)
18826+ kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
18827+
18828+ /* big page range */
18829+ start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18830+ << (PMD_SHIFT - PAGE_SHIFT);
18831+ if (start_pfn < (big_page_start >> PAGE_SHIFT))
18832+ start_pfn = big_page_start >> PAGE_SHIFT;
18833+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
18834+ if (start_pfn < end_pfn)
18835+ kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
18836+ cpu_has_pse);
18837+
18838+ /* tail is not big page alignment ? */
18839+ start_pfn = end_pfn;
18840+ if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
18841+ end_pfn = end >> PAGE_SHIFT;
18842+ if (start_pfn < end_pfn)
18843+ kernel_physical_mapping_init(pgd_base, start_pfn,
18844+ end_pfn, 0);
18845+ }
18846+
18847+ early_ioremap_page_table_range_init(pgd_base);
18848+
18849+ __flush_tlb_all();
18850+
18851+ if (!after_init_bootmem)
18852+ reserve_early(table_start << PAGE_SHIFT,
18853+ table_end << PAGE_SHIFT, "PGTABLE");
18854+
18855+ if (!after_init_bootmem)
18856+ early_memtest(start, end);
18857+
18858+ return end >> PAGE_SHIFT;
18859+}
18860+
18861+
18862+/*
18863+ * paging_init() sets up the page tables - note that the first 8MB are
18864+ * already mapped by head.S.
18865+ *
18866+ * This routines also unmaps the page at virtual kernel address 0, so
18867+ * that we can trap those pesky NULL-reference errors in the kernel.
18868+ */
18869+void __init paging_init(void)
18870+{
18871 pagetable_init();
18872
18873 __flush_tlb_all();
18874
18875 kmap_init();
18876
18877- /* Switch to the real shared_info page, and clear the
18878- * dummy page. */
18879- set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
18880- HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
18881- memset(empty_zero_page, 0, sizeof(empty_zero_page));
18882-
18883- /* Setup mapping of lower 1st MB */
18884- for (i = 0; i < NR_FIX_ISAMAPS; i++)
18885- if (is_initial_xendomain())
18886- set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
18887- else
18888- __set_fixmap(FIX_ISAMAP_BEGIN - i,
18889- virt_to_machine(empty_zero_page),
18890- PAGE_KERNEL_RO);
18891+ /*
18892+ * NOTE: at this point the bootmem allocator is fully available.
18893+ */
18894+ sparse_init();
18895+ zone_sizes_init();
18896 }
18897
18898 /*
18899@@ -598,7 +947,7 @@ static struct kcore_list kcore_mem, kcor
18900 void __init mem_init(void)
18901 {
18902 int codesize, reservedpages, datasize, initsize;
18903- int tmp, bad_ppro;
18904+ int tmp;
18905 unsigned long pfn;
18906
18907 pci_iommu_alloc();
18908@@ -606,19 +955,6 @@ void __init mem_init(void)
18909 #ifdef CONFIG_FLATMEM
18910 BUG_ON(!mem_map);
18911 #endif
18912- bad_ppro = ppro_with_ram_bug();
18913-
18914-#ifdef CONFIG_HIGHMEM
18915- /* check that fixmap and pkmap do not overlap */
18916- if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
18917- printk(KERN_ERR
18918- "fixmap and kmap areas overlap - this will crash\n");
18919- printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
18920- PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
18921- FIXADDR_START);
18922- BUG();
18923- }
18924-#endif
18925 /* this will put all low memory onto the freelists */
18926 totalram_pages += free_all_bootmem();
18927 /* XEN: init and count low-mem pages outside initial allocation. */
18928@@ -636,7 +972,7 @@ void __init mem_init(void)
18929 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
18930 reservedpages++;
18931
18932- set_highmem_pages_init(bad_ppro);
18933+ set_highmem_pages_init();
18934
18935 codesize = (unsigned long) &_etext - (unsigned long) &_text;
18936 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
18937@@ -657,7 +993,6 @@ void __init mem_init(void)
18938 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
18939 );
18940
18941-#if 1 /* double-sanity-check paranoia */
18942 printk(KERN_INFO "virtual kernel memory layout:\n"
18943 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
18944 #ifdef CONFIG_HIGHMEM
18945@@ -698,7 +1033,6 @@ void __init mem_init(void)
18946 #endif
18947 BUG_ON(VMALLOC_START > VMALLOC_END);
18948 BUG_ON((unsigned long)high_memory > VMALLOC_START);
18949-#endif /* double-sanity-check paranoia */
18950
18951 if (boot_cpu_data.wp_works_ok < 0)
18952 test_wp_bit();
18953@@ -755,6 +1089,8 @@ void mark_rodata_ro(void)
18954 unsigned long start = PFN_ALIGN(_text);
18955 unsigned long size = PFN_ALIGN(_etext) - start;
18956
18957+#ifndef CONFIG_DYNAMIC_FTRACE
18958+ /* Dynamic tracing modifies the kernel text section */
18959 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
18960 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
18961 size >> 10);
18962@@ -767,6 +1103,8 @@ void mark_rodata_ro(void)
18963 printk(KERN_INFO "Testing CPA: write protecting again\n");
18964 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
18965 #endif
18966+#endif /* CONFIG_DYNAMIC_FTRACE */
18967+
18968 start += size;
18969 size = (unsigned long)__end_rodata - start;
18970 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
18971@@ -829,3 +1167,9 @@ void free_initrd_mem(unsigned long start
18972 free_init_pages("initrd memory", start, end);
18973 }
18974 #endif
18975+
18976+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
18977+ int flags)
18978+{
18979+ return reserve_bootmem(phys, len, flags);
18980+}
18981Index: head-2008-12-01/arch/x86/mm/init_64-xen.c
18982===================================================================
18983--- head-2008-12-01.orig/arch/x86/mm/init_64-xen.c 2008-12-01 11:44:55.000000000 +0100
18984+++ head-2008-12-01/arch/x86/mm/init_64-xen.c 2008-12-01 11:49:07.000000000 +0100
18985@@ -21,6 +21,7 @@
18986 #include <linux/swap.h>
18987 #include <linux/smp.h>
18988 #include <linux/init.h>
18989+#include <linux/initrd.h>
18990 #include <linux/pagemap.h>
18991 #include <linux/bootmem.h>
18992 #include <linux/proc_fs.h>
18993@@ -52,6 +53,14 @@
18994
18995 #include <xen/features.h>
18996
18997+/*
18998+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
18999+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
19000+ * apertures, ACPI and other tables without having to play with fixmaps.
19001+ */
19002+unsigned long max_low_pfn_mapped;
19003+unsigned long max_pfn_mapped;
19004+
19005 #if CONFIG_XEN_COMPAT <= 0x030002
19006 unsigned int __kernel_page_user;
19007 EXPORT_SYMBOL(__kernel_page_user);
19008@@ -60,12 +69,11 @@ EXPORT_SYMBOL(__kernel_page_user);
19009 int after_bootmem;
19010
19011 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
19012-extern unsigned long start_pfn;
19013
19014 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
19015 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
19016
19017-int direct_gbpages __meminitdata
19018+int direct_gbpages
19019 #ifdef CONFIG_DIRECT_GBPAGES
19020 = 1
19021 #endif
19022@@ -145,55 +153,23 @@ void __meminit early_make_page_readonly(
19023 * around without checking the pgd every time.
19024 */
19025
19026-void show_mem(void)
19027-{
19028- long i, total = 0, reserved = 0;
19029- long shared = 0, cached = 0;
19030- struct page *page;
19031- pg_data_t *pgdat;
19032-
19033- printk(KERN_INFO "Mem-info:\n");
19034- show_free_areas();
19035- for_each_online_pgdat(pgdat) {
19036- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
19037- /*
19038- * This loop can take a while with 256 GB and
19039- * 4k pages so defer the NMI watchdog:
19040- */
19041- if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
19042- touch_nmi_watchdog();
19043-
19044- if (!pfn_valid(pgdat->node_start_pfn + i))
19045- continue;
19046-
19047- page = pfn_to_page(pgdat->node_start_pfn + i);
19048- total++;
19049- if (PageReserved(page))
19050- reserved++;
19051- else if (PageSwapCache(page))
19052- cached++;
19053- else if (page_count(page))
19054- shared += page_count(page) - 1;
19055- }
19056- }
19057- printk(KERN_INFO "%lu pages of RAM\n", total);
19058- printk(KERN_INFO "%lu reserved pages\n", reserved);
19059- printk(KERN_INFO "%lu pages shared\n", shared);
19060- printk(KERN_INFO "%lu pages swap cached\n", cached);
19061-}
19062-
19063 static unsigned long __meminitdata table_start;
19064+static unsigned long __meminitdata table_cur;
19065 static unsigned long __meminitdata table_end;
19066
19067-static __init void *spp_getpage(void)
19068+/*
19069+ * NOTE: This function is marked __ref because it calls __init function
19070+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
19071+ */
19072+static __ref void *spp_getpage(void)
19073 {
19074 void *ptr;
19075
19076 if (after_bootmem)
19077 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
19078- else if (start_pfn < table_end) {
19079- ptr = __va(start_pfn << PAGE_SHIFT);
19080- start_pfn++;
19081+ else if (table_cur < table_end) {
19082+ ptr = __va(table_cur << PAGE_SHIFT);
19083+ table_cur++;
19084 memset(ptr, 0, PAGE_SIZE);
19085 } else
19086 ptr = alloc_bootmem_pages(PAGE_SIZE);
19087@@ -208,30 +184,18 @@ static __init void *spp_getpage(void)
19088 return ptr;
19089 }
19090
19091-#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
19092-#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
19093-
19094-static __init void
19095-set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
19096+void
19097+set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
19098 {
19099- pgd_t *pgd;
19100 pud_t *pud;
19101 pmd_t *pmd;
19102- pte_t *pte, new_pte;
19103-
19104- pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
19105+ pte_t *pte;
19106
19107- pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
19108- if (pgd_none(*pgd)) {
19109- printk(KERN_ERR
19110- "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19111- return;
19112- }
19113- pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
19114+ pud = pud_page + pud_index(vaddr);
19115 if (pud_none(*pud)) {
19116 pmd = (pmd_t *) spp_getpage();
19117 make_page_readonly(pmd, XENFEAT_writable_page_tables);
19118- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19119+ pud_populate(&init_mm, pud, pmd);
19120 if (pmd != pmd_offset(pud, 0)) {
19121 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19122 pmd, pmd_offset(pud, 0));
19123@@ -242,19 +206,20 @@ set_pte_phys(unsigned long vaddr, unsign
19124 if (pmd_none(*pmd)) {
19125 pte = (pte_t *) spp_getpage();
19126 make_page_readonly(pte, XENFEAT_writable_page_tables);
19127- set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19128+ pmd_populate_kernel(&init_mm, pmd, pte);
19129 if (pte != pte_offset_kernel(pmd, 0)) {
19130 printk(KERN_ERR "PAGETABLE BUG #02!\n");
19131 return;
19132 }
19133 }
19134- if (pgprot_val(prot))
19135- new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
19136- else
19137- new_pte = __pte(0);
19138
19139 pte = pte_offset_kernel(pmd, vaddr);
19140 if (!pte_none(*pte) && __pte_val(new_pte) &&
19141+#ifdef CONFIG_ACPI
19142+ /* __acpi_map_table() fails to properly call clear_fixmap() */
19143+ (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19144+ vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19145+#endif
19146 __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19147 pte_ERROR(*pte);
19148 set_pte(pte, new_pte);
19149@@ -266,15 +231,13 @@ set_pte_phys(unsigned long vaddr, unsign
19150 __flush_tlb_one(vaddr);
19151 }
19152
19153-static __init void
19154-set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
19155+void
19156+set_pte_vaddr(unsigned long vaddr, pte_t pteval)
19157 {
19158 pgd_t *pgd;
19159- pud_t *pud;
19160- pmd_t *pmd;
19161- pte_t *pte, new_pte;
19162+ pud_t *pud_page;
19163
19164- pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
19165+ pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
19166
19167 pgd = pgd_offset_k(vaddr);
19168 if (pgd_none(*pgd)) {
19169@@ -282,47 +245,51 @@ set_pte_phys_ma(unsigned long vaddr, uns
19170 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19171 return;
19172 }
19173- pud = pud_offset(pgd, vaddr);
19174- if (pud_none(*pud)) {
19175- pmd = (pmd_t *) spp_getpage();
19176- make_page_readonly(pmd, XENFEAT_writable_page_tables);
19177- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19178- if (pmd != pmd_offset(pud, 0)) {
19179- printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19180- pmd, pmd_offset(pud, 0));
19181+ pud_page = (pud_t*)pgd_page_vaddr(*pgd);
19182+ set_pte_vaddr_pud(pud_page, vaddr, pteval);
19183+}
19184+
19185+#ifndef CONFIG_XEN
19186+/*
19187+ * Create large page table mappings for a range of physical addresses.
19188+ */
19189+static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
19190+ pgprot_t prot)
19191+{
19192+ pgd_t *pgd;
19193+ pud_t *pud;
19194+ pmd_t *pmd;
19195+
19196+ BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
19197+ for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
19198+ pgd = pgd_offset_k((unsigned long)__va(phys));
19199+ if (pgd_none(*pgd)) {
19200+ pud = (pud_t *) spp_getpage();
19201+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
19202+ _PAGE_USER));
19203 }
19204- }
19205- pmd = pmd_offset(pud, vaddr);
19206- if (pmd_none(*pmd)) {
19207- pte = (pte_t *) spp_getpage();
19208- make_page_readonly(pte, XENFEAT_writable_page_tables);
19209- set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19210- if (pte != pte_offset_kernel(pmd, 0)) {
19211- printk(KERN_ERR "PAGETABLE BUG #02!\n");
19212- return;
19213+ pud = pud_offset(pgd, (unsigned long)__va(phys));
19214+ if (pud_none(*pud)) {
19215+ pmd = (pmd_t *) spp_getpage();
19216+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
19217+ _PAGE_USER));
19218 }
19219+ pmd = pmd_offset(pud, phys);
19220+ BUG_ON(!pmd_none(*pmd));
19221+ set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
19222 }
19223- new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
19224+}
19225
19226- pte = pte_offset_kernel(pmd, vaddr);
19227- if (!pte_none(*pte) && __pte_val(new_pte) &&
19228-#ifdef CONFIG_ACPI
19229- /* __acpi_map_table() fails to properly call clear_fixmap() */
19230- (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19231- vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19232-#endif
19233- __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19234- pte_ERROR(*pte);
19235- set_pte(pte, new_pte);
19236+void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
19237+{
19238+ __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
19239+}
19240
19241- /*
19242- * It's enough to flush this one mapping.
19243- * (PGE mappings get flushed as well)
19244- */
19245- __flush_tlb_one(vaddr);
19246+void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
19247+{
19248+ __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
19249 }
19250
19251-#ifndef CONFIG_XEN
19252 /*
19253 * The head.S code sets up the kernel high mapping:
19254 *
19255@@ -352,33 +319,9 @@ void __init cleanup_highmap(void)
19256 }
19257 #endif
19258
19259-/* NOTE: this is meant to be run only at boot */
19260-void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
19261-{
19262- unsigned long address = __fix_to_virt(idx);
19263-
19264- if (idx >= __end_of_fixed_addresses) {
19265- printk(KERN_ERR "Invalid __set_fixmap\n");
19266- return;
19267- }
19268- switch (idx) {
19269- case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
19270- set_pte_phys(address, phys, prot, 0);
19271- set_pte_phys(address, phys, prot, 1);
19272- break;
19273- case FIX_EARLYCON_MEM_BASE:
19274- xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
19275- pfn_pte_ma(phys >> PAGE_SHIFT, prot));
19276- break;
19277- default:
19278- set_pte_phys_ma(address, phys, prot);
19279- break;
19280- }
19281-}
19282-
19283 static __meminit void *alloc_static_page(unsigned long *phys)
19284 {
19285- unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
19286+ unsigned long va = (table_cur << PAGE_SHIFT) + __START_KERNEL_map;
19287
19288 if (after_bootmem) {
19289 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
19290@@ -387,13 +330,12 @@ static __meminit void *alloc_static_page
19291 return adr;
19292 }
19293
19294- *phys = start_pfn << PAGE_SHIFT;
19295- start_pfn++;
19296- memset((void *)va, 0, PAGE_SIZE);
19297- return (void *)va;
19298+ BUG_ON(!table_cur);
19299+ *phys = table_cur++ << PAGE_SHIFT;
19300+ return memset((void *)va, 0, PAGE_SIZE);
19301 }
19302
19303-#define PTE_SIZE PAGE_SIZE
19304+#define unmap_low_page(p) ((void)(p))
19305
19306 static inline int __meminit make_readonly(unsigned long paddr)
19307 {
19308@@ -408,7 +350,7 @@ static inline int __meminit make_readonl
19309 /* Make old page tables read-only. */
19310 if (!xen_feature(XENFEAT_writable_page_tables)
19311 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
19312- && (paddr < (start_pfn << PAGE_SHIFT)))
19313+ && (paddr < (table_cur << PAGE_SHIFT)))
19314 readonly = 1;
19315
19316 /*
19317@@ -425,118 +367,129 @@ static inline int __meminit make_readonl
19318 return readonly;
19319 }
19320
19321-#ifndef CONFIG_XEN
19322-/* Must run before zap_low_mappings */
19323-__meminit void *early_ioremap(unsigned long addr, unsigned long size)
19324+static unsigned long __meminit
19325+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
19326 {
19327- pmd_t *pmd, *last_pmd;
19328- unsigned long vaddr;
19329- int i, pmds;
19330-
19331- pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19332- vaddr = __START_KERNEL_map;
19333- pmd = level2_kernel_pgt;
19334- last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
19335-
19336- for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
19337- for (i = 0; i < pmds; i++) {
19338- if (pmd_present(pmd[i]))
19339- goto continue_outer_loop;
19340- }
19341- vaddr += addr & ~PMD_MASK;
19342- addr &= PMD_MASK;
19343+ unsigned pages = 0;
19344+ unsigned long last_map_addr = end;
19345+ int i;
19346
19347- for (i = 0; i < pmds; i++, addr += PMD_SIZE)
19348- set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
19349- __flush_tlb_all();
19350-
19351- return (void *)vaddr;
19352-continue_outer_loop:
19353- ;
19354+ pte_t *pte = pte_page + pte_index(addr);
19355+
19356+ for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
19357+ unsigned long pteval = addr | __PAGE_KERNEL;
19358+
19359+ if (addr >= (after_bootmem
19360+ ? end
19361+ : xen_start_info->nr_pages << PAGE_SHIFT))
19362+ break;
19363+
19364+ if (__pte_val(*pte))
19365+ continue;
19366+
19367+ if (make_readonly(addr))
19368+ pteval &= ~_PAGE_RW;
19369+ if (0)
19370+ printk(" pte=%p addr=%lx pte=%016lx\n",
19371+ pte, addr, pteval);
19372+ if (!after_bootmem)
19373+ *pte = __pte(pteval & __supported_pte_mask);
19374+ else
19375+ set_pte(pte, __pte(pteval & __supported_pte_mask));
19376+ last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
19377+ pages++;
19378 }
19379- printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
19380- return NULL;
19381+ update_page_count(PG_LEVEL_4K, pages);
19382+
19383+ return last_map_addr;
19384 }
19385
19386-/*
19387- * To avoid virtual aliases later:
19388- */
19389-__meminit void early_iounmap(void *addr, unsigned long size)
19390+static unsigned long __meminit
19391+phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
19392 {
19393- unsigned long vaddr;
19394- pmd_t *pmd;
19395- int i, pmds;
19396-
19397- vaddr = (unsigned long)addr;
19398- pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19399- pmd = level2_kernel_pgt + pmd_index(vaddr);
19400+ pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
19401
19402- for (i = 0; i < pmds; i++)
19403- pmd_clear(pmd + i);
19404-
19405- __flush_tlb_all();
19406+ BUG_ON(!after_bootmem);
19407+ return phys_pte_init(pte, address, end);
19408 }
19409-#endif
19410
19411 static unsigned long __meminit
19412-phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
19413+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
19414+ unsigned long page_size_mask)
19415 {
19416+ unsigned long pages = 0;
19417+ unsigned long last_map_addr = end;
19418+ unsigned long start = address;
19419+
19420 int i = pmd_index(address);
19421
19422- for (; i < PTRS_PER_PMD; i++) {
19423+ for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
19424 unsigned long pte_phys;
19425- pmd_t *pmd = pmd_page + i;
19426- pte_t *pte, *pte_save;
19427- int k;
19428+ pmd_t *pmd = pmd_page + pmd_index(address);
19429+ pte_t *pte;
19430
19431 if (address >= end)
19432 break;
19433
19434 if (__pmd_val(*pmd)) {
19435- address += PMD_SIZE;
19436+ if (!pmd_large(*pmd)) {
19437+ spin_lock(&init_mm.page_table_lock);
19438+ last_map_addr = phys_pte_update(pmd, address,
19439+ end);
19440+ spin_unlock(&init_mm.page_table_lock);
19441+ }
19442+ /* Count entries we're using from level2_ident_pgt */
19443+ if (start == 0)
19444+ pages++;
19445 continue;
19446 }
19447
19448- pte = alloc_static_page(&pte_phys);
19449- pte_save = pte;
19450- for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
19451- unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
19452-
19453- if (address >= (after_bootmem
19454- ? end
19455- : xen_start_info->nr_pages << PAGE_SHIFT))
19456- pteval = 0;
19457- else if (make_readonly(address))
19458- pteval &= ~_PAGE_RW;
19459- set_pte(pte, __pte(pteval & __supported_pte_mask));
19460+ if (page_size_mask & (1<<PG_LEVEL_2M)) {
19461+ pages++;
19462+ spin_lock(&init_mm.page_table_lock);
19463+ set_pte((pte_t *)pmd,
19464+ pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19465+ spin_unlock(&init_mm.page_table_lock);
19466+ last_map_addr = (address & PMD_MASK) + PMD_SIZE;
19467+ continue;
19468 }
19469+
19470+ pte = alloc_static_page(&pte_phys);
19471+ last_map_addr = phys_pte_init(pte, address, end);
19472+ unmap_low_page(pte);
19473+
19474 if (!after_bootmem) {
19475- early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19476- *pmd = __pmd(pte_phys | _KERNPG_TABLE);
19477+ early_make_page_readonly(pte, XENFEAT_writable_page_tables);
19478+ *pmd = __pmd(pte_phys | _PAGE_TABLE);
19479 } else {
19480- make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19481- set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
19482+ make_page_readonly(pte, XENFEAT_writable_page_tables);
19483+ spin_lock(&init_mm.page_table_lock);
19484+ pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
19485+ spin_unlock(&init_mm.page_table_lock);
19486 }
19487 }
19488- return address;
19489+ update_page_count(PG_LEVEL_2M, pages);
19490+ return last_map_addr;
19491 }
19492
19493 static unsigned long __meminit
19494-phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
19495+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
19496+ unsigned long page_size_mask)
19497 {
19498 pmd_t *pmd = pmd_offset(pud, 0);
19499 unsigned long last_map_addr;
19500
19501- spin_lock(&init_mm.page_table_lock);
19502- last_map_addr = phys_pmd_init(pmd, address, end);
19503- spin_unlock(&init_mm.page_table_lock);
19504+ BUG_ON(!after_bootmem);
19505+ last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
19506 __flush_tlb_all();
19507 return last_map_addr;
19508 }
19509
19510 static unsigned long __meminit
19511-phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
19512+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
19513+ unsigned long page_size_mask)
19514 {
19515+ unsigned long pages = 0;
19516 unsigned long last_map_addr = end;
19517 int i = pud_index(addr);
19518
19519@@ -550,29 +503,59 @@ phys_pud_init(pud_t *pud_page, unsigned
19520
19521 if (__pud_val(*pud)) {
19522 if (!pud_large(*pud))
19523- last_map_addr = phys_pmd_update(pud, addr, end);
19524+ last_map_addr = phys_pmd_update(pud, addr, end,
19525+ page_size_mask);
19526 continue;
19527 }
19528
19529- if (direct_gbpages) {
19530+ if (page_size_mask & (1<<PG_LEVEL_1G)) {
19531+ pages++;
19532+ spin_lock(&init_mm.page_table_lock);
19533 set_pte((pte_t *)pud,
19534 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19535+ spin_unlock(&init_mm.page_table_lock);
19536 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
19537 continue;
19538 }
19539
19540 pmd = alloc_static_page(&pmd_phys);
19541+ last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
19542+ unmap_low_page(pmd);
19543
19544- spin_lock(&init_mm.page_table_lock);
19545- *pud = __pud(pmd_phys | _KERNPG_TABLE);
19546- last_map_addr = phys_pmd_init(pmd, addr, end);
19547- spin_unlock(&init_mm.page_table_lock);
19548-
19549- early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
19550+ if (!after_bootmem) {
19551+ early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
19552+ if (page_size_mask & (1 << PG_LEVEL_NUM))
19553+ xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE));
19554+ else
19555+ *pud = __pud(pmd_phys | _PAGE_TABLE);
19556+ } else {
19557+ make_page_readonly(pmd, XENFEAT_writable_page_tables);
19558+ spin_lock(&init_mm.page_table_lock);
19559+ pud_populate(&init_mm, pud, __va(pmd_phys));
19560+ spin_unlock(&init_mm.page_table_lock);
19561+ }
19562 }
19563 __flush_tlb_all();
19564+ update_page_count(PG_LEVEL_1G, pages);
19565
19566- return last_map_addr >> PAGE_SHIFT;
19567+ return last_map_addr;
19568+}
19569+
19570+static unsigned long __meminit
19571+phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
19572+ unsigned long page_size_mask)
19573+{
19574+ pud_t *pud;
19575+
19576+ if (!after_bootmem) {
19577+ unsigned long addr = __pgd_val(*pgd), *page;
19578+
19579+ addr_to_page(addr, page);
19580+ pud = (pud_t *)page;
19581+ } else
19582+ pud = (pud_t *)pgd_page_vaddr(*pgd);
19583+
19584+ return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM));
19585 }
19586
19587 void __init xen_init_pt(void)
19588@@ -654,7 +637,7 @@ void __init xen_init_pt(void)
19589 static void __init extend_init_mapping(unsigned long tables_space)
19590 {
19591 unsigned long va = __START_KERNEL_map;
19592- unsigned long start = start_pfn;
19593+ unsigned long start = table_cur;
19594 unsigned long phys, addr, *pte_page;
19595 pmd_t *pmd;
19596 pte_t *pte, new_pte;
19597@@ -674,7 +657,7 @@ static void __init extend_init_mapping(u
19598
19599 /* Ensure init mappings cover kernel text/data and initial tables. */
19600 while (va < (__START_KERNEL_map
19601- + (start_pfn << PAGE_SHIFT)
19602+ + (table_cur << PAGE_SHIFT)
19603 + tables_space)) {
19604 pmd = (pmd_t *)&page[pmd_index(va)];
19605 if (pmd_none(*pmd)) {
19606@@ -706,9 +689,9 @@ static void __init extend_init_mapping(u
19607 va += PAGE_SIZE;
19608 }
19609
19610- if (start_pfn > start)
19611+ if (table_cur > start)
19612 reserve_early(start << PAGE_SHIFT,
19613- start_pfn << PAGE_SHIFT, "INITMAP");
19614+ table_cur << PAGE_SHIFT, "INITMAP");
19615 }
19616
19617 static void __init find_early_table_space(unsigned long end)
19618@@ -717,23 +700,25 @@ static void __init find_early_table_spac
19619
19620 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
19621 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
19622- ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
19623+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
19624
19625 tables = round_up(puds * 8, PAGE_SIZE) +
19626 round_up(pmds * 8, PAGE_SIZE) +
19627 round_up(ptes * 8, PAGE_SIZE);
19628
19629+ table_cur = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
19630+ xen_start_info->nr_pt_frames;
19631+
19632 extend_init_mapping(tables);
19633
19634- table_start = start_pfn;
19635+ table_start = table_cur;
19636 table_end = table_start + (tables>>PAGE_SHIFT);
19637
19638- early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
19639- end, table_start << PAGE_SHIFT,
19640- (table_start << PAGE_SHIFT) + tables);
19641+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
19642+ end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
19643 }
19644
19645-static void __init xen_finish_init_mapping(void)
19646+static void __init xen_finish_init_mapping(bool reserve)
19647 {
19648 unsigned long i, start, end;
19649
19650@@ -762,7 +747,8 @@ static void __init xen_finish_init_mappi
19651 if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
19652 BUG();
19653
19654- /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
19655+ /* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
19656+ start = table_cur;
19657 table_end = ~0UL;
19658
19659 /*
19660@@ -789,8 +775,11 @@ static void __init xen_finish_init_mappi
19661 << PAGE_SHIFT,
19662 PAGE_KERNEL_RO);
19663
19664- /* Disable the 'start_pfn' allocator. */
19665- table_end = start_pfn;
19666+ /* Disable the 'table_cur' allocator. */
19667+ table_end = table_cur;
19668+ if (reserve && table_cur > start)
19669+ reserve_early(start << PAGE_SHIFT,
19670+ table_cur << PAGE_SHIFT, "FIXMAP");
19671 }
19672
19673 static void __init init_gbpages(void)
19674@@ -801,126 +790,89 @@ static void __init init_gbpages(void)
19675 direct_gbpages = 0;
19676 }
19677
19678-#ifdef CONFIG_MEMTEST_BOOTPARAM
19679-
19680-static void __init memtest(unsigned long start_phys, unsigned long size,
19681- unsigned pattern)
19682+static unsigned long __init kernel_physical_mapping_init(unsigned long start,
19683+ unsigned long end,
19684+ unsigned long page_size_mask)
19685 {
19686- unsigned long i;
19687- unsigned long *start;
19688- unsigned long start_bad;
19689- unsigned long last_bad;
19690- unsigned long val;
19691- unsigned long start_phys_aligned;
19692- unsigned long count;
19693- unsigned long incr;
19694-
19695- switch (pattern) {
19696- case 0:
19697- val = 0UL;
19698- break;
19699- case 1:
19700- val = -1UL;
19701- break;
19702- case 2:
19703- val = 0x5555555555555555UL;
19704- break;
19705- case 3:
19706- val = 0xaaaaaaaaaaaaaaaaUL;
19707- break;
19708- default:
19709- return;
19710- }
19711
19712- incr = sizeof(unsigned long);
19713- start_phys_aligned = ALIGN(start_phys, incr);
19714- count = (size - (start_phys_aligned - start_phys))/incr;
19715- start = __va(start_phys_aligned);
19716- start_bad = 0;
19717- last_bad = 0;
19718-
19719- for (i = 0; i < count; i++)
19720- start[i] = val;
19721- for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
19722- if (*start != val) {
19723- if (start_phys_aligned == last_bad + incr) {
19724- last_bad += incr;
19725- } else {
19726- if (start_bad) {
19727- printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19728- val, start_bad, last_bad + incr);
19729- reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19730- }
19731- start_bad = last_bad = start_phys_aligned;
19732- }
19733- }
19734- }
19735- if (start_bad) {
19736- printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19737- val, start_bad, last_bad + incr);
19738- reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19739- }
19740-
19741-}
19742-
19743-static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
19744-
19745-static int __init parse_memtest(char *arg)
19746-{
19747- if (arg)
19748- memtest_pattern = simple_strtoul(arg, NULL, 0);
19749- return 0;
19750-}
19751+ unsigned long next, last_map_addr = end;
19752
19753-early_param("memtest", parse_memtest);
19754+ start = (unsigned long)__va(start);
19755+ end = (unsigned long)__va(end);
19756
19757-static void __init early_memtest(unsigned long start, unsigned long end)
19758-{
19759- u64 t_start, t_size;
19760- unsigned pattern;
19761+ for (; start < end; start = next) {
19762+ pgd_t *pgd = pgd_offset_k(start);
19763+ unsigned long pud_phys;
19764+ pud_t *pud;
19765
19766- if (!memtest_pattern)
19767- return;
19768+ next = (start + PGDIR_SIZE) & PGDIR_MASK;
19769+ if (next > end)
19770+ next = end;
19771
19772- printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
19773- for (pattern = 0; pattern < memtest_pattern; pattern++) {
19774- t_start = start;
19775- t_size = 0;
19776- while (t_start < end) {
19777- t_start = find_e820_area_size(t_start, &t_size, 1);
19778-
19779- /* done ? */
19780- if (t_start >= end)
19781- break;
19782- if (t_start + t_size > end)
19783- t_size = end - t_start;
19784-
19785- printk(KERN_CONT "\n %016llx - %016llx pattern %d",
19786- (unsigned long long)t_start,
19787- (unsigned long long)t_start + t_size, pattern);
19788+ if (__pgd_val(*pgd)) {
19789+ last_map_addr = phys_pud_update(pgd, __pa(start),
19790+ __pa(end), page_size_mask);
19791+ continue;
19792+ }
19793
19794- memtest(t_start, t_size, pattern);
19795+ pud = alloc_static_page(&pud_phys);
19796+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
19797+ page_size_mask);
19798+ unmap_low_page(pud);
19799
19800- t_start += t_size;
19801+ if(!after_bootmem) {
19802+ early_make_page_readonly(pud, XENFEAT_writable_page_tables);
19803+ xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
19804+ } else {
19805+ make_page_readonly(pud, XENFEAT_writable_page_tables);
19806+ spin_lock(&init_mm.page_table_lock);
19807+ pgd_populate(&init_mm, pgd, __va(pud_phys));
19808+ spin_unlock(&init_mm.page_table_lock);
19809 }
19810 }
19811- printk(KERN_CONT "\n");
19812+
19813+ return last_map_addr;
19814 }
19815-#else
19816-static void __init early_memtest(unsigned long start, unsigned long end)
19817+
19818+struct map_range {
19819+ unsigned long start;
19820+ unsigned long end;
19821+ unsigned page_size_mask;
19822+};
19823+
19824+#define NR_RANGE_MR 5
19825+
19826+static int save_mr(struct map_range *mr, int nr_range,
19827+ unsigned long start_pfn, unsigned long end_pfn,
19828+ unsigned long page_size_mask)
19829 {
19830+
19831+ if (start_pfn < end_pfn) {
19832+ if (nr_range >= NR_RANGE_MR)
19833+ panic("run out of range for init_memory_mapping\n");
19834+ mr[nr_range].start = start_pfn<<PAGE_SHIFT;
19835+ mr[nr_range].end = end_pfn<<PAGE_SHIFT;
19836+ mr[nr_range].page_size_mask = page_size_mask;
19837+ nr_range++;
19838+ }
19839+
19840+ return nr_range;
19841 }
19842-#endif
19843
19844 /*
19845 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
19846 * This runs before bootmem is initialized and gets pages directly from
19847 * the physical memory. To access them they are temporarily mapped.
19848 */
19849-unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
19850+unsigned long __init_refok init_memory_mapping(unsigned long start,
19851+ unsigned long end)
19852 {
19853- unsigned long next, last_map_addr = end;
19854- unsigned long start_phys = start, end_phys = end;
19855+ unsigned long last_map_addr = 0;
19856+ unsigned long page_size_mask = 0;
19857+ unsigned long start_pfn, end_pfn;
19858+ bool first = !table_start;
19859+ struct map_range mr[NR_RANGE_MR];
19860+ int nr_range, i;
19861
19862 printk(KERN_INFO "init_memory_mapping\n");
19863
19864@@ -931,51 +883,123 @@ unsigned long __init_refok init_memory_m
19865 * memory mapped. Unfortunately this is done currently before the
19866 * nodes are discovered.
19867 */
19868- if (!after_bootmem) {
19869+ if (!after_bootmem)
19870 init_gbpages();
19871- find_early_table_space(end);
19872- }
19873
19874- start = (unsigned long)__va(start);
19875- end = (unsigned long)__va(end);
19876+ if (direct_gbpages)
19877+ page_size_mask |= 1 << PG_LEVEL_1G;
19878+ if (cpu_has_pse)
19879+ page_size_mask |= 1 << PG_LEVEL_2M;
19880+
19881+ memset(mr, 0, sizeof(mr));
19882+ nr_range = 0;
19883+
19884+ /* head if not big page alignment ?*/
19885+ start_pfn = start >> PAGE_SHIFT;
19886+ end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
19887+ << (PMD_SHIFT - PAGE_SHIFT);
19888+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
19889+
19890+ /* big page (2M) range*/
19891+ start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
19892+ << (PMD_SHIFT - PAGE_SHIFT);
19893+ end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
19894+ << (PUD_SHIFT - PAGE_SHIFT);
19895+ if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
19896+ end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
19897+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
19898+ page_size_mask & (1<<PG_LEVEL_2M));
19899+
19900+ /* big page (1G) range */
19901+ start_pfn = end_pfn;
19902+ end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
19903+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
19904+ page_size_mask &
19905+ ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
19906+
19907+ /* tail is not big page (1G) alignment */
19908+ start_pfn = end_pfn;
19909+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
19910+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
19911+ page_size_mask & (1<<PG_LEVEL_2M));
19912+
19913+ /* tail is not big page (2M) alignment */
19914+ start_pfn = end_pfn;
19915+ end_pfn = end>>PAGE_SHIFT;
19916+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
19917+
19918+ /* try to merge same page size and continuous */
19919+ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
19920+ unsigned long old_start;
19921+ if (mr[i].end != mr[i+1].start ||
19922+ mr[i].page_size_mask != mr[i+1].page_size_mask)
19923+ continue;
19924+ /* move it */
19925+ old_start = mr[i].start;
19926+ memmove(&mr[i], &mr[i+1],
19927+ (nr_range - 1 - i) * sizeof (struct map_range));
19928+ mr[i--].start = old_start;
19929+ nr_range--;
19930+ }
19931
19932- for (; start < end; start = next) {
19933- pgd_t *pgd = pgd_offset_k(start);
19934- unsigned long pud_phys;
19935- pud_t *pud;
19936+ for (i = 0; i < nr_range; i++)
19937+ printk(KERN_DEBUG " %010lx - %010lx page %s\n",
19938+ mr[i].start, mr[i].end,
19939+ (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
19940+ (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
19941
19942- if (after_bootmem)
19943- pud = pud_offset(pgd, start & PGDIR_MASK);
19944- else
19945- pud = alloc_static_page(&pud_phys);
19946- next = start + PGDIR_SIZE;
19947- if (next > end)
19948- next = end;
19949- last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
19950- if (!after_bootmem) {
19951- early_make_page_readonly(pud, XENFEAT_writable_page_tables);
19952- set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
19953- }
19954- }
19955+ if (first)
19956+ find_early_table_space(end);
19957
19958- if (!after_bootmem) {
19959- BUG_ON(start_pfn != table_end);
19960- xen_finish_init_mapping();
19961+ for (i = 0; i < nr_range; i++)
19962+ last_map_addr = kernel_physical_mapping_init(
19963+ mr[i].start, mr[i].end,
19964+ mr[i].page_size_mask);
19965+
19966+ BUG_ON(table_cur > table_end);
19967+ if (start < (table_start << PAGE_SHIFT)) {
19968+ WARN_ON(table_cur != table_end);
19969+ xen_finish_init_mapping(!first);
19970 }
19971
19972 __flush_tlb_all();
19973
19974- if (!after_bootmem)
19975+ if (first && table_end > table_start)
19976 reserve_early(table_start << PAGE_SHIFT,
19977 table_end << PAGE_SHIFT, "PGTABLE");
19978
19979+ printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
19980+ last_map_addr, end);
19981+
19982 if (!after_bootmem)
19983- early_memtest(start_phys, end_phys);
19984+ early_memtest(start, end);
19985
19986- return last_map_addr;
19987+ return last_map_addr >> PAGE_SHIFT;
19988 }
19989
19990 #ifndef CONFIG_NUMA
19991+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
19992+{
19993+ unsigned long bootmap_size, bootmap;
19994+
19995+ e820_register_active_regions(0, start_pfn, end_pfn);
19996+#ifdef CONFIG_XEN
19997+ if (end_pfn > xen_start_info->nr_pages)
19998+ end_pfn = xen_start_info->nr_pages;
19999+#endif
20000+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
20001+ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
20002+ PAGE_SIZE);
20003+ if (bootmap == -1L)
20004+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
20005+ /* don't touch min_low_pfn */
20006+ bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
20007+ 0, end_pfn);
20008+ free_bootmem_with_active_regions(0, end_pfn);
20009+ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
20010+ reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
20011+}
20012+
20013 void __init paging_init(void)
20014 {
20015 unsigned long max_zone_pfns[MAX_NR_ZONES];
20016@@ -983,9 +1007,9 @@ void __init paging_init(void)
20017 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
20018 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
20019 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
20020- max_zone_pfns[ZONE_NORMAL] = end_pfn;
20021+ max_zone_pfns[ZONE_NORMAL] = max_pfn;
20022
20023- memory_present(0, 0, end_pfn);
20024+ memory_present(0, 0, max_pfn);
20025 sparse_init();
20026 free_area_init_nodes(max_zone_pfns);
20027
20028@@ -1076,8 +1100,8 @@ void __init mem_init(void)
20029 init_page_count(pfn_to_page(pfn));
20030 totalram_pages++;
20031 }
20032- reservedpages = end_pfn - totalram_pages -
20033- absent_pages_in_range(0, end_pfn);
20034+ reservedpages = max_pfn - totalram_pages -
20035+ absent_pages_in_range(0, max_pfn);
20036 after_bootmem = 1;
20037
20038 codesize = (unsigned long) &_etext - (unsigned long) &_text;
20039@@ -1096,7 +1120,7 @@ void __init mem_init(void)
20040 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
20041 "%ldk reserved, %ldk data, %ldk init)\n",
20042 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
20043- end_pfn << (PAGE_SHIFT-10),
20044+ max_pfn << (PAGE_SHIFT-10),
20045 codesize >> 10,
20046 reservedpages << (PAGE_SHIFT-10),
20047 datasize >> 10,
20048@@ -1159,6 +1183,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
20049 void mark_rodata_ro(void)
20050 {
20051 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
20052+ unsigned long rodata_start =
20053+ ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20054+
20055+#ifdef CONFIG_DYNAMIC_FTRACE
20056+ /* Dynamic tracing modifies the kernel text section */
20057+ start = rodata_start;
20058+#endif
20059
20060 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
20061 (end - start) >> 10);
20062@@ -1168,8 +1199,7 @@ void mark_rodata_ro(void)
20063 * The rodata section (but not the kernel text!) should also be
20064 * not-executable.
20065 */
20066- start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20067- set_memory_nx(start, (end - start) >> PAGE_SHIFT);
20068+ set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
20069
20070 rodata_test();
20071
20072@@ -1191,24 +1221,26 @@ void free_initrd_mem(unsigned long start
20073 }
20074 #endif
20075
20076-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
20077+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
20078+ int flags)
20079 {
20080 #ifdef CONFIG_NUMA
20081 int nid, next_nid;
20082+ int ret;
20083 #endif
20084 unsigned long pfn = phys >> PAGE_SHIFT;
20085
20086- if (pfn >= end_pfn) {
20087+ if (pfn >= max_pfn) {
20088 /*
20089 * This can happen with kdump kernels when accessing
20090 * firmware tables:
20091 */
20092 if (pfn < max_pfn_mapped)
20093- return;
20094+ return -EFAULT;
20095
20096- printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
20097+ printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
20098 phys, len);
20099- return;
20100+ return -EFAULT;
20101 }
20102
20103 /* Should check here against the e820 map to avoid double free */
20104@@ -1216,9 +1248,13 @@ void __init reserve_bootmem_generic(unsi
20105 nid = phys_to_nid(phys);
20106 next_nid = phys_to_nid(phys + len - 1);
20107 if (nid == next_nid)
20108- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
20109+ ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
20110 else
20111- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20112+ ret = reserve_bootmem(phys, len, flags);
20113+
20114+ if (ret != 0)
20115+ return ret;
20116+
20117 #else
20118 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20119 #endif
20120@@ -1231,6 +1267,8 @@ void __init reserve_bootmem_generic(unsi
20121 set_dma_reserve(dma_reserve);
20122 }
20123 #endif
20124+
20125+ return 0;
20126 }
20127
20128 int kern_addr_valid(unsigned long addr)
20129@@ -1335,7 +1373,7 @@ vmemmap_populate(struct page *start_page
20130 pmd_t *pmd;
20131
20132 for (; addr < end; addr = next) {
20133- next = pmd_addr_end(addr, end);
20134+ void *p = NULL;
20135
20136 pgd = vmemmap_pgd_populate(addr, node);
20137 if (!pgd)
20138@@ -1345,33 +1383,51 @@ vmemmap_populate(struct page *start_page
20139 if (!pud)
20140 return -ENOMEM;
20141
20142- pmd = pmd_offset(pud, addr);
20143- if (pmd_none(*pmd)) {
20144- pte_t entry;
20145- void *p;
20146+ if (!cpu_has_pse) {
20147+ next = (addr + PAGE_SIZE) & PAGE_MASK;
20148+ pmd = vmemmap_pmd_populate(pud, addr, node);
20149+
20150+ if (!pmd)
20151+ return -ENOMEM;
20152+
20153+ p = vmemmap_pte_populate(pmd, addr, node);
20154
20155- p = vmemmap_alloc_block(PMD_SIZE, node);
20156 if (!p)
20157 return -ENOMEM;
20158
20159- entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20160- PAGE_KERNEL_LARGE);
20161- set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20162-
20163- /* check to see if we have contiguous blocks */
20164- if (p_end != p || node_start != node) {
20165- if (p_start)
20166- printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20167- addr_start, addr_end-1, p_start, p_end-1, node_start);
20168- addr_start = addr;
20169- node_start = node;
20170- p_start = p;
20171- }
20172- addr_end = addr + PMD_SIZE;
20173- p_end = p + PMD_SIZE;
20174+ addr_end = addr + PAGE_SIZE;
20175+ p_end = p + PAGE_SIZE;
20176 } else {
20177- vmemmap_verify((pte_t *)pmd, node, addr, next);
20178+ next = pmd_addr_end(addr, end);
20179+
20180+ pmd = pmd_offset(pud, addr);
20181+ if (pmd_none(*pmd)) {
20182+ pte_t entry;
20183+
20184+ p = vmemmap_alloc_block(PMD_SIZE, node);
20185+ if (!p)
20186+ return -ENOMEM;
20187+
20188+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20189+ PAGE_KERNEL_LARGE);
20190+ set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20191+
20192+ /* check to see if we have contiguous blocks */
20193+ if (p_end != p || node_start != node) {
20194+ if (p_start)
20195+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20196+ addr_start, addr_end-1, p_start, p_end-1, node_start);
20197+ addr_start = addr;
20198+ node_start = node;
20199+ p_start = p;
20200+ }
20201+
20202+ addr_end = addr + PMD_SIZE;
20203+ p_end = p + PMD_SIZE;
20204+ } else
20205+ vmemmap_verify((pte_t *)pmd, node, addr, next);
20206 }
20207+
20208 }
20209 return 0;
20210 }
20211Index: head-2008-12-01/arch/x86/mm/ioremap-xen.c
20212===================================================================
20213--- head-2008-12-01.orig/arch/x86/mm/ioremap-xen.c 2008-12-01 11:44:55.000000000 +0100
20214+++ head-2008-12-01/arch/x86/mm/ioremap-xen.c 2008-12-01 11:49:07.000000000 +0100
20215@@ -13,6 +13,7 @@
20216 #include <linux/pfn.h>
20217 #include <linux/slab.h>
20218 #include <linux/vmalloc.h>
20219+#include <linux/mmiotrace.h>
20220
20221 #include <asm/cacheflush.h>
20222 #include <asm/e820.h>
20223@@ -274,7 +275,8 @@ int ioremap_check_change_attr(unsigned l
20224 for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
20225 unsigned long pfn = mfn_to_local_pfn(mfn);
20226
20227- if (pfn >= max_pfn_mapped)
20228+ if (pfn >= max_low_pfn_mapped &&
20229+ (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
20230 continue;
20231 rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
20232 PAGE_SIZE, prot_val);
20233@@ -297,11 +299,14 @@ static void __iomem *__ioremap_caller(re
20234 {
20235 unsigned long mfn, offset, vaddr;
20236 resource_size_t last_addr;
20237+ const resource_size_t unaligned_phys_addr = phys_addr;
20238+ const unsigned long unaligned_size = size;
20239 struct vm_struct *area;
20240 unsigned long new_prot_val;
20241 pgprot_t prot;
20242 int retval;
20243 domid_t domid = DOMID_IO;
20244+ void __iomem *ret_addr;
20245
20246 /* Don't allow wraparound or zero size */
20247 last_addr = phys_addr + size - 1;
20248@@ -318,7 +323,7 @@ static void __iomem *__ioremap_caller(re
20249 /*
20250 * Don't remap the low PCI/ISA area, it's always mapped..
20251 */
20252- if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
20253+ if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
20254 return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
20255
20256 /*
20257@@ -342,7 +347,7 @@ static void __iomem *__ioremap_caller(re
20258 phys_addr &= PAGE_MASK;
20259 size = PAGE_ALIGN(last_addr+1) - phys_addr;
20260
20261- retval = reserve_memtype(phys_addr, phys_addr + size,
20262+ retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
20263 prot_val, &new_prot_val);
20264 if (retval) {
20265 pr_debug("Warning: reserve_memtype returned %d\n", retval);
20266@@ -410,7 +415,10 @@ static void __iomem *__ioremap_caller(re
20267 return NULL;
20268 }
20269
20270- return (void __iomem *) (vaddr + offset);
20271+ ret_addr = (void __iomem *) (vaddr + offset);
20272+ mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
20273+
20274+ return ret_addr;
20275 }
20276
20277 /**
20278@@ -438,7 +446,7 @@ void __iomem *ioremap_nocache(resource_s
20279 {
20280 /*
20281 * Ideally, this should be:
20282- * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20283+ * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20284 *
20285 * Till we fix all X drivers to use ioremap_wc(), we will use
20286 * UC MINUS.
20287@@ -462,7 +470,7 @@ EXPORT_SYMBOL(ioremap_nocache);
20288 */
20289 void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
20290 {
20291- if (pat_wc_enabled)
20292+ if (pat_enabled)
20293 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
20294 __builtin_return_address(0));
20295 else
20296@@ -502,6 +510,14 @@ static void __iomem *ioremap_default(res
20297 }
20298 #endif
20299
20300+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
20301+ unsigned long prot_val)
20302+{
20303+ return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
20304+ __builtin_return_address(0));
20305+}
20306+EXPORT_SYMBOL(ioremap_prot);
20307+
20308 /**
20309 * iounmap - Free a IO remapping
20310 * @addr: virtual address from ioremap_*
20311@@ -526,6 +542,8 @@ void iounmap(volatile void __iomem *addr
20312 addr = (volatile void __iomem *)
20313 (PAGE_MASK & (unsigned long __force)addr);
20314
20315+ mmiotrace_iounmap(addr);
20316+
20317 /* Use the vm area unlocked, assuming the caller
20318 ensures there isn't another iounmap for the same address
20319 in parallel. Reuse of the virtual address is prevented by
20320@@ -533,7 +551,7 @@ void iounmap(volatile void __iomem *addr
20321 cpa takes care of the direct mappings. */
20322 read_lock(&vmlist_lock);
20323 for (p = vmlist; p; p = p->next) {
20324- if (p->addr == addr)
20325+ if (p->addr == (void __force *)addr)
20326 break;
20327 }
20328 read_unlock(&vmlist_lock);
20329@@ -547,7 +565,7 @@ void iounmap(volatile void __iomem *addr
20330 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
20331
20332 /* Finally remove it */
20333- o = remove_vm_area((void *)addr);
20334+ o = remove_vm_area((void __force *)addr);
20335 BUG_ON(p != o || o == NULL);
20336 kfree(p);
20337 }
20338@@ -567,7 +585,7 @@ void *xlate_dev_mem_ptr(unsigned long ph
20339 if (page_is_ram(start >> PAGE_SHIFT))
20340 return __va(phys);
20341
20342- addr = (void *)ioremap_default(start, PAGE_SIZE);
20343+ addr = (void __force *)ioremap_default(start, PAGE_SIZE);
20344 if (addr)
20345 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
20346
20347@@ -595,8 +613,7 @@ static int __init early_ioremap_debug_se
20348 early_param("early_ioremap_debug", early_ioremap_debug_setup);
20349
20350 static __initdata int after_paging_init;
20351-static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
20352- __section(.bss.page_aligned);
20353+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
20354
20355 #ifdef CONFIG_X86_32
20356 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
20357@@ -693,10 +710,11 @@ static void __init __early_set_fixmap(en
20358 return;
20359 }
20360 pte = early_ioremap_pte(addr);
20361+
20362 if (pgprot_val(flags))
20363 set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
20364 else
20365- pte_clear(NULL, addr, pte);
20366+ pte_clear(&init_mm, addr, pte);
20367 __flush_tlb_one(addr);
20368 }
20369
20370@@ -724,13 +742,11 @@ static int __init check_early_ioremap_le
20371 {
20372 if (!early_ioremap_nested)
20373 return 0;
20374-
20375- printk(KERN_WARNING
20376+ WARN(1, KERN_WARNING
20377 "Debug warning: early ioremap leak of %d areas detected.\n",
20378- early_ioremap_nested);
20379+ early_ioremap_nested);
20380 printk(KERN_WARNING
20381- "please boot with early_ioremap_debug and report the dmesg.\n");
20382- WARN_ON(1);
20383+ "please boot with early_ioremap_debug and report the dmesg.\n");
20384
20385 return 1;
20386 }
20387Index: head-2008-12-01/arch/x86/mm/pageattr-xen.c
20388===================================================================
20389--- head-2008-12-01.orig/arch/x86/mm/pageattr-xen.c 2008-12-01 12:19:27.000000000 +0100
20390+++ head-2008-12-01/arch/x86/mm/pageattr-xen.c 2008-12-01 11:49:07.000000000 +0100
20391@@ -34,6 +34,47 @@ struct cpa_data {
20392 unsigned force_split : 1;
20393 };
20394
20395+#ifdef CONFIG_PROC_FS
20396+static unsigned long direct_pages_count[PG_LEVEL_NUM];
20397+
20398+void update_page_count(int level, unsigned long pages)
20399+{
20400+ unsigned long flags;
20401+
20402+ /* Protect against CPA */
20403+ spin_lock_irqsave(&pgd_lock, flags);
20404+ direct_pages_count[level] += pages;
20405+ spin_unlock_irqrestore(&pgd_lock, flags);
20406+}
20407+
20408+static void split_page_count(int level)
20409+{
20410+ direct_pages_count[level]--;
20411+ direct_pages_count[level - 1] += PTRS_PER_PTE;
20412+}
20413+
20414+int arch_report_meminfo(char *page)
20415+{
20416+ int n = sprintf(page, "DirectMap4k: %8lu kB\n",
20417+ direct_pages_count[PG_LEVEL_4K] << 2);
20418+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
20419+ n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
20420+ direct_pages_count[PG_LEVEL_2M] << 11);
20421+#else
20422+ n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
20423+ direct_pages_count[PG_LEVEL_2M] << 12);
20424+#endif
20425+#ifdef CONFIG_X86_64
20426+ if (direct_gbpages)
20427+ n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
20428+ direct_pages_count[PG_LEVEL_1G] << 20);
20429+#endif
20430+ return n;
20431+}
20432+#else
20433+static inline void split_page_count(int level) { }
20434+#endif
20435+
20436 #ifdef CONFIG_X86_64
20437
20438 static inline unsigned long highmap_start_pfn(void)
20439@@ -106,7 +147,7 @@ static void cpa_flush_all(unsigned long
20440 {
20441 BUG_ON(irqs_disabled());
20442
20443- on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
20444+ on_each_cpu(__cpa_flush_all, (void *) cache, 1);
20445 }
20446
20447 static void __cpa_flush_range(void *arg)
20448@@ -127,7 +168,7 @@ static void cpa_flush_range(unsigned lon
20449 BUG_ON(irqs_disabled());
20450 WARN_ON(PAGE_ALIGN(start) != start);
20451
20452- on_each_cpu(__cpa_flush_range, NULL, 1, 1);
20453+ on_each_cpu(__cpa_flush_range, NULL, 1);
20454
20455 if (!cache)
20456 return;
20457@@ -229,6 +270,7 @@ pte_t *lookup_address(unsigned long addr
20458
20459 return pte_offset_kernel(pmd, address);
20460 }
20461+EXPORT_SYMBOL_GPL(lookup_address);
20462
20463 /*
20464 * Set the new pmd in all the pgds we know about:
20465@@ -509,6 +551,16 @@ static int split_large_page(pte_t *kpte,
20466 }
20467 #endif
20468
20469+ if (address >= (unsigned long)__va(0) &&
20470+ address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
20471+ split_page_count(level);
20472+
20473+#ifdef CONFIG_X86_64
20474+ if (address >= (unsigned long)__va(1UL<<32) &&
20475+ address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
20476+ split_page_count(level);
20477+#endif
20478+
20479 /*
20480 * Get the target mfn from the original entry:
20481 */
20482@@ -565,10 +617,9 @@ repeat:
20483 if (!__pte_val(old_pte)) {
20484 if (!primary)
20485 return 0;
20486- printk(KERN_WARNING "CPA: called for zero pte. "
20487+ WARN(1, KERN_WARNING "CPA: called for zero pte. "
20488 "vaddr = %lx cpa->vaddr = %lx\n", address,
20489 cpa->vaddr);
20490- WARN_ON(1);
20491 return -EINVAL;
20492 }
20493
20494@@ -633,15 +684,24 @@ static int cpa_process_alias(struct cpa_
20495 struct cpa_data alias_cpa;
20496 int ret = 0;
20497
20498- if (cpa->pfn > max_pfn_mapped)
20499+ if (cpa->pfn >= max_pfn_mapped)
20500 return 0;
20501
20502+#ifdef CONFIG_X86_64
20503+ if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
20504+ return 0;
20505+#endif
20506 /*
20507 * No need to redo, when the primary call touched the direct
20508 * mapping already:
20509 */
20510- if (!within(cpa->vaddr, PAGE_OFFSET,
20511- PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
20512+ if (!(within(cpa->vaddr, PAGE_OFFSET,
20513+ PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
20514+#ifdef CONFIG_X86_64
20515+ || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
20516+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
20517+#endif
20518+ )) {
20519
20520 alias_cpa = *cpa;
20521 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
20522@@ -809,7 +869,7 @@ int set_memory_uc(unsigned long addr, in
20523 /*
20524 * for now UC MINUS. see comments in ioremap_nocache()
20525 */
20526- if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20527+ if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20528 _PAGE_CACHE_UC_MINUS, NULL))
20529 return -EINVAL;
20530
20531@@ -825,10 +885,10 @@ int _set_memory_wc(unsigned long addr, i
20532
20533 int set_memory_wc(unsigned long addr, int numpages)
20534 {
20535- if (!pat_wc_enabled)
20536+ if (!pat_enabled)
20537 return set_memory_uc(addr, numpages);
20538
20539- if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20540+ if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20541 _PAGE_CACHE_WC, NULL))
20542 return -EINVAL;
20543
20544@@ -844,7 +904,7 @@ int _set_memory_wb(unsigned long addr, i
20545
20546 int set_memory_wb(unsigned long addr, int numpages)
20547 {
20548- free_memtype(addr, addr + numpages * PAGE_SIZE);
20549+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
20550
20551 return _set_memory_wb(addr, numpages);
20552 }
20553Index: head-2008-12-01/arch/x86/mm/pat-xen.c
20554===================================================================
20555--- head-2008-12-01.orig/arch/x86/mm/pat-xen.c 2008-12-01 11:44:55.000000000 +0100
20556+++ head-2008-12-01/arch/x86/mm/pat-xen.c 2008-12-01 11:49:07.000000000 +0100
20557@@ -12,6 +12,8 @@
20558 #include <linux/gfp.h>
20559 #include <linux/fs.h>
20560 #include <linux/bootmem.h>
20561+#include <linux/debugfs.h>
20562+#include <linux/seq_file.h>
20563
20564 #include <asm/msr.h>
20565 #include <asm/tlbflush.h>
20566@@ -26,11 +28,11 @@
20567 #include <asm/io.h>
20568
20569 #ifdef CONFIG_X86_PAT
20570-int __read_mostly pat_wc_enabled = 1;
20571+int __read_mostly pat_enabled = 1;
20572
20573 void __cpuinit pat_disable(char *reason)
20574 {
20575- pat_wc_enabled = 0;
20576+ pat_enabled = 0;
20577 printk(KERN_INFO "%s\n", reason);
20578 }
20579
20580@@ -42,6 +44,19 @@ static int __init nopat(char *str)
20581 early_param("nopat", nopat);
20582 #endif
20583
20584+
20585+static int debug_enable;
20586+static int __init pat_debug_setup(char *str)
20587+{
20588+ debug_enable = 1;
20589+ return 0;
20590+}
20591+__setup("debugpat", pat_debug_setup);
20592+
20593+#define dprintk(fmt, arg...) \
20594+ do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
20595+
20596+
20597 static u64 __read_mostly boot_pat_state;
20598
20599 enum {
20600@@ -53,24 +68,25 @@ enum {
20601 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
20602 };
20603
20604-#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
20605+#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
20606
20607 void pat_init(void)
20608 {
20609 u64 pat;
20610
20611- if (!pat_wc_enabled)
20612+ if (!pat_enabled)
20613 return;
20614
20615 /* Paranoia check. */
20616- if (!cpu_has_pat) {
20617- printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
20618+ if (!cpu_has_pat && boot_pat_state) {
20619 /*
20620- * Panic if this happens on the secondary CPU, and we
20621+ * If this happens we are on a secondary CPU, but
20622 * switched to PAT on the boot CPU. We have no way to
20623 * undo PAT.
20624- */
20625- BUG_ON(boot_pat_state);
20626+ */
20627+ printk(KERN_ERR "PAT enabled, "
20628+ "but not supported by secondary CPU\n");
20629+ BUG();
20630 }
20631
20632 #ifndef CONFIG_XEN
20633@@ -87,8 +103,8 @@ void pat_init(void)
20634 * 011 UC _PAGE_CACHE_UC
20635 * PAT bit unused
20636 */
20637- pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
20638- PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
20639+ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
20640+ PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
20641
20642 /* Boot CPU check */
20643 if (!boot_pat_state)
20644@@ -113,13 +129,13 @@ void pat_init(void)
20645 static char *cattr_name(unsigned long flags)
20646 {
20647 switch (flags & _PAGE_CACHE_MASK) {
20648- case _PAGE_CACHE_UC: return "uncached";
20649- case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20650- case _PAGE_CACHE_WB: return "write-back";
20651- case _PAGE_CACHE_WC: return "write-combining";
20652- case _PAGE_CACHE_WP: return "write-protected";
20653- case _PAGE_CACHE_WT: return "write-through";
20654- default: return "broken";
20655+ case _PAGE_CACHE_UC: return "uncached";
20656+ case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20657+ case _PAGE_CACHE_WB: return "write-back";
20658+ case _PAGE_CACHE_WC: return "write-combining";
20659+ case _PAGE_CACHE_WP: return "write-protected";
20660+ case _PAGE_CACHE_WT: return "write-through";
20661+ default: return "broken";
20662 }
20663 }
20664
20665@@ -157,49 +173,55 @@ static DEFINE_SPINLOCK(memtype_lock); /
20666 * The intersection is based on "Effective Memory Type" tables in IA-32
20667 * SDM vol 3a
20668 */
20669-static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
20670- unsigned long *ret_prot)
20671+static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
20672 {
20673- unsigned long pat_type;
20674- u8 mtrr_type;
20675-
20676- pat_type = prot & _PAGE_CACHE_MASK;
20677- prot &= (~_PAGE_CACHE_MASK);
20678-
20679- /*
20680- * We return the PAT request directly for types where PAT takes
20681- * precedence with respect to MTRR and for UC_MINUS.
20682- * Consistency checks with other PAT requests is done later
20683- * while going through memtype list.
20684- */
20685- if (pat_type == _PAGE_CACHE_WC) {
20686- *ret_prot = prot | _PAGE_CACHE_WC;
20687- return 0;
20688- } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
20689- *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
20690- return 0;
20691- } else if (pat_type == _PAGE_CACHE_UC) {
20692- *ret_prot = prot | _PAGE_CACHE_UC;
20693- return 0;
20694- }
20695-
20696 /*
20697 * Look for MTRR hint to get the effective type in case where PAT
20698 * request is for WB.
20699 */
20700- mtrr_type = mtrr_type_lookup(start, end);
20701+ if (req_type == _PAGE_CACHE_WB) {
20702+ u8 mtrr_type;
20703
20704- if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
20705- *ret_prot = prot | _PAGE_CACHE_UC;
20706- } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
20707- *ret_prot = prot | _PAGE_CACHE_WC;
20708- } else {
20709- *ret_prot = prot | _PAGE_CACHE_WB;
20710+ mtrr_type = mtrr_type_lookup(start, end);
20711+ if (mtrr_type == MTRR_TYPE_UNCACHABLE)
20712+ return _PAGE_CACHE_UC;
20713+ if (mtrr_type == MTRR_TYPE_WRCOMB)
20714+ return _PAGE_CACHE_WC;
20715+ }
20716+
20717+ return req_type;
20718+}
20719+
20720+static int chk_conflict(struct memtype *new, struct memtype *entry,
20721+ unsigned long *type)
20722+{
20723+ if (new->type != entry->type) {
20724+ if (type) {
20725+ new->type = entry->type;
20726+ *type = entry->type;
20727+ } else
20728+ goto conflict;
20729 }
20730
20731+ /* check overlaps with more than one entry in the list */
20732+ list_for_each_entry_continue(entry, &memtype_list, nd) {
20733+ if (new->end <= entry->start)
20734+ break;
20735+ else if (new->type != entry->type)
20736+ goto conflict;
20737+ }
20738 return 0;
20739+
20740+ conflict:
20741+ printk(KERN_INFO "%s:%d conflicting memory types "
20742+ "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
20743+ new->end, cattr_name(new->type), cattr_name(entry->type));
20744+ return -EBUSY;
20745 }
20746
20747+static struct memtype *cached_entry;
20748+static u64 cached_start;
20749+
20750 /*
20751 * req_type typically has one of the:
20752 * - _PAGE_CACHE_WB
20753@@ -210,37 +232,36 @@ static int pat_x_mtrr_type(u64 start, u6
20754 * req_type will have a special case value '-1', when requester want to inherit
20755 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
20756 *
20757- * If ret_type is NULL, function will return an error if it cannot reserve the
20758- * region with req_type. If ret_type is non-null, function will return
20759- * available type in ret_type in case of no error. In case of any error
20760+ * If new_type is NULL, function will return an error if it cannot reserve the
20761+ * region with req_type. If new_type is non-NULL, function will return
20762+ * available type in new_type in case of no error. In case of any error
20763 * it will return a negative return value.
20764 */
20765 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
20766- unsigned long *ret_type)
20767+ unsigned long *new_type)
20768 {
20769- struct memtype *new_entry = NULL;
20770- struct memtype *parse;
20771+ struct memtype *new, *entry;
20772 unsigned long actual_type;
20773+ struct list_head *where;
20774 int err = 0;
20775
20776- /* Only track when pat_wc_enabled */
20777- if (!pat_wc_enabled) {
20778+ BUG_ON(start >= end); /* end is exclusive */
20779+
20780+ if (!pat_enabled) {
20781 /* This is identical to page table setting without PAT */
20782- if (ret_type) {
20783- if (req_type == -1) {
20784- *ret_type = _PAGE_CACHE_WB;
20785- } else {
20786- *ret_type = req_type;
20787- }
20788+ if (new_type) {
20789+ if (req_type == -1)
20790+ *new_type = _PAGE_CACHE_WB;
20791+ else
20792+ *new_type = req_type & _PAGE_CACHE_MASK;
20793 }
20794 return 0;
20795 }
20796
20797 /* Low ISA region is always mapped WB in page table. No need to track */
20798- if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
20799- if (ret_type)
20800- *ret_type = _PAGE_CACHE_WB;
20801-
20802+ if (is_ISA_range(start, end - 1)) {
20803+ if (new_type)
20804+ *new_type = _PAGE_CACHE_WB;
20805 return 0;
20806 }
20807
20808@@ -253,206 +274,118 @@ int reserve_memtype(u64 start, u64 end,
20809 */
20810 u8 mtrr_type = mtrr_type_lookup(start, end);
20811
20812- if (mtrr_type == MTRR_TYPE_WRBACK) {
20813- req_type = _PAGE_CACHE_WB;
20814+ if (mtrr_type == MTRR_TYPE_WRBACK)
20815 actual_type = _PAGE_CACHE_WB;
20816- } else {
20817- req_type = _PAGE_CACHE_UC_MINUS;
20818+ else
20819 actual_type = _PAGE_CACHE_UC_MINUS;
20820- }
20821- } else {
20822- req_type &= _PAGE_CACHE_MASK;
20823- err = pat_x_mtrr_type(start, end, req_type, &actual_type);
20824- }
20825-
20826- if (err) {
20827- if (ret_type)
20828- *ret_type = actual_type;
20829+ } else
20830+ actual_type = pat_x_mtrr_type(start, end,
20831+ req_type & _PAGE_CACHE_MASK);
20832
20833- return -EINVAL;
20834- }
20835-
20836- new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
20837- if (!new_entry)
20838+ new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
20839+ if (!new)
20840 return -ENOMEM;
20841
20842- new_entry->start = start;
20843- new_entry->end = end;
20844- new_entry->type = actual_type;
20845+ new->start = start;
20846+ new->end = end;
20847+ new->type = actual_type;
20848
20849- if (ret_type)
20850- *ret_type = actual_type;
20851+ if (new_type)
20852+ *new_type = actual_type;
20853
20854 spin_lock(&memtype_lock);
20855
20856- /* Search for existing mapping that overlaps the current range */
20857- list_for_each_entry(parse, &memtype_list, nd) {
20858- struct memtype *saved_ptr;
20859+ if (cached_entry && start >= cached_start)
20860+ entry = cached_entry;
20861+ else
20862+ entry = list_entry(&memtype_list, struct memtype, nd);
20863
20864- if (parse->start >= end) {
20865- pr_debug("New Entry\n");
20866- list_add(&new_entry->nd, parse->nd.prev);
20867- new_entry = NULL;
20868+ /* Search for existing mapping that overlaps the current range */
20869+ where = NULL;
20870+ list_for_each_entry_continue(entry, &memtype_list, nd) {
20871+ if (end <= entry->start) {
20872+ where = entry->nd.prev;
20873+ cached_entry = list_entry(where, struct memtype, nd);
20874 break;
20875- }
20876-
20877- if (start <= parse->start && end >= parse->start) {
20878- if (actual_type != parse->type && ret_type) {
20879- actual_type = parse->type;
20880- *ret_type = actual_type;
20881- new_entry->type = actual_type;
20882- }
20883-
20884- if (actual_type != parse->type) {
20885- printk(
20886- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
20887- current->comm, current->pid,
20888- start, end,
20889- cattr_name(actual_type),
20890- cattr_name(parse->type));
20891- err = -EBUSY;
20892- break;
20893- }
20894-
20895- saved_ptr = parse;
20896- /*
20897- * Check to see whether the request overlaps more
20898- * than one entry in the list
20899- */
20900- list_for_each_entry_continue(parse, &memtype_list, nd) {
20901- if (end <= parse->start) {
20902- break;
20903- }
20904-
20905- if (actual_type != parse->type) {
20906- printk(
20907- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
20908- current->comm, current->pid,
20909- start, end,
20910- cattr_name(actual_type),
20911- cattr_name(parse->type));
20912- err = -EBUSY;
20913- break;
20914- }
20915- }
20916-
20917- if (err) {
20918- break;
20919+ } else if (start <= entry->start) { /* end > entry->start */
20920+ err = chk_conflict(new, entry, new_type);
20921+ if (!err) {
20922+ dprintk("Overlap at 0x%Lx-0x%Lx\n",
20923+ entry->start, entry->end);
20924+ where = entry->nd.prev;
20925+ cached_entry = list_entry(where,
20926+ struct memtype, nd);
20927 }
20928-
20929- pr_debug("Overlap at 0x%Lx-0x%Lx\n",
20930- saved_ptr->start, saved_ptr->end);
20931- /* No conflict. Go ahead and add this new entry */
20932- list_add(&new_entry->nd, saved_ptr->nd.prev);
20933- new_entry = NULL;
20934 break;
20935- }
20936-
20937- if (start < parse->end) {
20938- if (actual_type != parse->type && ret_type) {
20939- actual_type = parse->type;
20940- *ret_type = actual_type;
20941- new_entry->type = actual_type;
20942- }
20943-
20944- if (actual_type != parse->type) {
20945- printk(
20946- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
20947- current->comm, current->pid,
20948- start, end,
20949- cattr_name(actual_type),
20950- cattr_name(parse->type));
20951- err = -EBUSY;
20952- break;
20953- }
20954-
20955- saved_ptr = parse;
20956- /*
20957- * Check to see whether the request overlaps more
20958- * than one entry in the list
20959- */
20960- list_for_each_entry_continue(parse, &memtype_list, nd) {
20961- if (end <= parse->start) {
20962- break;
20963- }
20964-
20965- if (actual_type != parse->type) {
20966- printk(
20967- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
20968- current->comm, current->pid,
20969- start, end,
20970- cattr_name(actual_type),
20971- cattr_name(parse->type));
20972- err = -EBUSY;
20973- break;
20974+ } else if (start < entry->end) { /* start > entry->start */
20975+ err = chk_conflict(new, entry, new_type);
20976+ if (!err) {
20977+ dprintk("Overlap at 0x%Lx-0x%Lx\n",
20978+ entry->start, entry->end);
20979+ cached_entry = list_entry(entry->nd.prev,
20980+ struct memtype, nd);
20981+
20982+ /*
20983+ * Move to right position in the linked
20984+ * list to add this new entry
20985+ */
20986+ list_for_each_entry_continue(entry,
20987+ &memtype_list, nd) {
20988+ if (start <= entry->start) {
20989+ where = entry->nd.prev;
20990+ break;
20991+ }
20992 }
20993 }
20994-
20995- if (err) {
20996- break;
20997- }
20998-
20999- pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
21000- saved_ptr->start, saved_ptr->end);
21001- /* No conflict. Go ahead and add this new entry */
21002- list_add(&new_entry->nd, &saved_ptr->nd);
21003- new_entry = NULL;
21004 break;
21005 }
21006 }
21007
21008 if (err) {
21009- printk(KERN_INFO
21010- "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
21011- start, end, cattr_name(new_entry->type),
21012- cattr_name(req_type));
21013- kfree(new_entry);
21014+ printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
21015+ "track %s, req %s\n",
21016+ start, end, cattr_name(new->type), cattr_name(req_type));
21017+ kfree(new);
21018 spin_unlock(&memtype_lock);
21019 return err;
21020 }
21021
21022- if (new_entry) {
21023- /* No conflict. Not yet added to the list. Add to the tail */
21024- list_add_tail(&new_entry->nd, &memtype_list);
21025- pr_debug("New Entry\n");
21026- }
21027+ cached_start = start;
21028
21029- if (ret_type) {
21030- pr_debug(
21031- "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21032- start, end, cattr_name(actual_type),
21033- cattr_name(req_type), cattr_name(*ret_type));
21034- } else {
21035- pr_debug(
21036- "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
21037- start, end, cattr_name(actual_type),
21038- cattr_name(req_type));
21039- }
21040+ if (where)
21041+ list_add(&new->nd, where);
21042+ else
21043+ list_add_tail(&new->nd, &memtype_list);
21044
21045 spin_unlock(&memtype_lock);
21046+
21047+ dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21048+ start, end, cattr_name(new->type), cattr_name(req_type),
21049+ new_type ? cattr_name(*new_type) : "-");
21050+
21051 return err;
21052 }
21053
21054 int free_memtype(u64 start, u64 end)
21055 {
21056- struct memtype *ml;
21057+ struct memtype *entry;
21058 int err = -EINVAL;
21059
21060- /* Only track when pat_wc_enabled */
21061- if (!pat_wc_enabled) {
21062+ if (!pat_enabled)
21063 return 0;
21064- }
21065
21066 /* Low ISA region is always mapped WB. No need to track */
21067- if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
21068+ if (is_ISA_range(start, end - 1))
21069 return 0;
21070- }
21071
21072 spin_lock(&memtype_lock);
21073- list_for_each_entry(ml, &memtype_list, nd) {
21074- if (ml->start == start && ml->end == end) {
21075- list_del(&ml->nd);
21076- kfree(ml);
21077+ list_for_each_entry(entry, &memtype_list, nd) {
21078+ if (entry->start == start && entry->end == end) {
21079+ if (cached_entry == entry || cached_start == start)
21080+ cached_entry = NULL;
21081+
21082+ list_del(&entry->nd);
21083+ kfree(entry);
21084 err = 0;
21085 break;
21086 }
21087@@ -464,27 +397,19 @@ int free_memtype(u64 start, u64 end)
21088 current->comm, current->pid, start, end);
21089 }
21090
21091- pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21092+ dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21093 return err;
21094 }
21095
21096
21097-/*
21098- * /dev/mem mmap interface. The memtype used for mapping varies:
21099- * - Use UC for mappings with O_SYNC flag
21100- * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
21101- * inherit the memtype from existing mapping.
21102- * - Else use UC_MINUS memtype (for backward compatibility with existing
21103- * X drivers.
21104- */
21105 pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
21106 unsigned long size, pgprot_t vma_prot)
21107 {
21108 return vma_prot;
21109 }
21110
21111-#ifdef CONFIG_NONPROMISC_DEVMEM
21112-/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
21113+#ifdef CONFIG_STRICT_DEVMEM
21114+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
21115 static inline int range_is_allowed(unsigned long mfn, unsigned long size)
21116 {
21117 return 1;
21118@@ -508,20 +433,20 @@ static inline int range_is_allowed(unsig
21119 }
21120 return 1;
21121 }
21122-#endif /* CONFIG_NONPROMISC_DEVMEM */
21123+#endif /* CONFIG_STRICT_DEVMEM */
21124
21125 int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
21126 unsigned long size, pgprot_t *vma_prot)
21127 {
21128 u64 addr = (u64)mfn << PAGE_SHIFT;
21129- unsigned long flags = _PAGE_CACHE_UC_MINUS;
21130+ unsigned long flags = -1;
21131 int retval;
21132
21133 if (!range_is_allowed(mfn, size))
21134 return 0;
21135
21136 if (file->f_flags & O_SYNC) {
21137- flags = _PAGE_CACHE_UC;
21138+ flags = _PAGE_CACHE_UC_MINUS;
21139 }
21140
21141 #ifndef CONFIG_X86_32
21142@@ -534,25 +459,26 @@ int phys_mem_access_prot_allowed(struct
21143 * caching for the high addresses through the KEN pin, but
21144 * we maintain the tradition of paranoia in this code.
21145 */
21146- if (!pat_wc_enabled &&
21147- ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
21148- test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
21149- test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
21150- test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
21151- (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21152+ if (!pat_enabled &&
21153+ !(boot_cpu_has(X86_FEATURE_MTRR) ||
21154+ boot_cpu_has(X86_FEATURE_K6_MTRR) ||
21155+ boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
21156+ boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
21157+ (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21158 flags = _PAGE_CACHE_UC;
21159 }
21160 #endif
21161 #endif
21162
21163 /*
21164- * With O_SYNC, we can only take UC mapping. Fail if we cannot.
21165+ * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
21166+ *
21167 * Without O_SYNC, we want to get
21168 * - WB for WB-able memory and no other conflicting mappings
21169 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
21170 * - Inherit from confliting mappings otherwise
21171 */
21172- if (flags != _PAGE_CACHE_UC_MINUS) {
21173+ if (flags != -1) {
21174 retval = reserve_memtype(addr, addr + size, flags, NULL);
21175 } else {
21176 retval = reserve_memtype(addr, addr + size, -1, &flags);
21177@@ -600,3 +526,88 @@ void unmap_devmem(unsigned long mfn, uns
21178 free_memtype(addr, addr + size);
21179 }
21180
21181+#if defined(CONFIG_DEBUG_FS)
21182+
21183+/* get Nth element of the linked list */
21184+static struct memtype *memtype_get_idx(loff_t pos)
21185+{
21186+ struct memtype *list_node, *print_entry;
21187+ int i = 1;
21188+
21189+ print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21190+ if (!print_entry)
21191+ return NULL;
21192+
21193+ spin_lock(&memtype_lock);
21194+ list_for_each_entry(list_node, &memtype_list, nd) {
21195+ if (pos == i) {
21196+ *print_entry = *list_node;
21197+ spin_unlock(&memtype_lock);
21198+ return print_entry;
21199+ }
21200+ ++i;
21201+ }
21202+ spin_unlock(&memtype_lock);
21203+ kfree(print_entry);
21204+ return NULL;
21205+}
21206+
21207+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
21208+{
21209+ if (*pos == 0) {
21210+ ++*pos;
21211+ seq_printf(seq, "PAT memtype list:\n");
21212+ }
21213+
21214+ return memtype_get_idx(*pos);
21215+}
21216+
21217+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
21218+{
21219+ ++*pos;
21220+ return memtype_get_idx(*pos);
21221+}
21222+
21223+static void memtype_seq_stop(struct seq_file *seq, void *v)
21224+{
21225+}
21226+
21227+static int memtype_seq_show(struct seq_file *seq, void *v)
21228+{
21229+ struct memtype *print_entry = (struct memtype *)v;
21230+
21231+ seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
21232+ print_entry->start, print_entry->end);
21233+ kfree(print_entry);
21234+ return 0;
21235+}
21236+
21237+static struct seq_operations memtype_seq_ops = {
21238+ .start = memtype_seq_start,
21239+ .next = memtype_seq_next,
21240+ .stop = memtype_seq_stop,
21241+ .show = memtype_seq_show,
21242+};
21243+
21244+static int memtype_seq_open(struct inode *inode, struct file *file)
21245+{
21246+ return seq_open(file, &memtype_seq_ops);
21247+}
21248+
21249+static const struct file_operations memtype_fops = {
21250+ .open = memtype_seq_open,
21251+ .read = seq_read,
21252+ .llseek = seq_lseek,
21253+ .release = seq_release,
21254+};
21255+
21256+static int __init pat_memtype_list_init(void)
21257+{
21258+ debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
21259+ NULL, &memtype_fops);
21260+ return 0;
21261+}
21262+
21263+late_initcall(pat_memtype_list_init);
21264+
21265+#endif /* CONFIG_DEBUG_FS */
21266Index: head-2008-12-01/arch/x86/mm/pgtable-xen.c
21267===================================================================
21268--- head-2008-12-01.orig/arch/x86/mm/pgtable-xen.c 2008-12-01 11:46:22.000000000 +0100
21269+++ head-2008-12-01/arch/x86/mm/pgtable-xen.c 2008-12-01 11:49:07.000000000 +0100
21270@@ -4,6 +4,7 @@
21271 #include <asm/pgalloc.h>
21272 #include <asm/pgtable.h>
21273 #include <asm/tlb.h>
21274+#include <asm/fixmap.h>
21275 #include <asm/hypervisor.h>
21276 #include <asm/mmu_context.h>
21277
21278@@ -410,15 +411,9 @@ static inline void pgd_list_del(pgd_t *p
21279 static void pgd_ctor(void *p)
21280 {
21281 pgd_t *pgd = p;
21282- unsigned long flags;
21283
21284 pgd_test_and_unpin(pgd);
21285
21286- /* Clear usermode parts of PGD */
21287- memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
21288-
21289- spin_lock_irqsave(&pgd_lock, flags);
21290-
21291 /* If the pgd points to a shared pagetable level (either the
21292 ptes in non-PAE, or shared PMD in PAE), then just copy the
21293 references from swapper_pg_dir. */
21294@@ -440,13 +435,9 @@ static void pgd_ctor(void *p)
21295 __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
21296 #endif
21297
21298-#ifndef CONFIG_X86_PAE
21299 /* list required to sync kernel mapping updates */
21300 if (!SHARED_KERNEL_PMD)
21301 pgd_list_add(pgd);
21302-#endif
21303-
21304- spin_unlock_irqrestore(&pgd_lock, flags);
21305 }
21306
21307 static void pgd_dtor(void *pgd)
21308@@ -475,33 +466,6 @@ static void pgd_dtor(void *pgd)
21309
21310 #ifdef CONFIG_X86_PAE
21311 /*
21312- * Mop up any pmd pages which may still be attached to the pgd.
21313- * Normally they will be freed by munmap/exit_mmap, but any pmd we
21314- * preallocate which never got a corresponding vma will need to be
21315- * freed manually.
21316- */
21317-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21318-{
21319- int i;
21320-
21321- for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
21322- pgd_t pgd = pgdp[i];
21323-
21324- if (__pgd_val(pgd) != 0) {
21325- pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21326-
21327- pgdp[i] = xen_make_pgd(0);
21328-
21329- paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21330- pmd_free(mm, pmd);
21331- }
21332- }
21333-
21334- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21335- xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21336-}
21337-
21338-/*
21339 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
21340 * updating the top-level pagetable entries to guarantee the
21341 * processor notices the update. Since this is expensive, and
21342@@ -512,61 +476,7 @@ static void pgd_mop_up_pmds(struct mm_st
21343 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
21344 * and initialize the kernel pmds here.
21345 */
21346-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21347-{
21348- pud_t *pud;
21349- pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
21350- unsigned long addr, flags;
21351- int i;
21352-
21353- /*
21354- * We can race save/restore (if we sleep during a GFP_KERNEL memory
21355- * allocation). We therefore store virtual addresses of pmds as they
21356- * do not change across save/restore, and poke the machine addresses
21357- * into the pgdir under the pgd_lock.
21358- */
21359- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
21360- pmds[i] = pmd_alloc_one(mm, addr);
21361- if (!pmds[i])
21362- goto out_oom;
21363- }
21364-
21365- spin_lock_irqsave(&pgd_lock, flags);
21366-
21367- /* Protect against save/restore: move below 4GB under pgd_lock. */
21368- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21369- && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21370- spin_unlock_irqrestore(&pgd_lock, flags);
21371-out_oom:
21372- while (i--)
21373- pmd_free(mm, pmds[i]);
21374- return 0;
21375- }
21376-
21377- /* Copy kernel pmd contents and write-protect the new pmds. */
21378- pud = pud_offset(pgd, 0);
21379- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
21380- i++, pud++, addr += PUD_SIZE) {
21381- if (i >= KERNEL_PGD_BOUNDARY) {
21382- memcpy(pmds[i],
21383- (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21384- sizeof(pmd_t) * PTRS_PER_PMD);
21385- make_lowmem_page_readonly(
21386- pmds[i], XENFEAT_writable_page_tables);
21387- }
21388-
21389- /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21390- pud_populate(mm, pud, pmds[i]);
21391- }
21392-
21393- /* List required to sync kernel mapping updates and
21394- * to pin/unpin on save/restore. */
21395- pgd_list_add(pgd);
21396-
21397- spin_unlock_irqrestore(&pgd_lock, flags);
21398-
21399- return 1;
21400-}
21401+#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
21402
21403 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
21404 {
21405@@ -596,16 +506,97 @@ void pud_populate(struct mm_struct *mm,
21406 xen_tlb_flush();
21407 }
21408 #else /* !CONFIG_X86_PAE */
21409+
21410 /* No need to prepopulate any pagetable entries in non-PAE modes. */
21411-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21412+#define PREALLOCATED_PMDS 0
21413+
21414+#endif /* CONFIG_X86_PAE */
21415+
21416+static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
21417 {
21418- return 1;
21419+ int i;
21420+
21421+ if (contig)
21422+ xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
21423+
21424+ for(i = 0; i < PREALLOCATED_PMDS; i++)
21425+ if (pmds[i])
21426+ pmd_free(mm, pmds[i]);
21427 }
21428
21429-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
21430+static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
21431 {
21432+ int i;
21433+ bool failed = false;
21434+
21435+ for(i = 0; i < PREALLOCATED_PMDS; i++) {
21436+ pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
21437+ if (pmd == NULL)
21438+ failed = true;
21439+ pmds[i] = pmd;
21440+ }
21441+
21442+ if (failed) {
21443+ free_pmds(pmds, mm, false);
21444+ return -ENOMEM;
21445+ }
21446+
21447+ return 0;
21448+}
21449+
21450+/*
21451+ * Mop up any pmd pages which may still be attached to the pgd.
21452+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
21453+ * preallocate which never got a corresponding vma will need to be
21454+ * freed manually.
21455+ */
21456+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21457+{
21458+ int i;
21459+
21460+ for(i = 0; i < PREALLOCATED_PMDS; i++) {
21461+ pgd_t pgd = pgdp[i];
21462+
21463+ if (__pgd_val(pgd) != 0) {
21464+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21465+
21466+ pgdp[i] = xen_make_pgd(0);
21467+
21468+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21469+ pmd_free(mm, pmd);
21470+ }
21471+ }
21472+
21473+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21474+ xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21475+}
21476+
21477+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
21478+{
21479+ pud_t *pud;
21480+ unsigned long addr;
21481+ int i;
21482+
21483+ if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
21484+ return;
21485+
21486+ pud = pud_offset(pgd, 0);
21487+ for (addr = i = 0; i < PREALLOCATED_PMDS;
21488+ i++, pud++, addr += PUD_SIZE) {
21489+ pmd_t *pmd = pmds[i];
21490+
21491+ if (i >= KERNEL_PGD_BOUNDARY) {
21492+ memcpy(pmd,
21493+ (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21494+ sizeof(pmd_t) * PTRS_PER_PMD);
21495+ make_lowmem_page_readonly(
21496+ pmd, XENFEAT_writable_page_tables);
21497+ }
21498+
21499+ /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21500+ pud_populate(mm, pud, pmd);
21501+ }
21502 }
21503-#endif /* CONFIG_X86_PAE */
21504
21505 #ifdef CONFIG_X86_64
21506 /* We allocate two contiguous pages for kernel and user. */
21507@@ -616,19 +607,52 @@ static void pgd_mop_up_pmds(struct mm_st
21508
21509 pgd_t *pgd_alloc(struct mm_struct *mm)
21510 {
21511- pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21512+ pgd_t *pgd;
21513+ pmd_t *pmds[PREALLOCATED_PMDS];
21514+ unsigned long flags;
21515+
21516+ pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21517+
21518+ if (pgd == NULL)
21519+ goto out;
21520
21521- /* so that alloc_pd can use it */
21522 mm->pgd = pgd;
21523- if (pgd)
21524- pgd_ctor(pgd);
21525
21526- if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
21527- free_pages((unsigned long)pgd, PGD_ORDER);
21528- pgd = NULL;
21529+ if (preallocate_pmds(pmds, mm) != 0)
21530+ goto out_free_pgd;
21531+
21532+ if (paravirt_pgd_alloc(mm) != 0)
21533+ goto out_free_pmds;
21534+
21535+ /*
21536+ * Make sure that pre-populating the pmds is atomic with
21537+ * respect to anything walking the pgd_list, so that they
21538+ * never see a partially populated pgd.
21539+ */
21540+ spin_lock_irqsave(&pgd_lock, flags);
21541+
21542+#ifdef CONFIG_X86_PAE
21543+ /* Protect against save/restore: move below 4GB under pgd_lock. */
21544+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21545+ && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21546+ spin_unlock_irqrestore(&pgd_lock, flags);
21547+ goto out_free_pmds;
21548 }
21549+#endif
21550+
21551+ pgd_ctor(pgd);
21552+ pgd_prepopulate_pmd(mm, pgd, pmds);
21553+
21554+ spin_unlock_irqrestore(&pgd_lock, flags);
21555
21556 return pgd;
21557+
21558+out_free_pmds:
21559+ free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
21560+out_free_pgd:
21561+ free_pages((unsigned long)pgd, PGD_ORDER);
21562+out:
21563+ return NULL;
21564 }
21565
21566 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
21567@@ -644,6 +668,7 @@ void pgd_free(struct mm_struct *mm, pgd_
21568 pgd_dtor(pgd);
21569
21570 pgd_mop_up_pmds(mm, pgd);
21571+ paravirt_pgd_free(mm, pgd);
21572 free_pages((unsigned long)pgd, PGD_ORDER);
21573 }
21574
21575@@ -685,7 +710,7 @@ int ptep_test_and_clear_young(struct vm_
21576
21577 if (pte_young(*ptep))
21578 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
21579- &ptep->pte);
21580+ (unsigned long *) &ptep->pte);
21581
21582 if (ret)
21583 pte_update(vma->vm_mm, addr, ptep);
21584@@ -707,3 +732,42 @@ int ptep_clear_flush_young(struct vm_are
21585
21586 return young;
21587 }
21588+
21589+int fixmaps_set;
21590+
21591+void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21592+{
21593+ unsigned long address = __fix_to_virt(idx);
21594+ pte_t pte;
21595+
21596+ if (idx >= __end_of_fixed_addresses) {
21597+ BUG();
21598+ return;
21599+ }
21600+
21601+ switch (idx) {
21602+#ifdef CONFIG_X86_64
21603+ extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
21604+
21605+ case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
21606+ pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21607+ set_pte_vaddr_pud(level3_user_pgt, address, pte);
21608+ break;
21609+ case FIX_EARLYCON_MEM_BASE:
21610+ xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
21611+ pfn_pte_ma(phys >> PAGE_SHIFT, flags));
21612+ fixmaps_set++;
21613+ return;
21614+#else
21615+ case FIX_WP_TEST:
21616+ case FIX_VDSO:
21617+ pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21618+ break;
21619+#endif
21620+ default:
21621+ pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21622+ break;
21623+ }
21624+ set_pte_vaddr(address, pte);
21625+ fixmaps_set++;
21626+}
21627Index: head-2008-12-01/arch/x86/mm/pgtable_32-xen.c
21628===================================================================
21629--- head-2008-12-01.orig/arch/x86/mm/pgtable_32-xen.c 2008-12-01 11:44:55.000000000 +0100
21630+++ head-2008-12-01/arch/x86/mm/pgtable_32-xen.c 2008-12-01 11:49:07.000000000 +0100
21631@@ -25,51 +25,49 @@
21632 #include <xen/features.h>
21633 #include <asm/hypervisor.h>
21634
21635-void show_mem(void)
21636+/*
21637+ * Associate a virtual page frame with a given physical page frame
21638+ * and protection flags for that frame.
21639+ */
21640+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
21641 {
21642- int total = 0, reserved = 0;
21643- int shared = 0, cached = 0;
21644- int highmem = 0;
21645- struct page *page;
21646- pg_data_t *pgdat;
21647- unsigned long i;
21648- unsigned long flags;
21649-
21650- printk(KERN_INFO "Mem-info:\n");
21651- show_free_areas();
21652- for_each_online_pgdat(pgdat) {
21653- pgdat_resize_lock(pgdat, &flags);
21654- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
21655- if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
21656- touch_nmi_watchdog();
21657- page = pgdat_page_nr(pgdat, i);
21658- total++;
21659- if (PageHighMem(page))
21660- highmem++;
21661- if (PageReserved(page))
21662- reserved++;
21663- else if (PageSwapCache(page))
21664- cached++;
21665- else if (page_count(page))
21666- shared += page_count(page) - 1;
21667- }
21668- pgdat_resize_unlock(pgdat, &flags);
21669- }
21670- printk(KERN_INFO "%d pages of RAM\n", total);
21671- printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
21672- printk(KERN_INFO "%d reserved pages\n", reserved);
21673- printk(KERN_INFO "%d pages shared\n", shared);
21674- printk(KERN_INFO "%d pages swap cached\n", cached);
21675-
21676- printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
21677- printk(KERN_INFO "%lu pages writeback\n",
21678- global_page_state(NR_WRITEBACK));
21679- printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
21680- printk(KERN_INFO "%lu pages slab\n",
21681- global_page_state(NR_SLAB_RECLAIMABLE) +
21682- global_page_state(NR_SLAB_UNRECLAIMABLE));
21683- printk(KERN_INFO "%lu pages pagetables\n",
21684- global_page_state(NR_PAGETABLE));
21685+#ifndef CONFIG_XEN
21686+ pgd_t *pgd;
21687+ pud_t *pud;
21688+ pmd_t *pmd;
21689+ pte_t *pte;
21690+
21691+ pgd = swapper_pg_dir + pgd_index(vaddr);
21692+ if (pgd_none(*pgd)) {
21693+ BUG();
21694+ return;
21695+ }
21696+ pud = pud_offset(pgd, vaddr);
21697+ if (pud_none(*pud)) {
21698+ BUG();
21699+ return;
21700+ }
21701+ pmd = pmd_offset(pud, vaddr);
21702+ if (pmd_none(*pmd)) {
21703+ BUG();
21704+ return;
21705+ }
21706+ pte = pte_offset_kernel(pmd, vaddr);
21707+ if (pte_val(pteval))
21708+ set_pte_present(&init_mm, vaddr, pte, pteval);
21709+ else
21710+ pte_clear(&init_mm, vaddr, pte);
21711+
21712+ /*
21713+ * It's enough to flush this one mapping.
21714+ * (PGE mappings get flushed as well)
21715+ */
21716+ __flush_tlb_one(vaddr);
21717+#else
21718+ if (HYPERVISOR_update_va_mapping(vaddr, pteval,
21719+ UVMF_INVLPG|UVMF_ALL))
21720+ BUG();
21721+#endif
21722 }
21723
21724 /*
21725@@ -107,35 +105,10 @@ void set_pmd_pfn(unsigned long vaddr, un
21726 __flush_tlb_one(vaddr);
21727 }
21728
21729-static int fixmaps;
21730 unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
21731 unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
21732 EXPORT_SYMBOL(__FIXADDR_TOP);
21733
21734-void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21735-{
21736- unsigned long address = __fix_to_virt(idx);
21737- pte_t pte;
21738-
21739- if (idx >= __end_of_fixed_addresses) {
21740- BUG();
21741- return;
21742- }
21743- switch (idx) {
21744- case FIX_WP_TEST:
21745- case FIX_VDSO:
21746- pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21747- break;
21748- default:
21749- pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21750- break;
21751- }
21752- if (HYPERVISOR_update_va_mapping(address, pte,
21753- UVMF_INVLPG|UVMF_ALL))
21754- BUG();
21755- fixmaps++;
21756-}
21757-
21758 /**
21759 * reserve_top_address - reserves a hole in the top of kernel address space
21760 * @reserve - size of hole to reserve
21761@@ -145,13 +118,48 @@ void __set_fixmap (enum fixed_addresses
21762 */
21763 void __init reserve_top_address(unsigned long reserve)
21764 {
21765- BUG_ON(fixmaps > 0);
21766+ BUG_ON(fixmaps_set > 0);
21767 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
21768 (int)-reserve);
21769 __FIXADDR_TOP = -reserve - PAGE_SIZE;
21770 __VMALLOC_RESERVE += reserve;
21771 }
21772
21773+/*
21774+ * vmalloc=size forces the vmalloc area to be exactly 'size'
21775+ * bytes. This can be used to increase (or decrease) the
21776+ * vmalloc area - the default is 128m.
21777+ */
21778+static int __init parse_vmalloc(char *arg)
21779+{
21780+ if (!arg)
21781+ return -EINVAL;
21782+
21783+ __VMALLOC_RESERVE = memparse(arg, &arg);
21784+ return 0;
21785+}
21786+early_param("vmalloc", parse_vmalloc);
21787+
21788+#ifndef CONFIG_XEN
21789+/*
21790+ * reservetop=size reserves a hole at the top of the kernel address space which
21791+ * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
21792+ * so relocating the fixmap can be done before paging initialization.
21793+ */
21794+static int __init parse_reservetop(char *arg)
21795+{
21796+ unsigned long address;
21797+
21798+ if (!arg)
21799+ return -EINVAL;
21800+
21801+ address = memparse(arg, &arg);
21802+ reserve_top_address(address);
21803+ return 0;
21804+}
21805+early_param("reservetop", parse_reservetop);
21806+#endif
21807+
21808 void make_lowmem_page_readonly(void *va, unsigned int feature)
21809 {
21810 pte_t *pte;
21811Index: head-2008-12-01/arch/x86/pci/amd_bus.c
21812===================================================================
21813--- head-2008-12-01.orig/arch/x86/pci/amd_bus.c 2008-12-03 15:48:43.000000000 +0100
21814+++ head-2008-12-01/arch/x86/pci/amd_bus.c 2008-12-01 11:49:07.000000000 +0100
21815@@ -607,6 +607,14 @@ static int __init pci_io_ecs_init(void)
21816 for_each_online_cpu(cpu)
21817 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
21818 (void *)(long)cpu);
21819+#ifdef CONFIG_XEN
21820+ {
21821+ u64 reg;
21822+ rdmsrl(MSR_AMD64_NB_CFG, reg);
21823+ if (!(reg & ENABLE_CF8_EXT_CFG))
21824+ return 0;
21825+ }
21826+#endif
21827 pci_probe |= PCI_HAS_IO_ECS;
21828
21829 return 0;
21830@@ -614,6 +622,10 @@ static int __init pci_io_ecs_init(void)
21831
21832 static int __init amd_postcore_init(void)
21833 {
21834+#ifdef CONFIG_XEN
21835+ if (!is_initial_xendomain())
21836+ return 0;
21837+#endif
21838 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
21839 return 0;
21840
21841Index: head-2008-12-01/arch/x86/pci/irq-xen.c
21842===================================================================
21843--- head-2008-12-01.orig/arch/x86/pci/irq-xen.c 2008-12-01 11:44:55.000000000 +0100
21844+++ head-2008-12-01/arch/x86/pci/irq-xen.c 2008-12-01 11:49:07.000000000 +0100
21845@@ -11,8 +11,8 @@
21846 #include <linux/slab.h>
21847 #include <linux/interrupt.h>
21848 #include <linux/dmi.h>
21849-#include <asm/io.h>
21850-#include <asm/smp.h>
21851+#include <linux/io.h>
21852+#include <linux/smp.h>
21853 #include <asm/io_apic.h>
21854 #include <linux/irq.h>
21855 #include <linux/acpi.h>
21856@@ -45,7 +45,8 @@ struct irq_router {
21857 char *name;
21858 u16 vendor, device;
21859 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
21860- int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
21861+ int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
21862+ int new);
21863 };
21864
21865 struct irq_router_handler {
21866@@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_d
21867 * and perform checksum verification.
21868 */
21869
21870-static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
21871+static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
21872 {
21873 struct irq_routing_table *rt;
21874 int i;
21875@@ -74,10 +75,11 @@ static inline struct irq_routing_table *
21876 rt->size < sizeof(struct irq_routing_table))
21877 return NULL;
21878 sum = 0;
21879- for (i=0; i < rt->size; i++)
21880+ for (i = 0; i < rt->size; i++)
21881 sum += addr[i];
21882 if (!sum) {
21883- DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
21884+ DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
21885+ rt);
21886 return rt;
21887 }
21888 return NULL;
21889@@ -104,7 +106,9 @@ static struct irq_routing_table * __init
21890 return rt;
21891 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
21892 }
21893- for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
21894+ for (addr = (u8 *) isa_bus_to_virt(0xf0000);
21895+ addr < (u8 *) isa_bus_to_virt(0x100000);
21896+ addr += 16) {
21897 rt = pirq_check_routing_table(addr);
21898 if (rt)
21899 return rt;
21900@@ -126,20 +130,20 @@ static void __init pirq_peer_trick(void)
21901 struct irq_info *e;
21902
21903 memset(busmap, 0, sizeof(busmap));
21904- for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
21905+ for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
21906 e = &rt->slots[i];
21907 #ifdef DEBUG
21908 {
21909 int j;
21910 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
21911- for(j=0; j<4; j++)
21912+ for (j = 0; j < 4; j++)
21913 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
21914 DBG("\n");
21915 }
21916 #endif
21917 busmap[e->bus] = 1;
21918 }
21919- for(i = 1; i < 256; i++) {
21920+ for (i = 1; i < 256; i++) {
21921 int node;
21922 if (!busmap[i] || pci_find_bus(0, i))
21923 continue;
21924@@ -187,7 +191,8 @@ static unsigned int read_config_nybble(s
21925 return (nr & 1) ? (x >> 4) : (x & 0xf);
21926 }
21927
21928-static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
21929+static void write_config_nybble(struct pci_dev *router, unsigned offset,
21930+ unsigned nr, unsigned int val)
21931 {
21932 u8 x;
21933 unsigned reg = offset + (nr >> 1);
21934@@ -289,7 +294,7 @@ static int pirq_ite_get(struct pci_dev *
21935 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
21936
21937 WARN_ON_ONCE(pirq > 4);
21938- return read_config_nybble(router,0x43, pirqmap[pirq-1]);
21939+ return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
21940 }
21941
21942 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21943@@ -318,7 +323,7 @@ static int pirq_opti_set(struct pci_dev
21944
21945 /*
21946 * Cyrix: nibble offset 0x5C
21947- * 0x5C bits 7:4 is INTB bits 3:0 is INTA
21948+ * 0x5C bits 7:4 is INTB bits 3:0 is INTA
21949 * 0x5D bits 7:4 is INTD bits 3:0 is INTC
21950 */
21951 static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
21952@@ -354,7 +359,7 @@ static int pirq_cyrix_set(struct pci_dev
21953 * Apparently there are systems implementing PCI routing table using
21954 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
21955 * We try our best to handle both link mappings.
21956- *
21957+ *
21958 * Currently (2003-05-21) it appears most SiS chipsets follow the
21959 * definition of routing registers from the SiS-5595 southbridge.
21960 * According to the SiS 5595 datasheets the revision id's of the
21961@@ -374,7 +379,7 @@ static int pirq_cyrix_set(struct pci_dev
21962 *
21963 * 0x62: USBIRQ:
21964 * bit 6 OHCI function disabled (0), enabled (1)
21965- *
21966+ *
21967 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
21968 *
21969 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
21970@@ -437,7 +442,7 @@ static int pirq_vlsi_get(struct pci_dev
21971 {
21972 WARN_ON_ONCE(pirq >= 9);
21973 if (pirq > 8) {
21974- printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
21975+ dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
21976 return 0;
21977 }
21978 return read_config_nybble(router, 0x74, pirq-1);
21979@@ -447,7 +452,7 @@ static int pirq_vlsi_set(struct pci_dev
21980 {
21981 WARN_ON_ONCE(pirq >= 9);
21982 if (pirq > 8) {
21983- printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
21984+ dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
21985 return 0;
21986 }
21987 write_config_nybble(router, 0x74, pirq-1, irq);
21988@@ -471,7 +476,8 @@ static int pirq_serverworks_get(struct p
21989 return inb(0xc01) & 0xf;
21990 }
21991
21992-static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21993+static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
21994+ int pirq, int irq)
21995 {
21996 outb(pirq, 0xc00);
21997 outb(irq, 0xc01);
21998@@ -491,22 +497,20 @@ static int pirq_amd756_get(struct pci_de
21999 u8 irq;
22000 irq = 0;
22001 if (pirq <= 4)
22002- {
22003 irq = read_config_nybble(router, 0x56, pirq - 1);
22004- }
22005- printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
22006- dev->vendor, dev->device, pirq, irq);
22007+ dev_info(&dev->dev,
22008+ "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
22009+ dev->vendor, dev->device, pirq, irq);
22010 return irq;
22011 }
22012
22013 static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22014 {
22015- printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
22016- dev->vendor, dev->device, pirq, irq);
22017+ dev_info(&dev->dev,
22018+ "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
22019+ dev->vendor, dev->device, pirq, irq);
22020 if (pirq <= 4)
22021- {
22022 write_config_nybble(router, 0x56, pirq - 1, irq);
22023- }
22024 return 1;
22025 }
22026
22027@@ -553,50 +557,51 @@ static __init int intel_router_probe(str
22028 if (pci_dev_present(pirq_440gx))
22029 return 0;
22030
22031- switch(device)
22032- {
22033- case PCI_DEVICE_ID_INTEL_82371FB_0:
22034- case PCI_DEVICE_ID_INTEL_82371SB_0:
22035- case PCI_DEVICE_ID_INTEL_82371AB_0:
22036- case PCI_DEVICE_ID_INTEL_82371MX:
22037- case PCI_DEVICE_ID_INTEL_82443MX_0:
22038- case PCI_DEVICE_ID_INTEL_82801AA_0:
22039- case PCI_DEVICE_ID_INTEL_82801AB_0:
22040- case PCI_DEVICE_ID_INTEL_82801BA_0:
22041- case PCI_DEVICE_ID_INTEL_82801BA_10:
22042- case PCI_DEVICE_ID_INTEL_82801CA_0:
22043- case PCI_DEVICE_ID_INTEL_82801CA_12:
22044- case PCI_DEVICE_ID_INTEL_82801DB_0:
22045- case PCI_DEVICE_ID_INTEL_82801E_0:
22046- case PCI_DEVICE_ID_INTEL_82801EB_0:
22047- case PCI_DEVICE_ID_INTEL_ESB_1:
22048- case PCI_DEVICE_ID_INTEL_ICH6_0:
22049- case PCI_DEVICE_ID_INTEL_ICH6_1:
22050- case PCI_DEVICE_ID_INTEL_ICH7_0:
22051- case PCI_DEVICE_ID_INTEL_ICH7_1:
22052- case PCI_DEVICE_ID_INTEL_ICH7_30:
22053- case PCI_DEVICE_ID_INTEL_ICH7_31:
22054- case PCI_DEVICE_ID_INTEL_ESB2_0:
22055- case PCI_DEVICE_ID_INTEL_ICH8_0:
22056- case PCI_DEVICE_ID_INTEL_ICH8_1:
22057- case PCI_DEVICE_ID_INTEL_ICH8_2:
22058- case PCI_DEVICE_ID_INTEL_ICH8_3:
22059- case PCI_DEVICE_ID_INTEL_ICH8_4:
22060- case PCI_DEVICE_ID_INTEL_ICH9_0:
22061- case PCI_DEVICE_ID_INTEL_ICH9_1:
22062- case PCI_DEVICE_ID_INTEL_ICH9_2:
22063- case PCI_DEVICE_ID_INTEL_ICH9_3:
22064- case PCI_DEVICE_ID_INTEL_ICH9_4:
22065- case PCI_DEVICE_ID_INTEL_ICH9_5:
22066- case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22067- case PCI_DEVICE_ID_INTEL_ICH10_0:
22068- case PCI_DEVICE_ID_INTEL_ICH10_1:
22069- case PCI_DEVICE_ID_INTEL_ICH10_2:
22070- case PCI_DEVICE_ID_INTEL_ICH10_3:
22071- r->name = "PIIX/ICH";
22072- r->get = pirq_piix_get;
22073- r->set = pirq_piix_set;
22074- return 1;
22075+ switch (device) {
22076+ case PCI_DEVICE_ID_INTEL_82371FB_0:
22077+ case PCI_DEVICE_ID_INTEL_82371SB_0:
22078+ case PCI_DEVICE_ID_INTEL_82371AB_0:
22079+ case PCI_DEVICE_ID_INTEL_82371MX:
22080+ case PCI_DEVICE_ID_INTEL_82443MX_0:
22081+ case PCI_DEVICE_ID_INTEL_82801AA_0:
22082+ case PCI_DEVICE_ID_INTEL_82801AB_0:
22083+ case PCI_DEVICE_ID_INTEL_82801BA_0:
22084+ case PCI_DEVICE_ID_INTEL_82801BA_10:
22085+ case PCI_DEVICE_ID_INTEL_82801CA_0:
22086+ case PCI_DEVICE_ID_INTEL_82801CA_12:
22087+ case PCI_DEVICE_ID_INTEL_82801DB_0:
22088+ case PCI_DEVICE_ID_INTEL_82801E_0:
22089+ case PCI_DEVICE_ID_INTEL_82801EB_0:
22090+ case PCI_DEVICE_ID_INTEL_ESB_1:
22091+ case PCI_DEVICE_ID_INTEL_ICH6_0:
22092+ case PCI_DEVICE_ID_INTEL_ICH6_1:
22093+ case PCI_DEVICE_ID_INTEL_ICH7_0:
22094+ case PCI_DEVICE_ID_INTEL_ICH7_1:
22095+ case PCI_DEVICE_ID_INTEL_ICH7_30:
22096+ case PCI_DEVICE_ID_INTEL_ICH7_31:
22097+ case PCI_DEVICE_ID_INTEL_ESB2_0:
22098+ case PCI_DEVICE_ID_INTEL_ICH8_0:
22099+ case PCI_DEVICE_ID_INTEL_ICH8_1:
22100+ case PCI_DEVICE_ID_INTEL_ICH8_2:
22101+ case PCI_DEVICE_ID_INTEL_ICH8_3:
22102+ case PCI_DEVICE_ID_INTEL_ICH8_4:
22103+ case PCI_DEVICE_ID_INTEL_ICH9_0:
22104+ case PCI_DEVICE_ID_INTEL_ICH9_1:
22105+ case PCI_DEVICE_ID_INTEL_ICH9_2:
22106+ case PCI_DEVICE_ID_INTEL_ICH9_3:
22107+ case PCI_DEVICE_ID_INTEL_ICH9_4:
22108+ case PCI_DEVICE_ID_INTEL_ICH9_5:
22109+ case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22110+ case PCI_DEVICE_ID_INTEL_ICH10_0:
22111+ case PCI_DEVICE_ID_INTEL_ICH10_1:
22112+ case PCI_DEVICE_ID_INTEL_ICH10_2:
22113+ case PCI_DEVICE_ID_INTEL_ICH10_3:
22114+ case PCI_DEVICE_ID_INTEL_PCH_0:
22115+ case PCI_DEVICE_ID_INTEL_PCH_1:
22116+ r->name = "PIIX/ICH";
22117+ r->get = pirq_piix_get;
22118+ r->set = pirq_piix_set;
22119+ return 1;
22120 }
22121 return 0;
22122 }
22123@@ -610,7 +615,7 @@ static __init int via_router_probe(struc
22124 * workarounds for some buggy BIOSes
22125 */
22126 if (device == PCI_DEVICE_ID_VIA_82C586_0) {
22127- switch(router->device) {
22128+ switch (router->device) {
22129 case PCI_DEVICE_ID_VIA_82C686:
22130 /*
22131 * Asus k7m bios wrongly reports 82C686A
22132@@ -635,7 +640,7 @@ static __init int via_router_probe(struc
22133 }
22134 }
22135
22136- switch(device) {
22137+ switch (device) {
22138 case PCI_DEVICE_ID_VIA_82C586_0:
22139 r->name = "VIA";
22140 r->get = pirq_via586_get;
22141@@ -658,28 +663,27 @@ static __init int via_router_probe(struc
22142
22143 static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22144 {
22145- switch(device)
22146- {
22147- case PCI_DEVICE_ID_VLSI_82C534:
22148- r->name = "VLSI 82C534";
22149- r->get = pirq_vlsi_get;
22150- r->set = pirq_vlsi_set;
22151- return 1;
22152+ switch (device) {
22153+ case PCI_DEVICE_ID_VLSI_82C534:
22154+ r->name = "VLSI 82C534";
22155+ r->get = pirq_vlsi_get;
22156+ r->set = pirq_vlsi_set;
22157+ return 1;
22158 }
22159 return 0;
22160 }
22161
22162
22163-static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22164+static __init int serverworks_router_probe(struct irq_router *r,
22165+ struct pci_dev *router, u16 device)
22166 {
22167- switch(device)
22168- {
22169- case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22170- case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22171- r->name = "ServerWorks";
22172- r->get = pirq_serverworks_get;
22173- r->set = pirq_serverworks_set;
22174- return 1;
22175+ switch (device) {
22176+ case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22177+ case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22178+ r->name = "ServerWorks";
22179+ r->get = pirq_serverworks_get;
22180+ r->set = pirq_serverworks_set;
22181+ return 1;
22182 }
22183 return 0;
22184 }
22185@@ -688,7 +692,7 @@ static __init int sis_router_probe(struc
22186 {
22187 if (device != PCI_DEVICE_ID_SI_503)
22188 return 0;
22189-
22190+
22191 r->name = "SIS";
22192 r->get = pirq_sis_get;
22193 r->set = pirq_sis_set;
22194@@ -697,50 +701,45 @@ static __init int sis_router_probe(struc
22195
22196 static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22197 {
22198- switch(device)
22199- {
22200- case PCI_DEVICE_ID_CYRIX_5520:
22201- r->name = "NatSemi";
22202- r->get = pirq_cyrix_get;
22203- r->set = pirq_cyrix_set;
22204- return 1;
22205+ switch (device) {
22206+ case PCI_DEVICE_ID_CYRIX_5520:
22207+ r->name = "NatSemi";
22208+ r->get = pirq_cyrix_get;
22209+ r->set = pirq_cyrix_set;
22210+ return 1;
22211 }
22212 return 0;
22213 }
22214
22215 static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22216 {
22217- switch(device)
22218- {
22219- case PCI_DEVICE_ID_OPTI_82C700:
22220- r->name = "OPTI";
22221- r->get = pirq_opti_get;
22222- r->set = pirq_opti_set;
22223- return 1;
22224+ switch (device) {
22225+ case PCI_DEVICE_ID_OPTI_82C700:
22226+ r->name = "OPTI";
22227+ r->get = pirq_opti_get;
22228+ r->set = pirq_opti_set;
22229+ return 1;
22230 }
22231 return 0;
22232 }
22233
22234 static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22235 {
22236- switch(device)
22237- {
22238- case PCI_DEVICE_ID_ITE_IT8330G_0:
22239- r->name = "ITE";
22240- r->get = pirq_ite_get;
22241- r->set = pirq_ite_set;
22242- return 1;
22243+ switch (device) {
22244+ case PCI_DEVICE_ID_ITE_IT8330G_0:
22245+ r->name = "ITE";
22246+ r->get = pirq_ite_get;
22247+ r->set = pirq_ite_set;
22248+ return 1;
22249 }
22250 return 0;
22251 }
22252
22253 static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22254 {
22255- switch(device)
22256- {
22257+ switch (device) {
22258 case PCI_DEVICE_ID_AL_M1533:
22259 case PCI_DEVICE_ID_AL_M1563:
22260- printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
22261 r->name = "ALI";
22262 r->get = pirq_ali_get;
22263 r->set = pirq_ali_set;
22264@@ -751,25 +750,24 @@ static __init int ali_router_probe(struc
22265
22266 static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22267 {
22268- switch(device)
22269- {
22270- case PCI_DEVICE_ID_AMD_VIPER_740B:
22271- r->name = "AMD756";
22272- break;
22273- case PCI_DEVICE_ID_AMD_VIPER_7413:
22274- r->name = "AMD766";
22275- break;
22276- case PCI_DEVICE_ID_AMD_VIPER_7443:
22277- r->name = "AMD768";
22278- break;
22279- default:
22280- return 0;
22281+ switch (device) {
22282+ case PCI_DEVICE_ID_AMD_VIPER_740B:
22283+ r->name = "AMD756";
22284+ break;
22285+ case PCI_DEVICE_ID_AMD_VIPER_7413:
22286+ r->name = "AMD766";
22287+ break;
22288+ case PCI_DEVICE_ID_AMD_VIPER_7443:
22289+ r->name = "AMD768";
22290+ break;
22291+ default:
22292+ return 0;
22293 }
22294 r->get = pirq_amd756_get;
22295 r->set = pirq_amd756_set;
22296 return 1;
22297 }
22298-
22299+
22300 static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22301 {
22302 switch (device) {
22303@@ -811,7 +809,7 @@ static struct pci_dev *pirq_router_dev;
22304 * FIXME: should we have an option to say "generic for
22305 * chipset" ?
22306 */
22307-
22308+
22309 static void __init pirq_find_router(struct irq_router *r)
22310 {
22311 struct irq_routing_table *rt = pirq_table;
22312@@ -830,7 +828,7 @@ static void __init pirq_find_router(stru
22313 r->name = "default";
22314 r->get = NULL;
22315 r->set = NULL;
22316-
22317+
22318 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
22319 rt->rtr_vendor, rt->rtr_device);
22320
22321@@ -841,19 +839,19 @@ static void __init pirq_find_router(stru
22322 return;
22323 }
22324
22325- for( h = pirq_routers; h->vendor; h++) {
22326+ for (h = pirq_routers; h->vendor; h++) {
22327 /* First look for a router match */
22328- if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
22329+ if (rt->rtr_vendor == h->vendor &&
22330+ h->probe(r, pirq_router_dev, rt->rtr_device))
22331 break;
22332 /* Fall back to a device match */
22333- if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
22334+ if (pirq_router_dev->vendor == h->vendor &&
22335+ h->probe(r, pirq_router_dev, pirq_router_dev->device))
22336 break;
22337 }
22338- printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
22339- pirq_router.name,
22340- pirq_router_dev->vendor,
22341- pirq_router_dev->device,
22342- pci_name(pirq_router_dev));
22343+ dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
22344+ pirq_router.name,
22345+ pirq_router_dev->vendor, pirq_router_dev->device);
22346
22347 /* The device remains referenced for the kernel lifetime */
22348 }
22349@@ -861,11 +859,13 @@ static void __init pirq_find_router(stru
22350 static struct irq_info *pirq_get_info(struct pci_dev *dev)
22351 {
22352 struct irq_routing_table *rt = pirq_table;
22353- int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
22354+ int entries = (rt->size - sizeof(struct irq_routing_table)) /
22355+ sizeof(struct irq_info);
22356 struct irq_info *info;
22357
22358 for (info = rt->slots; entries--; info++)
22359- if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22360+ if (info->bus == dev->bus->number &&
22361+ PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22362 return info;
22363 return NULL;
22364 }
22365@@ -884,7 +884,7 @@ static int pcibios_lookup_irq(struct pci
22366 /* Find IRQ pin */
22367 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
22368 if (!pin) {
22369- DBG(KERN_DEBUG " -> no interrupt pin\n");
22370+ dev_dbg(&dev->dev, "no interrupt pin\n");
22371 return 0;
22372 }
22373 pin = pin - 1;
22374@@ -893,20 +893,21 @@ static int pcibios_lookup_irq(struct pci
22375
22376 if (!pirq_table)
22377 return 0;
22378-
22379- DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
22380+
22381 info = pirq_get_info(dev);
22382 if (!info) {
22383- DBG(" -> not found in routing table\n" KERN_DEBUG);
22384+ dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
22385+ 'A' + pin);
22386 return 0;
22387 }
22388 pirq = info->irq[pin].link;
22389 mask = info->irq[pin].bitmap;
22390 if (!pirq) {
22391- DBG(" -> not routed\n" KERN_DEBUG);
22392+ dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
22393 return 0;
22394 }
22395- DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
22396+ dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
22397+ 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
22398 mask &= pcibios_irq_mask;
22399
22400 /* Work around broken HP Pavilion Notebooks which assign USB to
22401@@ -919,7 +920,8 @@ static int pcibios_lookup_irq(struct pci
22402 }
22403
22404 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
22405- if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
22406+ if (acer_tm360_irqrouting && dev->irq == 11 &&
22407+ dev->vendor == PCI_VENDOR_ID_O2) {
22408 pirq = 0x68;
22409 mask = 0x400;
22410 dev->irq = r->get(pirq_router_dev, dev, pirq);
22411@@ -932,51 +934,50 @@ static int pcibios_lookup_irq(struct pci
22412 */
22413 newirq = dev->irq;
22414 if (newirq && !((1 << newirq) & mask)) {
22415- if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
22416- else printk("\n" KERN_WARNING
22417- "PCI: IRQ %i for device %s doesn't match PIRQ mask "
22418- "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
22419- pci_name(dev));
22420+ if (pci_probe & PCI_USE_PIRQ_MASK)
22421+ newirq = 0;
22422+ else
22423+ dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
22424+ "%#x; try pci=usepirqmask\n", newirq, mask);
22425 }
22426 if (!newirq && assign) {
22427 for (i = 0; i < 16; i++) {
22428 if (!(mask & (1 << i)))
22429 continue;
22430- if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
22431+ if (pirq_penalty[i] < pirq_penalty[newirq] &&
22432+ can_request_irq(i, IRQF_SHARED))
22433 newirq = i;
22434 }
22435 }
22436- DBG(" -> newirq=%d", newirq);
22437+ dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
22438
22439 /* Check if it is hardcoded */
22440 if ((pirq & 0xf0) == 0xf0) {
22441 irq = pirq & 0xf;
22442- DBG(" -> hardcoded IRQ %d\n", irq);
22443- msg = "Hardcoded";
22444- } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22445- ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
22446- DBG(" -> got IRQ %d\n", irq);
22447- msg = "Found";
22448+ msg = "hardcoded";
22449+ } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22450+ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
22451+ msg = "found";
22452 eisa_set_level_irq(irq);
22453- } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22454- DBG(" -> assigning IRQ %d", newirq);
22455+ } else if (newirq && r->set &&
22456+ (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22457 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
22458 eisa_set_level_irq(newirq);
22459- DBG(" ... OK\n");
22460- msg = "Assigned";
22461+ msg = "assigned";
22462 irq = newirq;
22463 }
22464 }
22465
22466 if (!irq) {
22467- DBG(" ... failed\n");
22468 if (newirq && mask == (1 << newirq)) {
22469- msg = "Guessed";
22470+ msg = "guessed";
22471 irq = newirq;
22472- } else
22473+ } else {
22474+ dev_dbg(&dev->dev, "can't route interrupt\n");
22475 return 0;
22476+ }
22477 }
22478- printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
22479+ dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
22480
22481 /* Update IRQ for all devices with the same pirq value */
22482 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
22483@@ -988,20 +989,25 @@ static int pcibios_lookup_irq(struct pci
22484 if (!info)
22485 continue;
22486 if (info->irq[pin].link == pirq) {
22487- /* We refuse to override the dev->irq information. Give a warning! */
22488- if ( dev2->irq && dev2->irq != irq && \
22489+ /*
22490+ * We refuse to override the dev->irq
22491+ * information. Give a warning!
22492+ */
22493+ if (dev2->irq && dev2->irq != irq && \
22494 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
22495- ((1 << dev2->irq) & mask)) ) {
22496+ ((1 << dev2->irq) & mask))) {
22497 #ifndef CONFIG_PCI_MSI
22498- printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
22499- pci_name(dev2), dev2->irq, irq);
22500+ dev_info(&dev2->dev, "IRQ routing conflict: "
22501+ "have IRQ %d, want IRQ %d\n",
22502+ dev2->irq, irq);
22503 #endif
22504- continue;
22505- }
22506+ continue;
22507+ }
22508 dev2->irq = irq;
22509 pirq_penalty[irq]++;
22510 if (dev != dev2)
22511- printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
22512+ dev_info(&dev->dev, "sharing IRQ %d with %s\n",
22513+ irq, pci_name(dev2));
22514 }
22515 }
22516 return 1;
22517@@ -1015,15 +1021,20 @@ static void __init pcibios_fixup_irqs(vo
22518 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
22519 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
22520 /*
22521- * If the BIOS has set an out of range IRQ number, just ignore it.
22522- * Also keep track of which IRQ's are already in use.
22523+ * If the BIOS has set an out of range IRQ number, just
22524+ * ignore it. Also keep track of which IRQ's are
22525+ * already in use.
22526 */
22527 if (dev->irq >= 16) {
22528- DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
22529+ dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
22530 dev->irq = 0;
22531 }
22532- /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
22533- if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
22534+ /*
22535+ * If the IRQ is already assigned to a PCI device,
22536+ * ignore its ISA use penalty
22537+ */
22538+ if (pirq_penalty[dev->irq] >= 100 &&
22539+ pirq_penalty[dev->irq] < 100000)
22540 pirq_penalty[dev->irq] = 0;
22541 pirq_penalty[dev->irq]++;
22542 }
22543@@ -1035,13 +1046,17 @@ static void __init pcibios_fixup_irqs(vo
22544 /*
22545 * Recalculate IRQ numbers if we use the I/O APIC.
22546 */
22547- if (io_apic_assign_pci_irqs)
22548- {
22549+ if (io_apic_assign_pci_irqs) {
22550 int irq;
22551
22552 if (pin) {
22553- pin--; /* interrupt pins are numbered starting from 1 */
22554- irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
22555+ /*
22556+ * interrupt pins are numbered starting
22557+ * from 1
22558+ */
22559+ pin--;
22560+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
22561+ PCI_SLOT(dev->devfn), pin);
22562 /*
22563 * Busses behind bridges are typically not listed in the MP-table.
22564 * In this case we have to look up the IRQ based on the parent bus,
22565@@ -1049,18 +1064,18 @@ static void __init pcibios_fixup_irqs(vo
22566 * busses itself so we should get into this branch reliably.
22567 */
22568 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22569- struct pci_dev * bridge = dev->bus->self;
22570+ struct pci_dev *bridge = dev->bus->self;
22571
22572 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22573- irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22574+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22575 PCI_SLOT(bridge->devfn), pin);
22576 if (irq >= 0)
22577- printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22578- pci_name(bridge), 'A' + pin, irq);
22579+ dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
22580+ pci_name(bridge),
22581+ 'A' + pin, irq);
22582 }
22583 if (irq >= 0) {
22584- printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22585- pci_name(dev), 'A' + pin, irq);
22586+ dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
22587 dev->irq = irq;
22588 }
22589 }
22590@@ -1082,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq
22591 {
22592 if (!broken_hp_bios_irq9) {
22593 broken_hp_bios_irq9 = 1;
22594- printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22595+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22596+ d->ident);
22597 }
22598 return 0;
22599 }
22600@@ -1095,7 +1111,8 @@ static int __init fix_acer_tm360_irqrout
22601 {
22602 if (!acer_tm360_irqrouting) {
22603 acer_tm360_irqrouting = 1;
22604- printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22605+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22606+ d->ident);
22607 }
22608 return 0;
22609 }
22610@@ -1107,7 +1124,8 @@ static struct dmi_system_id __initdata p
22611 .matches = {
22612 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
22613 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
22614- DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
22615+ DMI_MATCH(DMI_PRODUCT_VERSION,
22616+ "HP Pavilion Notebook Model GE"),
22617 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
22618 },
22619 },
22620@@ -1122,7 +1140,7 @@ static struct dmi_system_id __initdata p
22621 { }
22622 };
22623
22624-static int __init pcibios_irq_init(void)
22625+int __init pcibios_irq_init(void)
22626 {
22627 DBG(KERN_DEBUG "PCI: IRQ init\n");
22628
22629@@ -1142,11 +1160,14 @@ static int __init pcibios_irq_init(void)
22630 pirq_find_router(&pirq_router);
22631 if (pirq_table->exclusive_irqs) {
22632 int i;
22633- for (i=0; i<16; i++)
22634+ for (i = 0; i < 16; i++)
22635 if (!(pirq_table->exclusive_irqs & (1 << i)))
22636 pirq_penalty[i] += 100;
22637 }
22638- /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
22639+ /*
22640+ * If we're using the I/O APIC, avoid using the PCI IRQ
22641+ * routing table
22642+ */
22643 if (io_apic_assign_pci_irqs)
22644 pirq_table = NULL;
22645 }
22646@@ -1157,9 +1178,6 @@ static int __init pcibios_irq_init(void)
22647 return 0;
22648 }
22649
22650-subsys_initcall(pcibios_irq_init);
22651-
22652-
22653 static void pirq_penalize_isa_irq(int irq, int active)
22654 {
22655 /*
22656@@ -1193,7 +1211,7 @@ static int pirq_enable_irq(struct pci_de
22657 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
22658 char *msg = "";
22659
22660- pin--; /* interrupt pins are numbered starting from 1 */
22661+ pin--; /* interrupt pins are numbered starting from 1 */
22662
22663 if (io_apic_assign_pci_irqs) {
22664 int irq;
22665@@ -1207,35 +1225,41 @@ static int pirq_enable_irq(struct pci_de
22666 */
22667 temp_dev = dev;
22668 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22669- struct pci_dev * bridge = dev->bus->self;
22670+ struct pci_dev *bridge = dev->bus->self;
22671
22672 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22673- irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22674+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22675 PCI_SLOT(bridge->devfn), pin);
22676 if (irq >= 0)
22677- printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22678- pci_name(bridge), 'A' + pin, irq);
22679+ dev_warn(&dev->dev, "using bridge %s "
22680+ "INT %c to get IRQ %d\n",
22681+ pci_name(bridge), 'A' + pin,
22682+ irq);
22683 dev = bridge;
22684 }
22685 dev = temp_dev;
22686 if (irq >= 0) {
22687- printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22688- pci_name(dev), 'A' + pin, irq);
22689+ dev_info(&dev->dev, "PCI->APIC IRQ transform: "
22690+ "INT %c -> IRQ %d\n", 'A' + pin, irq);
22691 dev->irq = irq;
22692 return 0;
22693 } else
22694- msg = " Probably buggy MP table.";
22695+ msg = "; probably buggy MP table";
22696 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
22697 msg = "";
22698 else
22699- msg = " Please try using pci=biosirq.";
22700+ msg = "; please try using pci=biosirq";
22701
22702- /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
22703- if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
22704+ /*
22705+ * With IDE legacy devices the IRQ lookup failure is not
22706+ * a problem..
22707+ */
22708+ if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
22709+ !(dev->class & 0x5))
22710 return 0;
22711
22712- printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
22713- 'A' + pin, pci_name(dev), msg);
22714+ dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
22715+ 'A' + pin, msg);
22716 }
22717 return 0;
22718 }
22719Index: head-2008-12-01/arch/x86/vdso/Makefile
22720===================================================================
22721--- head-2008-12-01.orig/arch/x86/vdso/Makefile 2008-12-01 11:37:10.000000000 +0100
22722+++ head-2008-12-01/arch/x86/vdso/Makefile 2008-12-01 11:49:07.000000000 +0100
22723@@ -65,9 +65,7 @@ obj-$(VDSO32-y) += vdso32-syms.lds
22724 vdso32.so-$(VDSO32-y) += int80
22725 vdso32.so-$(CONFIG_COMPAT) += syscall
22726 vdso32.so-$(VDSO32-y) += sysenter
22727-xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
22728-xen-vdso32-$(CONFIG_X86_32) += syscall
22729-vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
22730+vdso32.so-$(CONFIG_X86_XEN) += syscall
22731
22732 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
22733
22734Index: head-2008-12-01/arch/x86/vdso/vdso32.S
22735===================================================================
22736--- head-2008-12-01.orig/arch/x86/vdso/vdso32.S 2008-12-01 11:37:10.000000000 +0100
22737+++ head-2008-12-01/arch/x86/vdso/vdso32.S 2008-12-01 11:49:07.000000000 +0100
22738@@ -9,7 +9,7 @@ vdso32_int80_end:
22739
22740 .globl vdso32_syscall_start, vdso32_syscall_end
22741 vdso32_syscall_start:
22742-#ifdef CONFIG_COMPAT
22743+#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
22744 .incbin "arch/x86/vdso/vdso32-syscall.so"
22745 #endif
22746 vdso32_syscall_end:
22747@@ -19,16 +19,4 @@ vdso32_sysenter_start:
22748 .incbin "arch/x86/vdso/vdso32-sysenter.so"
22749 vdso32_sysenter_end:
22750
22751-#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
22752- .globl vdso32_int80_start, vdso32_int80_end
22753-vdso32_int80_start:
22754- .incbin "arch/x86/vdso/vdso32-int80.so"
22755-vdso32_int80_end:
22756-#elif defined(CONFIG_X86_XEN)
22757- .globl vdso32_syscall_start, vdso32_syscall_end
22758-vdso32_syscall_start:
22759- .incbin "arch/x86/vdso/vdso32-syscall.so"
22760-vdso32_syscall_end:
22761-#endif
22762-
22763 __FINIT
22764Index: head-2008-12-01/arch/x86/vdso/vdso32-setup-xen.c
22765===================================================================
22766--- head-2008-12-01.orig/arch/x86/vdso/vdso32-setup-xen.c 2008-12-01 11:44:55.000000000 +0100
22767+++ head-2008-12-01/arch/x86/vdso/vdso32-setup-xen.c 2008-12-01 11:49:07.000000000 +0100
22768@@ -195,50 +195,28 @@ static __init void relocate_vdso(Elf32_E
22769 }
22770 }
22771
22772-/*
22773- * These symbols are defined by vdso32.S to mark the bounds
22774- * of the ELF DSO images included therein.
22775- */
22776-extern const char vdso32_default_start, vdso32_default_end;
22777-extern const char vdso32_sysenter_start, vdso32_sysenter_end;
22778 static struct page *vdso32_pages[1];
22779
22780 #ifdef CONFIG_X86_64
22781
22782-#if CONFIG_XEN_COMPAT < 0x030200
22783-static int use_int80 = 1;
22784-#endif
22785-static int use_sysenter __read_mostly = -1;
22786-
22787-#define vdso32_sysenter() (use_sysenter > 0)
22788+#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
22789+#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
22790
22791-/* May not be __init: called during resume */
22792-void syscall32_cpu_init(void)
22793+void __cpuinit syscall32_cpu_init(void)
22794 {
22795- static const struct callback_register cstar = {
22796+ static /*const*/ struct callback_register __cpuinitdata cstar = {
22797 .type = CALLBACKTYPE_syscall32,
22798 .address = (unsigned long)ia32_cstar_target
22799 };
22800- static const struct callback_register sysenter = {
22801+ static /*const*/ struct callback_register __cpuinitdata sysenter = {
22802 .type = CALLBACKTYPE_sysenter,
22803 .address = (unsigned long)ia32_sysenter_target
22804 };
22805
22806- if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
22807- (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
22808-#if CONFIG_XEN_COMPAT < 0x030200
22809- return;
22810- use_int80 = 0;
22811-#else
22812- BUG();
22813-#endif
22814-
22815- if (use_sysenter < 0) {
22816- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
22817- use_sysenter = 1;
22818- if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
22819- use_sysenter = 1;
22820- }
22821+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
22822+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
22823+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
22824+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
22825 }
22826
22827 #define compat_uses_vma 1
22828@@ -250,6 +228,7 @@ static inline void map_compat_vdso(int m
22829 #else /* CONFIG_X86_32 */
22830
22831 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
22832+#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
22833
22834 extern asmlinkage void ia32pv_cstar_target(void);
22835 static /*const*/ struct callback_register __cpuinitdata cstar = {
22836@@ -265,13 +244,13 @@ void __cpuinit enable_sep_cpu(void)
22837 .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
22838 };
22839
22840- if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
22841+ if (vdso32_syscall()) {
22842 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
22843 BUG();
22844 return;
22845 }
22846
22847- if (!boot_cpu_has(X86_FEATURE_SEP))
22848+ if (!vdso32_sysenter())
22849 return;
22850
22851 if (xen_feature(XENFEAT_supervisor_mode_kernel))
22852@@ -341,34 +320,26 @@ int __init sysenter_setup(void)
22853
22854 #ifdef CONFIG_X86_32
22855 gate_vma_init();
22856-#endif
22857
22858-#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
22859- if (use_int80) {
22860- extern const char vdso32_int80_start, vdso32_int80_end;
22861-
22862- vsyscall = &vdso32_int80_start;
22863- vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
22864- } else
22865-#elif defined(CONFIG_X86_32)
22866- if (boot_cpu_has(X86_FEATURE_SYSCALL)
22867- && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
22868- || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
22869- setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
22870- barrier(); /* until clear_bit()'s constraints are correct ... */
22871 if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
22872- extern const char vdso32_syscall_start, vdso32_syscall_end;
22873-
22874+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
22875+ && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
22876+ setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
22877+ else {
22878+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
22879+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
22880+ }
22881+ }
22882+#endif
22883+ if (vdso32_syscall()) {
22884 vsyscall = &vdso32_syscall_start;
22885 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
22886- } else
22887-#endif
22888- if (!vdso32_sysenter()) {
22889- vsyscall = &vdso32_default_start;
22890- vsyscall_len = &vdso32_default_end - &vdso32_default_start;
22891- } else {
22892+ } else if (vdso32_sysenter()){
22893 vsyscall = &vdso32_sysenter_start;
22894 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
22895+ } else {
22896+ vsyscall = &vdso32_int80_start;
22897+ vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
22898 }
22899
22900 memcpy(syscall_page, vsyscall, vsyscall_len);
22901Index: head-2008-12-01/arch/x86/xen/Kconfig
22902===================================================================
22903--- head-2008-12-01.orig/arch/x86/xen/Kconfig 2008-12-01 11:36:47.000000000 +0100
22904+++ head-2008-12-01/arch/x86/xen/Kconfig 2008-12-01 11:49:07.000000000 +0100
22905@@ -17,7 +17,7 @@ config XEN_MAX_DOMAIN_MEMORY
22906 int "Maximum allowed size of a domain in gigabytes"
22907 default 8 if X86_32
22908 default 32 if X86_64
22909- depends on XEN
22910+ depends on PARAVIRT_XEN
22911 help
22912 The pseudo-physical to machine address array is sized
22913 according to the maximum possible memory size of a Xen
22914@@ -26,5 +26,5 @@ config XEN_MAX_DOMAIN_MEMORY
22915
22916 config XEN_SAVE_RESTORE
22917 bool
22918- depends on PM
22919+ depends on PARAVIRT_XEN && PM
22920 default y
22921\ No newline at end of file
22922Index: head-2008-12-01/drivers/acpi/processor_core.c
22923===================================================================
22924--- head-2008-12-01.orig/drivers/acpi/processor_core.c 2008-12-01 11:44:55.000000000 +0100
22925+++ head-2008-12-01/drivers/acpi/processor_core.c 2008-12-01 11:49:07.000000000 +0100
22926@@ -721,9 +721,11 @@ static int __cpuinit acpi_processor_star
22927 if (result)
22928 goto end;
22929
22930- sysdev = get_cpu_sysdev(pr->id);
22931- if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
22932- return -EFAULT;
22933+ if (pr->id != -1) {
22934+ sysdev = get_cpu_sysdev(pr->id);
22935+ if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
22936+ return -EFAULT;
22937+ }
22938
22939 status = acpi_install_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
22940 acpi_processor_notify, pr);
22941@@ -895,7 +897,8 @@ static int acpi_processor_remove(struct
22942 status = acpi_remove_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
22943 acpi_processor_notify);
22944
22945- sysfs_remove_link(&device->dev.kobj, "sysdev");
22946+ if (pr->id != -1)
22947+ sysfs_remove_link(&device->dev.kobj, "sysdev");
22948
22949 acpi_processor_remove_fs(device);
22950
22951Index: head-2008-12-01/drivers/char/tpm/tpm_vtpm.c
22952===================================================================
22953--- head-2008-12-01.orig/drivers/char/tpm/tpm_vtpm.c 2008-12-03 15:48:43.000000000 +0100
22954+++ head-2008-12-01/drivers/char/tpm/tpm_vtpm.c 2008-12-01 11:49:07.000000000 +0100
22955@@ -347,7 +347,7 @@ static int _vtpm_send_queued(struct tpm_
22956 {
22957 int rc;
22958 int error = 0;
22959- long flags;
22960+ unsigned long flags;
22961 unsigned char buffer[1];
22962 struct vtpm_state *vtpms;
22963 vtpms = (struct vtpm_state *)chip_get_private(chip);
22964Index: head-2008-12-01/drivers/misc/Kconfig
22965===================================================================
22966--- head-2008-12-01.orig/drivers/misc/Kconfig 2008-12-03 15:48:43.000000000 +0100
22967+++ head-2008-12-01/drivers/misc/Kconfig 2008-12-01 11:49:07.000000000 +0100
22968@@ -438,7 +438,7 @@ config ENCLOSURE_SERVICES
22969 config SGI_XP
22970 tristate "Support communication between SGI SSIs"
22971 depends on NET
22972- depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP
22973+ depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP && !XEN
22974 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
22975 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
22976 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP
22977@@ -465,7 +465,7 @@ config HP_ILO
22978
22979 config SGI_GRU
22980 tristate "SGI GRU driver"
22981- depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP
22982+ depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP && !XEN
22983 default n
22984 select MMU_NOTIFIER
22985 ---help---
22986Index: head-2008-12-01/drivers/pci/msi-xen.c
22987===================================================================
22988--- head-2008-12-01.orig/drivers/pci/msi-xen.c 2008-12-01 11:44:55.000000000 +0100
22989+++ head-2008-12-01/drivers/pci/msi-xen.c 2008-12-01 11:49:07.000000000 +0100
22990@@ -90,12 +90,10 @@ arch_teardown_msi_irqs(struct pci_dev *d
22991 }
22992 #endif
22993
22994-static void msi_set_enable(struct pci_dev *dev, int enable)
22995+static void __msi_set_enable(struct pci_dev *dev, int pos, int enable)
22996 {
22997- int pos;
22998 u16 control;
22999
23000- pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
23001 if (pos) {
23002 pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
23003 control &= ~PCI_MSI_FLAGS_ENABLE;
23004@@ -105,6 +103,11 @@ static void msi_set_enable(struct pci_de
23005 }
23006 }
23007
23008+static void msi_set_enable(struct pci_dev *dev, int enable)
23009+{
23010+ __msi_set_enable(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), enable);
23011+}
23012+
23013 static void msix_set_enable(struct pci_dev *dev, int enable)
23014 {
23015 int pos;
23016@@ -573,9 +576,8 @@ int pci_enable_msi(struct pci_dev* dev)
23017
23018 /* Check whether driver already requested for MSI-X irqs */
23019 if (dev->msix_enabled) {
23020- printk(KERN_INFO "PCI: %s: Can't enable MSI. "
23021- "Device already has MSI-X enabled\n",
23022- pci_name(dev));
23023+ dev_info(&dev->dev, "can't enable MSI "
23024+ "(MSI-X already enabled)\n");
23025 return -EINVAL;
23026 }
23027
23028@@ -707,9 +709,8 @@ int pci_enable_msix(struct pci_dev* dev,
23029 temp = dev->irq;
23030 /* Check whether driver already requested for MSI vector */
23031 if (dev->msi_enabled) {
23032- printk(KERN_INFO "PCI: %s: Can't enable MSI-X. "
23033- "Device already has an MSI irq assigned\n",
23034- pci_name(dev));
23035+ dev_info(&dev->dev, "can't enable MSI-X "
23036+ "(MSI IRQ already assigned)\n");
23037 return -EINVAL;
23038 }
23039
23040Index: head-2008-12-01/drivers/pci/quirks.c
23041===================================================================
23042--- head-2008-12-01.orig/drivers/pci/quirks.c 2008-12-03 15:48:43.000000000 +0100
23043+++ head-2008-12-01/drivers/pci/quirks.c 2008-12-01 11:49:07.000000000 +0100
23044@@ -42,9 +42,7 @@ static void __devinit quirk_release_reso
23045 /* PCI Host Bridge isn't a target device */
23046 return;
23047 }
23048- printk(KERN_INFO
23049- "PCI: Disable device and release resources [%s].\n",
23050- pci_name(dev));
23051+ dev_info(&dev->dev, "disable device and release resources\n");
23052 pci_disable_device(dev);
23053
23054 for (i=0; i < PCI_NUM_RESOURCES; i++) {
23055Index: head-2008-12-01/drivers/pci/setup-res.c
23056===================================================================
23057--- head-2008-12-01.orig/drivers/pci/setup-res.c 2008-12-03 15:48:43.000000000 +0100
23058+++ head-2008-12-01/drivers/pci/setup-res.c 2008-12-01 11:50:17.000000000 +0100
23059@@ -129,7 +129,7 @@ int pci_claim_resource(struct pci_dev *d
23060 #ifdef CONFIG_PCI_REASSIGN
23061 void pci_disable_bridge_window(struct pci_dev *dev)
23062 {
23063- printk(KERN_DEBUG "PCI: Disable bridge window on %s\n", pci_name(dev));
23064+ dev_dbg(&dev->dev, "disable bridge window\n");
23065
23066 /* MMIO Base/Limit */
23067 pci_write_config_dword(dev, PCI_MEMORY_BASE, 0x0000fff0);
23068@@ -190,8 +190,8 @@ int pci_assign_resource(struct pci_dev *
23069 res->flags &= ~IORESOURCE_STARTALIGN;
23070 if (resno < PCI_BRIDGE_RESOURCES) {
23071 #ifdef CONFIG_PCI_REASSIGN
23072- printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23073- "%016llx - %016llx\n", resno, pci_name(dev),
23074+ dev_dbg(&dev->dev, "assign resource(%d) "
23075+ "%016llx - %016llx\n", resno,
23076 (unsigned long long)res->start,
23077 (unsigned long long)res->end);
23078 #endif
23079@@ -235,8 +235,8 @@ int pci_assign_resource_fixed(struct pci
23080 (unsigned long long)res->end);
23081 } else if (resno < PCI_BRIDGE_RESOURCES) {
23082 #ifdef CONFIG_PCI_REASSIGN
23083- printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23084- "%016llx - %016llx\n", resno, pci_name(dev),
23085+ dev_dbg(&dev->dev, "assign resource(%d) "
23086+ "%016llx - %016llx\n", resno,
23087 (unsigned long long)res->start,
23088 (unsigned long long)res->end);
23089 #endif
23090Index: head-2008-12-01/drivers/xen/Makefile
23091===================================================================
23092--- head-2008-12-01.orig/drivers/xen/Makefile 2008-12-01 11:44:55.000000000 +0100
23093+++ head-2008-12-01/drivers/xen/Makefile 2008-12-01 11:49:07.000000000 +0100
23094@@ -1,4 +1,4 @@
23095-obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
23096+obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o manage.o
23097 xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
23098 xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
23099
23100Index: head-2008-12-01/drivers/xen/balloon/sysfs.c
23101===================================================================
23102--- head-2008-12-01.orig/drivers/xen/balloon/sysfs.c 2008-12-01 11:37:10.000000000 +0100
23103+++ head-2008-12-01/drivers/xen/balloon/sysfs.c 2008-12-01 11:49:07.000000000 +0100
23104@@ -45,6 +45,7 @@
23105
23106 #define BALLOON_SHOW(name, format, args...) \
23107 static ssize_t show_##name(struct sys_device *dev, \
23108+ struct sysdev_attribute *attr, \
23109 char *buf) \
23110 { \
23111 return sprintf(buf, format, ##args); \
23112@@ -59,14 +60,15 @@ BALLOON_SHOW(hard_limit_kb,
23113 (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0);
23114 BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
23115
23116-static ssize_t show_target_kb(struct sys_device *dev, char *buf)
23117+static ssize_t show_target_kb(struct sys_device *dev,
23118+ struct sysdev_attribute *attr, char *buf)
23119 {
23120 return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
23121 }
23122
23123 static ssize_t store_target_kb(struct sys_device *dev,
23124- const char *buf,
23125- size_t count)
23126+ struct sysdev_attribute *attr,
23127+ const char *buf, size_t count)
23128 {
23129 char memstring[64], *endchar;
23130 unsigned long long target_bytes;
23131Index: head-2008-12-01/drivers/xen/blktap/blktap.c
23132===================================================================
23133--- head-2008-12-01.orig/drivers/xen/blktap/blktap.c 2008-12-01 11:44:55.000000000 +0100
23134+++ head-2008-12-01/drivers/xen/blktap/blktap.c 2008-12-01 11:49:07.000000000 +0100
23135@@ -54,6 +54,7 @@
23136 #include <linux/gfp.h>
23137 #include <linux/poll.h>
23138 #include <linux/delay.h>
23139+#include <linux/nsproxy.h>
23140 #include <asm/tlbflush.h>
23141
23142 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
23143@@ -483,7 +484,7 @@ found:
23144
23145 if ((class = get_xen_class()) != NULL)
23146 device_create(class, NULL, MKDEV(blktap_major, minor),
23147- "blktap%d", minor);
23148+ NULL, "blktap%d", minor);
23149 }
23150
23151 out:
23152@@ -1686,7 +1687,8 @@ static int __init blkif_init(void)
23153 * We only create the device when a request of a new device is
23154 * made.
23155 */
23156- device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
23157+ device_create(class, NULL, MKDEV(blktap_major, 0), NULL,
23158+ "blktap0");
23159 } else {
23160 /* this is bad, but not fatal */
23161 WPRINTK("blktap: sysfs xen_class not created\n");
23162Index: head-2008-12-01/drivers/xen/char/mem.c
23163===================================================================
23164--- head-2008-12-01.orig/drivers/xen/char/mem.c 2008-12-01 11:44:55.000000000 +0100
23165+++ head-2008-12-01/drivers/xen/char/mem.c 2008-12-01 11:49:07.000000000 +0100
23166@@ -35,7 +35,7 @@ static inline int uncached_access(struct
23167
23168 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
23169 {
23170-#ifdef CONFIG_NONPROMISC_DEVMEM
23171+#ifdef CONFIG_STRICT_DEVMEM
23172 u64 from = ((u64)pfn) << PAGE_SHIFT;
23173 u64 to = from + size;
23174 u64 cursor = from;
23175@@ -172,7 +172,10 @@ static void mmap_mem_close(struct vm_are
23176
23177 static struct vm_operations_struct mmap_mem_ops = {
23178 .open = mmap_mem_open,
23179- .close = mmap_mem_close
23180+ .close = mmap_mem_close,
23181+#ifdef CONFIG_HAVE_IOREMAP_PROT
23182+ .access = generic_access_phys
23183+#endif
23184 };
23185
23186 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
23187Index: head-2008-12-01/drivers/xen/console/console.c
23188===================================================================
23189--- head-2008-12-01.orig/drivers/xen/console/console.c 2008-12-01 11:44:55.000000000 +0100
23190+++ head-2008-12-01/drivers/xen/console/console.c 2008-12-01 11:49:07.000000000 +0100
23191@@ -416,9 +416,7 @@ static void __xencons_tx_flush(void)
23192
23193 if (work_done && (xencons_tty != NULL)) {
23194 wake_up_interruptible(&xencons_tty->write_wait);
23195- if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
23196- (xencons_tty->ldisc.write_wakeup != NULL))
23197- (xencons_tty->ldisc.write_wakeup)(xencons_tty);
23198+ tty_wakeup(xencons_tty);
23199 }
23200 }
23201
23202@@ -619,8 +617,8 @@ static void xencons_close(struct tty_str
23203 tty->closing = 1;
23204 tty_wait_until_sent(tty, 0);
23205 tty_driver_flush_buffer(tty);
23206- if (tty->ldisc.flush_buffer != NULL)
23207- tty->ldisc.flush_buffer(tty);
23208+ if (tty->ldisc.ops->flush_buffer != NULL)
23209+ tty->ldisc.ops->flush_buffer(tty);
23210 tty->closing = 0;
23211 spin_lock_irqsave(&xencons_lock, flags);
23212 xencons_tty = NULL;
23213Index: head-2008-12-01/drivers/xen/core/evtchn.c
23214===================================================================
23215--- head-2008-12-01.orig/drivers/xen/core/evtchn.c 2008-12-01 11:37:10.000000000 +0100
23216+++ head-2008-12-01/drivers/xen/core/evtchn.c 2008-12-03 15:53:53.000000000 +0100
23217@@ -744,9 +744,9 @@ static struct irq_chip dynirq_chip = {
23218 };
23219
23220 /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
23221-static int pirq_eoi_does_unmask;
23222+static bool pirq_eoi_does_unmask;
23223 static DECLARE_BITMAP(pirq_needs_eoi, ALIGN(NR_PIRQS, PAGE_SIZE * 8))
23224- __attribute__ ((__section__(".bss.page_aligned"), __aligned__(PAGE_SIZE)));
23225+ __page_aligned_bss;
23226
23227 static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
23228 {
23229@@ -1002,6 +1002,7 @@ void xen_poll_irq(int irq)
23230 BUG();
23231 }
23232
23233+#ifdef CONFIG_PM_SLEEP
23234 static void restore_cpu_virqs(unsigned int cpu)
23235 {
23236 struct evtchn_bind_virq bind_virq;
23237@@ -1094,6 +1095,7 @@ void irq_resume(void)
23238 }
23239
23240 }
23241+#endif
23242
23243 #if defined(CONFIG_X86_IO_APIC)
23244 #define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
23245@@ -1175,7 +1177,7 @@ void __init xen_init_IRQ(void)
23246 BUG_ON(!bitmap_empty(pirq_needs_eoi, PAGE_SIZE * 8));
23247 eoi_mfn.mfn = virt_to_bus(pirq_needs_eoi) >> PAGE_SHIFT;
23248 if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_mfn, &eoi_mfn) == 0)
23249- pirq_eoi_does_unmask = 1;
23250+ pirq_eoi_does_unmask = true;
23251
23252 /* No event channels are 'live' right now. */
23253 for (i = 0; i < NR_EVENT_CHANNELS; i++)
23254Index: head-2008-12-01/drivers/xen/core/gnttab.c
23255===================================================================
23256--- head-2008-12-01.orig/drivers/xen/core/gnttab.c 2008-12-03 15:48:43.000000000 +0100
23257+++ head-2008-12-01/drivers/xen/core/gnttab.c 2008-12-02 09:26:17.000000000 +0100
23258@@ -449,6 +449,7 @@ static int map_pte_fn(pte_t *pte, struct
23259 return 0;
23260 }
23261
23262+#ifdef CONFIG_PM_SLEEP
23263 static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
23264 unsigned long addr, void *data)
23265 {
23266@@ -456,6 +457,7 @@ static int unmap_pte_fn(pte_t *pte, stru
23267 set_pte_at(&init_mm, addr, pte, __pte(0));
23268 return 0;
23269 }
23270+#endif
23271
23272 void *arch_gnttab_alloc_shared(unsigned long *frames)
23273 {
23274@@ -633,6 +635,75 @@ void __gnttab_dma_map_page(struct page *
23275 } while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
23276 }
23277
23278+#ifdef __HAVE_ARCH_PTE_SPECIAL
23279+
23280+static unsigned int GNTMAP_pte_special;
23281+
23282+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
23283+ unsigned int count)
23284+{
23285+ unsigned int i;
23286+
23287+ if (unlikely(cmd != GNTTABOP_map_grant_ref))
23288+ count = 0;
23289+
23290+ for (i = 0; i < count; ++i, ++map) {
23291+ if (!(map->flags & GNTMAP_host_map)
23292+ || !(map->flags & GNTMAP_application_map))
23293+ continue;
23294+ if (GNTMAP_pte_special)
23295+ map->flags |= GNTMAP_pte_special;
23296+ else {
23297+ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
23298+ return true;
23299+ }
23300+ }
23301+
23302+ return false;
23303+}
23304+EXPORT_SYMBOL(gnttab_pre_map_adjust);
23305+
23306+#if CONFIG_XEN_COMPAT < 0x030400
23307+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
23308+{
23309+ unsigned int i;
23310+ int rc = 0;
23311+
23312+ for (i = 0; i < count && rc == 0; ++i, ++map) {
23313+ pte_t pte;
23314+
23315+ if (!(map->flags & GNTMAP_host_map)
23316+ || !(map->flags & GNTMAP_application_map))
23317+ continue;
23318+
23319+#ifdef CONFIG_X86
23320+ pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
23321+ | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
23322+ | _PAGE_SPECIAL)
23323+ & __supported_pte_mask);
23324+#else
23325+#error Architecture not yet supported.
23326+#endif
23327+ if (!(map->flags & GNTMAP_readonly))
23328+ pte = pte_mkwrite(pte);
23329+
23330+ if (map->flags & GNTMAP_contains_pte) {
23331+ mmu_update_t u;
23332+
23333+ u.ptr = map->host_addr;
23334+ u.val = __pte_val(pte);
23335+ rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
23336+ } else
23337+ rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
23338+ }
23339+
23340+ return rc;
23341+}
23342+EXPORT_SYMBOL(gnttab_post_map_adjust);
23343+#endif
23344+
23345+#endif /* __HAVE_ARCH_PTE_SPECIAL */
23346+
23347 int gnttab_resume(void)
23348 {
23349 if (max_nr_grant_frames() < nr_grant_frames)
23350@@ -640,6 +711,7 @@ int gnttab_resume(void)
23351 return gnttab_map(0, nr_grant_frames - 1);
23352 }
23353
23354+#ifdef CONFIG_PM_SLEEP
23355 int gnttab_suspend(void)
23356 {
23357 #ifdef CONFIG_X86
23358@@ -649,6 +721,7 @@ int gnttab_suspend(void)
23359 #endif
23360 return 0;
23361 }
23362+#endif
23363
23364 #else /* !CONFIG_XEN */
23365
23366@@ -759,6 +832,18 @@ int __devinit gnttab_init(void)
23367 gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
23368 gnttab_free_head = NR_RESERVED_ENTRIES;
23369
23370+#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
23371+ if (!xen_feature(XENFEAT_auto_translated_physmap)
23372+ && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
23373+#ifdef CONFIG_X86
23374+ GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
23375+ >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
23376+#else
23377+#error Architecture not yet supported.
23378+#endif
23379+ }
23380+#endif
23381+
23382 return 0;
23383
23384 ini_nomem:
23385Index: head-2008-12-01/drivers/xen/core/machine_kexec.c
23386===================================================================
23387--- head-2008-12-01.orig/drivers/xen/core/machine_kexec.c 2008-12-01 11:44:55.000000000 +0100
23388+++ head-2008-12-01/drivers/xen/core/machine_kexec.c 2008-12-01 11:49:07.000000000 +0100
23389@@ -90,7 +90,7 @@ void __init xen_machine_kexec_setup_reso
23390 xen_hypervisor_res.start = range.start;
23391 xen_hypervisor_res.end = range.start + range.size - 1;
23392 xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
23393-#ifdef CONFIG_X86_64
23394+#ifdef CONFIG_X86
23395 insert_resource(&iomem_resource, &xen_hypervisor_res);
23396 #endif
23397
23398@@ -105,7 +105,7 @@ void __init xen_machine_kexec_setup_reso
23399 if (range.size) {
23400 crashk_res.start = range.start;
23401 crashk_res.end = range.start + range.size - 1;
23402-#ifdef CONFIG_X86_64
23403+#ifdef CONFIG_X86
23404 insert_resource(&iomem_resource, &crashk_res);
23405 #endif
23406 }
23407@@ -152,7 +152,7 @@ void __init xen_machine_kexec_setup_reso
23408 return;
23409 }
23410
23411-#ifndef CONFIG_X86_64
23412+#ifndef CONFIG_X86
23413 void __init xen_machine_kexec_register_resources(struct resource *res)
23414 {
23415 request_resource(res, &xen_hypervisor_res);
23416Index: head-2008-12-01/drivers/xen/core/machine_reboot.c
23417===================================================================
23418--- head-2008-12-01.orig/drivers/xen/core/machine_reboot.c 2008-12-01 11:44:55.000000000 +0100
23419+++ head-2008-12-01/drivers/xen/core/machine_reboot.c 2008-12-01 11:49:07.000000000 +0100
23420@@ -65,6 +65,7 @@ EXPORT_SYMBOL(machine_restart);
23421 EXPORT_SYMBOL(machine_halt);
23422 EXPORT_SYMBOL(machine_power_off);
23423
23424+#ifdef CONFIG_PM_SLEEP
23425 static void pre_suspend(void)
23426 {
23427 HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
23428@@ -119,6 +120,7 @@ static void post_suspend(int suspend_can
23429 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
23430 virt_to_mfn(pfn_to_mfn_frame_list_list);
23431 }
23432+#endif
23433
23434 #else /* !(defined(__i386__) || defined(__x86_64__)) */
23435
23436@@ -137,6 +139,7 @@ static void post_suspend(int suspend_can
23437
23438 #endif
23439
23440+#ifdef CONFIG_PM_SLEEP
23441 struct suspend {
23442 int fast_suspend;
23443 void (*resume_notifier)(int);
23444@@ -230,7 +233,8 @@ int __xen_suspend(int fast_suspend, void
23445
23446 if (fast_suspend) {
23447 xenbus_suspend();
23448- err = stop_machine_run(take_machine_down, &suspend, 0);
23449+ err = stop_machine(take_machine_down, &suspend,
23450+ &cpumask_of_cpu(0));
23451 if (err < 0)
23452 xenbus_suspend_cancel();
23453 } else {
23454@@ -253,3 +257,4 @@ int __xen_suspend(int fast_suspend, void
23455
23456 return 0;
23457 }
23458+#endif
23459Index: head-2008-12-01/drivers/xen/core/reboot.c
23460===================================================================
23461--- head-2008-12-01.orig/drivers/xen/core/reboot.c 2008-12-01 11:36:47.000000000 +0100
23462+++ head-2008-12-01/drivers/xen/core/reboot.c 2008-12-01 11:49:07.000000000 +0100
23463@@ -29,17 +29,12 @@ MODULE_LICENSE("Dual BSD/GPL");
23464 /* Ignore multiple shutdown requests. */
23465 static int shutting_down = SHUTDOWN_INVALID;
23466
23467-/* Was last suspend request cancelled? */
23468-static int suspend_cancelled;
23469-
23470 /* Can we leave APs online when we suspend? */
23471 static int fast_suspend;
23472
23473 static void __shutdown_handler(struct work_struct *unused);
23474 static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
23475
23476-static int setup_suspend_evtchn(void);
23477-
23478 int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));
23479
23480 static int shutdown_process(void *__unused)
23481@@ -69,6 +64,13 @@ static int shutdown_process(void *__unus
23482 return 0;
23483 }
23484
23485+#ifdef CONFIG_PM_SLEEP
23486+
23487+static int setup_suspend_evtchn(void);
23488+
23489+/* Was last suspend request cancelled? */
23490+static int suspend_cancelled;
23491+
23492 static void xen_resume_notifier(int _suspend_cancelled)
23493 {
23494 int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
23495@@ -118,6 +120,10 @@ static int xen_suspend(void *__unused)
23496 return 0;
23497 }
23498
23499+#else
23500+# define xen_suspend NULL
23501+#endif
23502+
23503 static void switch_shutdown_state(int new_state)
23504 {
23505 int prev_state, old_state = SHUTDOWN_INVALID;
23506@@ -194,8 +200,10 @@ static void shutdown_handler(struct xenb
23507 new_state = SHUTDOWN_POWEROFF;
23508 else if (strcmp(str, "reboot") == 0)
23509 ctrl_alt_del();
23510+#ifdef CONFIG_PM_SLEEP
23511 else if (strcmp(str, "suspend") == 0)
23512 new_state = SHUTDOWN_SUSPEND;
23513+#endif
23514 else if (strcmp(str, "halt") == 0)
23515 new_state = SHUTDOWN_HALT;
23516 else
23517@@ -247,6 +255,7 @@ static struct xenbus_watch sysrq_watch =
23518 .callback = sysrq_handler
23519 };
23520
23521+#ifdef CONFIG_PM_SLEEP
23522 static irqreturn_t suspend_int(int irq, void* dev_id)
23523 {
23524 switch_shutdown_state(SHUTDOWN_SUSPEND);
23525@@ -274,6 +283,9 @@ static int setup_suspend_evtchn(void)
23526
23527 return 0;
23528 }
23529+#else
23530+#define setup_suspend_evtchn() 0
23531+#endif
23532
23533 static int setup_shutdown_watcher(void)
23534 {
23535Index: head-2008-12-01/drivers/xen/core/smpboot.c
23536===================================================================
23537--- head-2008-12-01.orig/drivers/xen/core/smpboot.c 2008-12-01 11:44:55.000000000 +0100
23538+++ head-2008-12-01/drivers/xen/core/smpboot.c 2008-12-01 11:49:07.000000000 +0100
23539@@ -27,6 +27,7 @@
23540
23541 extern irqreturn_t smp_reschedule_interrupt(int, void *);
23542 extern irqreturn_t smp_call_function_interrupt(int, void *);
23543+extern irqreturn_t smp_call_function_single_interrupt(int, void *);
23544
23545 extern int local_setup_timer(unsigned int cpu);
23546 extern void local_teardown_timer(unsigned int cpu);
23547@@ -54,8 +55,10 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
23548
23549 static DEFINE_PER_CPU(int, resched_irq);
23550 static DEFINE_PER_CPU(int, callfunc_irq);
23551+static DEFINE_PER_CPU(int, call1func_irq);
23552 static char resched_name[NR_CPUS][15];
23553 static char callfunc_name[NR_CPUS][15];
23554+static char call1func_name[NR_CPUS][15];
23555
23556 #ifdef CONFIG_X86_LOCAL_APIC
23557 #define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
23558@@ -77,8 +80,10 @@ void __init prefill_possible_map(void)
23559
23560 for (i = 0; i < NR_CPUS; i++) {
23561 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
23562- if (rc >= 0)
23563+ if (rc >= 0) {
23564 cpu_set(i, cpu_possible_map);
23565+ nr_cpu_ids = i + 1;
23566+ }
23567 }
23568 }
23569
23570@@ -114,7 +119,8 @@ static int __cpuinit xen_smp_intr_init(u
23571 {
23572 int rc;
23573
23574- per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
23575+ per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) =
23576+ per_cpu(call1func_irq, cpu) = -1;
23577
23578 sprintf(resched_name[cpu], "resched%u", cpu);
23579 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
23580@@ -138,6 +144,17 @@ static int __cpuinit xen_smp_intr_init(u
23581 goto fail;
23582 per_cpu(callfunc_irq, cpu) = rc;
23583
23584+ sprintf(call1func_name[cpu], "call1func%u", cpu);
23585+ rc = bind_ipi_to_irqhandler(CALL_FUNC_SINGLE_VECTOR,
23586+ cpu,
23587+ smp_call_function_single_interrupt,
23588+ IRQF_DISABLED|IRQF_NOBALANCING,
23589+ call1func_name[cpu],
23590+ NULL);
23591+ if (rc < 0)
23592+ goto fail;
23593+ per_cpu(call1func_irq, cpu) = rc;
23594+
23595 rc = xen_spinlock_init(cpu);
23596 if (rc < 0)
23597 goto fail;
23598@@ -152,6 +169,8 @@ static int __cpuinit xen_smp_intr_init(u
23599 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23600 if (per_cpu(callfunc_irq, cpu) >= 0)
23601 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23602+ if (per_cpu(call1func_irq, cpu) >= 0)
23603+ unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23604 xen_spinlock_cleanup(cpu);
23605 return rc;
23606 }
23607@@ -164,6 +183,7 @@ static void __cpuexit xen_smp_intr_exit(
23608
23609 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23610 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23611+ unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23612 xen_spinlock_cleanup(cpu);
23613 }
23614 #endif
23615@@ -171,11 +191,7 @@ static void __cpuexit xen_smp_intr_exit(
23616 void __cpuinit cpu_bringup(void)
23617 {
23618 cpu_init();
23619-#ifdef __i386__
23620 identify_secondary_cpu(&current_cpu_data);
23621-#else
23622- identify_cpu(&current_cpu_data);
23623-#endif
23624 touch_softlockup_watchdog();
23625 preempt_disable();
23626 local_irq_enable();
23627@@ -255,9 +271,6 @@ void __init smp_prepare_cpus(unsigned in
23628 struct task_struct *idle;
23629 int apicid;
23630 struct vcpu_get_physid cpu_id;
23631-#ifdef __x86_64__
23632- struct desc_ptr *gdt_descr;
23633-#endif
23634 void *gdt_addr;
23635
23636 apicid = 0;
23637@@ -270,7 +283,7 @@ void __init smp_prepare_cpus(unsigned in
23638
23639 current_thread_info()->cpu = 0;
23640
23641- for (cpu = 0; cpu < NR_CPUS; cpu++) {
23642+ for_each_possible_cpu (cpu) {
23643 cpus_clear(per_cpu(cpu_sibling_map, cpu));
23644 cpus_clear(per_cpu(cpu_core_map, cpu));
23645 }
23646@@ -297,21 +310,10 @@ void __init smp_prepare_cpus(unsigned in
23647 if (IS_ERR(idle))
23648 panic("failed fork for CPU %d", cpu);
23649
23650-#ifdef __x86_64__
23651- gdt_descr = &cpu_gdt_descr[cpu];
23652- gdt_descr->address = get_zeroed_page(GFP_KERNEL);
23653- if (unlikely(!gdt_descr->address)) {
23654- printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
23655- cpu);
23656- continue;
23657- }
23658- gdt_descr->size = GDT_SIZE;
23659- memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
23660- gdt_addr = (void *)gdt_descr->address;
23661-#else
23662+#ifdef __i386__
23663 init_gdt(cpu);
23664- gdt_addr = get_cpu_gdt_table(cpu);
23665 #endif
23666+ gdt_addr = get_cpu_gdt_table(cpu);
23667 make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
23668
23669 apicid = cpu;
23670Index: head-2008-12-01/drivers/xen/core/spinlock.c
23671===================================================================
23672--- head-2008-12-01.orig/drivers/xen/core/spinlock.c 2008-12-01 11:37:10.000000000 +0100
23673+++ head-2008-12-01/drivers/xen/core/spinlock.c 2008-12-01 11:51:53.000000000 +0100
23674@@ -73,9 +73,9 @@ int xen_spin_wait(raw_spinlock_t *lock,
23675 /* announce we're spinning */
23676 spinning.ticket = token;
23677 spinning.lock = lock;
23678- spinning.prev = __get_cpu_var(spinning);
23679+ spinning.prev = x86_read_percpu(spinning);
23680 smp_wmb();
23681- __get_cpu_var(spinning) = &spinning;
23682+ x86_write_percpu(spinning, &spinning);
23683
23684 /* clear pending */
23685 xen_clear_irq_pending(irq);
23686@@ -102,7 +102,7 @@ int xen_spin_wait(raw_spinlock_t *lock,
23687 kstat_this_cpu.irqs[irq] += !rc;
23688
23689 /* announce we're done */
23690- __get_cpu_var(spinning) = spinning.prev;
23691+ x86_write_percpu(spinning, spinning.prev);
23692 rm_lock = &__get_cpu_var(spinning_rm_lock);
23693 raw_local_irq_save(flags);
23694 __raw_write_lock(rm_lock);
23695Index: head-2008-12-01/drivers/xen/fbfront/xenfb.c
23696===================================================================
23697--- head-2008-12-01.orig/drivers/xen/fbfront/xenfb.c 2008-12-01 11:44:55.000000000 +0100
23698+++ head-2008-12-01/drivers/xen/fbfront/xenfb.c 2008-12-01 11:49:07.000000000 +0100
23699@@ -18,6 +18,7 @@
23700 * frame buffer.
23701 */
23702
23703+#include <linux/console.h>
23704 #include <linux/kernel.h>
23705 #include <linux/errno.h>
23706 #include <linux/fb.h>
23707@@ -544,6 +545,28 @@ static unsigned long vmalloc_to_mfn(void
23708 return pfn_to_mfn(vmalloc_to_pfn(address));
23709 }
23710
23711+static __devinit void
23712+xenfb_make_preferred_console(void)
23713+{
23714+ struct console *c;
23715+
23716+ if (console_set_on_cmdline)
23717+ return;
23718+
23719+ acquire_console_sem();
23720+ for (c = console_drivers; c; c = c->next) {
23721+ if (!strcmp(c->name, "tty") && c->index == 0)
23722+ break;
23723+ }
23724+ release_console_sem();
23725+ if (c) {
23726+ unregister_console(c);
23727+ c->flags |= CON_CONSDEV;
23728+ c->flags &= ~CON_PRINTBUFFER; /* don't print again */
23729+ register_console(c);
23730+ }
23731+}
23732+
23733 static int __devinit xenfb_probe(struct xenbus_device *dev,
23734 const struct xenbus_device_id *id)
23735 {
23736@@ -673,6 +696,7 @@ static int __devinit xenfb_probe(struct
23737 goto error;
23738 }
23739
23740+ xenfb_make_preferred_console();
23741 return 0;
23742
23743 error_nomem:
23744@@ -881,4 +905,5 @@ static void __exit xenfb_cleanup(void)
23745 module_init(xenfb_init);
23746 module_exit(xenfb_cleanup);
23747
23748+MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
23749 MODULE_LICENSE("GPL");
23750Index: head-2008-12-01/drivers/xen/fbfront/xenkbd.c
23751===================================================================
23752--- head-2008-12-01.orig/drivers/xen/fbfront/xenkbd.c 2008-12-01 11:36:07.000000000 +0100
23753+++ head-2008-12-01/drivers/xen/fbfront/xenkbd.c 2008-12-01 11:49:07.000000000 +0100
23754@@ -350,4 +350,5 @@ static void __exit xenkbd_cleanup(void)
23755 module_init(xenkbd_init);
23756 module_exit(xenkbd_cleanup);
23757
23758+MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
23759 MODULE_LICENSE("GPL");
23760Index: head-2008-12-01/drivers/xen/gntdev/gntdev.c
23761===================================================================
23762--- head-2008-12-01.orig/drivers/xen/gntdev/gntdev.c 2008-12-01 11:44:55.000000000 +0100
23763+++ head-2008-12-01/drivers/xen/gntdev/gntdev.c 2008-12-01 11:49:07.000000000 +0100
23764@@ -418,7 +418,7 @@ static int __init gntdev_init(void)
23765 }
23766
23767 device = device_create(class, NULL, MKDEV(gntdev_major, 0),
23768- GNTDEV_NAME);
23769+ NULL, GNTDEV_NAME);
23770 if (IS_ERR(device)) {
23771 printk(KERN_ERR "Error creating gntdev device in xen_class\n");
23772 printk(KERN_ERR "gntdev created with major number = %d\n",
23773Index: head-2008-12-01/drivers/xen/netfront/accel.c
23774===================================================================
23775--- head-2008-12-01.orig/drivers/xen/netfront/accel.c 2008-12-01 11:36:55.000000000 +0100
23776+++ head-2008-12-01/drivers/xen/netfront/accel.c 2008-12-01 11:49:07.000000000 +0100
23777@@ -28,6 +28,7 @@
23778 * IN THE SOFTWARE.
23779 */
23780
23781+#include <linux/version.h>
23782 #include <linux/netdevice.h>
23783 #include <linux/skbuff.h>
23784 #include <linux/list.h>
23785Index: head-2008-12-01/drivers/xen/netfront/netfront.c
23786===================================================================
23787--- head-2008-12-01.orig/drivers/xen/netfront/netfront.c 2008-12-01 11:44:55.000000000 +0100
23788+++ head-2008-12-01/drivers/xen/netfront/netfront.c 2008-12-01 11:49:07.000000000 +0100
23789@@ -640,7 +640,7 @@ static int network_open(struct net_devic
23790 }
23791 spin_unlock_bh(&np->rx_lock);
23792
23793- network_maybe_wake_tx(dev);
23794+ netif_start_queue(dev);
23795
23796 return 0;
23797 }
23798Index: head-2008-12-01/drivers/xen/sfc_netback/accel.h
23799===================================================================
23800--- head-2008-12-01.orig/drivers/xen/sfc_netback/accel.h 2008-12-03 15:48:43.000000000 +0100
23801+++ head-2008-12-01/drivers/xen/sfc_netback/accel.h 2008-12-01 11:49:07.000000000 +0100
23802@@ -25,6 +25,7 @@
23803 #ifndef NETBACK_ACCEL_H
23804 #define NETBACK_ACCEL_H
23805
23806+#include <linux/version.h>
23807 #include <linux/slab.h>
23808 #include <linux/ip.h>
23809 #include <linux/tcp.h>
23810Index: head-2008-12-01/drivers/xen/sfc_netfront/accel.h
23811===================================================================
23812--- head-2008-12-01.orig/drivers/xen/sfc_netfront/accel.h 2008-12-01 11:29:05.000000000 +0100
23813+++ head-2008-12-01/drivers/xen/sfc_netfront/accel.h 2008-12-01 11:49:07.000000000 +0100
23814@@ -35,6 +35,7 @@
23815 #include <xen/evtchn.h>
23816
23817 #include <linux/kernel.h>
23818+#include <linux/version.h>
23819 #include <linux/list.h>
23820
23821 enum netfront_accel_post_status {
23822Index: head-2008-12-01/drivers/xen/xenbus/xenbus_comms.c
23823===================================================================
23824--- head-2008-12-01.orig/drivers/xen/xenbus/xenbus_comms.c 2008-12-01 11:36:47.000000000 +0100
23825+++ head-2008-12-01/drivers/xen/xenbus/xenbus_comms.c 2008-12-01 11:49:07.000000000 +0100
23826@@ -228,14 +228,11 @@ int xb_init_comms(void)
23827 intf->rsp_cons = intf->rsp_prod;
23828 }
23829
23830+#if defined(CONFIG_XEN) || defined(MODULE)
23831 if (xenbus_irq)
23832 unbind_from_irqhandler(xenbus_irq, &xb_waitq);
23833
23834-#if defined(CONFIG_XEN) || defined(MODULE)
23835 err = bind_caller_port_to_irqhandler(
23836-#else
23837- err = bind_evtchn_to_irqhandler(
23838-#endif
23839 xen_store_evtchn, wake_waiting,
23840 0, "xenbus", &xb_waitq);
23841 if (err <= 0) {
23842@@ -244,6 +241,20 @@ int xb_init_comms(void)
23843 }
23844
23845 xenbus_irq = err;
23846+#else
23847+ if (xenbus_irq) {
23848+ /* Already have an irq; assume we're resuming */
23849+ rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
23850+ } else {
23851+ err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
23852+ 0, "xenbus", &xb_waitq);
23853+ if (err <= 0) {
23854+ printk(KERN_ERR "XENBUS request irq failed %i\n", err);
23855+ return err;
23856+ }
23857+ xenbus_irq = err;
23858+ }
23859+#endif
23860
23861 return 0;
23862 }
23863Index: head-2008-12-01/drivers/xen/xenbus/xenbus_probe.c
23864===================================================================
23865--- head-2008-12-01.orig/drivers/xen/xenbus/xenbus_probe.c 2008-12-01 11:44:55.000000000 +0100
23866+++ head-2008-12-01/drivers/xen/xenbus/xenbus_probe.c 2008-12-01 11:49:07.000000000 +0100
23867@@ -36,6 +36,7 @@
23868 __FUNCTION__, __LINE__, ##args)
23869
23870 #include <linux/kernel.h>
23871+#include <linux/version.h>
23872 #include <linux/err.h>
23873 #include <linux/string.h>
23874 #include <linux/ctype.h>
23875Index: head-2008-12-01/fs/aio.c
23876===================================================================
23877--- head-2008-12-01.orig/fs/aio.c 2008-12-01 11:44:55.000000000 +0100
23878+++ head-2008-12-01/fs/aio.c 2008-12-01 11:49:07.000000000 +0100
23879@@ -1319,7 +1319,7 @@ static int make_aio_fd(struct kioctx *io
23880 int fd;
23881 struct file *file;
23882
23883- fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
23884+ fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
23885 if (fd < 0)
23886 return fd;
23887
23888Index: head-2008-12-01/include/asm-generic/pgtable.h
23889===================================================================
23890--- head-2008-12-01.orig/include/asm-generic/pgtable.h 2008-12-01 11:29:05.000000000 +0100
23891+++ head-2008-12-01/include/asm-generic/pgtable.h 2008-12-01 11:49:07.000000000 +0100
23892@@ -99,10 +99,6 @@ static inline void ptep_set_wrprotect(st
23893 }
23894 #endif
23895
23896-#ifndef arch_change_pte_range
23897-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
23898-#endif
23899-
23900 #ifndef __HAVE_ARCH_PTE_SAME
23901 #define pte_same(A,B) (pte_val(A) == pte_val(B))
23902 #endif
23903Index: head-2008-12-01/include/asm-x86/dma-mapping.h
23904===================================================================
23905--- head-2008-12-01.orig/include/asm-x86/dma-mapping.h 2008-12-01 11:44:55.000000000 +0100
23906+++ head-2008-12-01/include/asm-x86/dma-mapping.h 2008-12-01 11:49:07.000000000 +0100
23907@@ -74,7 +74,7 @@ static inline struct dma_mapping_ops *ge
23908 /* Make sure we keep the same behaviour */
23909 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
23910 {
23911-#ifdef CONFIG_X86_32
23912+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
23913 return 0;
23914 #else
23915 struct dma_mapping_ops *ops = get_dma_ops(dev);
23916Index: head-2008-12-01/include/asm-x86/mach-xen/asm/desc.h
23917===================================================================
23918--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/desc.h 2008-12-01 11:44:55.000000000 +0100
23919+++ head-2008-12-01/include/asm-x86/mach-xen/asm/desc.h 2008-12-01 11:49:07.000000000 +0100
23920@@ -31,11 +31,17 @@ extern struct desc_ptr idt_descr;
23921 extern gate_desc idt_table[];
23922 #endif
23923
23924+struct gdt_page {
23925+ struct desc_struct gdt[GDT_ENTRIES];
23926+} __attribute__((aligned(PAGE_SIZE)));
23927+DECLARE_PER_CPU(struct gdt_page, gdt_page);
23928+
23929+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
23930+{
23931+ return per_cpu(gdt_page, cpu).gdt;
23932+}
23933+
23934 #ifdef CONFIG_X86_64
23935-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
23936-extern struct desc_ptr cpu_gdt_descr[];
23937-/* the cpu gdt accessor */
23938-#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
23939
23940 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
23941 unsigned dpl, unsigned ist, unsigned seg)
23942@@ -53,16 +59,6 @@ static inline void pack_gate(gate_desc *
23943 }
23944
23945 #else
23946-struct gdt_page {
23947- struct desc_struct gdt[GDT_ENTRIES];
23948-} __attribute__((aligned(PAGE_SIZE)));
23949-DECLARE_PER_CPU(struct gdt_page, gdt_page);
23950-
23951-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
23952-{
23953- return per_cpu(gdt_page, cpu).gdt;
23954-}
23955-
23956 static inline void pack_gate(gate_desc *gate, unsigned char type,
23957 unsigned long base, unsigned dpl, unsigned flags,
23958 unsigned short seg)
23959@@ -333,6 +329,28 @@ static inline void set_intr_gate(unsigne
23960 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
23961 }
23962
23963+#define SYS_VECTOR_FREE 0
23964+#define SYS_VECTOR_ALLOCED 1
23965+
23966+extern int first_system_vector;
23967+extern char system_vectors[];
23968+
23969+static inline void alloc_system_vector(int vector)
23970+{
23971+ if (system_vectors[vector] == SYS_VECTOR_FREE) {
23972+ system_vectors[vector] = SYS_VECTOR_ALLOCED;
23973+ if (first_system_vector > vector)
23974+ first_system_vector = vector;
23975+ } else
23976+ BUG();
23977+}
23978+
23979+static inline void alloc_intr_gate(unsigned int n, void *addr)
23980+{
23981+ alloc_system_vector(n);
23982+ set_intr_gate(n, addr);
23983+}
23984+
23985 /*
23986 * This routine sets up an interrupt gate at directory privilege level 3.
23987 */
23988Index: head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap.h
23989===================================================================
23990--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/fixmap.h 2008-12-01 11:44:55.000000000 +0100
23991+++ head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap.h 2008-12-01 11:49:07.000000000 +0100
23992@@ -7,7 +7,58 @@
23993 # include "fixmap_64.h"
23994 #endif
23995
23996+extern int fixmaps_set;
23997+
23998+void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
23999+
24000+static inline void __set_fixmap(enum fixed_addresses idx,
24001+ maddr_t phys, pgprot_t flags)
24002+{
24003+ xen_set_fixmap(idx, phys, flags);
24004+}
24005+
24006+#define set_fixmap(idx, phys) \
24007+ __set_fixmap(idx, phys, PAGE_KERNEL)
24008+
24009+/*
24010+ * Some hardware wants to get fixmapped without caching.
24011+ */
24012+#define set_fixmap_nocache(idx, phys) \
24013+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24014+
24015 #define clear_fixmap(idx) \
24016 __set_fixmap(idx, 0, __pgprot(0))
24017
24018+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24019+#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24020+
24021+extern void __this_fixmap_does_not_exist(void);
24022+
24023+/*
24024+ * 'index to address' translation. If anyone tries to use the idx
24025+ * directly without translation, we catch the bug with a NULL-deference
24026+ * kernel oops. Illegal ranges of incoming indices are caught too.
24027+ */
24028+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24029+{
24030+ /*
24031+ * this branch gets completely eliminated after inlining,
24032+ * except when someone tries to use fixaddr indices in an
24033+ * illegal way. (such as mixing up address types or using
24034+ * out-of-range indices).
24035+ *
24036+ * If it doesn't get removed, the linker will complain
24037+ * loudly with a reasonably clear error message..
24038+ */
24039+ if (idx >= __end_of_fixed_addresses)
24040+ __this_fixmap_does_not_exist();
24041+
24042+ return __fix_to_virt(idx);
24043+}
24044+
24045+static inline unsigned long virt_to_fix(const unsigned long vaddr)
24046+{
24047+ BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24048+ return __virt_to_fix(vaddr);
24049+}
24050 #endif
24051Index: head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_32.h
24052===================================================================
24053--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2008-12-01 11:44:55.000000000 +0100
24054+++ head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_32.h 2008-12-01 11:49:07.000000000 +0100
24055@@ -58,10 +58,17 @@ enum fixed_addresses {
24056 #ifdef CONFIG_X86_LOCAL_APIC
24057 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24058 #endif
24059-#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
24060+#ifndef CONFIG_XEN
24061+#ifdef CONFIG_X86_IO_APIC
24062 FIX_IO_APIC_BASE_0,
24063 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
24064 #endif
24065+#else
24066+ FIX_SHARED_INFO,
24067+#define NR_FIX_ISAMAPS 256
24068+ FIX_ISAMAP_END,
24069+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24070+#endif
24071 #ifdef CONFIG_X86_VISWS_APIC
24072 FIX_CO_CPU, /* Cobalt timer */
24073 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
24074@@ -78,51 +85,38 @@ enum fixed_addresses {
24075 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
24076 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
24077 #endif
24078-#ifdef CONFIG_ACPI
24079- FIX_ACPI_BEGIN,
24080- FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24081-#endif
24082 #ifdef CONFIG_PCI_MMCONFIG
24083 FIX_PCIE_MCFG,
24084 #endif
24085 #ifdef CONFIG_PARAVIRT
24086 FIX_PARAVIRT_BOOTMAP,
24087 #endif
24088- FIX_SHARED_INFO,
24089-#define NR_FIX_ISAMAPS 256
24090- FIX_ISAMAP_END,
24091- FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24092 __end_of_permanent_fixed_addresses,
24093 /*
24094 * 256 temporary boot-time mappings, used by early_ioremap(),
24095 * before ioremap() is functional.
24096 *
24097- * We round it up to the next 512 pages boundary so that we
24098+ * We round it up to the next 256 pages boundary so that we
24099 * can have a single pgd entry and a single pte table:
24100 */
24101 #define NR_FIX_BTMAPS 64
24102 #define FIX_BTMAPS_NESTING 4
24103- FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24104- (__end_of_permanent_fixed_addresses & 511),
24105+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
24106+ (__end_of_permanent_fixed_addresses & 255),
24107 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24108 FIX_WP_TEST,
24109+#ifdef CONFIG_ACPI
24110+ FIX_ACPI_BEGIN,
24111+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24112+#endif
24113 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24114 FIX_OHCI1394_BASE,
24115 #endif
24116 __end_of_fixed_addresses
24117 };
24118
24119-extern void __set_fixmap(enum fixed_addresses idx,
24120- maddr_t phys, pgprot_t flags);
24121 extern void reserve_top_address(unsigned long reserve);
24122
24123-#define set_fixmap(idx, phys) \
24124- __set_fixmap(idx, phys, PAGE_KERNEL)
24125-/*
24126- * Some hardware wants to get fixmapped without caching.
24127- */
24128-#define set_fixmap_nocache(idx, phys) \
24129- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24130
24131 #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
24132
24133@@ -131,38 +125,5 @@ extern void reserve_top_address(unsigned
24134 #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
24135 #define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
24136
24137-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24138-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24139-
24140-extern void __this_fixmap_does_not_exist(void);
24141-
24142-/*
24143- * 'index to address' translation. If anyone tries to use the idx
24144- * directly without tranlation, we catch the bug with a NULL-deference
24145- * kernel oops. Illegal ranges of incoming indices are caught too.
24146- */
24147-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24148-{
24149- /*
24150- * this branch gets completely eliminated after inlining,
24151- * except when someone tries to use fixaddr indices in an
24152- * illegal way. (such as mixing up address types or using
24153- * out-of-range indices).
24154- *
24155- * If it doesn't get removed, the linker will complain
24156- * loudly with a reasonably clear error message..
24157- */
24158- if (idx >= __end_of_fixed_addresses)
24159- __this_fixmap_does_not_exist();
24160-
24161- return __fix_to_virt(idx);
24162-}
24163-
24164-static inline unsigned long virt_to_fix(const unsigned long vaddr)
24165-{
24166- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24167- return __virt_to_fix(vaddr);
24168-}
24169-
24170 #endif /* !__ASSEMBLY__ */
24171 #endif
24172Index: head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_64.h
24173===================================================================
24174--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2008-12-01 11:44:55.000000000 +0100
24175+++ head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_64.h 2008-12-01 11:49:07.000000000 +0100
24176@@ -12,6 +12,7 @@
24177 #define _ASM_FIXMAP_64_H
24178
24179 #include <linux/kernel.h>
24180+#include <asm/acpi.h>
24181 #include <asm/apicdef.h>
24182 #include <asm/page.h>
24183 #include <asm/vsyscall.h>
24184@@ -40,7 +41,6 @@ enum fixed_addresses {
24185 VSYSCALL_HPET,
24186 FIX_DBGP_BASE,
24187 FIX_EARLYCON_MEM_BASE,
24188- FIX_HPET_BASE,
24189 #ifdef CONFIG_X86_LOCAL_APIC
24190 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24191 #endif
24192@@ -53,14 +53,21 @@ enum fixed_addresses {
24193 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
24194 + MAX_EFI_IO_PAGES - 1,
24195 #endif
24196+#ifdef CONFIG_PARAVIRT
24197+ FIX_PARAVIRT_BOOTMAP,
24198+#else
24199+ FIX_SHARED_INFO,
24200+#endif
24201 #ifdef CONFIG_ACPI
24202 FIX_ACPI_BEGIN,
24203 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24204 #endif
24205- FIX_SHARED_INFO,
24206 #define NR_FIX_ISAMAPS 256
24207 FIX_ISAMAP_END,
24208 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24209+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24210+ FIX_OHCI1394_BASE,
24211+#endif
24212 __end_of_permanent_fixed_addresses,
24213 /*
24214 * 256 temporary boot-time mappings, used by early_ioremap(),
24215@@ -71,27 +78,12 @@ enum fixed_addresses {
24216 */
24217 #define NR_FIX_BTMAPS 64
24218 #define FIX_BTMAPS_NESTING 4
24219- FIX_BTMAP_END =
24220- __end_of_permanent_fixed_addresses + 512 -
24221+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24222 (__end_of_permanent_fixed_addresses & 511),
24223 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24224-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24225- FIX_OHCI1394_BASE,
24226-#endif
24227 __end_of_fixed_addresses
24228 };
24229
24230-extern void __set_fixmap(enum fixed_addresses idx,
24231- unsigned long phys, pgprot_t flags);
24232-
24233-#define set_fixmap(idx, phys) \
24234- __set_fixmap(idx, phys, PAGE_KERNEL)
24235-/*
24236- * Some hardware wants to get fixmapped without caching.
24237- */
24238-#define set_fixmap_nocache(idx, phys) \
24239- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24240-
24241 #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
24242 #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
24243 #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
24244@@ -100,30 +92,4 @@ extern void __set_fixmap(enum fixed_addr
24245 #define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
24246 #define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
24247
24248-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24249-
24250-extern void __this_fixmap_does_not_exist(void);
24251-
24252-/*
24253- * 'index to address' translation. If anyone tries to use the idx
24254- * directly without translation, we catch the bug with a NULL-deference
24255- * kernel oops. Illegal ranges of incoming indices are caught too.
24256- */
24257-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24258-{
24259- /*
24260- * this branch gets completely eliminated after inlining,
24261- * except when someone tries to use fixaddr indices in an
24262- * illegal way. (such as mixing up address types or using
24263- * out-of-range indices).
24264- *
24265- * If it doesn't get removed, the linker will complain
24266- * loudly with a reasonably clear error message..
24267- */
24268- if (idx >= __end_of_fixed_addresses)
24269- __this_fixmap_does_not_exist();
24270-
24271- return __fix_to_virt(idx);
24272-}
24273-
24274 #endif
24275Index: head-2008-12-01/include/asm-x86/mach-xen/asm/highmem.h
24276===================================================================
24277--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/highmem.h 2008-12-01 11:44:55.000000000 +0100
24278+++ head-2008-12-01/include/asm-x86/mach-xen/asm/highmem.h 2008-12-01 11:49:07.000000000 +0100
24279@@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *p
24280
24281 #define flush_cache_kmaps() do { } while (0)
24282
24283+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
24284+ unsigned long end_pfn);
24285+
24286 void clear_highpage(struct page *);
24287 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
24288 {
24289Index: head-2008-12-01/include/asm-x86/mach-xen/asm/hypercall.h
24290===================================================================
24291--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/hypercall.h 2008-12-01 11:36:55.000000000 +0100
24292+++ head-2008-12-01/include/asm-x86/mach-xen/asm/hypercall.h 2008-12-01 11:49:07.000000000 +0100
24293@@ -323,9 +323,19 @@ static inline int __must_check
24294 HYPERVISOR_grant_table_op(
24295 unsigned int cmd, void *uop, unsigned int count)
24296 {
24297+ bool fixup = false;
24298+ int rc;
24299+
24300 if (arch_use_lazy_mmu_mode())
24301 xen_multicall_flush(false);
24302- return _hypercall3(int, grant_table_op, cmd, uop, count);
24303+#ifdef GNTTABOP_map_grant_ref
24304+ if (cmd == GNTTABOP_map_grant_ref)
24305+#endif
24306+ fixup = gnttab_pre_map_adjust(cmd, uop, count);
24307+ rc = _hypercall3(int, grant_table_op, cmd, uop, count);
24308+ if (rc == 0 && fixup)
24309+ rc = gnttab_post_map_adjust(uop, count);
24310+ return rc;
24311 }
24312
24313 static inline int __must_check
24314Index: head-2008-12-01/include/asm-x86/mach-xen/asm/hypervisor.h
24315===================================================================
24316--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2008-12-01 11:37:10.000000000 +0100
24317+++ head-2008-12-01/include/asm-x86/mach-xen/asm/hypervisor.h 2008-12-01 11:49:07.000000000 +0100
24318@@ -35,7 +35,6 @@
24319
24320 #include <linux/types.h>
24321 #include <linux/kernel.h>
24322-#include <linux/version.h>
24323 #include <linux/errno.h>
24324 #include <xen/interface/xen.h>
24325 #include <xen/interface/platform.h>
24326@@ -171,6 +170,20 @@ static inline void arch_flush_lazy_mmu_m
24327 }
24328 #endif
24329
24330+struct gnttab_map_grant_ref;
24331+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
24332+ unsigned int count);
24333+#if CONFIG_XEN_COMPAT < 0x030400
24334+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
24335+#else
24336+static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
24337+ unsigned int count)
24338+{
24339+ BUG();
24340+ return -ENOSYS;
24341+}
24342+#endif
24343+
24344 #else /* CONFIG_XEN */
24345
24346 static inline void xen_multicall_flush(bool ignore) {}
24347@@ -179,6 +192,9 @@ static inline void xen_multicall_flush(b
24348 #define xen_multi_mmu_update(...) ({ BUG(); -ENOSYS; })
24349 #define xen_multi_mmuext_op(...) ({ BUG(); -ENOSYS; })
24350
24351+#define gnttab_pre_map_adjust(...) false
24352+#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
24353+
24354 #endif /* CONFIG_XEN */
24355
24356 #if defined(CONFIG_X86_64)
24357Index: head-2008-12-01/include/asm-x86/mach-xen/asm/io.h
24358===================================================================
24359--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/io.h 2008-12-01 11:44:55.000000000 +0100
24360+++ head-2008-12-01/include/asm-x86/mach-xen/asm/io.h 2008-12-01 11:49:07.000000000 +0100
24361@@ -3,6 +3,76 @@
24362
24363 #define ARCH_HAS_IOREMAP_WC
24364
24365+#include <linux/compiler.h>
24366+
24367+/*
24368+ * early_ioremap() and early_iounmap() are for temporary early boot-time
24369+ * mappings, before the real ioremap() is functional.
24370+ * A boot-time mapping is currently limited to at most 16 pages.
24371+ */
24372+#ifndef __ASSEMBLY__
24373+extern void early_ioremap_init(void);
24374+extern void early_ioremap_clear(void);
24375+extern void early_ioremap_reset(void);
24376+extern void *early_ioremap(unsigned long offset, unsigned long size);
24377+extern void early_iounmap(void *addr, unsigned long size);
24378+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24379+#endif
24380+
24381+#define build_mmio_read(name, size, type, reg, barrier) \
24382+static inline type name(const volatile void __iomem *addr) \
24383+{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
24384+:"m" (*(volatile type __force *)addr) barrier); return ret; }
24385+
24386+#define build_mmio_write(name, size, type, reg, barrier) \
24387+static inline void name(type val, volatile void __iomem *addr) \
24388+{ asm volatile("mov" size " %0,%1": :reg (val), \
24389+"m" (*(volatile type __force *)addr) barrier); }
24390+
24391+build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
24392+build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
24393+build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
24394+
24395+build_mmio_read(__readb, "b", unsigned char, "=q", )
24396+build_mmio_read(__readw, "w", unsigned short, "=r", )
24397+build_mmio_read(__readl, "l", unsigned int, "=r", )
24398+
24399+build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
24400+build_mmio_write(writew, "w", unsigned short, "r", :"memory")
24401+build_mmio_write(writel, "l", unsigned int, "r", :"memory")
24402+
24403+build_mmio_write(__writeb, "b", unsigned char, "q", )
24404+build_mmio_write(__writew, "w", unsigned short, "r", )
24405+build_mmio_write(__writel, "l", unsigned int, "r", )
24406+
24407+#define readb_relaxed(a) __readb(a)
24408+#define readw_relaxed(a) __readw(a)
24409+#define readl_relaxed(a) __readl(a)
24410+#define __raw_readb __readb
24411+#define __raw_readw __readw
24412+#define __raw_readl __readl
24413+
24414+#define __raw_writeb __writeb
24415+#define __raw_writew __writew
24416+#define __raw_writel __writel
24417+
24418+#define mmiowb() barrier()
24419+
24420+#ifdef CONFIG_X86_64
24421+build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
24422+build_mmio_read(__readq, "q", unsigned long, "=r", )
24423+build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
24424+build_mmio_write(__writeq, "q", unsigned long, "r", )
24425+
24426+#define readq_relaxed(a) __readq(a)
24427+#define __raw_readq __readq
24428+#define __raw_writeq writeq
24429+
24430+/* Let people know we have them */
24431+#define readq readq
24432+#define writeq writeq
24433+#endif
24434+
24435 #ifdef CONFIG_X86_32
24436 # include "io_32.h"
24437 #else
24438@@ -19,4 +89,17 @@ extern int ioremap_check_change_attr(uns
24439 unsigned long prot_val);
24440 extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
24441
24442+/*
24443+ * early_ioremap() and early_iounmap() are for temporary early boot-time
24444+ * mappings, before the real ioremap() is functional.
24445+ * A boot-time mapping is currently limited to at most 16 pages.
24446+ */
24447+extern void early_ioremap_init(void);
24448+extern void early_ioremap_clear(void);
24449+extern void early_ioremap_reset(void);
24450+extern void *early_ioremap(unsigned long offset, unsigned long size);
24451+extern void early_iounmap(void *addr, unsigned long size);
24452+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24453+
24454+
24455 #endif /* _ASM_X86_IO_H */
24456Index: head-2008-12-01/include/asm-x86/mach-xen/asm/io_32.h
24457===================================================================
24458--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/io_32.h 2008-12-01 11:44:55.000000000 +0100
24459+++ head-2008-12-01/include/asm-x86/mach-xen/asm/io_32.h 2008-12-01 11:49:07.000000000 +0100
24460@@ -123,6 +123,8 @@ static inline void *phys_to_virt(unsigne
24461 */
24462 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
24463 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
24464+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
24465+ unsigned long prot_val);
24466
24467 /*
24468 * The default ioremap() behavior is non-cached:
24469@@ -135,18 +137,6 @@ static inline void __iomem *ioremap(reso
24470 extern void iounmap(volatile void __iomem *addr);
24471
24472 /*
24473- * early_ioremap() and early_iounmap() are for temporary early boot-time
24474- * mappings, before the real ioremap() is functional.
24475- * A boot-time mapping is currently limited to at most 16 pages.
24476- */
24477-extern void early_ioremap_init(void);
24478-extern void early_ioremap_clear(void);
24479-extern void early_ioremap_reset(void);
24480-extern void *early_ioremap(unsigned long offset, unsigned long size);
24481-extern void early_iounmap(void *addr, unsigned long size);
24482-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24483-
24484-/*
24485 * ISA I/O bus memory addresses are 1:1 with the physical address.
24486 */
24487 #define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
24488@@ -162,55 +152,6 @@ extern void __iomem *fix_ioremap(unsigne
24489 #define virt_to_bus(_x) phys_to_machine(__pa(_x))
24490 #define bus_to_virt(_x) __va(machine_to_phys(_x))
24491
24492-/*
24493- * readX/writeX() are used to access memory mapped devices. On some
24494- * architectures the memory mapped IO stuff needs to be accessed
24495- * differently. On the x86 architecture, we just read/write the
24496- * memory location directly.
24497- */
24498-
24499-static inline unsigned char readb(const volatile void __iomem *addr)
24500-{
24501- return *(volatile unsigned char __force *)addr;
24502-}
24503-
24504-static inline unsigned short readw(const volatile void __iomem *addr)
24505-{
24506- return *(volatile unsigned short __force *)addr;
24507-}
24508-
24509-static inline unsigned int readl(const volatile void __iomem *addr)
24510-{
24511- return *(volatile unsigned int __force *) addr;
24512-}
24513-
24514-#define readb_relaxed(addr) readb(addr)
24515-#define readw_relaxed(addr) readw(addr)
24516-#define readl_relaxed(addr) readl(addr)
24517-#define __raw_readb readb
24518-#define __raw_readw readw
24519-#define __raw_readl readl
24520-
24521-static inline void writeb(unsigned char b, volatile void __iomem *addr)
24522-{
24523- *(volatile unsigned char __force *)addr = b;
24524-}
24525-
24526-static inline void writew(unsigned short b, volatile void __iomem *addr)
24527-{
24528- *(volatile unsigned short __force *)addr = b;
24529-}
24530-
24531-static inline void writel(unsigned int b, volatile void __iomem *addr)
24532-{
24533- *(volatile unsigned int __force *)addr = b;
24534-}
24535-#define __raw_writeb writeb
24536-#define __raw_writew writew
24537-#define __raw_writel writel
24538-
24539-#define mmiowb()
24540-
24541 static inline void
24542 memset_io(volatile void __iomem *addr, unsigned char val, int count)
24543 {
24544Index: head-2008-12-01/include/asm-x86/mach-xen/asm/io_64.h
24545===================================================================
24546--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/io_64.h 2008-12-01 11:44:55.000000000 +0100
24547+++ head-2008-12-01/include/asm-x86/mach-xen/asm/io_64.h 2008-12-01 11:49:07.000000000 +0100
24548@@ -190,6 +190,8 @@ extern void early_iounmap(void *addr, un
24549 */
24550 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
24551 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
24552+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
24553+ unsigned long prot_val);
24554
24555 /*
24556 * The default ioremap() behavior is non-cached:
24557@@ -220,77 +222,6 @@ extern void __iomem *fix_ioremap(unsigne
24558 #define virt_to_bus(_x) phys_to_machine(__pa(_x))
24559 #define bus_to_virt(_x) __va(machine_to_phys(_x))
24560
24561-/*
24562- * readX/writeX() are used to access memory mapped devices. On some
24563- * architectures the memory mapped IO stuff needs to be accessed
24564- * differently. On the x86 architecture, we just read/write the
24565- * memory location directly.
24566- */
24567-
24568-static inline __u8 __readb(const volatile void __iomem *addr)
24569-{
24570- return *(__force volatile __u8 *)addr;
24571-}
24572-
24573-static inline __u16 __readw(const volatile void __iomem *addr)
24574-{
24575- return *(__force volatile __u16 *)addr;
24576-}
24577-
24578-static __always_inline __u32 __readl(const volatile void __iomem *addr)
24579-{
24580- return *(__force volatile __u32 *)addr;
24581-}
24582-
24583-static inline __u64 __readq(const volatile void __iomem *addr)
24584-{
24585- return *(__force volatile __u64 *)addr;
24586-}
24587-
24588-#define readb(x) __readb(x)
24589-#define readw(x) __readw(x)
24590-#define readl(x) __readl(x)
24591-#define readq(x) __readq(x)
24592-#define readb_relaxed(a) readb(a)
24593-#define readw_relaxed(a) readw(a)
24594-#define readl_relaxed(a) readl(a)
24595-#define readq_relaxed(a) readq(a)
24596-#define __raw_readb readb
24597-#define __raw_readw readw
24598-#define __raw_readl readl
24599-#define __raw_readq readq
24600-
24601-#define mmiowb()
24602-
24603-static inline void __writel(__u32 b, volatile void __iomem *addr)
24604-{
24605- *(__force volatile __u32 *)addr = b;
24606-}
24607-
24608-static inline void __writeq(__u64 b, volatile void __iomem *addr)
24609-{
24610- *(__force volatile __u64 *)addr = b;
24611-}
24612-
24613-static inline void __writeb(__u8 b, volatile void __iomem *addr)
24614-{
24615- *(__force volatile __u8 *)addr = b;
24616-}
24617-
24618-static inline void __writew(__u16 b, volatile void __iomem *addr)
24619-{
24620- *(__force volatile __u16 *)addr = b;
24621-}
24622-
24623-#define writeq(val, addr) __writeq((val), (addr))
24624-#define writel(val, addr) __writel((val), (addr))
24625-#define writew(val, addr) __writew((val), (addr))
24626-#define writeb(val, addr) __writeb((val), (addr))
24627-#define __raw_writeb writeb
24628-#define __raw_writew writew
24629-#define __raw_writel writel
24630-#define __raw_writeq writeq
24631-
24632 void __memcpy_fromio(void *, unsigned long, unsigned);
24633 void __memcpy_toio(unsigned long, const void *, unsigned);
24634
24635Index: head-2008-12-01/include/asm-x86/mach-xen/asm/irq_vectors.h
24636===================================================================
24637--- /dev/null 1970-01-01 00:00:00.000000000 +0000
24638+++ head-2008-12-01/include/asm-x86/mach-xen/asm/irq_vectors.h 2008-12-01 11:49:07.000000000 +0100
24639@@ -0,0 +1,52 @@
24640+#ifndef _ASM_IRQ_VECTORS_H
24641+#define _ASM_IRQ_VECTORS_H
24642+
24643+#ifdef CONFIG_X86_32
24644+# define SYSCALL_VECTOR 0x80
24645+#else
24646+# define IA32_SYSCALL_VECTOR 0x80
24647+#endif
24648+
24649+#define RESCHEDULE_VECTOR 0
24650+#define CALL_FUNCTION_VECTOR 1
24651+#define CALL_FUNC_SINGLE_VECTOR 2
24652+#define SPIN_UNLOCK_VECTOR 3
24653+#define NR_IPIS 4
24654+
24655+/*
24656+ * The maximum number of vectors supported by i386 processors
24657+ * is limited to 256. For processors other than i386, NR_VECTORS
24658+ * should be changed accordingly.
24659+ */
24660+#define NR_VECTORS 256
24661+
24662+#define FIRST_VM86_IRQ 3
24663+#define LAST_VM86_IRQ 15
24664+#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
24665+
24666+/*
24667+ * The flat IRQ space is divided into two regions:
24668+ * 1. A one-to-one mapping of real physical IRQs. This space is only used
24669+ * if we have physical device-access privilege. This region is at the
24670+ * start of the IRQ space so that existing device drivers do not need
24671+ * to be modified to translate physical IRQ numbers into our IRQ space.
24672+ * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
24673+ * are bound using the provided bind/unbind functions.
24674+ */
24675+
24676+#define PIRQ_BASE 0
24677+#if !defined(MAX_IO_APICS)
24678+# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
24679+#elif NR_CPUS < MAX_IO_APICS
24680+# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
24681+#else
24682+# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
24683+#endif
24684+
24685+#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
24686+#define NR_DYNIRQS 256
24687+
24688+#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
24689+#define NR_IRQ_VECTORS NR_IRQS
24690+
24691+#endif /* _ASM_IRQ_VECTORS_H */
24692Index: head-2008-12-01/include/asm-x86/mach-xen/asm/irqflags.h
24693===================================================================
24694--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/irqflags.h 2008-12-01 11:44:55.000000000 +0100
24695+++ head-2008-12-01/include/asm-x86/mach-xen/asm/irqflags.h 2008-12-01 11:49:07.000000000 +0100
24696@@ -118,7 +118,7 @@ static inline void halt(void)
24697
24698 #ifndef CONFIG_X86_64
24699 #define INTERRUPT_RETURN iret
24700-#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
24701+#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
24702 sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
24703 __TEST_PENDING ; \
24704 jnz 14f /* process more events if necessary... */ ; \
24705@@ -175,18 +175,6 @@ static inline void trace_hardirqs_fixup_
24706 #else
24707
24708 #ifdef CONFIG_X86_64
24709-/*
24710- * Currently paravirt can't handle swapgs nicely when we
24711- * don't have a stack we can rely on (such as a user space
24712- * stack). So we either find a way around these or just fault
24713- * and emulate if a guest tries to call swapgs directly.
24714- *
24715- * Either way, this is a good way to document that we don't
24716- * have a reliable stack. x86_64 only.
24717- */
24718-#define SWAPGS_UNSAFE_STACK swapgs
24719-#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
24720-#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
24721 #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
24722 #define ARCH_LOCKDEP_SYS_EXIT_IRQ \
24723 TRACE_IRQS_ON; \
24724@@ -198,24 +186,6 @@ static inline void trace_hardirqs_fixup_
24725 TRACE_IRQS_OFF;
24726
24727 #else
24728-#define ARCH_TRACE_IRQS_ON \
24729- pushl %eax; \
24730- pushl %ecx; \
24731- pushl %edx; \
24732- call trace_hardirqs_on; \
24733- popl %edx; \
24734- popl %ecx; \
24735- popl %eax;
24736-
24737-#define ARCH_TRACE_IRQS_OFF \
24738- pushl %eax; \
24739- pushl %ecx; \
24740- pushl %edx; \
24741- call trace_hardirqs_off; \
24742- popl %edx; \
24743- popl %ecx; \
24744- popl %eax;
24745-
24746 #define ARCH_LOCKDEP_SYS_EXIT \
24747 pushl %eax; \
24748 pushl %ecx; \
24749@@ -229,8 +199,8 @@ static inline void trace_hardirqs_fixup_
24750 #endif
24751
24752 #ifdef CONFIG_TRACE_IRQFLAGS
24753-# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
24754-# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
24755+# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
24756+# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
24757 #else
24758 # define TRACE_IRQS_ON
24759 # define TRACE_IRQS_OFF
24760Index: head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context.h
24761===================================================================
24762--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/mmu_context.h 2008-12-01 11:36:55.000000000 +0100
24763+++ head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context.h 2008-12-01 11:49:07.000000000 +0100
24764@@ -1,5 +1,42 @@
24765+#ifndef __ASM_X86_MMU_CONTEXT_H
24766+#define __ASM_X86_MMU_CONTEXT_H
24767+
24768+#include <asm/desc.h>
24769+#include <asm/atomic.h>
24770+#include <asm/pgalloc.h>
24771+#include <asm/tlbflush.h>
24772+
24773+void arch_exit_mmap(struct mm_struct *mm);
24774+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24775+
24776+void mm_pin(struct mm_struct *mm);
24777+void mm_unpin(struct mm_struct *mm);
24778+void mm_pin_all(void);
24779+
24780+static inline void xen_activate_mm(struct mm_struct *prev,
24781+ struct mm_struct *next)
24782+{
24783+ if (!PagePinned(virt_to_page(next->pgd)))
24784+ mm_pin(next);
24785+}
24786+
24787+/*
24788+ * Used for LDT copy/destruction.
24789+ */
24790+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24791+void destroy_context(struct mm_struct *mm);
24792+
24793 #ifdef CONFIG_X86_32
24794 # include "mmu_context_32.h"
24795 #else
24796 # include "mmu_context_64.h"
24797 #endif
24798+
24799+#define activate_mm(prev, next) \
24800+do { \
24801+ xen_activate_mm(prev, next); \
24802+ switch_mm((prev), (next), NULL); \
24803+} while (0);
24804+
24805+
24806+#endif /* __ASM_X86_MMU_CONTEXT_H */
24807Index: head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_32.h
24808===================================================================
24809--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2008-12-01 11:44:55.000000000 +0100
24810+++ head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_32.h 2008-12-01 11:49:07.000000000 +0100
24811@@ -1,32 +1,6 @@
24812 #ifndef __I386_SCHED_H
24813 #define __I386_SCHED_H
24814
24815-#include <asm/desc.h>
24816-#include <asm/atomic.h>
24817-#include <asm/pgalloc.h>
24818-#include <asm/tlbflush.h>
24819-
24820-void arch_exit_mmap(struct mm_struct *mm);
24821-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24822-
24823-void mm_pin(struct mm_struct *mm);
24824-void mm_unpin(struct mm_struct *mm);
24825-void mm_pin_all(void);
24826-
24827-static inline void xen_activate_mm(struct mm_struct *prev,
24828- struct mm_struct *next)
24829-{
24830- if (!PagePinned(virt_to_page(next->pgd)))
24831- mm_pin(next);
24832-}
24833-
24834-/*
24835- * Used for LDT copy/destruction.
24836- */
24837-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24838-void destroy_context(struct mm_struct *mm);
24839-
24840-
24841 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24842 {
24843 #if 0 /* XEN: no lazy tlb */
24844@@ -107,10 +81,4 @@ static inline void switch_mm(struct mm_s
24845 #define deactivate_mm(tsk, mm) \
24846 asm("movl %0,%%gs": :"r" (0));
24847
24848-#define activate_mm(prev, next) \
24849-do { \
24850- xen_activate_mm(prev, next); \
24851- switch_mm((prev), (next), NULL); \
24852-} while (0)
24853-
24854 #endif
24855Index: head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_64.h
24856===================================================================
24857--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2008-12-01 11:44:55.000000000 +0100
24858+++ head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_64.h 2008-12-01 11:49:07.000000000 +0100
24859@@ -1,23 +1,6 @@
24860 #ifndef __X86_64_MMU_CONTEXT_H
24861 #define __X86_64_MMU_CONTEXT_H
24862
24863-#include <asm/desc.h>
24864-#include <asm/atomic.h>
24865-#include <asm/pgalloc.h>
24866-#include <asm/page.h>
24867-#include <asm/pda.h>
24868-#include <asm/pgtable.h>
24869-#include <asm/tlbflush.h>
24870-
24871-void arch_exit_mmap(struct mm_struct *mm);
24872-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24873-
24874-/*
24875- * possibly do the LDT unload here?
24876- */
24877-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24878-void destroy_context(struct mm_struct *mm);
24879-
24880 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24881 {
24882 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
24883@@ -58,10 +41,6 @@ static inline void __prepare_arch_switch
24884 }
24885 }
24886
24887-extern void mm_pin(struct mm_struct *mm);
24888-extern void mm_unpin(struct mm_struct *mm);
24889-void mm_pin_all(void);
24890-
24891 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
24892 struct task_struct *tsk)
24893 {
24894@@ -124,11 +103,4 @@ do { \
24895 asm volatile("movl %0,%%fs"::"r"(0)); \
24896 } while (0)
24897
24898-static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
24899-{
24900- if (!PagePinned(virt_to_page(next->pgd)))
24901- mm_pin(next);
24902- switch_mm(prev, next, NULL);
24903-}
24904-
24905 #endif
24906Index: head-2008-12-01/include/asm-x86/mach-xen/asm/page.h
24907===================================================================
24908--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/page.h 2008-12-01 11:44:55.000000000 +0100
24909+++ head-2008-12-01/include/asm-x86/mach-xen/asm/page.h 2008-12-01 11:49:07.000000000 +0100
24910@@ -16,9 +16,9 @@
24911 * below. The preprocessor will warn if the two definitions aren't identical.
24912 */
24913 #define _PAGE_BIT_PRESENT 0
24914-#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
24915-#define _PAGE_BIT_IO 9
24916-#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
24917+#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
24918+#define _PAGE_BIT_IO 11
24919+#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
24920
24921 #define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
24922 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
24923@@ -28,8 +28,11 @@
24924 (ie, 32-bit PAE). */
24925 #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
24926
24927-/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
24928-#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
24929+/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
24930+#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
24931+
24932+/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
24933+#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
24934
24935 #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
24936 #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
24937@@ -39,8 +42,7 @@
24938 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
24939 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
24940
24941-/* to align the pointer to the (next) page boundary */
24942-#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
24943+#define HUGE_MAX_HSTATE 2
24944
24945 #ifndef __ASSEMBLY__
24946 #include <linux/types.h>
24947@@ -61,9 +63,17 @@
24948
24949 #ifndef __ASSEMBLY__
24950
24951+typedef struct { pgdval_t pgd; } pgd_t;
24952+typedef struct { pgprotval_t pgprot; } pgprot_t;
24953+
24954 extern int page_is_ram(unsigned long pagenr);
24955 extern int devmem_is_allowed(unsigned long pagenr);
24956+extern void map_devmem(unsigned long pfn, unsigned long size,
24957+ pgprot_t vma_prot);
24958+extern void unmap_devmem(unsigned long pfn, unsigned long size,
24959+ pgprot_t vma_prot);
24960
24961+extern unsigned long max_low_pfn_mapped;
24962 extern unsigned long max_pfn_mapped;
24963
24964 struct page;
24965@@ -84,15 +94,11 @@ static inline void copy_user_page(void *
24966 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
24967 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
24968
24969-typedef struct { pgprotval_t pgprot; } pgprot_t;
24970-
24971 #define pgprot_val(x) ((x).pgprot)
24972 #define __pgprot(x) ((pgprot_t) { (x) } )
24973
24974 #include <asm/maddr.h>
24975
24976-typedef struct { pgdval_t pgd; } pgd_t;
24977-
24978 #define __pgd_ma(x) ((pgd_t) { (x) } )
24979 static inline pgd_t xen_make_pgd(pgdval_t val)
24980 {
24981@@ -196,6 +202,11 @@ static inline pteval_t xen_pte_val(pte_t
24982 return ret;
24983 }
24984
24985+static inline pteval_t xen_pte_flags(pte_t pte)
24986+{
24987+ return __pte_val(pte) & PTE_FLAGS_MASK;
24988+}
24989+
24990 #define pgd_val(x) xen_pgd_val(x)
24991 #define __pgd(x) xen_make_pgd(x)
24992
24993@@ -210,6 +221,7 @@ static inline pteval_t xen_pte_val(pte_t
24994 #endif
24995
24996 #define pte_val(x) xen_pte_val(x)
24997+#define pte_flags(x) xen_pte_flags(x)
24998 #define __pte(x) xen_make_pte(x)
24999
25000 #define __pa(x) __phys_addr((unsigned long)(x))
25001Index: head-2008-12-01/include/asm-x86/mach-xen/asm/page_64.h
25002===================================================================
25003--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/page_64.h 2008-12-01 11:44:55.000000000 +0100
25004+++ head-2008-12-01/include/asm-x86/mach-xen/asm/page_64.h 2008-12-01 11:49:07.000000000 +0100
25005@@ -26,6 +26,12 @@
25006 #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
25007 #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
25008
25009+/*
25010+ * Set __PAGE_OFFSET to the most negative possible address +
25011+ * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
25012+ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
25013+ * what Xen requires.
25014+ */
25015 #define __PAGE_OFFSET _AC(0xffff880000000000, UL)
25016
25017 #define __PHYSICAL_START CONFIG_PHYSICAL_START
25018@@ -63,7 +69,8 @@
25019 void clear_page(void *page);
25020 void copy_page(void *to, void *from);
25021
25022-extern unsigned long end_pfn;
25023+/* duplicated to the one in bootmem.h */
25024+extern unsigned long max_pfn;
25025
25026 static inline unsigned long __phys_addr(unsigned long x)
25027 {
25028@@ -91,6 +98,11 @@ typedef union { pteval_t pte; unsigned i
25029 extern unsigned long init_memory_mapping(unsigned long start,
25030 unsigned long end);
25031
25032+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
25033+
25034+extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
25035+extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
25036+
25037 #endif /* !__ASSEMBLY__ */
25038
25039 #ifdef CONFIG_FLATMEM
25040Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pci.h
25041===================================================================
25042--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pci.h 2008-12-01 11:44:55.000000000 +0100
25043+++ head-2008-12-01/include/asm-x86/mach-xen/asm/pci.h 2008-12-01 11:49:07.000000000 +0100
25044@@ -21,6 +21,8 @@ struct pci_sysdata {
25045 #endif
25046 };
25047
25048+extern int pci_routeirq;
25049+
25050 /* scan a bus after allocating a pci_sysdata for it */
25051 extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
25052 int node);
25053Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pci_32.h
25054===================================================================
25055--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pci_32.h 2008-12-01 11:36:55.000000000 +0100
25056+++ head-2008-12-01/include/asm-x86/mach-xen/asm/pci_32.h 2008-12-01 11:49:07.000000000 +0100
25057@@ -38,12 +38,14 @@ struct pci_dev;
25058 #define PCI_DMA_BUS_IS_PHYS (1)
25059
25060 /* pci_unmap_{page,single} is a nop so... */
25061-#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
25062-#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
25063-#define pci_unmap_addr(PTR, ADDR_NAME) (0)
25064-#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
25065-#define pci_unmap_len(PTR, LEN_NAME) (0)
25066-#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
25067+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
25068+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
25069+#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
25070+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
25071+ do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
25072+#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
25073+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
25074+ do { break; } while (pci_unmap_len(PTR, LEN_NAME))
25075
25076 #endif
25077
25078Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgalloc.h
25079===================================================================
25080--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgalloc.h 2008-12-01 11:44:55.000000000 +0100
25081+++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgalloc.h 2008-12-01 11:49:07.000000000 +0100
25082@@ -7,6 +7,9 @@
25083
25084 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
25085
25086+static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
25087+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
25088+
25089 static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
25090 static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
25091 static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
25092Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable.h
25093===================================================================
25094--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable.h 2008-12-01 11:44:55.000000000 +0100
25095+++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable.h 2008-12-01 11:49:07.000000000 +0100
25096@@ -13,11 +13,12 @@
25097 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
25098 #define _PAGE_BIT_PAT 7 /* on 4KB pages */
25099 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
25100-#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
25101+#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
25102+#define _PAGE_BIT_UNUSED2 10
25103+#define _PAGE_BIT_IO 11 /* Mapped page is I/O or foreign and
25104 * has no associated page struct. */
25105-#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
25106-#define _PAGE_BIT_UNUSED3 11
25107 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
25108+#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
25109 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
25110
25111 /* If _PAGE_BIT_PRESENT is clear, we use these: */
25112@@ -28,34 +29,31 @@
25113 /* if the user mapped it with PROT_NONE; pte_present gives true */
25114 #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
25115
25116-/*
25117- * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
25118- * sign-extended value on 32-bit with all 1's in the upper word,
25119- * which preserves the upper pte values on 64-bit ptes:
25120- */
25121-#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25122-#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
25123-#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
25124-#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
25125-#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
25126-#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
25127-#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
25128-#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
25129-#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
25130-#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25131-#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
25132-#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
25133-#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
25134-#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
25135+#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25136+#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
25137+#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
25138+#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
25139+#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
25140+#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
25141+#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
25142+#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
25143+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
25144+#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
25145+#define _PAGE_UNUSED2 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2)
25146+#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25147+#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
25148+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
25149+#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
25150+#define __HAVE_ARCH_PTE_SPECIAL
25151
25152 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
25153-#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
25154+#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
25155 #else
25156-#define _PAGE_NX 0
25157+#define _PAGE_NX (_AT(pteval_t, 0))
25158 #endif
25159
25160-#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
25161-#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
25162+#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
25163+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
25164
25165 #ifndef __ASSEMBLY__
25166 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
25167@@ -71,8 +69,8 @@ extern unsigned int __kernel_page_user;
25168 _PAGE_DIRTY | __kernel_page_user)
25169
25170 /* Set of bits not changed in pte_modify */
25171-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25172- _PAGE_ACCESSED | _PAGE_DIRTY)
25173+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25174+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
25175
25176 /*
25177 * PAT settings are part of the hypervisor interface, which sets the
25178@@ -102,19 +100,9 @@ extern unsigned int __kernel_page_user;
25179 #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
25180 _PAGE_ACCESSED)
25181
25182-#ifdef CONFIG_X86_32
25183-#define _PAGE_KERNEL_EXEC \
25184- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
25185-#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
25186-
25187-#ifndef __ASSEMBLY__
25188-extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
25189-#endif /* __ASSEMBLY__ */
25190-#else
25191 #define __PAGE_KERNEL_EXEC \
25192 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
25193 #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
25194-#endif
25195
25196 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
25197 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
25198@@ -125,25 +113,22 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
25199 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
25200 #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
25201 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
25202+#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
25203 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
25204
25205-/*
25206- * We don't support GLOBAL page in xenolinux64
25207- */
25208-#define MAKE_GLOBAL(x) __pgprot((x))
25209-
25210-#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
25211-#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
25212-#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
25213-#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
25214-#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
25215-#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
25216-#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
25217-#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
25218-#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
25219-#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
25220-#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
25221-#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25222+#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
25223+#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
25224+#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
25225+#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
25226+#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
25227+#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
25228+#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
25229+#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
25230+#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
25231+#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
25232+#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
25233+#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
25234+#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25235
25236 /* xwr */
25237 #define __P000 PAGE_NONE
25238@@ -182,27 +167,27 @@ extern struct list_head pgd_list;
25239 */
25240 static inline int pte_dirty(pte_t pte)
25241 {
25242- return __pte_val(pte) & _PAGE_DIRTY;
25243+ return pte_flags(pte) & _PAGE_DIRTY;
25244 }
25245
25246 static inline int pte_young(pte_t pte)
25247 {
25248- return __pte_val(pte) & _PAGE_ACCESSED;
25249+ return pte_flags(pte) & _PAGE_ACCESSED;
25250 }
25251
25252 static inline int pte_write(pte_t pte)
25253 {
25254- return __pte_val(pte) & _PAGE_RW;
25255+ return pte_flags(pte) & _PAGE_RW;
25256 }
25257
25258 static inline int pte_file(pte_t pte)
25259 {
25260- return __pte_val(pte) & _PAGE_FILE;
25261+ return pte_flags(pte) & _PAGE_FILE;
25262 }
25263
25264 static inline int pte_huge(pte_t pte)
25265 {
25266- return __pte_val(pte) & _PAGE_PSE;
25267+ return pte_flags(pte) & _PAGE_PSE;
25268 }
25269
25270 static inline int pte_global(pte_t pte)
25271@@ -212,12 +197,12 @@ static inline int pte_global(pte_t pte)
25272
25273 static inline int pte_exec(pte_t pte)
25274 {
25275- return !(__pte_val(pte) & _PAGE_NX);
25276+ return !(pte_flags(pte) & _PAGE_NX);
25277 }
25278
25279 static inline int pte_special(pte_t pte)
25280 {
25281- return 0;
25282+ return pte_flags(pte) & _PAGE_SPECIAL;
25283 }
25284
25285 static inline int pmd_large(pmd_t pte)
25286@@ -228,22 +213,22 @@ static inline int pmd_large(pmd_t pte)
25287
25288 static inline pte_t pte_mkclean(pte_t pte)
25289 {
25290- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
25291+ return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY);
25292 }
25293
25294 static inline pte_t pte_mkold(pte_t pte)
25295 {
25296- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
25297+ return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED);
25298 }
25299
25300 static inline pte_t pte_wrprotect(pte_t pte)
25301 {
25302- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
25303+ return __pte_ma(__pte_val(pte) & ~_PAGE_RW);
25304 }
25305
25306 static inline pte_t pte_mkexec(pte_t pte)
25307 {
25308- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
25309+ return __pte_ma(__pte_val(pte) & ~_PAGE_NX);
25310 }
25311
25312 static inline pte_t pte_mkdirty(pte_t pte)
25313@@ -268,7 +253,7 @@ static inline pte_t pte_mkhuge(pte_t pte
25314
25315 static inline pte_t pte_clrhuge(pte_t pte)
25316 {
25317- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
25318+ return __pte_ma(__pte_val(pte) & ~_PAGE_PSE);
25319 }
25320
25321 static inline pte_t pte_mkglobal(pte_t pte)
25322@@ -283,35 +268,46 @@ static inline pte_t pte_clrglobal(pte_t
25323
25324 static inline pte_t pte_mkspecial(pte_t pte)
25325 {
25326- return pte;
25327+ return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL);
25328 }
25329
25330 extern pteval_t __supported_pte_mask;
25331
25332 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
25333 {
25334- return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
25335- pgprot_val(pgprot)) & __supported_pte_mask);
25336+ pgprotval_t prot = pgprot_val(pgprot);
25337+
25338+ if (prot & _PAGE_PRESENT)
25339+ prot &= __supported_pte_mask;
25340+ return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25341 }
25342
25343 static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
25344 {
25345- return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
25346- pgprot_val(pgprot)) & __supported_pte_mask);
25347+ pgprotval_t prot = pgprot_val(pgprot);
25348+
25349+ if (prot & _PAGE_PRESENT)
25350+ prot &= __supported_pte_mask;
25351+ return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25352 }
25353
25354 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
25355 {
25356- return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
25357- pgprot_val(pgprot)) & __supported_pte_mask);
25358+ pgprotval_t prot = pgprot_val(pgprot);
25359+
25360+ if (prot & _PAGE_PRESENT)
25361+ prot &= __supported_pte_mask;
25362+ return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25363 }
25364
25365 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
25366 {
25367- pteval_t val = pte_val(pte);
25368+ pgprotval_t prot = pgprot_val(newprot);
25369+ pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
25370
25371- val &= _PAGE_CHG_MASK;
25372- val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
25373+ if (prot & _PAGE_PRESENT)
25374+ prot &= __supported_pte_mask;
25375+ val |= prot & ~_PAGE_CHG_MASK;
25376
25377 return __pte(val);
25378 }
25379@@ -325,9 +321,11 @@ static inline pgprot_t pgprot_modify(pgp
25380 return __pgprot(preservebits | addbits);
25381 }
25382
25383-#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
25384+#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
25385
25386-#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
25387+#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \
25388+ ? pgprot_val(p) & __supported_pte_mask \
25389+ : pgprot_val(p))
25390
25391 #ifndef __ASSEMBLY__
25392 #define __HAVE_PHYS_MEM_ACCESS_PROT
25393@@ -338,6 +336,17 @@ int phys_mem_access_prot_allowed(struct
25394 unsigned long size, pgprot_t *vma_prot);
25395 #endif
25396
25397+/* Install a pte for a particular vaddr in kernel space. */
25398+void set_pte_vaddr(unsigned long vaddr, pte_t pte);
25399+
25400+#ifndef CONFIG_XEN
25401+extern void native_pagetable_setup_start(pgd_t *base);
25402+extern void native_pagetable_setup_done(pgd_t *base);
25403+#else
25404+static inline void xen_pagetable_setup_start(pgd_t *base) {}
25405+static inline void xen_pagetable_setup_done(pgd_t *base) {}
25406+#endif
25407+
25408 #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
25409 #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
25410
25411@@ -373,6 +382,26 @@ int phys_mem_access_prot_allowed(struct
25412 # include "pgtable_64.h"
25413 #endif
25414
25415+/*
25416+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25417+ *
25418+ * this macro returns the index of the entry in the pgd page which would
25419+ * control the given virtual address
25420+ */
25421+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25422+
25423+/*
25424+ * pgd_offset() returns a (pgd_t *)
25425+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25426+ */
25427+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25428+/*
25429+ * a shortcut which implies the use of the kernel's pgd, instead
25430+ * of a process's
25431+ */
25432+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25433+
25434+
25435 #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
25436 #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
25437
25438@@ -383,8 +412,15 @@ enum {
25439 PG_LEVEL_4K,
25440 PG_LEVEL_2M,
25441 PG_LEVEL_1G,
25442+ PG_LEVEL_NUM
25443 };
25444
25445+#ifdef CONFIG_PROC_FS
25446+extern void update_page_count(int level, unsigned long pages);
25447+#else
25448+static inline void update_page_count(int level, unsigned long pages) { }
25449+#endif
25450+
25451 /*
25452 * Helper function that returns the kernel pagetable entry controlling
25453 * the virtual address 'address'. NULL means no pagetable entry present.
25454@@ -441,6 +477,8 @@ static inline void xen_pte_clear(struct
25455 * race with other CPU's that might be updating the dirty
25456 * bit at the same time.
25457 */
25458+struct vm_area_struct;
25459+
25460 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
25461 extern int ptep_set_access_flags(struct vm_area_struct *vma,
25462 unsigned long address, pte_t *ptep,
25463@@ -523,9 +561,6 @@ static inline void clone_pgd_range(pgd_t
25464 memcpy(dst, src, count * sizeof(pgd_t));
25465 }
25466
25467-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
25468- xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
25469-
25470 #define arbitrary_virt_to_machine(va) \
25471 ({ \
25472 unsigned int __lvl; \
25473@@ -535,6 +570,44 @@ static inline void clone_pgd_range(pgd_t
25474 | ((unsigned long)(va) & (PAGE_SIZE - 1))); \
25475 })
25476
25477+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
25478+static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
25479+ pte_t *ptep)
25480+{
25481+#if CONFIG_XEN_COMPAT < 0x030300
25482+ if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
25483+ return ptep_get_and_clear(mm, addr, ptep);
25484+#endif
25485+ return *ptep;
25486+}
25487+
25488+#ifdef CONFIG_HIGHPTE
25489+extern void *high_memory;
25490+#endif
25491+
25492+static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
25493+ pte_t *ptep, pte_t pte)
25494+{
25495+ mmu_update_t u;
25496+
25497+#if CONFIG_XEN_COMPAT < 0x030300
25498+ if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
25499+ set_pte_at(mm, addr, ptep, pte);
25500+ return;
25501+ }
25502+#endif
25503+#ifdef CONFIG_HIGHPTE
25504+ if ((void *)ptep > high_memory)
25505+ u.ptr = arbitrary_virt_to_machine(ptep)
25506+ | MMU_PT_UPDATE_PRESERVE_AD;
25507+ else
25508+#endif
25509+ u.ptr = virt_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
25510+ u.val = __pte_val(pte);
25511+ if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
25512+ BUG();
25513+}
25514+
25515 #include <asm-generic/pgtable.h>
25516
25517 #include <xen/features.h>
25518@@ -563,10 +636,6 @@ int touch_pte_range(struct mm_struct *mm
25519 unsigned long address,
25520 unsigned long size);
25521
25522-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
25523- unsigned long addr, unsigned long end, pgprot_t newprot,
25524- int dirty_accountable);
25525-
25526 #endif /* __ASSEMBLY__ */
25527
25528 #endif /* _ASM_X86_PGTABLE_H */
25529Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable-3level.h
25530===================================================================
25531--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2008-12-01 11:44:55.000000000 +0100
25532+++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable-3level.h 2008-12-01 11:49:07.000000000 +0100
25533@@ -14,11 +14,11 @@
25534 #define pmd_ERROR(e) \
25535 printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
25536 __FILE__, __LINE__, &(e), __pmd_val(e), \
25537- (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25538+ (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25539 #define pgd_ERROR(e) \
25540 printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
25541 __FILE__, __LINE__, &(e), __pgd_val(e), \
25542- (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25543+ (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25544
25545 static inline int pud_none(pud_t pud)
25546 {
25547@@ -27,7 +27,7 @@ static inline int pud_none(pud_t pud)
25548 }
25549 static inline int pud_bad(pud_t pud)
25550 {
25551- return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25552+ return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25553 }
25554
25555 static inline int pud_present(pud_t pud)
25556@@ -102,9 +102,9 @@ static inline void pud_clear(pud_t *pudp
25557 xen_tlb_flush();
25558 }
25559
25560-#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
25561+#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))
25562
25563-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
25564+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
25565
25566
25567 /* Find an entry in the second-level page table.. */
25568Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_32.h
25569===================================================================
25570--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2008-12-01 11:44:55.000000000 +0100
25571+++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_32.h 2008-12-01 11:49:07.000000000 +0100
25572@@ -89,10 +89,10 @@ extern unsigned long pg0[];
25573 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
25574 can temporarily clear it. */
25575 #define pmd_present(x) (__pmd_val(x))
25576-#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25577+#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25578 #else
25579 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
25580-#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25581+#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25582 #endif
25583
25584
25585@@ -119,26 +119,6 @@ extern unsigned long pg0[];
25586 */
25587 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
25588
25589-/*
25590- * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25591- *
25592- * this macro returns the index of the entry in the pgd page which would
25593- * control the given virtual address
25594- */
25595-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25596-#define pgd_index_k(addr) pgd_index((addr))
25597-
25598-/*
25599- * pgd_offset() returns a (pgd_t *)
25600- * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25601- */
25602-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25603-
25604-/*
25605- * a shortcut which implies the use of the kernel's pgd, instead
25606- * of a process's
25607- */
25608-#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25609
25610 static inline int pud_large(pud_t pud) { return 0; }
25611
25612@@ -165,7 +145,7 @@ static inline int pud_large(pud_t pud) {
25613 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25614
25615 #define pmd_page_vaddr(pmd) \
25616- ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
25617+ ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
25618
25619 #if defined(CONFIG_HIGHPTE)
25620 #define pte_offset_map(dir, address) \
25621Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_64.h
25622===================================================================
25623--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-01 11:44:55.000000000 +0100
25624+++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-01 11:49:07.000000000 +0100
25625@@ -23,6 +23,8 @@ extern void xen_init_pt(void);
25626 extern pud_t level3_kernel_pgt[512];
25627 extern pud_t level3_ident_pgt[512];
25628 extern pmd_t level2_kernel_pgt[512];
25629+extern pmd_t level2_fixmap_pgt[512];
25630+extern pmd_t level2_ident_pgt[512];
25631 extern pgd_t init_level4_pgt[];
25632
25633 #define swapper_pg_dir init_level4_pgt
25634@@ -79,6 +81,9 @@ extern void paging_init(void);
25635
25636 struct mm_struct;
25637
25638+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
25639+
25640+
25641 #define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
25642
25643 static inline void xen_set_pte(pte_t *ptep, pte_t pte)
25644@@ -150,24 +155,24 @@ static inline void xen_pgd_clear(pgd_t *
25645 #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
25646 #define VMEMMAP_START _AC(0xffffe20000000000, UL)
25647 #define MODULES_VADDR _AC(0xffffffffa0000000, UL)
25648-#define MODULES_END _AC(0xfffffffffff00000, UL)
25649+#define MODULES_END _AC(0xffffffffff000000, UL)
25650 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
25651
25652 #ifndef __ASSEMBLY__
25653
25654 static inline int pgd_bad(pgd_t pgd)
25655 {
25656- return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25657+ return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25658 }
25659
25660 static inline int pud_bad(pud_t pud)
25661 {
25662- return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25663+ return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25664 }
25665
25666 static inline int pmd_bad(pmd_t pmd)
25667 {
25668- return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25669+ return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25670 }
25671
25672 #define pte_none(x) (!(x).pte)
25673@@ -175,7 +180,7 @@ static inline int pmd_bad(pmd_t pmd)
25674
25675 #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
25676
25677-#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
25678+#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
25679 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
25680 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
25681 #define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
25682@@ -200,11 +205,8 @@ static inline int pmd_bad(pmd_t pmd)
25683 * Level 4 access.
25684 */
25685 #define pgd_page_vaddr(pgd) \
25686- ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
25687+ ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
25688 #define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
25689-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25690-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25691-#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
25692 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
25693 static inline int pgd_large(pgd_t pgd) { return 0; }
25694 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
25695@@ -226,7 +228,7 @@ static inline int pud_large(pud_t pte)
25696 }
25697
25698 /* PMD - Level 2 access */
25699-#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
25700+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
25701 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25702
25703 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
25704Index: head-2008-12-01/include/asm-x86/mach-xen/asm/processor.h
25705===================================================================
25706--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/processor.h 2008-12-01 11:44:55.000000000 +0100
25707+++ head-2008-12-01/include/asm-x86/mach-xen/asm/processor.h 2008-12-01 11:49:07.000000000 +0100
25708@@ -134,7 +134,7 @@ extern __u32 cleared_cpu_caps[NCAPINTS
25709 #ifdef CONFIG_SMP
25710 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25711 #define cpu_data(cpu) per_cpu(cpu_info, cpu)
25712-#define current_cpu_data cpu_data(smp_processor_id())
25713+#define current_cpu_data __get_cpu_var(cpu_info)
25714 #else
25715 #define cpu_data(cpu) boot_cpu_data
25716 #define current_cpu_data boot_cpu_data
25717@@ -153,7 +153,7 @@ static inline int hlt_works(int cpu)
25718
25719 extern void cpu_detect(struct cpuinfo_x86 *c);
25720
25721-extern void identify_cpu(struct cpuinfo_x86 *);
25722+extern void early_cpu_init(void);
25723 extern void identify_boot_cpu(void);
25724 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25725 extern void print_cpu_info(struct cpuinfo_x86 *);
25726@@ -267,15 +267,11 @@ struct tss_struct {
25727 struct thread_struct *io_bitmap_owner;
25728
25729 /*
25730- * Pad the TSS to be cacheline-aligned (size is 0x100):
25731- */
25732- unsigned long __cacheline_filler[35];
25733- /*
25734 * .. and then another 0x100 bytes for the emergency kernel stack:
25735 */
25736 unsigned long stack[64];
25737
25738-} __attribute__((packed));
25739+} ____cacheline_aligned;
25740
25741 DECLARE_PER_CPU(struct tss_struct, init_tss);
25742
25743@@ -668,11 +664,36 @@ static inline void __sti_mwait(unsigned
25744
25745 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25746
25747-extern int force_mwait;
25748-
25749 extern void select_idle_routine(const struct cpuinfo_x86 *c);
25750
25751 extern unsigned long boot_option_idle_override;
25752+extern unsigned long idle_halt;
25753+extern unsigned long idle_nomwait;
25754+
25755+#ifndef CONFIG_XEN
25756+/*
25757+ * on systems with caches, caches must be flashed as the absolute
25758+ * last instruction before going into a suspended halt. Otherwise,
25759+ * dirty data can linger in the cache and become stale on resume,
25760+ * leading to strange errors.
25761+ *
25762+ * perform a variety of operations to guarantee that the compiler
25763+ * will not reorder instructions. wbinvd itself is serializing
25764+ * so the processor will not reorder.
25765+ *
25766+ * Systems without cache can just go into halt.
25767+ */
25768+static inline void wbinvd_halt(void)
25769+{
25770+ mb();
25771+ /* check for clflush to determine if wbinvd is legal */
25772+ if (cpu_has_clflush)
25773+ asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
25774+ else
25775+ while (1)
25776+ halt();
25777+}
25778+#endif
25779
25780 extern void enable_sep_cpu(void);
25781 extern int sysenter_setup(void);
25782Index: head-2008-12-01/include/asm-x86/mach-xen/asm/segment.h
25783===================================================================
25784--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/segment.h 2008-12-01 11:44:55.000000000 +0100
25785+++ head-2008-12-01/include/asm-x86/mach-xen/asm/segment.h 2008-12-01 11:49:07.000000000 +0100
25786@@ -1,6 +1,15 @@
25787 #ifndef _ASM_X86_SEGMENT_H_
25788 #define _ASM_X86_SEGMENT_H_
25789
25790+/* Constructor for a conventional segment GDT (or LDT) entry */
25791+/* This is a macro so it can be used in initializers */
25792+#define GDT_ENTRY(flags, base, limit) \
25793+ ((((base) & 0xff000000ULL) << (56-24)) | \
25794+ (((flags) & 0x0000f0ffULL) << 40) | \
25795+ (((limit) & 0x000f0000ULL) << (48-16)) | \
25796+ (((base) & 0x00ffffffULL) << 16) | \
25797+ (((limit) & 0x0000ffffULL)))
25798+
25799 /* Simple and small GDT entries for booting only */
25800
25801 #define GDT_ENTRY_BOOT_CS 2
25802@@ -61,18 +70,14 @@
25803 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
25804
25805 #define GDT_ENTRY_DEFAULT_USER_CS 14
25806-#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
25807
25808 #define GDT_ENTRY_DEFAULT_USER_DS 15
25809-#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
25810
25811 #define GDT_ENTRY_KERNEL_BASE 12
25812
25813 #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
25814-#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25815
25816 #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
25817-#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25818
25819 #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
25820 #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
25821@@ -143,10 +148,11 @@
25822 #else
25823 #include <asm/cache.h>
25824
25825-#define __KERNEL_CS 0x10
25826-#define __KERNEL_DS 0x18
25827+#define GDT_ENTRY_KERNEL32_CS 1
25828+#define GDT_ENTRY_KERNEL_CS 2
25829+#define GDT_ENTRY_KERNEL_DS 3
25830
25831-#define __KERNEL32_CS 0x08
25832+#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
25833
25834 /*
25835 * we cannot use the same code segment descriptor for user and kernel
25836@@ -154,10 +160,10 @@
25837 * The segment offset needs to contain a RPL. Grr. -AK
25838 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
25839 */
25840-
25841-#define __USER32_CS 0x23 /* 4*8+3 */
25842-#define __USER_DS 0x2b /* 5*8+3 */
25843-#define __USER_CS 0x33 /* 6*8+3 */
25844+#define GDT_ENTRY_DEFAULT_USER32_CS 4
25845+#define GDT_ENTRY_DEFAULT_USER_DS 5
25846+#define GDT_ENTRY_DEFAULT_USER_CS 6
25847+#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
25848 #define __USER32_DS __USER_DS
25849
25850 #define GDT_ENTRY_TSS 8 /* needs two entries */
25851@@ -179,6 +185,11 @@
25852
25853 #endif
25854
25855+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25856+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25857+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
25858+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
25859+
25860 /* User mode is privilege level 3 */
25861 #define USER_RPL 0x3
25862 /* LDT segment has TI set, GDT has it cleared */
25863Index: head-2008-12-01/include/asm-x86/mach-xen/asm/smp.h
25864===================================================================
25865--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/smp.h 2008-12-01 11:44:55.000000000 +0100
25866+++ head-2008-12-01/include/asm-x86/mach-xen/asm/smp.h 2008-12-01 11:49:07.000000000 +0100
25867@@ -25,23 +25,16 @@ extern cpumask_t cpu_initialized;
25868 extern void (*mtrr_hook)(void);
25869 extern void zap_low_mappings(void);
25870
25871+extern int __cpuinit get_local_pda(int cpu);
25872+
25873 extern int smp_num_siblings;
25874 extern unsigned int num_processors;
25875 extern cpumask_t cpu_initialized;
25876
25877-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
25878-extern u16 x86_cpu_to_apicid_init[];
25879-extern u16 x86_bios_cpu_apicid_init[];
25880-extern void *x86_cpu_to_apicid_early_ptr;
25881-extern void *x86_bios_cpu_apicid_early_ptr;
25882-#else
25883-#define x86_cpu_to_apicid_early_ptr NULL
25884-#define x86_bios_cpu_apicid_early_ptr NULL
25885-#endif
25886-
25887 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
25888 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
25889 DECLARE_PER_CPU(u16, cpu_llc_id);
25890+
25891 DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
25892 DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
25893
25894@@ -63,9 +56,9 @@ struct smp_ops {
25895
25896 void (*smp_send_stop)(void);
25897 void (*smp_send_reschedule)(int cpu);
25898- int (*smp_call_function_mask)(cpumask_t mask,
25899- void (*func)(void *info), void *info,
25900- int wait);
25901+
25902+ void (*send_call_func_ipi)(cpumask_t mask);
25903+ void (*send_call_func_single_ipi)(int cpu);
25904 };
25905
25906 /* Globals due to paravirt */
25907@@ -106,11 +99,14 @@ static inline void smp_send_reschedule(i
25908 smp_ops.smp_send_reschedule(cpu);
25909 }
25910
25911-static inline int smp_call_function_mask(cpumask_t mask,
25912- void (*func) (void *info), void *info,
25913- int wait)
25914+static inline void arch_send_call_function_single_ipi(int cpu)
25915 {
25916- return smp_ops.smp_call_function_mask(mask, func, info, wait);
25917+ smp_ops.send_call_func_single_ipi(cpu);
25918+}
25919+
25920+static inline void arch_send_call_function_ipi(cpumask_t mask)
25921+{
25922+ smp_ops.send_call_func_ipi(mask);
25923 }
25924
25925 void native_smp_prepare_boot_cpu(void);
25926@@ -122,23 +118,19 @@ int native_cpu_up(unsigned int cpunum);
25927
25928 void xen_smp_send_stop(void);
25929 void xen_smp_send_reschedule(int cpu);
25930-int xen_smp_call_function_mask(cpumask_t mask,
25931- void (*func) (void *info), void *info,
25932- int wait);
25933+void xen_send_call_func_ipi(cpumask_t mask);
25934+void xen_send_call_func_single_ipi(int cpu);
25935
25936 #define smp_send_stop xen_smp_send_stop
25937 #define smp_send_reschedule xen_smp_send_reschedule
25938-#define smp_call_function_mask xen_smp_call_function_mask
25939-
25940-extern void prefill_possible_map(void);
25941+#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi
25942+#define arch_send_call_function_ipi xen_send_call_func_ipi
25943
25944 #endif /* CONFIG_XEN */
25945
25946 extern int __cpu_disable(void);
25947 extern void __cpu_die(unsigned int cpu);
25948
25949-extern void prefill_possible_map(void);
25950-
25951 void smp_store_cpu_info(int id);
25952 #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
25953
25954@@ -149,6 +141,14 @@ static inline int num_booting_cpus(void)
25955 }
25956 #endif /* CONFIG_SMP */
25957
25958+#if defined(CONFIG_SMP) && (defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_XEN))
25959+extern void prefill_possible_map(void);
25960+#else
25961+static inline void prefill_possible_map(void)
25962+{
25963+}
25964+#endif
25965+
25966 extern unsigned disabled_cpus __cpuinitdata;
25967
25968 #ifdef CONFIG_X86_32_SMP
25969@@ -216,12 +216,8 @@ static inline int hard_smp_processor_id(
25970 #endif /* CONFIG_X86_LOCAL_APIC */
25971
25972 #ifdef CONFIG_HOTPLUG_CPU
25973-extern void cpu_exit_clear(void);
25974 extern void cpu_uninit(void);
25975 #endif
25976
25977-extern void smp_alloc_memory(void);
25978-extern void lock_ipi_call_lock(void);
25979-extern void unlock_ipi_call_lock(void);
25980 #endif /* __ASSEMBLY__ */
25981 #endif
25982Index: head-2008-12-01/include/asm-x86/mach-xen/asm/spinlock.h
25983===================================================================
25984--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/spinlock.h 2008-12-01 11:44:55.000000000 +0100
25985+++ head-2008-12-01/include/asm-x86/mach-xen/asm/spinlock.h 2008-12-01 11:49:07.000000000 +0100
25986@@ -65,14 +65,14 @@ extern void xen_spin_kick(raw_spinlock_t
25987 */
25988 #if (NR_CPUS < 256)
25989 #define TICKET_SHIFT 8
25990-#define __raw_spin_lock_preamble \
25991+#define __ticket_spin_lock_preamble \
25992 asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
25993 "cmpb %h0, %b0\n\t" \
25994 "sete %1" \
25995 : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
25996 : "0" (0x0100) \
25997 : "memory", "cc")
25998-#define __raw_spin_lock_body \
25999+#define __ticket_spin_lock_body \
26000 asm("1:\t" \
26001 "cmpb %h0, %b0\n\t" \
26002 "je 2f\n\t" \
26003@@ -88,7 +88,7 @@ extern void xen_spin_kick(raw_spinlock_t
26004 : "memory", "cc")
26005
26006
26007-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26008+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26009 {
26010 int tmp, new;
26011
26012@@ -107,7 +107,7 @@ static __always_inline int __raw_spin_tr
26013 return tmp;
26014 }
26015
26016-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26017+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26018 {
26019 unsigned int token;
26020 unsigned char kick;
26021@@ -124,7 +124,7 @@ static __always_inline void __raw_spin_u
26022 }
26023 #else
26024 #define TICKET_SHIFT 16
26025-#define __raw_spin_lock_preamble \
26026+#define __ticket_spin_lock_preamble \
26027 do { \
26028 unsigned int tmp; \
26029 asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
26030@@ -136,7 +136,7 @@ static __always_inline void __raw_spin_u
26031 : "0" (0x00010000) \
26032 : "memory", "cc"); \
26033 } while (0)
26034-#define __raw_spin_lock_body \
26035+#define __ticket_spin_lock_body \
26036 do { \
26037 unsigned int tmp; \
26038 asm("shldl $16, %0, %2\n" \
26039@@ -155,7 +155,7 @@ static __always_inline void __raw_spin_u
26040 : "memory", "cc"); \
26041 } while (0)
26042
26043-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26044+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26045 {
26046 int tmp;
26047 int new;
26048@@ -177,7 +177,7 @@ static __always_inline int __raw_spin_tr
26049 return tmp;
26050 }
26051
26052-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26053+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26054 {
26055 unsigned int token, tmp;
26056 bool kick;
26057@@ -195,49 +195,145 @@ static __always_inline void __raw_spin_u
26058 }
26059 #endif
26060
26061-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26062+static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
26063 {
26064 int tmp = ACCESS_ONCE(lock->slock);
26065
26066 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
26067 }
26068
26069-static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26070+static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
26071 {
26072 int tmp = ACCESS_ONCE(lock->slock);
26073
26074 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
26075 }
26076
26077-static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26078+static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
26079 {
26080 unsigned int token, count;
26081 bool free;
26082
26083- __raw_spin_lock_preamble;
26084+ __ticket_spin_lock_preamble;
26085 if (unlikely(!free))
26086 token = xen_spin_adjust(lock, token);
26087 do {
26088 count = 1 << 10;
26089- __raw_spin_lock_body;
26090+ __ticket_spin_lock_body;
26091 } while (unlikely(!count) && !xen_spin_wait(lock, token));
26092 }
26093
26094-static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26095- unsigned long flags)
26096+static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock,
26097+ unsigned long flags)
26098 {
26099 unsigned int token, count;
26100 bool free;
26101
26102- __raw_spin_lock_preamble;
26103+ __ticket_spin_lock_preamble;
26104 if (unlikely(!free))
26105 token = xen_spin_adjust(lock, token);
26106 do {
26107 count = 1 << 10;
26108- __raw_spin_lock_body;
26109+ __ticket_spin_lock_body;
26110 } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
26111 }
26112
26113+#ifdef CONFIG_PARAVIRT
26114+/*
26115+ * Define virtualization-friendly old-style lock byte lock, for use in
26116+ * pv_lock_ops if desired.
26117+ *
26118+ * This differs from the pre-2.6.24 spinlock by always using xchgb
26119+ * rather than decb to take the lock; this allows it to use a
26120+ * zero-initialized lock structure. It also maintains a 1-byte
26121+ * contention counter, so that we can implement
26122+ * __byte_spin_is_contended.
26123+ */
26124+struct __byte_spinlock {
26125+ s8 lock;
26126+ s8 spinners;
26127+};
26128+
26129+static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
26130+{
26131+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26132+ return bl->lock != 0;
26133+}
26134+
26135+static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
26136+{
26137+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26138+ return bl->spinners != 0;
26139+}
26140+
26141+static inline void __byte_spin_lock(raw_spinlock_t *lock)
26142+{
26143+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26144+ s8 val = 1;
26145+
26146+ asm("1: xchgb %1, %0\n"
26147+ " test %1,%1\n"
26148+ " jz 3f\n"
26149+ " " LOCK_PREFIX "incb %2\n"
26150+ "2: rep;nop\n"
26151+ " cmpb $1, %0\n"
26152+ " je 2b\n"
26153+ " " LOCK_PREFIX "decb %2\n"
26154+ " jmp 1b\n"
26155+ "3:"
26156+ : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
26157+}
26158+
26159+static inline int __byte_spin_trylock(raw_spinlock_t *lock)
26160+{
26161+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26162+ u8 old = 1;
26163+
26164+ asm("xchgb %1,%0"
26165+ : "+m" (bl->lock), "+q" (old) : : "memory");
26166+
26167+ return old == 0;
26168+}
26169+
26170+static inline void __byte_spin_unlock(raw_spinlock_t *lock)
26171+{
26172+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26173+ smp_wmb();
26174+ bl->lock = 0;
26175+}
26176+#else /* !CONFIG_PARAVIRT */
26177+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26178+{
26179+ return __ticket_spin_is_locked(lock);
26180+}
26181+
26182+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26183+{
26184+ return __ticket_spin_is_contended(lock);
26185+}
26186+
26187+static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26188+{
26189+ __ticket_spin_lock(lock);
26190+}
26191+
26192+static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26193+ unsigned long flags)
26194+{
26195+ __ticket_spin_lock_flags(lock, flags);
26196+}
26197+
26198+static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26199+{
26200+ return __ticket_spin_trylock(lock);
26201+}
26202+
26203+static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26204+{
26205+ __ticket_spin_unlock(lock);
26206+}
26207+#endif /* CONFIG_PARAVIRT */
26208+
26209 static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
26210 {
26211 while (__raw_spin_is_locked(lock))
26212Index: head-2008-12-01/include/asm-x86/mach-xen/asm/system.h
26213===================================================================
26214--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/system.h 2008-12-01 11:44:55.000000000 +0100
26215+++ head-2008-12-01/include/asm-x86/mach-xen/asm/system.h 2008-12-01 11:49:07.000000000 +0100
26216@@ -137,7 +137,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
26217 #define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
26218 #define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
26219
26220-extern void load_gs_index(unsigned);
26221+extern void xen_load_gs_index(unsigned);
26222
26223 /*
26224 * Load a segment. Fall back on loading the zero
26225@@ -154,14 +154,14 @@ extern void load_gs_index(unsigned);
26226 "jmp 2b\n" \
26227 ".previous\n" \
26228 _ASM_EXTABLE(1b,3b) \
26229- : :"r" (value), "r" (0))
26230+ : :"r" (value), "r" (0) : "memory")
26231
26232
26233 /*
26234 * Save a segment register away
26235 */
26236 #define savesegment(seg, value) \
26237- asm volatile("mov %%" #seg ",%0":"=rm" (value))
26238+ asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
26239
26240 static inline unsigned long get_limit(unsigned long segment)
26241 {
26242@@ -269,6 +269,7 @@ static inline void xen_wbinvd(void)
26243 #ifdef CONFIG_X86_64
26244 #define read_cr8() (xen_read_cr8())
26245 #define write_cr8(x) (xen_write_cr8(x))
26246+#define load_gs_index xen_load_gs_index
26247 #endif
26248
26249 /* Clear the 'TS' bit */
26250@@ -287,13 +288,12 @@ static inline void clflush(volatile void
26251 void disable_hlt(void);
26252 void enable_hlt(void);
26253
26254-extern int es7000_plat;
26255 void cpu_idle_wait(void);
26256
26257 extern unsigned long arch_align_stack(unsigned long sp);
26258 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
26259
26260-void default_idle(void);
26261+void xen_idle(void);
26262
26263 /*
26264 * Force strict CPU ordering.
26265Index: head-2008-12-01/include/asm-x86/mach-xen/asm/xor_64.h
26266===================================================================
26267--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/xor_64.h 2008-12-01 11:44:55.000000000 +0100
26268+++ head-2008-12-01/include/asm-x86/mach-xen/asm/xor_64.h 2008-12-01 11:49:07.000000000 +0100
26269@@ -1,3 +1,6 @@
26270+#ifndef ASM_X86__XOR_64_H
26271+#define ASM_X86__XOR_64_H
26272+
26273 /*
26274 * x86-64 changes / gcc fixes from Andi Kleen.
26275 * Copyright 2002 Andi Kleen, SuSE Labs.
26276@@ -330,3 +333,5 @@ do { \
26277 We may also be able to load into the L1 only depending on how the cpu
26278 deals with a load to a line that is being prefetched. */
26279 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
26280+
26281+#endif /* ASM_X86__XOR_64_H */
26282Index: head-2008-12-01/include/asm-x86/mach-xen/irq_vectors.h
26283===================================================================
26284--- head-2008-12-01.orig/include/asm-x86/mach-xen/irq_vectors.h 2008-12-01 11:37:10.000000000 +0100
26285+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26286@@ -1,126 +0,0 @@
26287-/*
26288- * This file should contain #defines for all of the interrupt vector
26289- * numbers used by this architecture.
26290- *
26291- * In addition, there are some standard defines:
26292- *
26293- * FIRST_EXTERNAL_VECTOR:
26294- * The first free place for external interrupts
26295- *
26296- * SYSCALL_VECTOR:
26297- * The IRQ vector a syscall makes the user to kernel transition
26298- * under.
26299- *
26300- * TIMER_IRQ:
26301- * The IRQ number the timer interrupt comes in at.
26302- *
26303- * NR_IRQS:
26304- * The total number of interrupt vectors (including all the
26305- * architecture specific interrupts) needed.
26306- *
26307- */
26308-#ifndef _ASM_IRQ_VECTORS_H
26309-#define _ASM_IRQ_VECTORS_H
26310-
26311-/*
26312- * IDT vectors usable for external interrupt sources start
26313- * at 0x20:
26314- */
26315-#define FIRST_EXTERNAL_VECTOR 0x20
26316-
26317-#define SYSCALL_VECTOR 0x80
26318-
26319-/*
26320- * Vectors 0x20-0x2f are used for ISA interrupts.
26321- */
26322-
26323-#if 0
26324-/*
26325- * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
26326- *
26327- * some of the following vectors are 'rare', they are merged
26328- * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
26329- * TLB, reschedule and local APIC vectors are performance-critical.
26330- *
26331- * Vectors 0xf0-0xfa are free (reserved for future Linux use).
26332- */
26333-#define SPURIOUS_APIC_VECTOR 0xff
26334-#define ERROR_APIC_VECTOR 0xfe
26335-#define INVALIDATE_TLB_VECTOR 0xfd
26336-#define RESCHEDULE_VECTOR 0xfc
26337-#define CALL_FUNCTION_VECTOR 0xfb
26338-
26339-#define THERMAL_APIC_VECTOR 0xf0
26340-/*
26341- * Local APIC timer IRQ vector is on a different priority level,
26342- * to work around the 'lost local interrupt if more than 2 IRQ
26343- * sources per level' errata.
26344- */
26345-#define LOCAL_TIMER_VECTOR 0xef
26346-#endif
26347-
26348-#define SPURIOUS_APIC_VECTOR 0xff
26349-#define ERROR_APIC_VECTOR 0xfe
26350-
26351-/*
26352- * First APIC vector available to drivers: (vectors 0x30-0xee)
26353- * we start at 0x31 to spread out vectors evenly between priority
26354- * levels. (0x80 is the syscall vector)
26355- */
26356-#define FIRST_DEVICE_VECTOR 0x31
26357-#define FIRST_SYSTEM_VECTOR 0xef
26358-
26359-/*
26360- * 16 8259A IRQ's, 208 potential APIC interrupt sources.
26361- * Right now the APIC is mostly only used for SMP.
26362- * 256 vectors is an architectural limit. (we can have
26363- * more than 256 devices theoretically, but they will
26364- * have to use shared interrupts)
26365- * Since vectors 0x00-0x1f are used/reserved for the CPU,
26366- * the usable vector space is 0x20-0xff (224 vectors)
26367- */
26368-
26369-#define RESCHEDULE_VECTOR 0
26370-#define CALL_FUNCTION_VECTOR 1
26371-#define SPIN_UNLOCK_VECTOR 2
26372-#define NR_IPIS 3
26373-
26374-/*
26375- * The maximum number of vectors supported by i386 processors
26376- * is limited to 256. For processors other than i386, NR_VECTORS
26377- * should be changed accordingly.
26378- */
26379-#define NR_VECTORS 256
26380-
26381-#define FPU_IRQ 13
26382-
26383-#define FIRST_VM86_IRQ 3
26384-#define LAST_VM86_IRQ 15
26385-#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
26386-
26387-/*
26388- * The flat IRQ space is divided into two regions:
26389- * 1. A one-to-one mapping of real physical IRQs. This space is only used
26390- * if we have physical device-access privilege. This region is at the
26391- * start of the IRQ space so that existing device drivers do not need
26392- * to be modified to translate physical IRQ numbers into our IRQ space.
26393- * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
26394- * are bound using the provided bind/unbind functions.
26395- */
26396-
26397-#define PIRQ_BASE 0
26398-#if !defined(MAX_IO_APICS)
26399-# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26400-#elif NR_CPUS < MAX_IO_APICS
26401-# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26402-#else
26403-# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
26404-#endif
26405-
26406-#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
26407-#define NR_DYNIRQS 256
26408-
26409-#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
26410-#define NR_IRQ_VECTORS NR_IRQS
26411-
26412-#endif /* _ASM_IRQ_VECTORS_H */
26413Index: head-2008-12-01/include/asm-x86/mach-xen/setup_arch_post.h
26414===================================================================
26415--- head-2008-12-01.orig/include/asm-x86/mach-xen/setup_arch_post.h 2008-12-03 15:48:43.000000000 +0100
26416+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26417@@ -1,63 +0,0 @@
26418-/**
26419- * machine_specific_* - Hooks for machine specific setup.
26420- *
26421- * Description:
26422- * This is included late in kernel/setup.c so that it can make
26423- * use of all of the static functions.
26424- **/
26425-
26426-#include <xen/interface/callback.h>
26427-
26428-extern void hypervisor_callback(void);
26429-extern void failsafe_callback(void);
26430-extern void nmi(void);
26431-
26432-static void __init machine_specific_arch_setup(void)
26433-{
26434- int ret;
26435- static struct callback_register __initdata event = {
26436- .type = CALLBACKTYPE_event,
26437- .address = (unsigned long) hypervisor_callback,
26438- };
26439- static struct callback_register __initdata failsafe = {
26440- .type = CALLBACKTYPE_failsafe,
26441- .address = (unsigned long)failsafe_callback,
26442- };
26443- static struct callback_register __initdata syscall = {
26444- .type = CALLBACKTYPE_syscall,
26445- .address = (unsigned long)system_call,
26446- };
26447-#ifdef CONFIG_X86_LOCAL_APIC
26448- static struct callback_register __initdata nmi_cb = {
26449- .type = CALLBACKTYPE_nmi,
26450- .address = (unsigned long)nmi,
26451- };
26452-#endif
26453-
26454- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
26455- if (ret == 0)
26456- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
26457- if (ret == 0)
26458- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
26459-#if CONFIG_XEN_COMPAT <= 0x030002
26460- if (ret == -ENOSYS)
26461- ret = HYPERVISOR_set_callbacks(
26462- event.address,
26463- failsafe.address,
26464- syscall.address);
26465-#endif
26466- BUG_ON(ret);
26467-
26468-#ifdef CONFIG_X86_LOCAL_APIC
26469- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
26470-#if CONFIG_XEN_COMPAT <= 0x030002
26471- if (ret == -ENOSYS) {
26472- static struct xennmi_callback __initdata cb = {
26473- .handler_address = (unsigned long)nmi
26474- };
26475-
26476- HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
26477- }
26478-#endif
26479-#endif
26480-}
26481Index: head-2008-12-01/include/asm-x86/mach-xen/setup_arch_pre.h
26482===================================================================
26483--- head-2008-12-01.orig/include/asm-x86/mach-xen/setup_arch_pre.h 2008-12-03 15:48:43.000000000 +0100
26484+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26485@@ -1,5 +0,0 @@
26486-/* Hook to call BIOS initialisation function */
26487-
26488-#define ARCH_SETUP machine_specific_arch_setup();
26489-
26490-static void __init machine_specific_arch_setup(void);
26491Index: head-2008-12-01/include/asm-x86/traps.h
26492===================================================================
26493--- head-2008-12-01.orig/include/asm-x86/traps.h 2008-12-03 15:48:43.000000000 +0100
26494+++ head-2008-12-01/include/asm-x86/traps.h 2008-12-01 11:49:07.000000000 +0100
26495@@ -23,6 +23,9 @@ asmlinkage void spurious_interrupt_bug(v
26496 #ifdef CONFIG_X86_MCE
26497 asmlinkage void machine_check(void);
26498 #endif /* CONFIG_X86_MCE */
26499+#ifdef CONFIG_X86_XEN
26500+asmlinkage void fixup_4gb_segment(void);
26501+#endif
26502
26503 void do_divide_error(struct pt_regs *, long);
26504 void do_overflow(struct pt_regs *, long);
26505@@ -48,6 +51,9 @@ void math_error(void __user *);
26506 void do_coprocessor_error(struct pt_regs *, long);
26507 void do_simd_coprocessor_error(struct pt_regs *, long);
26508 void do_spurious_interrupt_bug(struct pt_regs *, long);
26509+#ifdef CONFIG_XEN
26510+void do_fixup_4gb_segment(struct pt_regs *, long);
26511+#endif
26512 unsigned long patch_espfix_desc(unsigned long, unsigned long);
26513 asmlinkage void math_emulate(long);
26514
26515Index: head-2008-12-01/include/asm-x86/xen/hypercall.h
26516===================================================================
26517--- head-2008-12-01.orig/include/asm-x86/xen/hypercall.h 2008-12-03 15:48:43.000000000 +0100
26518+++ head-2008-12-01/include/asm-x86/xen/hypercall.h 2008-12-01 11:49:07.000000000 +0100
26519@@ -264,7 +264,7 @@ HYPERVISOR_fpu_taskswitch(int set)
26520 static inline int
26521 HYPERVISOR_sched_op(int cmd, void *arg)
26522 {
26523- return _hypercall2(int, sched_op_new, cmd, arg);
26524+ return _hypercall2(int, sched_op, cmd, arg);
26525 }
26526
26527 static inline long
26528Index: head-2008-12-01/include/asm-x86/xen/interface_64.h
26529===================================================================
26530--- head-2008-12-01.orig/include/asm-x86/xen/interface_64.h 2008-12-03 15:48:43.000000000 +0100
26531+++ head-2008-12-01/include/asm-x86/xen/interface_64.h 2008-12-01 11:49:07.000000000 +0100
26532@@ -136,7 +136,7 @@ struct cpu_user_regs {
26533 uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
26534 uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
26535 };
26536-DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
26537+DEFINE_XEN_GUEST_HANDLE_STRUCT(cpu_user_regs);
26538
26539 #undef __DECL_REG
26540
26541Index: head-2008-12-01/include/linux/page-flags.h
26542===================================================================
26543--- head-2008-12-01.orig/include/linux/page-flags.h 2008-12-01 11:48:52.000000000 +0100
26544+++ head-2008-12-01/include/linux/page-flags.h 2008-12-01 11:49:07.000000000 +0100
26545@@ -109,9 +109,11 @@ enum pageflags {
26546 /* Filesystems */
26547 PG_checked = PG_owner_priv_1,
26548
26549+#ifdef CONFIG_PARAVIRT_XEN
26550 /* XEN */
26551 PG_pinned = PG_owner_priv_1,
26552 PG_savepinned = PG_dirty,
26553+#endif
26554
26555 /* SLOB */
26556 PG_slob_page = PG_active,
26557@@ -185,8 +187,12 @@ PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU,
26558 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
26559 __PAGEFLAG(Slab, slab)
26560 PAGEFLAG(Checked, checked) /* Used by some filesystems */
26561+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
26562 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
26563+#endif
26564+#ifdef CONFIG_PARAVIRT_XEN
26565 PAGEFLAG(SavePinned, savepinned); /* Xen */
26566+#endif
26567 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
26568 PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
26569 __SETPAGEFLAG(Private, private)
26570Index: head-2008-12-01/include/xen/interface/memory.h
26571===================================================================
26572--- head-2008-12-01.orig/include/xen/interface/memory.h 2008-12-01 11:44:55.000000000 +0100
26573+++ head-2008-12-01/include/xen/interface/memory.h 2008-12-01 11:49:07.000000000 +0100
26574@@ -82,6 +82,7 @@ struct xen_memory_reservation {
26575 domid_t domid;
26576
26577 };
26578+DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_memory_reservation);
26579 typedef struct xen_memory_reservation xen_memory_reservation_t;
26580 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
26581
26582@@ -167,11 +168,7 @@ struct xen_machphys_mfn_list {
26583 * any large discontiguities in the machine address space, 2MB gaps in
26584 * the machphys table will be represented by an MFN base of zero.
26585 */
26586-#ifndef CONFIG_PARAVIRT_XEN
26587 XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
26588-#else
26589- ulong extent_start;
26590-#endif
26591
26592 /*
26593 * Number of extents written to the above array. This will be smaller
26594@@ -179,6 +176,7 @@ struct xen_machphys_mfn_list {
26595 */
26596 unsigned int nr_extents;
26597 };
26598+DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
26599 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
26600 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
26601
26602@@ -218,6 +216,7 @@ struct xen_add_to_physmap {
26603 /* GPFN where the source mapping page should appear. */
26604 xen_pfn_t gpfn;
26605 };
26606+DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
26607 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
26608 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
26609
26610@@ -250,21 +249,13 @@ struct xen_translate_gpfn_list {
26611 xen_ulong_t nr_gpfns;
26612
26613 /* List of GPFNs to translate. */
26614-#ifndef CONFIG_PARAVIRT_XEN
26615 XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
26616-#else
26617- ulong gpfn_list;
26618-#endif
26619
26620 /*
26621 * Output list to contain MFN translations. May be the same as the input
26622 * list (in which case each input GPFN is overwritten with the output MFN).
26623 */
26624-#ifndef CONFIG_PARAVIRT_XEN
26625 XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
26626-#else
26627- ulong mfn_list;
26628-#endif
26629 };
26630 DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
26631 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
26632Index: head-2008-12-01/kernel/hrtimer.c
26633===================================================================
26634--- head-2008-12-01.orig/kernel/hrtimer.c 2008-12-03 15:48:43.000000000 +0100
26635+++ head-2008-12-01/kernel/hrtimer.c 2008-12-01 11:49:07.000000000 +0100
26636@@ -1084,7 +1084,7 @@ ktime_t hrtimer_get_remaining(const stru
26637 }
26638 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
26639
26640-#ifdef CONFIG_NO_HZ
26641+#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26642 /**
26643 * hrtimer_get_next_event - get the time until next expiry event
26644 *
26645Index: head-2008-12-01/kernel/timer.c
26646===================================================================
26647--- head-2008-12-01.orig/kernel/timer.c 2008-12-03 15:48:43.000000000 +0100
26648+++ head-2008-12-01/kernel/timer.c 2008-12-01 11:49:07.000000000 +0100
26649@@ -815,7 +815,7 @@ static inline void __run_timers(struct t
26650 spin_unlock_irq(&base->lock);
26651 }
26652
26653-#ifdef CONFIG_NO_HZ
26654+#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26655 /*
26656 * Find out when the next timer event is due to happen. This
26657 * is used on S/390 to stop all activity when a cpus is idle.
26658Index: head-2008-12-01/lib/swiotlb-xen.c
26659===================================================================
26660--- head-2008-12-01.orig/lib/swiotlb-xen.c 2008-12-01 11:44:55.000000000 +0100
26661+++ head-2008-12-01/lib/swiotlb-xen.c 2008-12-01 11:49:07.000000000 +0100
26662@@ -750,7 +750,7 @@ swiotlb_sync_sg_for_device(struct device
26663 }
26664
26665 int
26666-swiotlb_dma_mapping_error(dma_addr_t dma_addr)
26667+swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
26668 {
26669 return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
26670 }
26671Index: head-2008-12-01/mm/mprotect.c
26672===================================================================
26673--- head-2008-12-01.orig/mm/mprotect.c 2008-12-01 11:29:05.000000000 +0100
26674+++ head-2008-12-01/mm/mprotect.c 2008-12-01 11:49:07.000000000 +0100
26675@@ -92,8 +92,6 @@ static inline void change_pmd_range(stru
26676 next = pmd_addr_end(addr, end);
26677 if (pmd_none_or_clear_bad(pmd))
26678 continue;
26679- if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
26680- continue;
26681 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
26682 } while (pmd++, addr = next, addr != end);
26683 }