]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/60036_xen3-patch-2.6.27.patch1
Stop dhcpcd before starting if it was running
[people/pmueller/ipfire-2.x.git] / src / patches / 60036_xen3-patch-2.6.27.patch1
1 From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
2 Subject: [PATCH] Linux: Update to 2.6.27
3 Patch-mainline: 2.6.27
4
5 This patch contains the differences between Linux 2.6.26 and 2.6.27.
6
7 Acked-by: Jeff Mahoney <jeffm@suse.com>
8 Automatically created from "patches.kernel.org/patch-2.6.27" by xen-port-patches.py
9
10 Index: head-2008-12-01/arch/x86/Kconfig
11 ===================================================================
12 --- head-2008-12-01.orig/arch/x86/Kconfig 2008-12-01 11:44:55.000000000 +0100
13 +++ head-2008-12-01/arch/x86/Kconfig 2008-12-01 11:49:07.000000000 +0100
14 @@ -590,7 +590,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
15 config AMD_IOMMU
16 bool "AMD IOMMU support"
17 select SWIOTLB
18 - depends on X86_64 && PCI && ACPI
19 + depends on X86_64 && PCI && ACPI && !X86_64_XEN
20 help
21 With this option you can enable support for AMD IOMMU hardware in
22 your system. An IOMMU is a hardware component which provides
23 @@ -625,8 +625,10 @@ config MAXSMP
24
25 config NR_CPUS
26 int "Maximum number of CPUs (2-512)" if !MAXSMP
27 + range 2 32 if XEN
28 range 2 512
29 depends on SMP
30 + default "32" if MAXSMP && XEN
31 default "4096" if MAXSMP
32 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
33 default "16" if X86_64_XEN
34 @@ -1223,7 +1225,7 @@ config MTRR
35 config MTRR_SANITIZER
36 bool
37 prompt "MTRR cleanup support"
38 - depends on MTRR
39 + depends on MTRR && !XEN
40 help
41 Convert MTRR layout from continuous to discrete, so X drivers can
42 add writeback entries.
43 Index: head-2008-12-01/arch/x86/Kconfig.debug
44 ===================================================================
45 --- head-2008-12-01.orig/arch/x86/Kconfig.debug 2008-12-01 11:37:10.000000000 +0100
46 +++ head-2008-12-01/arch/x86/Kconfig.debug 2008-12-01 11:49:07.000000000 +0100
47 @@ -25,6 +25,7 @@ config STRICT_DEVMEM
48 config X86_VERBOSE_BOOTUP
49 bool "Enable verbose x86 bootup info messages"
50 default y
51 + depends on !XEN
52 help
53 Enables the informational output from the decompression stage
54 (e.g. bzImage) of the boot. If you disable this you will still
55 @@ -166,7 +167,7 @@ config MMIOTRACE_HOOKS
56
57 config MMIOTRACE
58 bool "Memory mapped IO tracing"
59 - depends on DEBUG_KERNEL && PCI
60 + depends on DEBUG_KERNEL && PCI && !XEN
61 select TRACING
62 select MMIOTRACE_HOOKS
63 help
64 Index: head-2008-12-01/arch/x86/Makefile
65 ===================================================================
66 --- head-2008-12-01.orig/arch/x86/Makefile 2008-12-01 11:36:55.000000000 +0100
67 +++ head-2008-12-01/arch/x86/Makefile 2008-12-01 11:49:07.000000000 +0100
68 @@ -116,8 +116,8 @@ mflags-$(CONFIG_X86_VOYAGER) := -Iinclud
69 mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
70
71 # Xen subarch support
72 -mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-x86/mach-xen
73 -mcore-$(CONFIG_X86_XEN) := arch/x86/mach-xen/
74 +mflags-$(CONFIG_XEN) := -Iinclude/asm-x86/mach-xen
75 +mcore-$(CONFIG_XEN) := arch/x86/mach-xen/
76
77 # generic subarchitecture
78 mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
79 @@ -128,8 +128,10 @@ mcore-$(CONFIG_X86_GENERICARCH) := arch/
80 mflags-y += -Iinclude/asm-x86/mach-default
81
82 # 64 bit does not support subarch support - clear sub arch variables
83 +ifneq ($(CONFIG_XEN),y)
84 fcore-$(CONFIG_X86_64) :=
85 mcore-$(CONFIG_X86_64) :=
86 +endif
87
88 KBUILD_CFLAGS += $(mflags-y)
89 KBUILD_AFLAGS += $(mflags-y)
90 Index: head-2008-12-01/arch/x86/ia32/ia32entry-xen.S
91 ===================================================================
92 --- head-2008-12-01.orig/arch/x86/ia32/ia32entry-xen.S 2008-12-01 11:44:55.000000000 +0100
93 +++ head-2008-12-01/arch/x86/ia32/ia32entry-xen.S 2008-12-01 11:49:07.000000000 +0100
94 @@ -15,6 +15,16 @@
95 #include <asm/irqflags.h>
96 #include <linux/linkage.h>
97
98 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
99 +#include <linux/elf-em.h>
100 +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
101 +#define __AUDIT_ARCH_LE 0x40000000
102 +
103 +#ifndef CONFIG_AUDITSYSCALL
104 +#define sysexit_audit int_ret_from_sys_call
105 +#define sysretl_audit int_ret_from_sys_call
106 +#endif
107 +
108 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
109
110 .macro IA32_ARG_FIXUP noebp=0
111 @@ -37,6 +47,11 @@
112 movq %rax,R8(%rsp)
113 .endm
114
115 + /*
116 + * Reload arg registers from stack in case ptrace changed them.
117 + * We don't reload %eax because syscall_trace_enter() returned
118 + * the value it wants us to use in the table lookup.
119 + */
120 .macro LOAD_ARGS32 offset
121 movl \offset(%rsp),%r11d
122 movl \offset+8(%rsp),%r10d
123 @@ -46,7 +61,6 @@
124 movl \offset+48(%rsp),%edx
125 movl \offset+56(%rsp),%esi
126 movl \offset+64(%rsp),%edi
127 - movl \offset+72(%rsp),%eax
128 .endm
129
130 .macro CFI_STARTPROC32 simple
131 @@ -61,6 +75,19 @@
132 CFI_UNDEFINED r15
133 .endm
134
135 +#ifdef CONFIG_PARAVIRT
136 +ENTRY(native_usergs_sysret32)
137 + swapgs
138 + sysretl
139 +ENDPROC(native_usergs_sysret32)
140 +
141 +ENTRY(native_irq_enable_sysexit)
142 + swapgs
143 + sti
144 + sysexit
145 +ENDPROC(native_irq_enable_sysexit)
146 +#endif
147 +
148 /*
149 * 32bit SYSENTER instruction entry.
150 *
151 @@ -98,7 +125,7 @@ ENTRY(ia32_sysenter_target)
152 CFI_RESTORE rcx
153 movl %ebp,%ebp /* zero extension */
154 movl %eax,%eax
155 - movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
156 + movl 48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d
157 movl $__USER32_DS,40(%rsp)
158 movq %rbp,32(%rsp)
159 movl $__USER32_CS,16(%rsp)
160 @@ -113,19 +140,79 @@ ENTRY(ia32_sysenter_target)
161 .quad 1b,ia32_badarg
162 .previous
163 GET_THREAD_INFO(%r10)
164 - orl $TS_COMPAT,threadinfo_status(%r10)
165 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
166 + orl $TS_COMPAT,TI_status(%r10)
167 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
168 jnz sysenter_tracesys
169 -sysenter_do_call:
170 cmpl $(IA32_NR_syscalls-1),%eax
171 ja ia32_badsys
172 +sysenter_do_call:
173 IA32_ARG_FIXUP 1
174 +sysenter_dispatch:
175 call *ia32_sys_call_table(,%rax,8)
176 movq %rax,RAX-ARGOFFSET(%rsp)
177 + GET_THREAD_INFO(%r10)
178 + DISABLE_INTERRUPTS(CLBR_NONE)
179 + TRACE_IRQS_OFF
180 + testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
181 + jnz sysexit_audit
182 + jmp int_ret_from_sys_call
183 +
184 +#ifdef CONFIG_AUDITSYSCALL
185 + .macro auditsys_entry_common
186 + movl %esi,%r9d /* 6th arg: 4th syscall arg */
187 + movl %edx,%r8d /* 5th arg: 3rd syscall arg */
188 + /* (already in %ecx) 4th arg: 2nd syscall arg */
189 + movl %ebx,%edx /* 3rd arg: 1st syscall arg */
190 + movl %eax,%esi /* 2nd arg: syscall number */
191 + movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
192 + call audit_syscall_entry
193 + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
194 + cmpl $(IA32_NR_syscalls-1),%eax
195 + ja ia32_badsys
196 + movl %ebx,%edi /* reload 1st syscall arg */
197 + movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
198 + movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
199 + movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
200 + movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
201 + .endm
202 +
203 + .macro auditsys_exit exit,ebpsave=RBP
204 + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
205 + jnz int_ret_from_sys_call
206 + TRACE_IRQS_ON
207 + ENABLE_INTERRUPTS(CLBR_NONE)
208 + movl %eax,%esi /* second arg, syscall return value */
209 + cmpl $0,%eax /* is it < 0? */
210 + setl %al /* 1 if so, 0 if not */
211 + movzbl %al,%edi /* zero-extend that into %edi */
212 + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
213 + call audit_syscall_exit
214 + GET_THREAD_INFO(%r10)
215 + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
216 + movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
217 + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
218 + DISABLE_INTERRUPTS(CLBR_NONE)
219 + TRACE_IRQS_OFF
220 + testl %edi,TI_flags(%r10)
221 + jnz int_with_check
222 jmp int_ret_from_sys_call
223 + .endm
224 +
225 +sysenter_auditsys:
226 + auditsys_entry_common
227 + movl %ebp,%r9d /* reload 6th syscall arg */
228 + jmp sysenter_dispatch
229 +
230 +sysexit_audit:
231 + auditsys_exit sysexit_from_sys_call
232 +#endif
233
234 sysenter_tracesys:
235 xchgl %r9d,%ebp
236 +#ifdef CONFIG_AUDITSYSCALL
237 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
238 + jz sysenter_auditsys
239 +#endif
240 SAVE_REST
241 CLEAR_RREGS
242 movq %r9,R9(%rsp)
243 @@ -186,18 +273,38 @@ ENTRY(ia32_cstar_target)
244 .quad 1b,ia32_badarg
245 .previous
246 GET_THREAD_INFO(%r10)
247 - orl $TS_COMPAT,threadinfo_status(%r10)
248 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
249 + orl $TS_COMPAT,TI_status(%r10)
250 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
251 jnz cstar_tracesys
252 cstar_do_call:
253 cmpl $IA32_NR_syscalls-1,%eax
254 ja ia32_badsys
255 IA32_ARG_FIXUP 1
256 +cstar_dispatch:
257 call *ia32_sys_call_table(,%rax,8)
258 movq %rax,RAX-ARGOFFSET(%rsp)
259 + GET_THREAD_INFO(%r10)
260 + DISABLE_INTERRUPTS(CLBR_NONE)
261 + testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
262 + jnz sysretl_audit
263 jmp int_ret_from_sys_call
264
265 -cstar_tracesys:
266 +#ifdef CONFIG_AUDITSYSCALL
267 +cstar_auditsys:
268 + movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
269 + auditsys_entry_common
270 + movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
271 + jmp cstar_dispatch
272 +
273 +sysretl_audit:
274 + auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
275 +#endif
276 +
277 +cstar_tracesys:
278 +#ifdef CONFIG_AUDITSYSCALL
279 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
280 + jz cstar_auditsys
281 +#endif
282 xchgl %r9d,%ebp
283 SAVE_REST
284 CLEAR_RREGS
285 @@ -263,8 +370,8 @@ ENTRY(ia32_syscall)
286 this could be a problem. */
287 SAVE_ARGS 0,0,1
288 GET_THREAD_INFO(%r10)
289 - orl $TS_COMPAT,threadinfo_status(%r10)
290 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
291 + orl $TS_COMPAT,TI_status(%r10)
292 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
293 jnz ia32_tracesys
294 ia32_do_syscall:
295 cmpl $(IA32_NR_syscalls-1),%eax
296 @@ -309,13 +416,11 @@ quiet_ni_syscall:
297 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
298 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
299 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
300 - PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
301 PTREGSCALL stub32_execve, sys32_execve, %rcx
302 PTREGSCALL stub32_fork, sys_fork, %rdi
303 PTREGSCALL stub32_clone, sys32_clone, %rdx
304 PTREGSCALL stub32_vfork, sys_vfork, %rdi
305 PTREGSCALL stub32_iopl, sys_iopl, %rsi
306 - PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
307
308 ENTRY(ia32_ptregs_common)
309 popq %r11
310 @@ -415,7 +520,7 @@ ia32_sys_call_table:
311 .quad sys_ssetmask
312 .quad sys_setreuid16 /* 70 */
313 .quad sys_setregid16
314 - .quad stub32_sigsuspend
315 + .quad sys32_sigsuspend
316 .quad compat_sys_sigpending
317 .quad sys_sethostname
318 .quad compat_sys_setrlimit /* 75 */
319 @@ -522,7 +627,7 @@ ia32_sys_call_table:
320 .quad sys32_rt_sigpending
321 .quad compat_sys_rt_sigtimedwait
322 .quad sys32_rt_sigqueueinfo
323 - .quad stub32_rt_sigsuspend
324 + .quad sys_rt_sigsuspend
325 .quad sys32_pread /* 180 */
326 .quad sys32_pwrite
327 .quad sys_chown16
328 @@ -670,4 +775,10 @@ ia32_sys_call_table:
329 .quad sys32_fallocate
330 .quad compat_sys_timerfd_settime /* 325 */
331 .quad compat_sys_timerfd_gettime
332 + .quad compat_sys_signalfd4
333 + .quad sys_eventfd2
334 + .quad sys_epoll_create1
335 + .quad sys_dup3 /* 330 */
336 + .quad sys_pipe2
337 + .quad sys_inotify_init1
338 ia32_syscall_end:
339 Index: head-2008-12-01/arch/x86/kernel/Makefile
340 ===================================================================
341 --- head-2008-12-01.orig/arch/x86/kernel/Makefile 2008-12-01 11:44:55.000000000 +0100
342 +++ head-2008-12-01/arch/x86/kernel/Makefile 2008-12-01 11:49:07.000000000 +0100
343 @@ -120,9 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
344
345 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
346
347 - obj-$(CONFIG_XEN) += nmi_64.o
348 + obj-$(CONFIG_XEN) += nmi.o
349 time_64-$(CONFIG_XEN) += time_32.o
350 endif
351
352 -disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
353 - pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
354 +disabled-obj-$(CONFIG_XEN) := bios_uv.o early-quirks.o hpet.o i8253.o \
355 + i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \
356 + tlb_$(BITS).o tsc.o tsc_sync.o vsmp_64.o
357 Index: head-2008-12-01/arch/x86/kernel/acpi/boot.c
358 ===================================================================
359 --- head-2008-12-01.orig/arch/x86/kernel/acpi/boot.c 2008-12-01 11:44:55.000000000 +0100
360 +++ head-2008-12-01/arch/x86/kernel/acpi/boot.c 2008-12-01 11:49:07.000000000 +0100
361 @@ -951,7 +951,9 @@ void __init mp_register_ioapic(int id, u
362 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
363 mp_ioapics[idx].mp_apicaddr = address;
364
365 +#ifndef CONFIG_XEN
366 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
367 +#endif
368 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
369 #ifdef CONFIG_X86_32
370 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
371 @@ -1108,7 +1110,7 @@ int mp_register_gsi(u32 gsi, int trigger
372 {
373 int ioapic;
374 int ioapic_pin;
375 -#ifdef CONFIG_X86_32
376 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
377 #define MAX_GSI_NUM 4096
378 #define IRQ_COMPRESSION_START 64
379
380 @@ -1156,7 +1158,7 @@ int mp_register_gsi(u32 gsi, int trigger
381 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
382 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
383 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
384 -#ifdef CONFIG_X86_32
385 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
386 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
387 #else
388 return gsi;
389 @@ -1164,7 +1166,7 @@ int mp_register_gsi(u32 gsi, int trigger
390 }
391
392 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
393 -#ifdef CONFIG_X86_32
394 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
395 /*
396 * For GSI >= 64, use IRQ compression
397 */
398 Index: head-2008-12-01/arch/x86/kernel/acpi/sleep-xen.c
399 ===================================================================
400 --- head-2008-12-01.orig/arch/x86/kernel/acpi/sleep-xen.c 2008-12-01 11:44:55.000000000 +0100
401 +++ head-2008-12-01/arch/x86/kernel/acpi/sleep-xen.c 2008-12-01 11:49:07.000000000 +0100
402 @@ -9,6 +9,7 @@
403 #include <linux/bootmem.h>
404 #include <linux/dmi.h>
405 #include <linux/cpumask.h>
406 +#include <asm/segment.h>
407
408 #include "realmode/wakeup.h"
409 #include "sleep.h"
410 @@ -20,7 +21,7 @@ unsigned long acpi_realmode_flags;
411 /* address in low memory of the wakeup routine. */
412 static unsigned long acpi_realmode;
413
414 -#ifdef CONFIG_64BIT
415 +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
416 static char temp_stack[10240];
417 #endif
418 #endif
419 @@ -54,18 +55,27 @@ int acpi_save_state_mem(void)
420 header->video_mode = saved_video_mode;
421
422 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
423 +
424 + /*
425 + * Set up the wakeup GDT. We set these up as Big Real Mode,
426 + * that is, with limits set to 4 GB. At least the Lenovo
427 + * Thinkpad X61 is known to need this for the video BIOS
428 + * initialization quirk to work; this is likely to also
429 + * be the case for other laptops or integrated video devices.
430 + */
431 +
432 /* GDT[0]: GDT self-pointer */
433 header->wakeup_gdt[0] =
434 (u64)(sizeof(header->wakeup_gdt) - 1) +
435 ((u64)(acpi_wakeup_address +
436 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
437 << 16);
438 - /* GDT[1]: real-mode-like code segment */
439 - header->wakeup_gdt[1] = (0x009bULL << 40) +
440 - ((u64)acpi_wakeup_address << 16) + 0xffff;
441 - /* GDT[2]: real-mode-like data segment */
442 - header->wakeup_gdt[2] = (0x0093ULL << 40) +
443 - ((u64)acpi_wakeup_address << 16) + 0xffff;
444 + /* GDT[1]: big real mode-like code segment */
445 + header->wakeup_gdt[1] =
446 + GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
447 + /* GDT[2]: big real mode-like data segment */
448 + header->wakeup_gdt[2] =
449 + GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
450
451 #ifndef CONFIG_64BIT
452 store_gdt((struct desc_ptr *)&header->pmode_gdt);
453 @@ -79,7 +89,7 @@ int acpi_save_state_mem(void)
454 #endif /* !CONFIG_64BIT */
455
456 header->pmode_cr0 = read_cr0();
457 - header->pmode_cr4 = read_cr4();
458 + header->pmode_cr4 = read_cr4_safe();
459 header->realmode_flags = acpi_realmode_flags;
460 header->real_magic = 0x12345678;
461
462 @@ -89,7 +99,9 @@ int acpi_save_state_mem(void)
463 saved_magic = 0x12345678;
464 #else /* CONFIG_64BIT */
465 header->trampoline_segment = setup_trampoline() >> 4;
466 - init_rsp = (unsigned long)temp_stack + 4096;
467 +#ifdef CONFIG_SMP
468 + stack_start.sp = temp_stack + 4096;
469 +#endif
470 initial_code = (unsigned long)wakeup_long64;
471 saved_magic = 0x123456789abcdef0;
472 #endif /* CONFIG_64BIT */
473 @@ -145,6 +157,12 @@ static int __init acpi_sleep_setup(char
474 acpi_realmode_flags |= 2;
475 if (strncmp(str, "s3_beep", 7) == 0)
476 acpi_realmode_flags |= 4;
477 +#ifdef CONFIG_HIBERNATION
478 + if (strncmp(str, "s4_nohwsig", 10) == 0)
479 + acpi_no_s4_hw_signature();
480 +#endif
481 + if (strncmp(str, "old_ordering", 12) == 0)
482 + acpi_old_suspend_ordering();
483 str = strchr(str, ',');
484 if (str != NULL)
485 str += strspn(str, ", \t");
486 Index: head-2008-12-01/arch/x86/kernel/apic_32-xen.c
487 ===================================================================
488 --- head-2008-12-01.orig/arch/x86/kernel/apic_32-xen.c 2008-12-01 11:37:10.000000000 +0100
489 +++ head-2008-12-01/arch/x86/kernel/apic_32-xen.c 2008-12-01 11:49:07.000000000 +0100
490 @@ -59,7 +59,10 @@ static cpumask_t timer_bcast_ipi;
491 /*
492 * Debug level, exported for io_apic.c
493 */
494 -int apic_verbosity;
495 +unsigned int apic_verbosity;
496 +
497 +/* Have we found an MP table */
498 +int smp_found_config;
499
500 #ifndef CONFIG_XEN
501 static int modern_apic(void)
502 Index: head-2008-12-01/arch/x86/kernel/apic_64-xen.c
503 ===================================================================
504 --- head-2008-12-01.orig/arch/x86/kernel/apic_64-xen.c 2008-12-01 11:37:10.000000000 +0100
505 +++ head-2008-12-01/arch/x86/kernel/apic_64-xen.c 2008-12-01 11:49:07.000000000 +0100
506 @@ -39,7 +39,10 @@ int disable_apic;
507 /*
508 * Debug level, exported for io_apic.c
509 */
510 -int apic_verbosity;
511 +unsigned int apic_verbosity;
512 +
513 +/* Have we found an MP table */
514 +int smp_found_config;
515
516 /*
517 * The guts of the apic timer interrupt
518 Index: head-2008-12-01/arch/x86/kernel/asm-offsets_64.c
519 ===================================================================
520 --- head-2008-12-01.orig/arch/x86/kernel/asm-offsets_64.c 2008-12-03 15:48:43.000000000 +0100
521 +++ head-2008-12-01/arch/x86/kernel/asm-offsets_64.c 2008-12-01 11:49:07.000000000 +0100
522 @@ -138,7 +138,7 @@ int main(void)
523
524 BLANK();
525 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
526 -#ifdef CONFIG_XEN
527 +#ifdef CONFIG_PARAVIRT_XEN
528 BLANK();
529 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
530 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
531 Index: head-2008-12-01/arch/x86/kernel/cpu/amd_64.c
532 ===================================================================
533 --- head-2008-12-01.orig/arch/x86/kernel/cpu/amd_64.c 2008-12-03 15:48:43.000000000 +0100
534 +++ head-2008-12-01/arch/x86/kernel/cpu/amd_64.c 2008-12-01 11:49:07.000000000 +0100
535 @@ -193,6 +193,7 @@ static void __cpuinit init_amd(struct cp
536 fam10h_check_enable_mmcfg();
537 }
538
539 +#ifndef CONFIG_XEN
540 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
541 unsigned long long tseg;
542
543 @@ -211,6 +212,7 @@ static void __cpuinit init_amd(struct cp
544 set_memory_4k((unsigned long)__va(tseg), 1);
545 }
546 }
547 +#endif
548 }
549
550 static struct cpu_dev amd_cpu_dev __cpuinitdata = {
551 Index: head-2008-12-01/arch/x86/kernel/cpu/bugs_64.c
552 ===================================================================
553 --- head-2008-12-01.orig/arch/x86/kernel/cpu/bugs_64.c 2008-12-03 15:48:43.000000000 +0100
554 +++ head-2008-12-01/arch/x86/kernel/cpu/bugs_64.c 2008-12-01 11:49:07.000000000 +0100
555 @@ -20,6 +20,7 @@ void __init check_bugs(void)
556 #endif
557 alternative_instructions();
558
559 +#ifndef CONFIG_XEN
560 /*
561 * Make sure the first 2MB area is not mapped by huge pages
562 * There are typically fixed size MTRRs in there and overlapping
563 @@ -30,4 +31,5 @@ void __init check_bugs(void)
564 */
565 if (!direct_gbpages)
566 set_memory_4k((unsigned long)__va(0), 1);
567 +#endif
568 }
569 Index: head-2008-12-01/arch/x86/kernel/cpu/common-xen.c
570 ===================================================================
571 --- head-2008-12-01.orig/arch/x86/kernel/cpu/common-xen.c 2008-12-01 11:44:55.000000000 +0100
572 +++ head-2008-12-01/arch/x86/kernel/cpu/common-xen.c 2008-12-01 11:49:07.000000000 +0100
573 @@ -13,6 +13,7 @@
574 #include <asm/mtrr.h>
575 #include <asm/mce.h>
576 #include <asm/pat.h>
577 +#include <asm/asm.h>
578 #ifdef CONFIG_X86_LOCAL_APIC
579 #include <asm/mpspec.h>
580 #include <asm/apic.h>
581 @@ -341,11 +342,24 @@ static void __init early_cpu_detect(void
582
583 get_cpu_vendor(c, 1);
584
585 + early_get_cap(c);
586 +
587 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
588 cpu_devs[c->x86_vendor]->c_early_init)
589 cpu_devs[c->x86_vendor]->c_early_init(c);
590 +}
591
592 - early_get_cap(c);
593 +/*
594 + * The NOPL instruction is supposed to exist on all CPUs with
595 + * family >= 6; unfortunately, that's not true in practice because
596 + * of early VIA chips and (more importantly) broken virtualizers that
597 + * are not easy to detect. In the latter case it doesn't even *fail*
598 + * reliably, so probing for it doesn't even work. Disable it completely
599 + * unless we can find a reliable way to detect all the broken cases.
600 + */
601 +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
602 +{
603 + clear_cpu_cap(c, X86_FEATURE_NOPL);
604 }
605
606 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
607 @@ -402,8 +416,8 @@ static void __cpuinit generic_identify(s
608 }
609
610 init_scattered_cpuid_features(c);
611 + detect_nopl(c);
612 }
613 -
614 }
615
616 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
617 @@ -434,7 +448,7 @@ __setup("serialnumber", x86_serial_nr_se
618 /*
619 * This does the hard work of actually picking apart the CPU stuff...
620 */
621 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
622 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
623 {
624 int i;
625
626 @@ -448,6 +462,8 @@ void __cpuinit identify_cpu(struct cpuin
627 c->x86_max_cores = 1;
628 c->x86_clflush_size = 32;
629 memset(&c->x86_capability, 0, sizeof c->x86_capability);
630 + if (boot_cpu_has(X86_FEATURE_SYSCALL32))
631 + set_cpu_cap(c, X86_FEATURE_SYSCALL32);
632
633 if (!have_cpuid_p()) {
634 /*
635 Index: head-2008-12-01/arch/x86/kernel/cpu/common_64-xen.c
636 ===================================================================
637 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
638 +++ head-2008-12-01/arch/x86/kernel/cpu/common_64-xen.c 2008-12-01 11:49:07.000000000 +0100
639 @@ -0,0 +1,771 @@
640 +#include <linux/init.h>
641 +#include <linux/kernel.h>
642 +#include <linux/sched.h>
643 +#include <linux/string.h>
644 +#include <linux/bootmem.h>
645 +#include <linux/bitops.h>
646 +#include <linux/module.h>
647 +#include <linux/kgdb.h>
648 +#include <linux/topology.h>
649 +#include <linux/delay.h>
650 +#include <linux/smp.h>
651 +#include <linux/percpu.h>
652 +#include <asm/i387.h>
653 +#include <asm/msr.h>
654 +#include <asm/io.h>
655 +#include <asm/linkage.h>
656 +#include <asm/mmu_context.h>
657 +#include <asm/mtrr.h>
658 +#include <asm/mce.h>
659 +#include <asm/pat.h>
660 +#include <asm/asm.h>
661 +#include <asm/numa.h>
662 +#ifdef CONFIG_X86_LOCAL_APIC
663 +#include <asm/mpspec.h>
664 +#include <asm/apic.h>
665 +#include <mach_apic.h>
666 +#elif defined(CONFIG_XEN)
667 +#include <mach_apic.h>
668 +#endif
669 +#include <asm/pda.h>
670 +#include <asm/pgtable.h>
671 +#include <asm/processor.h>
672 +#include <asm/desc.h>
673 +#include <asm/atomic.h>
674 +#include <asm/proto.h>
675 +#include <asm/sections.h>
676 +#include <asm/setup.h>
677 +#include <asm/genapic.h>
678 +
679 +#include "cpu.h"
680 +
681 +/* We need valid kernel segments for data and code in long mode too
682 + * IRET will check the segment types kkeil 2000/10/28
683 + * Also sysret mandates a special GDT layout
684 + */
685 +/* The TLS descriptors are currently at a different place compared to i386.
686 + Hopefully nobody expects them at a fixed place (Wine?) */
687 +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
688 + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
689 + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
690 + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
691 + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
692 + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
693 + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
694 +} };
695 +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
696 +
697 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
698 +
699 +/* Current gdt points %fs at the "master" per-cpu area: after this,
700 + * it's on the real one. */
701 +void switch_to_new_gdt(void)
702 +{
703 +#ifndef CONFIG_XEN
704 + struct desc_ptr gdt_descr;
705 +
706 + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
707 + gdt_descr.size = GDT_SIZE - 1;
708 + load_gdt(&gdt_descr);
709 +#else
710 + void *va, *gdt_addr = get_cpu_gdt_table(smp_processor_id());
711 + unsigned long frames[16];
712 + unsigned int f = 0;
713 +
714 + for (va = gdt_addr; va < gdt_addr + GDT_SIZE; va += PAGE_SIZE) {
715 + frames[f++] = virt_to_mfn(va);
716 + make_page_readonly(va, XENFEAT_writable_descriptor_tables);
717 + }
718 + if (HYPERVISOR_set_gdt(frames, GDT_SIZE / sizeof(struct desc_struct)))
719 + BUG();
720 +#endif
721 +}
722 +
723 +struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
724 +
725 +static void __cpuinit default_init(struct cpuinfo_x86 *c)
726 +{
727 + display_cacheinfo(c);
728 +}
729 +
730 +static struct cpu_dev __cpuinitdata default_cpu = {
731 + .c_init = default_init,
732 + .c_vendor = "Unknown",
733 +};
734 +static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
735 +
736 +int __cpuinit get_model_name(struct cpuinfo_x86 *c)
737 +{
738 + unsigned int *v;
739 +
740 + if (c->extended_cpuid_level < 0x80000004)
741 + return 0;
742 +
743 + v = (unsigned int *) c->x86_model_id;
744 + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
745 + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
746 + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
747 + c->x86_model_id[48] = 0;
748 + return 1;
749 +}
750 +
751 +
752 +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
753 +{
754 + unsigned int n, dummy, ebx, ecx, edx;
755 +
756 + n = c->extended_cpuid_level;
757 +
758 + if (n >= 0x80000005) {
759 + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
760 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
761 + "D cache %dK (%d bytes/line)\n",
762 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
763 + c->x86_cache_size = (ecx>>24) + (edx>>24);
764 + /* On K8 L1 TLB is inclusive, so don't count it */
765 + c->x86_tlbsize = 0;
766 + }
767 +
768 + if (n >= 0x80000006) {
769 + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
770 + ecx = cpuid_ecx(0x80000006);
771 + c->x86_cache_size = ecx >> 16;
772 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
773 +
774 + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
775 + c->x86_cache_size, ecx & 0xFF);
776 + }
777 +}
778 +
779 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
780 +{
781 +#ifdef CONFIG_SMP
782 + u32 eax, ebx, ecx, edx;
783 + int index_msb, core_bits;
784 +
785 + cpuid(1, &eax, &ebx, &ecx, &edx);
786 +
787 +
788 + if (!cpu_has(c, X86_FEATURE_HT))
789 + return;
790 + if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
791 + goto out;
792 +
793 + smp_num_siblings = (ebx & 0xff0000) >> 16;
794 +
795 + if (smp_num_siblings == 1) {
796 + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
797 + } else if (smp_num_siblings > 1) {
798 +
799 + if (smp_num_siblings > NR_CPUS) {
800 + printk(KERN_WARNING "CPU: Unsupported number of "
801 + "siblings %d", smp_num_siblings);
802 + smp_num_siblings = 1;
803 + return;
804 + }
805 +
806 + index_msb = get_count_order(smp_num_siblings);
807 + c->phys_proc_id = phys_pkg_id(index_msb);
808 +
809 + smp_num_siblings = smp_num_siblings / c->x86_max_cores;
810 +
811 + index_msb = get_count_order(smp_num_siblings);
812 +
813 + core_bits = get_count_order(c->x86_max_cores);
814 +
815 + c->cpu_core_id = phys_pkg_id(index_msb) &
816 + ((1 << core_bits) - 1);
817 + }
818 +out:
819 + if ((c->x86_max_cores * smp_num_siblings) > 1) {
820 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
821 + c->phys_proc_id);
822 + printk(KERN_INFO "CPU: Processor Core ID: %d\n",
823 + c->cpu_core_id);
824 + }
825 +
826 +#endif
827 +}
828 +
829 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
830 +{
831 + char *v = c->x86_vendor_id;
832 + int i;
833 + static int printed;
834 +
835 + for (i = 0; i < X86_VENDOR_NUM; i++) {
836 + if (cpu_devs[i]) {
837 + if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
838 + (cpu_devs[i]->c_ident[1] &&
839 + !strcmp(v, cpu_devs[i]->c_ident[1]))) {
840 + c->x86_vendor = i;
841 + this_cpu = cpu_devs[i];
842 + return;
843 + }
844 + }
845 + }
846 + if (!printed) {
847 + printed++;
848 + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
849 + printk(KERN_ERR "CPU: Your system may be unstable.\n");
850 + }
851 + c->x86_vendor = X86_VENDOR_UNKNOWN;
852 +}
853 +
854 +static void __init early_cpu_support_print(void)
855 +{
856 + int i,j;
857 + struct cpu_dev *cpu_devx;
858 +
859 + printk("KERNEL supported cpus:\n");
860 + for (i = 0; i < X86_VENDOR_NUM; i++) {
861 + cpu_devx = cpu_devs[i];
862 + if (!cpu_devx)
863 + continue;
864 + for (j = 0; j < 2; j++) {
865 + if (!cpu_devx->c_ident[j])
866 + continue;
867 + printk(" %s %s\n", cpu_devx->c_vendor,
868 + cpu_devx->c_ident[j]);
869 + }
870 + }
871 +}
872 +
873 +/*
874 + * The NOPL instruction is supposed to exist on all CPUs with
875 + * family >= 6, unfortunately, that's not true in practice because
876 + * of early VIA chips and (more importantly) broken virtualizers that
877 + * are not easy to detect. Hence, probe for it based on first
878 + * principles.
879 + *
880 + * Note: no 64-bit chip is known to lack these, but put the code here
881 + * for consistency with 32 bits, and to make it utterly trivial to
882 + * diagnose the problem should it ever surface.
883 + */
884 +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
885 +{
886 + const u32 nopl_signature = 0x888c53b1; /* Random number */
887 + u32 has_nopl = nopl_signature;
888 +
889 + clear_cpu_cap(c, X86_FEATURE_NOPL);
890 + if (c->x86 >= 6) {
891 + asm volatile("\n"
892 + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
893 + "2:\n"
894 + " .section .fixup,\"ax\"\n"
895 + "3: xor %0,%0\n"
896 + " jmp 2b\n"
897 + " .previous\n"
898 + _ASM_EXTABLE(1b,3b)
899 + : "+a" (has_nopl));
900 +
901 + if (has_nopl == nopl_signature)
902 + set_cpu_cap(c, X86_FEATURE_NOPL);
903 + }
904 +}
905 +
906 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
907 +
908 +void __init early_cpu_init(void)
909 +{
910 + struct cpu_vendor_dev *cvdev;
911 +
912 + for (cvdev = __x86cpuvendor_start ;
913 + cvdev < __x86cpuvendor_end ;
914 + cvdev++)
915 + cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
916 + early_cpu_support_print();
917 + early_identify_cpu(&boot_cpu_data);
918 +}
919 +
920 +/* Do some early cpuid on the boot CPU to get some parameter that are
921 + needed before check_bugs. Everything advanced is in identify_cpu
922 + below. */
923 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
924 +{
925 + u32 tfms, xlvl;
926 +
927 + c->loops_per_jiffy = loops_per_jiffy;
928 + c->x86_cache_size = -1;
929 + c->x86_vendor = X86_VENDOR_UNKNOWN;
930 + c->x86_model = c->x86_mask = 0; /* So far unknown... */
931 + c->x86_vendor_id[0] = '\0'; /* Unset */
932 + c->x86_model_id[0] = '\0'; /* Unset */
933 + c->x86_clflush_size = 64;
934 + c->x86_cache_alignment = c->x86_clflush_size;
935 + c->x86_max_cores = 1;
936 + c->x86_coreid_bits = 0;
937 + c->extended_cpuid_level = 0;
938 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
939 +
940 + /* Get vendor name */
941 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
942 + (unsigned int *)&c->x86_vendor_id[0],
943 + (unsigned int *)&c->x86_vendor_id[8],
944 + (unsigned int *)&c->x86_vendor_id[4]);
945 +
946 + get_cpu_vendor(c);
947 +
948 + /* Initialize the standard set of capabilities */
949 + /* Note that the vendor-specific code below might override */
950 +
951 + /* Intel-defined flags: level 0x00000001 */
952 + if (c->cpuid_level >= 0x00000001) {
953 + __u32 misc;
954 + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
955 + &c->x86_capability[0]);
956 + c->x86 = (tfms >> 8) & 0xf;
957 + c->x86_model = (tfms >> 4) & 0xf;
958 + c->x86_mask = tfms & 0xf;
959 + if (c->x86 == 0xf)
960 + c->x86 += (tfms >> 20) & 0xff;
961 + if (c->x86 >= 0x6)
962 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
963 + if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
964 + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
965 + } else {
966 + /* Have CPUID level 0 only - unheard of */
967 + c->x86 = 4;
968 + }
969 +
970 + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
971 +#ifdef CONFIG_SMP
972 + c->phys_proc_id = c->initial_apicid;
973 +#endif
974 + /* AMD-defined flags: level 0x80000001 */
975 + xlvl = cpuid_eax(0x80000000);
976 + c->extended_cpuid_level = xlvl;
977 + if ((xlvl & 0xffff0000) == 0x80000000) {
978 + if (xlvl >= 0x80000001) {
979 + c->x86_capability[1] = cpuid_edx(0x80000001);
980 + c->x86_capability[6] = cpuid_ecx(0x80000001);
981 + }
982 + if (xlvl >= 0x80000004)
983 + get_model_name(c); /* Default name */
984 + }
985 +
986 + /* Transmeta-defined flags: level 0x80860001 */
987 + xlvl = cpuid_eax(0x80860000);
988 + if ((xlvl & 0xffff0000) == 0x80860000) {
989 + /* Don't set x86_cpuid_level here for now to not confuse. */
990 + if (xlvl >= 0x80860001)
991 + c->x86_capability[2] = cpuid_edx(0x80860001);
992 + }
993 +
994 + if (c->extended_cpuid_level >= 0x80000007)
995 + c->x86_power = cpuid_edx(0x80000007);
996 +
997 + if (c->extended_cpuid_level >= 0x80000008) {
998 + u32 eax = cpuid_eax(0x80000008);
999 +
1000 + c->x86_virt_bits = (eax >> 8) & 0xff;
1001 + c->x86_phys_bits = eax & 0xff;
1002 + }
1003 +
1004 + detect_nopl(c);
1005 +
1006 + if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
1007 + cpu_devs[c->x86_vendor]->c_early_init)
1008 + cpu_devs[c->x86_vendor]->c_early_init(c);
1009 +
1010 + validate_pat_support(c);
1011 +}
1012 +
1013 +/*
1014 + * This does the hard work of actually picking apart the CPU stuff...
1015 + */
1016 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
1017 +{
1018 + int i;
1019 +
1020 + early_identify_cpu(c);
1021 +
1022 + init_scattered_cpuid_features(c);
1023 +
1024 + c->apicid = phys_pkg_id(0);
1025 +
1026 + /*
1027 + * Vendor-specific initialization. In this section we
1028 + * canonicalize the feature flags, meaning if there are
1029 + * features a certain CPU supports which CPUID doesn't
1030 + * tell us, CPUID claiming incorrect flags, or other bugs,
1031 + * we handle them here.
1032 + *
1033 + * At the end of this section, c->x86_capability better
1034 + * indicate the features this CPU genuinely supports!
1035 + */
1036 + if (this_cpu->c_init)
1037 + this_cpu->c_init(c);
1038 +
1039 + detect_ht(c);
1040 +
1041 + /*
1042 + * On SMP, boot_cpu_data holds the common feature set between
1043 + * all CPUs; so make sure that we indicate which features are
1044 + * common between the CPUs. The first time this routine gets
1045 + * executed, c == &boot_cpu_data.
1046 + */
1047 + if (c != &boot_cpu_data) {
1048 + /* AND the already accumulated flags with these */
1049 + for (i = 0; i < NCAPINTS; i++)
1050 + boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1051 + }
1052 +
1053 + /* Clear all flags overriden by options */
1054 + for (i = 0; i < NCAPINTS; i++)
1055 + c->x86_capability[i] &= ~cleared_cpu_caps[i];
1056 +
1057 +#ifdef CONFIG_X86_MCE
1058 + mcheck_init(c);
1059 +#endif
1060 + select_idle_routine(c);
1061 +
1062 +#ifdef CONFIG_NUMA
1063 + numa_add_cpu(smp_processor_id());
1064 +#endif
1065 +
1066 +}
1067 +
1068 +void __cpuinit identify_boot_cpu(void)
1069 +{
1070 + identify_cpu(&boot_cpu_data);
1071 +}
1072 +
1073 +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1074 +{
1075 + BUG_ON(c == &boot_cpu_data);
1076 + identify_cpu(c);
1077 + mtrr_ap_init();
1078 +}
1079 +
1080 +static __init int setup_noclflush(char *arg)
1081 +{
1082 + setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1083 + return 1;
1084 +}
1085 +__setup("noclflush", setup_noclflush);
1086 +
1087 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1088 +{
1089 + if (c->x86_model_id[0])
1090 + printk(KERN_CONT "%s", c->x86_model_id);
1091 +
1092 + if (c->x86_mask || c->cpuid_level >= 0)
1093 + printk(KERN_CONT " stepping %02x\n", c->x86_mask);
1094 + else
1095 + printk(KERN_CONT "\n");
1096 +}
1097 +
1098 +static __init int setup_disablecpuid(char *arg)
1099 +{
1100 + int bit;
1101 + if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1102 + setup_clear_cpu_cap(bit);
1103 + else
1104 + return 0;
1105 + return 1;
1106 +}
1107 +__setup("clearcpuid=", setup_disablecpuid);
1108 +
1109 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1110 +
1111 +struct x8664_pda **_cpu_pda __read_mostly;
1112 +EXPORT_SYMBOL(_cpu_pda);
1113 +
1114 +#ifndef CONFIG_X86_NO_IDT
1115 +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
1116 +#endif
1117 +
1118 +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
1119 +
1120 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
1121 +EXPORT_SYMBOL_GPL(__supported_pte_mask);
1122 +
1123 +static int do_not_nx __cpuinitdata;
1124 +
1125 +/* noexec=on|off
1126 +Control non executable mappings for 64bit processes.
1127 +
1128 +on Enable(default)
1129 +off Disable
1130 +*/
1131 +static int __init nonx_setup(char *str)
1132 +{
1133 + if (!str)
1134 + return -EINVAL;
1135 + if (!strncmp(str, "on", 2)) {
1136 + __supported_pte_mask |= _PAGE_NX;
1137 + do_not_nx = 0;
1138 + } else if (!strncmp(str, "off", 3)) {
1139 + do_not_nx = 1;
1140 + __supported_pte_mask &= ~_PAGE_NX;
1141 + }
1142 + return 0;
1143 +}
1144 +early_param("noexec", nonx_setup);
1145 +
1146 +int force_personality32;
1147 +
1148 +/* noexec32=on|off
1149 +Control non executable heap for 32bit processes.
1150 +To control the stack too use noexec=off
1151 +
1152 +on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
1153 +off PROT_READ implies PROT_EXEC
1154 +*/
1155 +static int __init nonx32_setup(char *str)
1156 +{
1157 + if (!strcmp(str, "on"))
1158 + force_personality32 &= ~READ_IMPLIES_EXEC;
1159 + else if (!strcmp(str, "off"))
1160 + force_personality32 |= READ_IMPLIES_EXEC;
1161 + return 1;
1162 +}
1163 +__setup("noexec32=", nonx32_setup);
1164 +
1165 +static void __init_refok switch_pt(int cpu)
1166 +{
1167 +#ifdef CONFIG_XEN
1168 + if (cpu == 0)
1169 + xen_init_pt();
1170 + xen_pt_switch(__pa_symbol(init_level4_pgt));
1171 + xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
1172 +#endif
1173 +}
1174 +
1175 +void pda_init(int cpu)
1176 +{
1177 + struct x8664_pda *pda = cpu_pda(cpu);
1178 +
1179 + /* Setup up data that may be needed in __get_free_pages early */
1180 + loadsegment(fs, 0);
1181 + loadsegment(gs, 0);
1182 +#ifndef CONFIG_XEN
1183 + /* Memory clobbers used to order PDA accessed */
1184 + mb();
1185 + wrmsrl(MSR_GS_BASE, pda);
1186 + mb();
1187 +#else
1188 + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
1189 + (unsigned long)pda))
1190 + BUG();
1191 +#endif
1192 +
1193 + pda->cpunumber = cpu;
1194 + pda->irqcount = -1;
1195 + pda->kernelstack = (unsigned long)stack_thread_info() -
1196 + PDA_STACKOFFSET + THREAD_SIZE;
1197 + pda->active_mm = &init_mm;
1198 + pda->mmu_state = 0;
1199 +
1200 + if (cpu == 0) {
1201 + /* others are initialized in smpboot.c */
1202 + pda->pcurrent = &init_task;
1203 + pda->irqstackptr = boot_cpu_stack;
1204 + pda->irqstackptr += IRQSTACKSIZE - 64;
1205 + } else {
1206 + if (!pda->irqstackptr) {
1207 + pda->irqstackptr = (char *)
1208 + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
1209 + if (!pda->irqstackptr)
1210 + panic("cannot allocate irqstack for cpu %d",
1211 + cpu);
1212 + pda->irqstackptr += IRQSTACKSIZE - 64;
1213 + }
1214 +
1215 + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
1216 + pda->nodenumber = cpu_to_node(cpu);
1217 + }
1218 +
1219 + switch_pt(cpu);
1220 +}
1221 +
1222 +#ifndef CONFIG_X86_NO_TSS
1223 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
1224 + DEBUG_STKSZ] __page_aligned_bss;
1225 +#endif
1226 +
1227 +extern asmlinkage void ignore_sysret(void);
1228 +
1229 +void __cpuinit syscall_init(void)
1230 +{
1231 +#ifndef CONFIG_XEN
1232 + /*
1233 + * LSTAR and STAR live in a bit strange symbiosis.
1234 + * They both write to the same internal register. STAR allows to
1235 + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
1236 + */
1237 + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1238 + wrmsrl(MSR_LSTAR, system_call);
1239 + wrmsrl(MSR_CSTAR, ignore_sysret);
1240 +
1241 + /* Flags to clear on syscall */
1242 + wrmsrl(MSR_SYSCALL_MASK,
1243 + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1244 +#endif
1245 +#ifdef CONFIG_IA32_EMULATION
1246 + syscall32_cpu_init();
1247 +#else
1248 + static /*const*/ struct callback_register __cpuinitdata cstar = {
1249 + .type = CALLBACKTYPE_syscall32,
1250 + .address = (unsigned long)ignore_sysret
1251 + };
1252 +
1253 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
1254 + printk(KERN_WARN "Unable to register CSTAR callback\n");
1255 +#endif
1256 +}
1257 +
1258 +void __cpuinit check_efer(void)
1259 +{
1260 + unsigned long efer;
1261 +
1262 + rdmsrl(MSR_EFER, efer);
1263 + if (!(efer & EFER_NX) || do_not_nx)
1264 + __supported_pte_mask &= ~_PAGE_NX;
1265 +}
1266 +
1267 +unsigned long kernel_eflags;
1268 +
1269 +#ifndef CONFIG_X86_NO_TSS
1270 +/*
1271 + * Copies of the original ist values from the tss are only accessed during
1272 + * debugging, no special alignment required.
1273 + */
1274 +DEFINE_PER_CPU(struct orig_ist, orig_ist);
1275 +#endif
1276 +
1277 +/*
1278 + * cpu_init() initializes state that is per-CPU. Some data is already
1279 + * initialized (naturally) in the bootstrap process, such as the GDT
1280 + * and IDT. We reload them nevertheless, this function acts as a
1281 + * 'CPU state barrier', nothing should get across.
1282 + * A lot of state is already set up in PDA init.
1283 + */
1284 +void __cpuinit cpu_init(void)
1285 +{
1286 + int cpu = stack_smp_processor_id();
1287 +#ifndef CONFIG_X86_NO_TSS
1288 + struct tss_struct *t = &per_cpu(init_tss, cpu);
1289 + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
1290 + unsigned long v;
1291 + char *estacks = NULL;
1292 + int i;
1293 +#endif
1294 + struct task_struct *me;
1295 +
1296 + /* CPU 0 is initialised in head64.c */
1297 + if (cpu != 0)
1298 + pda_init(cpu);
1299 +#ifndef CONFIG_X86_NO_TSS
1300 + else
1301 + estacks = boot_exception_stacks;
1302 +#endif
1303 +
1304 + me = current;
1305 +
1306 + if (cpu_test_and_set(cpu, cpu_initialized))
1307 + panic("CPU#%d already initialized!\n", cpu);
1308 +
1309 + printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1310 +
1311 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1312 +
1313 + /*
1314 + * Initialize the per-CPU GDT with the boot GDT,
1315 + * and set up the GDT descriptor:
1316 + */
1317 +
1318 + switch_to_new_gdt();
1319 +#ifndef CONFIG_X86_NO_IDT
1320 + load_idt((const struct desc_ptr *)&idt_descr);
1321 +#endif
1322 +
1323 + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
1324 + syscall_init();
1325 +
1326 + wrmsrl(MSR_FS_BASE, 0);
1327 + wrmsrl(MSR_KERNEL_GS_BASE, 0);
1328 + barrier();
1329 +
1330 + check_efer();
1331 +
1332 +#ifndef CONFIG_X86_NO_TSS
1333 + /*
1334 + * set up and load the per-CPU TSS
1335 + */
1336 + if (!orig_ist->ist[0]) {
1337 + static const unsigned int order[N_EXCEPTION_STACKS] = {
1338 + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1339 + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1340 + };
1341 + for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1342 + if (cpu) {
1343 + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1344 + if (!estacks)
1345 + panic("Cannot allocate exception "
1346 + "stack %ld %d\n", v, cpu);
1347 + }
1348 + estacks += PAGE_SIZE << order[v];
1349 + orig_ist->ist[v] = t->x86_tss.ist[v] =
1350 + (unsigned long)estacks;
1351 + }
1352 + }
1353 +
1354 + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1355 + /*
1356 + * <= is required because the CPU will access up to
1357 + * 8 bits beyond the end of the IO permission bitmap.
1358 + */
1359 + for (i = 0; i <= IO_BITMAP_LONGS; i++)
1360 + t->io_bitmap[i] = ~0UL;
1361 +#endif
1362 +
1363 + atomic_inc(&init_mm.mm_count);
1364 + me->active_mm = &init_mm;
1365 + if (me->mm)
1366 + BUG();
1367 + enter_lazy_tlb(&init_mm, me);
1368 +
1369 + load_sp0(t, &current->thread);
1370 +#ifndef CONFIG_X86_NO_TSS
1371 + set_tss_desc(cpu, t);
1372 + load_TR_desc();
1373 +#endif
1374 + load_LDT(&init_mm.context);
1375 +
1376 +#ifdef CONFIG_KGDB
1377 + /*
1378 + * If the kgdb is connected no debug regs should be altered. This
1379 + * is only applicable when KGDB and a KGDB I/O module are built
1380 + * into the kernel and you are using early debugging with
1381 + * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1382 + */
1383 + if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1384 + arch_kgdb_ops.correct_hw_break();
1385 + else {
1386 +#endif
1387 + /*
1388 + * Clear all 6 debug registers:
1389 + */
1390 +
1391 + set_debugreg(0UL, 0);
1392 + set_debugreg(0UL, 1);
1393 + set_debugreg(0UL, 2);
1394 + set_debugreg(0UL, 3);
1395 + set_debugreg(0UL, 6);
1396 + set_debugreg(0UL, 7);
1397 +#ifdef CONFIG_KGDB
1398 + /* If the kgdb is connected no debug regs should be altered. */
1399 + }
1400 +#endif
1401 +
1402 + fpu_init();
1403 +
1404 + asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
1405 + if (raw_irqs_disabled())
1406 + kernel_eflags &= ~X86_EFLAGS_IF;
1407 +
1408 + if (is_uv_system())
1409 + uv_cpu_init();
1410 +}
1411 Index: head-2008-12-01/arch/x86/kernel/e820-xen.c
1412 ===================================================================
1413 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
1414 +++ head-2008-12-01/arch/x86/kernel/e820-xen.c 2008-12-01 11:49:07.000000000 +0100
1415 @@ -0,0 +1,1470 @@
1416 +/*
1417 + * Handle the memory map.
1418 + * The functions here do the job until bootmem takes over.
1419 + *
1420 + * Getting sanitize_e820_map() in sync with i386 version by applying change:
1421 + * - Provisions for empty E820 memory regions (reported by certain BIOSes).
1422 + * Alex Achenbach <xela@slit.de>, December 2002.
1423 + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
1424 + *
1425 + */
1426 +#include <linux/kernel.h>
1427 +#include <linux/types.h>
1428 +#include <linux/init.h>
1429 +#include <linux/bootmem.h>
1430 +#include <linux/ioport.h>
1431 +#include <linux/string.h>
1432 +#include <linux/kexec.h>
1433 +#include <linux/module.h>
1434 +#include <linux/mm.h>
1435 +#include <linux/pfn.h>
1436 +#include <linux/suspend.h>
1437 +#include <linux/firmware-map.h>
1438 +
1439 +#include <asm/pgtable.h>
1440 +#include <asm/page.h>
1441 +#include <asm/e820.h>
1442 +#include <asm/proto.h>
1443 +#include <asm/setup.h>
1444 +#include <xen/interface/memory.h>
1445 +
1446 +/*
1447 + * The e820 map is the map that gets modified e.g. with command line parameters
1448 + * and that is also registered with modifications in the kernel resource tree
1449 + * with the iomem_resource as parent.
1450 + *
1451 + * The e820_saved is directly saved after the BIOS-provided memory map is
1452 + * copied. It doesn't get modified afterwards. It's registered for the
1453 + * /sys/firmware/memmap interface.
1454 + *
1455 + * That memory map is not modified and is used as base for kexec. The kexec'd
1456 + * kernel should get the same memory map as the firmware provides. Then the
1457 + * user can e.g. boot the original kernel with mem=1G while still booting the
1458 + * next kernel with full memory.
1459 + */
1460 +struct e820map e820;
1461 +struct e820map e820_saved;
1462 +#ifdef CONFIG_XEN
1463 +static struct e820map machine_e820;
1464 +#endif
1465 +
1466 +/* For PCI or other memory-mapped resources */
1467 +unsigned long pci_mem_start = 0xaeedbabe;
1468 +#ifdef CONFIG_PCI
1469 +EXPORT_SYMBOL(pci_mem_start);
1470 +#endif
1471 +
1472 +/*
1473 + * This function checks if any part of the range <start,end> is mapped
1474 + * with type.
1475 + */
1476 +int
1477 +e820_any_mapped(u64 start, u64 end, unsigned type)
1478 +{
1479 + int i;
1480 +
1481 +#ifndef CONFIG_XEN
1482 + for (i = 0; i < e820.nr_map; i++) {
1483 + struct e820entry *ei = &e820.map[i];
1484 +#else
1485 + if (!is_initial_xendomain())
1486 + return 0;
1487 + for (i = 0; i < machine_e820.nr_map; ++i) {
1488 + const struct e820entry *ei = &machine_e820.map[i];
1489 +#endif
1490 +
1491 + if (type && ei->type != type)
1492 + continue;
1493 + if (ei->addr >= end || ei->addr + ei->size <= start)
1494 + continue;
1495 + return 1;
1496 + }
1497 + return 0;
1498 +}
1499 +EXPORT_SYMBOL_GPL(e820_any_mapped);
1500 +
1501 +/*
1502 + * This function checks if the entire range <start,end> is mapped with type.
1503 + *
1504 + * Note: this function only works correct if the e820 table is sorted and
1505 + * not-overlapping, which is the case
1506 + */
1507 +int __init e820_all_mapped(u64 start, u64 end, unsigned type)
1508 +{
1509 + int i;
1510 +
1511 +#ifndef CONFIG_XEN
1512 + for (i = 0; i < e820.nr_map; i++) {
1513 + struct e820entry *ei = &e820.map[i];
1514 +#else
1515 + if (!is_initial_xendomain())
1516 + return 0;
1517 + for (i = 0; i < machine_e820.nr_map; ++i) {
1518 + const struct e820entry *ei = &machine_e820.map[i];
1519 +#endif
1520 +
1521 + if (type && ei->type != type)
1522 + continue;
1523 + /* is the region (part) in overlap with the current region ?*/
1524 + if (ei->addr >= end || ei->addr + ei->size <= start)
1525 + continue;
1526 +
1527 + /* if the region is at the beginning of <start,end> we move
1528 + * start to the end of the region since it's ok until there
1529 + */
1530 + if (ei->addr <= start)
1531 + start = ei->addr + ei->size;
1532 + /*
1533 + * if start is now at or beyond end, we're done, full
1534 + * coverage
1535 + */
1536 + if (start >= end)
1537 + return 1;
1538 + }
1539 + return 0;
1540 +}
1541 +
1542 +/*
1543 + * Add a memory region to the kernel e820 map.
1544 + */
1545 +void __init e820_add_region(u64 start, u64 size, int type)
1546 +{
1547 + int x = e820.nr_map;
1548 +
1549 + if (x == ARRAY_SIZE(e820.map)) {
1550 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1551 + return;
1552 + }
1553 +
1554 + e820.map[x].addr = start;
1555 + e820.map[x].size = size;
1556 + e820.map[x].type = type;
1557 + e820.nr_map++;
1558 +}
1559 +
1560 +void __init e820_print_map(char *who)
1561 +{
1562 + int i;
1563 +
1564 + for (i = 0; i < e820.nr_map; i++) {
1565 + printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1566 + (unsigned long long) e820.map[i].addr,
1567 + (unsigned long long)
1568 + (e820.map[i].addr + e820.map[i].size));
1569 + switch (e820.map[i].type) {
1570 + case E820_RAM:
1571 + case E820_RESERVED_KERN:
1572 + printk(KERN_CONT "(usable)\n");
1573 + break;
1574 + case E820_RESERVED:
1575 + printk(KERN_CONT "(reserved)\n");
1576 + break;
1577 + case E820_ACPI:
1578 + printk(KERN_CONT "(ACPI data)\n");
1579 + break;
1580 + case E820_NVS:
1581 + printk(KERN_CONT "(ACPI NVS)\n");
1582 + break;
1583 + default:
1584 + printk(KERN_CONT "type %u\n", e820.map[i].type);
1585 + break;
1586 + }
1587 + }
1588 +}
1589 +
1590 +/*
1591 + * Sanitize the BIOS e820 map.
1592 + *
1593 + * Some e820 responses include overlapping entries. The following
1594 + * replaces the original e820 map with a new one, removing overlaps,
1595 + * and resolving conflicting memory types in favor of highest
1596 + * numbered type.
1597 + *
1598 + * The input parameter biosmap points to an array of 'struct
1599 + * e820entry' which on entry has elements in the range [0, *pnr_map)
1600 + * valid, and which has space for up to max_nr_map entries.
1601 + * On return, the resulting sanitized e820 map entries will be in
1602 + * overwritten in the same location, starting at biosmap.
1603 + *
1604 + * The integer pointed to by pnr_map must be valid on entry (the
1605 + * current number of valid entries located at biosmap) and will
1606 + * be updated on return, with the new number of valid entries
1607 + * (something no more than max_nr_map.)
1608 + *
1609 + * The return value from sanitize_e820_map() is zero if it
1610 + * successfully 'sanitized' the map entries passed in, and is -1
1611 + * if it did nothing, which can happen if either of (1) it was
1612 + * only passed one map entry, or (2) any of the input map entries
1613 + * were invalid (start + size < start, meaning that the size was
1614 + * so big the described memory range wrapped around through zero.)
1615 + *
1616 + * Visually we're performing the following
1617 + * (1,2,3,4 = memory types)...
1618 + *
1619 + * Sample memory map (w/overlaps):
1620 + * ____22__________________
1621 + * ______________________4_
1622 + * ____1111________________
1623 + * _44_____________________
1624 + * 11111111________________
1625 + * ____________________33__
1626 + * ___________44___________
1627 + * __________33333_________
1628 + * ______________22________
1629 + * ___________________2222_
1630 + * _________111111111______
1631 + * _____________________11_
1632 + * _________________4______
1633 + *
1634 + * Sanitized equivalent (no overlap):
1635 + * 1_______________________
1636 + * _44_____________________
1637 + * ___1____________________
1638 + * ____22__________________
1639 + * ______11________________
1640 + * _________1______________
1641 + * __________3_____________
1642 + * ___________44___________
1643 + * _____________33_________
1644 + * _______________2________
1645 + * ________________1_______
1646 + * _________________4______
1647 + * ___________________2____
1648 + * ____________________33__
1649 + * ______________________4_
1650 + */
1651 +
1652 +int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
1653 + int *pnr_map)
1654 +{
1655 + struct change_member {
1656 + struct e820entry *pbios; /* pointer to original bios entry */
1657 + unsigned long long addr; /* address for this change point */
1658 + };
1659 + static struct change_member change_point_list[2*E820_X_MAX] __initdata;
1660 + static struct change_member *change_point[2*E820_X_MAX] __initdata;
1661 + static struct e820entry *overlap_list[E820_X_MAX] __initdata;
1662 + static struct e820entry new_bios[E820_X_MAX] __initdata;
1663 + struct change_member *change_tmp;
1664 + unsigned long current_type, last_type;
1665 + unsigned long long last_addr;
1666 + int chgidx, still_changing;
1667 + int overlap_entries;
1668 + int new_bios_entry;
1669 + int old_nr, new_nr, chg_nr;
1670 + int i;
1671 +
1672 + /* if there's only one memory region, don't bother */
1673 +#ifdef CONFIG_XEN
1674 + if (*pnr_map == 1)
1675 + return 0;
1676 +#endif
1677 + if (*pnr_map < 2)
1678 + return -1;
1679 +
1680 + old_nr = *pnr_map;
1681 + BUG_ON(old_nr > max_nr_map);
1682 +
1683 + /* bail out if we find any unreasonable addresses in bios map */
1684 + for (i = 0; i < old_nr; i++)
1685 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
1686 + return -1;
1687 +
1688 + /* create pointers for initial change-point information (for sorting) */
1689 + for (i = 0; i < 2 * old_nr; i++)
1690 + change_point[i] = &change_point_list[i];
1691 +
1692 + /* record all known change-points (starting and ending addresses),
1693 + omitting those that are for empty memory regions */
1694 + chgidx = 0;
1695 + for (i = 0; i < old_nr; i++) {
1696 + if (biosmap[i].size != 0) {
1697 + change_point[chgidx]->addr = biosmap[i].addr;
1698 + change_point[chgidx++]->pbios = &biosmap[i];
1699 + change_point[chgidx]->addr = biosmap[i].addr +
1700 + biosmap[i].size;
1701 + change_point[chgidx++]->pbios = &biosmap[i];
1702 + }
1703 + }
1704 + chg_nr = chgidx;
1705 +
1706 + /* sort change-point list by memory addresses (low -> high) */
1707 + still_changing = 1;
1708 + while (still_changing) {
1709 + still_changing = 0;
1710 + for (i = 1; i < chg_nr; i++) {
1711 + unsigned long long curaddr, lastaddr;
1712 + unsigned long long curpbaddr, lastpbaddr;
1713 +
1714 + curaddr = change_point[i]->addr;
1715 + lastaddr = change_point[i - 1]->addr;
1716 + curpbaddr = change_point[i]->pbios->addr;
1717 + lastpbaddr = change_point[i - 1]->pbios->addr;
1718 +
1719 + /*
1720 + * swap entries, when:
1721 + *
1722 + * curaddr > lastaddr or
1723 + * curaddr == lastaddr and curaddr == curpbaddr and
1724 + * lastaddr != lastpbaddr
1725 + */
1726 + if (curaddr < lastaddr ||
1727 + (curaddr == lastaddr && curaddr == curpbaddr &&
1728 + lastaddr != lastpbaddr)) {
1729 + change_tmp = change_point[i];
1730 + change_point[i] = change_point[i-1];
1731 + change_point[i-1] = change_tmp;
1732 + still_changing = 1;
1733 + }
1734 + }
1735 + }
1736 +
1737 + /* create a new bios memory map, removing overlaps */
1738 + overlap_entries = 0; /* number of entries in the overlap table */
1739 + new_bios_entry = 0; /* index for creating new bios map entries */
1740 + last_type = 0; /* start with undefined memory type */
1741 + last_addr = 0; /* start with 0 as last starting address */
1742 +
1743 + /* loop through change-points, determining affect on the new bios map */
1744 + for (chgidx = 0; chgidx < chg_nr; chgidx++) {
1745 + /* keep track of all overlapping bios entries */
1746 + if (change_point[chgidx]->addr ==
1747 + change_point[chgidx]->pbios->addr) {
1748 + /*
1749 + * add map entry to overlap list (> 1 entry
1750 + * implies an overlap)
1751 + */
1752 + overlap_list[overlap_entries++] =
1753 + change_point[chgidx]->pbios;
1754 + } else {
1755 + /*
1756 + * remove entry from list (order independent,
1757 + * so swap with last)
1758 + */
1759 + for (i = 0; i < overlap_entries; i++) {
1760 + if (overlap_list[i] ==
1761 + change_point[chgidx]->pbios)
1762 + overlap_list[i] =
1763 + overlap_list[overlap_entries-1];
1764 + }
1765 + overlap_entries--;
1766 + }
1767 + /*
1768 + * if there are overlapping entries, decide which
1769 + * "type" to use (larger value takes precedence --
1770 + * 1=usable, 2,3,4,4+=unusable)
1771 + */
1772 + current_type = 0;
1773 + for (i = 0; i < overlap_entries; i++)
1774 + if (overlap_list[i]->type > current_type)
1775 + current_type = overlap_list[i]->type;
1776 + /*
1777 + * continue building up new bios map based on this
1778 + * information
1779 + */
1780 + if (current_type != last_type) {
1781 + if (last_type != 0) {
1782 + new_bios[new_bios_entry].size =
1783 + change_point[chgidx]->addr - last_addr;
1784 + /*
1785 + * move forward only if the new size
1786 + * was non-zero
1787 + */
1788 + if (new_bios[new_bios_entry].size != 0)
1789 + /*
1790 + * no more space left for new
1791 + * bios entries ?
1792 + */
1793 + if (++new_bios_entry >= max_nr_map)
1794 + break;
1795 + }
1796 + if (current_type != 0) {
1797 + new_bios[new_bios_entry].addr =
1798 + change_point[chgidx]->addr;
1799 + new_bios[new_bios_entry].type = current_type;
1800 + last_addr = change_point[chgidx]->addr;
1801 + }
1802 + last_type = current_type;
1803 + }
1804 + }
1805 + /* retain count for new bios entries */
1806 + new_nr = new_bios_entry;
1807 +
1808 + /* copy new bios mapping into original location */
1809 + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
1810 + *pnr_map = new_nr;
1811 +
1812 + return 0;
1813 +}
1814 +
1815 +static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
1816 +{
1817 + while (nr_map) {
1818 + u64 start = biosmap->addr;
1819 + u64 size = biosmap->size;
1820 + u64 end = start + size;
1821 + u32 type = biosmap->type;
1822 +
1823 + /* Overflow in 64 bits? Ignore the memory map. */
1824 + if (start > end)
1825 + return -1;
1826 +
1827 + e820_add_region(start, size, type);
1828 +
1829 + biosmap++;
1830 + nr_map--;
1831 + }
1832 + return 0;
1833 +}
1834 +
1835 +/*
1836 + * Copy the BIOS e820 map into a safe place.
1837 + *
1838 + * Sanity-check it while we're at it..
1839 + *
1840 + * If we're lucky and live on a modern system, the setup code
1841 + * will have given us a memory map that we can use to properly
1842 + * set up memory. If we aren't, we'll fake a memory map.
1843 + */
1844 +static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
1845 +{
1846 +#ifndef CONFIG_XEN
1847 + /* Only one memory region (or negative)? Ignore it */
1848 + if (nr_map < 2)
1849 + return -1;
1850 +#else
1851 + BUG_ON(nr_map < 1);
1852 +#endif
1853 +
1854 + return __append_e820_map(biosmap, nr_map);
1855 +}
1856 +
1857 +static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
1858 + u64 size, unsigned old_type,
1859 + unsigned new_type)
1860 +{
1861 + int i;
1862 + u64 real_updated_size = 0;
1863 +
1864 + BUG_ON(old_type == new_type);
1865 +
1866 + if (size > (ULLONG_MAX - start))
1867 + size = ULLONG_MAX - start;
1868 +
1869 + for (i = 0; i < e820.nr_map; i++) {
1870 + struct e820entry *ei = &e820x->map[i];
1871 + u64 final_start, final_end;
1872 + if (ei->type != old_type)
1873 + continue;
1874 + /* totally covered? */
1875 + if (ei->addr >= start &&
1876 + (ei->addr + ei->size) <= (start + size)) {
1877 + ei->type = new_type;
1878 + real_updated_size += ei->size;
1879 + continue;
1880 + }
1881 + /* partially covered */
1882 + final_start = max(start, ei->addr);
1883 + final_end = min(start + size, ei->addr + ei->size);
1884 + if (final_start >= final_end)
1885 + continue;
1886 + e820_add_region(final_start, final_end - final_start,
1887 + new_type);
1888 + real_updated_size += final_end - final_start;
1889 +
1890 + ei->size -= final_end - final_start;
1891 + if (ei->addr < final_start)
1892 + continue;
1893 + ei->addr = final_end;
1894 + }
1895 + return real_updated_size;
1896 +}
1897 +
1898 +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
1899 + unsigned new_type)
1900 +{
1901 + return e820_update_range_map(&e820, start, size, old_type, new_type);
1902 +}
1903 +
1904 +static u64 __init e820_update_range_saved(u64 start, u64 size,
1905 + unsigned old_type, unsigned new_type)
1906 +{
1907 + return e820_update_range_map(&e820_saved, start, size, old_type,
1908 + new_type);
1909 +}
1910 +
1911 +/* make e820 not cover the range */
1912 +u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
1913 + int checktype)
1914 +{
1915 + int i;
1916 + u64 real_removed_size = 0;
1917 +
1918 + if (size > (ULLONG_MAX - start))
1919 + size = ULLONG_MAX - start;
1920 +
1921 + for (i = 0; i < e820.nr_map; i++) {
1922 + struct e820entry *ei = &e820.map[i];
1923 + u64 final_start, final_end;
1924 +
1925 + if (checktype && ei->type != old_type)
1926 + continue;
1927 + /* totally covered? */
1928 + if (ei->addr >= start &&
1929 + (ei->addr + ei->size) <= (start + size)) {
1930 + real_removed_size += ei->size;
1931 + memset(ei, 0, sizeof(struct e820entry));
1932 + continue;
1933 + }
1934 + /* partially covered */
1935 + final_start = max(start, ei->addr);
1936 + final_end = min(start + size, ei->addr + ei->size);
1937 + if (final_start >= final_end)
1938 + continue;
1939 + real_removed_size += final_end - final_start;
1940 +
1941 + ei->size -= final_end - final_start;
1942 + if (ei->addr < final_start)
1943 + continue;
1944 + ei->addr = final_end;
1945 + }
1946 + return real_removed_size;
1947 +}
1948 +
1949 +void __init update_e820(void)
1950 +{
1951 + int nr_map;
1952 +
1953 + nr_map = e820.nr_map;
1954 + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
1955 + return;
1956 + e820.nr_map = nr_map;
1957 + printk(KERN_INFO "modified physical RAM map:\n");
1958 + e820_print_map("modified");
1959 +}
1960 +static void __init update_e820_saved(void)
1961 +{
1962 + int nr_map;
1963 +
1964 + nr_map = e820_saved.nr_map;
1965 + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
1966 + return;
1967 + e820_saved.nr_map = nr_map;
1968 +}
1969 +
1970 +#ifdef CONFIG_XEN
1971 +#define e820 machine_e820
1972 +#endif
1973 +
1974 +#define MAX_GAP_END 0x100000000ull
1975 +/*
1976 + * Search for a gap in the e820 memory space from start_addr to end_addr.
1977 + */
1978 +__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
1979 + unsigned long start_addr, unsigned long long end_addr)
1980 +{
1981 + unsigned long long last;
1982 + int i = e820.nr_map;
1983 + int found = 0;
1984 +
1985 + last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
1986 +#ifdef CONFIG_X86_64
1987 + if (start_addr >= MAX_GAP_END)
1988 + last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
1989 +#endif
1990 +
1991 + while (--i >= 0) {
1992 + unsigned long long start = e820.map[i].addr;
1993 + unsigned long long end = start + e820.map[i].size;
1994 +
1995 + if (end < start_addr)
1996 + continue;
1997 +
1998 + /*
1999 + * Since "last" is at most 4GB, we know we'll
2000 + * fit in 32 bits if this condition is true
2001 + */
2002 + if (last > end) {
2003 + unsigned long gap = last - end;
2004 +
2005 + if (gap >= *gapsize) {
2006 + *gapsize = gap;
2007 + *gapstart = end;
2008 + found = 1;
2009 + }
2010 + }
2011 + if (start < last)
2012 + last = start;
2013 + }
2014 + return found;
2015 +}
2016 +
2017 +/*
2018 + * Search for the biggest gap in the low 32 bits of the e820
2019 + * memory space. We pass this space to PCI to assign MMIO resources
2020 + * for hotplug or unconfigured devices in.
2021 + * Hopefully the BIOS let enough space left.
2022 + */
2023 +__init void e820_setup_gap(void)
2024 +{
2025 + unsigned long gapstart, gapsize, round;
2026 + int found;
2027 +
2028 + gapstart = 0x10000000;
2029 + gapsize = 0x400000;
2030 + found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
2031 +
2032 +#ifdef CONFIG_X86_64
2033 + if (!found) {
2034 + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2035 + "address range\n"
2036 + KERN_ERR "PCI: Unassigned devices with 32bit resource "
2037 + "registers may break!\n");
2038 + found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
2039 + BUG_ON(!found);
2040 + }
2041 +#endif
2042 +
2043 + /*
2044 + * See how much we want to round up: start off with
2045 + * rounding to the next 1MB area.
2046 + */
2047 + round = 0x100000;
2048 + while ((gapsize >> 4) > round)
2049 + round += round;
2050 + /* Fun with two's complement */
2051 + pci_mem_start = (gapstart + round) & -round;
2052 +
2053 + printk(KERN_INFO
2054 + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2055 + pci_mem_start, gapstart, gapsize);
2056 +}
2057 +
2058 +#undef e820
2059 +
2060 +#ifndef CONFIG_XEN
2061 +/**
2062 + * Because of the size limitation of struct boot_params, only first
2063 + * 128 E820 memory entries are passed to kernel via
2064 + * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
2065 + * linked list of struct setup_data, which is parsed here.
2066 + */
2067 +void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
2068 +{
2069 + u32 map_len;
2070 + int entries;
2071 + struct e820entry *extmap;
2072 +
2073 + entries = sdata->len / sizeof(struct e820entry);
2074 + map_len = sdata->len + sizeof(struct setup_data);
2075 + if (map_len > PAGE_SIZE)
2076 + sdata = early_ioremap(pa_data, map_len);
2077 + extmap = (struct e820entry *)(sdata->data);
2078 + __append_e820_map(extmap, entries);
2079 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
2080 + if (map_len > PAGE_SIZE)
2081 + early_iounmap(sdata, map_len);
2082 + printk(KERN_INFO "extended physical RAM map:\n");
2083 + e820_print_map("extended");
2084 +}
2085 +
2086 +#if defined(CONFIG_X86_64) || \
2087 + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
2088 +/**
2089 + * Find the ranges of physical addresses that do not correspond to
2090 + * e820 RAM areas and mark the corresponding pages as nosave for
2091 + * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
2092 + *
2093 + * This function requires the e820 map to be sorted and without any
2094 + * overlapping entries and assumes the first e820 area to be RAM.
2095 + */
2096 +void __init e820_mark_nosave_regions(unsigned long limit_pfn)
2097 +{
2098 + int i;
2099 + unsigned long pfn;
2100 +
2101 + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
2102 + for (i = 1; i < e820.nr_map; i++) {
2103 + struct e820entry *ei = &e820.map[i];
2104 +
2105 + if (pfn < PFN_UP(ei->addr))
2106 + register_nosave_region(pfn, PFN_UP(ei->addr));
2107 +
2108 + pfn = PFN_DOWN(ei->addr + ei->size);
2109 + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
2110 + register_nosave_region(PFN_UP(ei->addr), pfn);
2111 +
2112 + if (pfn >= limit_pfn)
2113 + break;
2114 + }
2115 +}
2116 +#endif
2117 +#endif
2118 +
2119 +/*
2120 + * Early reserved memory areas.
2121 + */
2122 +#define MAX_EARLY_RES 20
2123 +
2124 +struct early_res {
2125 + u64 start, end;
2126 + char name[16];
2127 + char overlap_ok;
2128 +};
2129 +static struct early_res early_res[MAX_EARLY_RES] __initdata = {
2130 +#ifndef CONFIG_XEN
2131 + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
2132 +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
2133 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
2134 +#endif
2135 +#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
2136 + /*
2137 + * But first pinch a few for the stack/trampoline stuff
2138 + * FIXME: Don't need the extra page at 4K, but need to fix
2139 + * trampoline before removing it. (see the GDT stuff)
2140 + */
2141 + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
2142 + /*
2143 + * Has to be in very low memory so we can execute
2144 + * real-mode AP code.
2145 + */
2146 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
2147 +#endif
2148 +#endif
2149 + {}
2150 +};
2151 +
2152 +static int __init find_overlapped_early(u64 start, u64 end)
2153 +{
2154 + int i;
2155 + struct early_res *r;
2156 +
2157 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2158 + r = &early_res[i];
2159 + if (end > r->start && start < r->end)
2160 + break;
2161 + }
2162 +
2163 + return i;
2164 +}
2165 +
2166 +/*
2167 + * Drop the i-th range from the early reservation map,
2168 + * by copying any higher ranges down one over it, and
2169 + * clearing what had been the last slot.
2170 + */
2171 +static void __init drop_range(int i)
2172 +{
2173 + int j;
2174 +
2175 + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
2176 + ;
2177 +
2178 + memmove(&early_res[i], &early_res[i + 1],
2179 + (j - 1 - i) * sizeof(struct early_res));
2180 +
2181 + early_res[j - 1].end = 0;
2182 +}
2183 +
2184 +/*
2185 + * Split any existing ranges that:
2186 + * 1) are marked 'overlap_ok', and
2187 + * 2) overlap with the stated range [start, end)
2188 + * into whatever portion (if any) of the existing range is entirely
2189 + * below or entirely above the stated range. Drop the portion
2190 + * of the existing range that overlaps with the stated range,
2191 + * which will allow the caller of this routine to then add that
2192 + * stated range without conflicting with any existing range.
2193 + */
2194 +static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
2195 +{
2196 + int i;
2197 + struct early_res *r;
2198 + u64 lower_start, lower_end;
2199 + u64 upper_start, upper_end;
2200 + char name[16];
2201 +
2202 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2203 + r = &early_res[i];
2204 +
2205 + /* Continue past non-overlapping ranges */
2206 + if (end <= r->start || start >= r->end)
2207 + continue;
2208 +
2209 + /*
2210 + * Leave non-ok overlaps as is; let caller
2211 + * panic "Overlapping early reservations"
2212 + * when it hits this overlap.
2213 + */
2214 + if (!r->overlap_ok)
2215 + return;
2216 +
2217 + /*
2218 + * We have an ok overlap. We will drop it from the early
2219 + * reservation map, and add back in any non-overlapping
2220 + * portions (lower or upper) as separate, overlap_ok,
2221 + * non-overlapping ranges.
2222 + */
2223 +
2224 + /* 1. Note any non-overlapping (lower or upper) ranges. */
2225 + strncpy(name, r->name, sizeof(name) - 1);
2226 +
2227 + lower_start = lower_end = 0;
2228 + upper_start = upper_end = 0;
2229 + if (r->start < start) {
2230 + lower_start = r->start;
2231 + lower_end = start;
2232 + }
2233 + if (r->end > end) {
2234 + upper_start = end;
2235 + upper_end = r->end;
2236 + }
2237 +
2238 + /* 2. Drop the original ok overlapping range */
2239 + drop_range(i);
2240 +
2241 + i--; /* resume for-loop on copied down entry */
2242 +
2243 + /* 3. Add back in any non-overlapping ranges. */
2244 + if (lower_end)
2245 + reserve_early_overlap_ok(lower_start, lower_end, name);
2246 + if (upper_end)
2247 + reserve_early_overlap_ok(upper_start, upper_end, name);
2248 + }
2249 +}
2250 +
2251 +static void __init __reserve_early(u64 start, u64 end, char *name,
2252 + int overlap_ok)
2253 +{
2254 + int i;
2255 + struct early_res *r;
2256 +
2257 + i = find_overlapped_early(start, end);
2258 + if (i >= MAX_EARLY_RES)
2259 + panic("Too many early reservations");
2260 + r = &early_res[i];
2261 + if (r->end)
2262 + panic("Overlapping early reservations "
2263 + "%llx-%llx %s to %llx-%llx %s\n",
2264 + start, end - 1, name?name:"", r->start,
2265 + r->end - 1, r->name);
2266 + r->start = start;
2267 + r->end = end;
2268 + r->overlap_ok = overlap_ok;
2269 + if (name)
2270 + strncpy(r->name, name, sizeof(r->name) - 1);
2271 +}
2272 +
2273 +/*
2274 + * A few early reservtations come here.
2275 + *
2276 + * The 'overlap_ok' in the name of this routine does -not- mean it
2277 + * is ok for these reservations to overlap an earlier reservation.
2278 + * Rather it means that it is ok for subsequent reservations to
2279 + * overlap this one.
2280 + *
2281 + * Use this entry point to reserve early ranges when you are doing
2282 + * so out of "Paranoia", reserving perhaps more memory than you need,
2283 + * just in case, and don't mind a subsequent overlapping reservation
2284 + * that is known to be needed.
2285 + *
2286 + * The drop_overlaps_that_are_ok() call here isn't really needed.
2287 + * It would be needed if we had two colliding 'overlap_ok'
2288 + * reservations, so that the second such would not panic on the
2289 + * overlap with the first. We don't have any such as of this
2290 + * writing, but might as well tolerate such if it happens in
2291 + * the future.
2292 + */
2293 +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
2294 +{
2295 + drop_overlaps_that_are_ok(start, end);
2296 + __reserve_early(start, end, name, 1);
2297 +}
2298 +
2299 +/*
2300 + * Most early reservations come here.
2301 + *
2302 + * We first have drop_overlaps_that_are_ok() drop any pre-existing
2303 + * 'overlap_ok' ranges, so that we can then reserve this memory
2304 + * range without risk of panic'ing on an overlapping overlap_ok
2305 + * early reservation.
2306 + */
2307 +void __init reserve_early(u64 start, u64 end, char *name)
2308 +{
2309 + drop_overlaps_that_are_ok(start, end);
2310 + __reserve_early(start, end, name, 0);
2311 +}
2312 +
2313 +void __init free_early(u64 start, u64 end)
2314 +{
2315 + struct early_res *r;
2316 + int i;
2317 +
2318 + i = find_overlapped_early(start, end);
2319 + r = &early_res[i];
2320 + if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
2321 + panic("free_early on not reserved area: %llx-%llx!",
2322 + start, end - 1);
2323 +
2324 + drop_range(i);
2325 +}
2326 +
2327 +void __init early_res_to_bootmem(u64 start, u64 end)
2328 +{
2329 + int i, count;
2330 + u64 final_start, final_end;
2331 +
2332 + count = 0;
2333 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
2334 + count++;
2335 +
2336 + printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
2337 + count, start, end);
2338 + for (i = 0; i < count; i++) {
2339 + struct early_res *r = &early_res[i];
2340 + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
2341 + r->start, r->end, r->name);
2342 + final_start = max(start, r->start);
2343 + final_end = min(end, r->end);
2344 + if (final_start >= final_end) {
2345 + printk(KERN_CONT "\n");
2346 + continue;
2347 + }
2348 + printk(KERN_CONT " ==> [%010llx - %010llx]\n",
2349 + final_start, final_end);
2350 + reserve_bootmem_generic(final_start, final_end - final_start,
2351 + BOOTMEM_DEFAULT);
2352 + }
2353 +}
2354 +
2355 +/* Check for already reserved areas */
2356 +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
2357 +{
2358 + int i;
2359 + u64 addr = *addrp;
2360 + int changed = 0;
2361 + struct early_res *r;
2362 +again:
2363 + i = find_overlapped_early(addr, addr + size);
2364 + r = &early_res[i];
2365 + if (i < MAX_EARLY_RES && r->end) {
2366 + *addrp = addr = round_up(r->end, align);
2367 + changed = 1;
2368 + goto again;
2369 + }
2370 + return changed;
2371 +}
2372 +
2373 +/* Check for already reserved areas */
2374 +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
2375 +{
2376 + int i;
2377 + u64 addr = *addrp, last;
2378 + u64 size = *sizep;
2379 + int changed = 0;
2380 +again:
2381 + last = addr + size;
2382 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2383 + struct early_res *r = &early_res[i];
2384 + if (last > r->start && addr < r->start) {
2385 + size = r->start - addr;
2386 + changed = 1;
2387 + goto again;
2388 + }
2389 + if (last > r->end && addr < r->end) {
2390 + addr = round_up(r->end, align);
2391 + size = last - addr;
2392 + changed = 1;
2393 + goto again;
2394 + }
2395 + if (last <= r->end && addr >= r->start) {
2396 + (*sizep)++;
2397 + return 0;
2398 + }
2399 + }
2400 + if (changed) {
2401 + *addrp = addr;
2402 + *sizep = size;
2403 + }
2404 + return changed;
2405 +}
2406 +
2407 +/*
2408 + * Find a free area with specified alignment in a specific range.
2409 + */
2410 +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
2411 +{
2412 + int i;
2413 +
2414 + for (i = 0; i < e820.nr_map; i++) {
2415 + struct e820entry *ei = &e820.map[i];
2416 + u64 addr, last;
2417 + u64 ei_last;
2418 +
2419 + if (ei->type != E820_RAM)
2420 + continue;
2421 + addr = round_up(ei->addr, align);
2422 + ei_last = ei->addr + ei->size;
2423 + if (addr < start)
2424 + addr = round_up(start, align);
2425 + if (addr >= ei_last)
2426 + continue;
2427 + while (bad_addr(&addr, size, align) && addr+size <= ei_last)
2428 + ;
2429 + last = addr + size;
2430 + if (last > ei_last)
2431 + continue;
2432 + if (last > end)
2433 + continue;
2434 + return addr;
2435 + }
2436 + return -1ULL;
2437 +}
2438 +
2439 +/*
2440 + * Find next free range after *start
2441 + */
2442 +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
2443 +{
2444 + int i;
2445 +
2446 + for (i = 0; i < e820.nr_map; i++) {
2447 + struct e820entry *ei = &e820.map[i];
2448 + u64 addr, last;
2449 + u64 ei_last;
2450 +
2451 + if (ei->type != E820_RAM)
2452 + continue;
2453 + addr = round_up(ei->addr, align);
2454 + ei_last = ei->addr + ei->size;
2455 + if (addr < start)
2456 + addr = round_up(start, align);
2457 + if (addr >= ei_last)
2458 + continue;
2459 + *sizep = ei_last - addr;
2460 + while (bad_addr_size(&addr, sizep, align) &&
2461 + addr + *sizep <= ei_last)
2462 + ;
2463 + last = addr + *sizep;
2464 + if (last > ei_last)
2465 + continue;
2466 + return addr;
2467 + }
2468 + return -1UL;
2469 +
2470 +}
2471 +
2472 +/*
2473 + * pre allocated 4k and reserved it in e820
2474 + */
2475 +u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
2476 +{
2477 + u64 size = 0;
2478 + u64 addr;
2479 + u64 start;
2480 +
2481 + start = startt;
2482 + while (size < sizet)
2483 + start = find_e820_area_size(start, &size, align);
2484 +
2485 + if (size < sizet)
2486 + return 0;
2487 +
2488 + addr = round_down(start + size - sizet, align);
2489 + e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
2490 + e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
2491 + printk(KERN_INFO "update e820 for early_reserve_e820\n");
2492 + update_e820();
2493 + update_e820_saved();
2494 +
2495 + return addr;
2496 +}
2497 +
2498 +#ifdef CONFIG_X86_32
2499 +# ifdef CONFIG_X86_PAE
2500 +# define MAX_ARCH_PFN (1ULL<<(40-PAGE_SHIFT))
2501 +# else
2502 +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
2503 +# endif
2504 +#else /* CONFIG_X86_32 */
2505 +# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
2506 +#endif
2507 +
2508 +/*
2509 + * Find the highest page frame number we have available
2510 + */
2511 +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
2512 +{
2513 + int i;
2514 + unsigned long last_pfn = 0;
2515 + unsigned long max_arch_pfn = MAX_ARCH_PFN;
2516 +
2517 + for (i = 0; i < e820.nr_map; i++) {
2518 + struct e820entry *ei = &e820.map[i];
2519 + unsigned long start_pfn;
2520 + unsigned long end_pfn;
2521 +
2522 + if (ei->type != type)
2523 + continue;
2524 +
2525 + start_pfn = ei->addr >> PAGE_SHIFT;
2526 + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
2527 +
2528 + if (start_pfn >= limit_pfn)
2529 + continue;
2530 + if (end_pfn > limit_pfn) {
2531 + last_pfn = limit_pfn;
2532 + break;
2533 + }
2534 + if (end_pfn > last_pfn)
2535 + last_pfn = end_pfn;
2536 + }
2537 +
2538 + if (last_pfn > max_arch_pfn)
2539 + last_pfn = max_arch_pfn;
2540 +
2541 + printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
2542 + last_pfn, max_arch_pfn);
2543 + return last_pfn;
2544 +}
2545 +unsigned long __init e820_end_of_ram_pfn(void)
2546 +{
2547 + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
2548 +}
2549 +
2550 +unsigned long __init e820_end_of_low_ram_pfn(void)
2551 +{
2552 + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
2553 +}
2554 +/*
2555 + * Finds an active region in the address range from start_pfn to last_pfn and
2556 + * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
2557 + */
2558 +int __init e820_find_active_region(const struct e820entry *ei,
2559 + unsigned long start_pfn,
2560 + unsigned long last_pfn,
2561 + unsigned long *ei_startpfn,
2562 + unsigned long *ei_endpfn)
2563 +{
2564 + u64 align = PAGE_SIZE;
2565 +
2566 + *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
2567 + *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
2568 +
2569 + /* Skip map entries smaller than a page */
2570 + if (*ei_startpfn >= *ei_endpfn)
2571 + return 0;
2572 +
2573 + /* Skip if map is outside the node */
2574 + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
2575 + *ei_startpfn >= last_pfn)
2576 + return 0;
2577 +
2578 + /* Check for overlaps */
2579 + if (*ei_startpfn < start_pfn)
2580 + *ei_startpfn = start_pfn;
2581 + if (*ei_endpfn > last_pfn)
2582 + *ei_endpfn = last_pfn;
2583 +
2584 + return 1;
2585 +}
2586 +
2587 +/* Walk the e820 map and register active regions within a node */
2588 +void __init e820_register_active_regions(int nid, unsigned long start_pfn,
2589 + unsigned long last_pfn)
2590 +{
2591 + unsigned long ei_startpfn;
2592 + unsigned long ei_endpfn;
2593 + int i;
2594 +
2595 + for (i = 0; i < e820.nr_map; i++)
2596 + if (e820_find_active_region(&e820.map[i],
2597 + start_pfn, last_pfn,
2598 + &ei_startpfn, &ei_endpfn))
2599 + add_active_range(nid, ei_startpfn, ei_endpfn);
2600 +}
2601 +
2602 +/*
2603 + * Find the hole size (in bytes) in the memory range.
2604 + * @start: starting address of the memory range to scan
2605 + * @end: ending address of the memory range to scan
2606 + */
2607 +u64 __init e820_hole_size(u64 start, u64 end)
2608 +{
2609 + unsigned long start_pfn = start >> PAGE_SHIFT;
2610 + unsigned long last_pfn = end >> PAGE_SHIFT;
2611 + unsigned long ei_startpfn, ei_endpfn, ram = 0;
2612 + int i;
2613 +
2614 + for (i = 0; i < e820.nr_map; i++) {
2615 + if (e820_find_active_region(&e820.map[i],
2616 + start_pfn, last_pfn,
2617 + &ei_startpfn, &ei_endpfn))
2618 + ram += ei_endpfn - ei_startpfn;
2619 + }
2620 + return end - start - ((u64)ram << PAGE_SHIFT);
2621 +}
2622 +
2623 +static void early_panic(char *msg)
2624 +{
2625 + early_printk(msg);
2626 + panic(msg);
2627 +}
2628 +
2629 +static int userdef __initdata;
2630 +
2631 +/* "mem=nopentium" disables the 4MB page tables. */
2632 +static int __init parse_memopt(char *p)
2633 +{
2634 + u64 mem_size, current_end;
2635 + unsigned int i;
2636 +
2637 + if (!p)
2638 + return -EINVAL;
2639 +
2640 +#ifdef CONFIG_X86_32
2641 + if (!strcmp(p, "nopentium")) {
2642 + setup_clear_cpu_cap(X86_FEATURE_PSE);
2643 + return 0;
2644 + }
2645 +#endif
2646 +
2647 + userdef = 1;
2648 + mem_size = memparse(p, &p);
2649 + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2650 +
2651 + i = e820.nr_map - 1;
2652 + current_end = e820.map[i].addr + e820.map[i].size;
2653 + if (current_end < mem_size) {
2654 + /*
2655 + * The e820 map ends before our requested size so
2656 + * extend the final entry to the requested address.
2657 + */
2658 + if (e820.map[i].type == E820_RAM)
2659 + e820.map[i].size = mem_size - e820.map[i].addr;
2660 + else
2661 + e820_add_region(current_end, mem_size - current_end, E820_RAM);
2662 + }
2663 +
2664 + return 0;
2665 +}
2666 +early_param("mem", parse_memopt);
2667 +
2668 +#ifndef CONFIG_XEN
2669 +static int __init parse_memmap_opt(char *p)
2670 +{
2671 + char *oldp;
2672 + u64 start_at, mem_size;
2673 +
2674 + if (!p)
2675 + return -EINVAL;
2676 +
2677 + if (!strncmp(p, "exactmap", 8)) {
2678 +#ifdef CONFIG_CRASH_DUMP
2679 + /*
2680 + * If we are doing a crash dump, we still need to know
2681 + * the real mem size before original memory map is
2682 + * reset.
2683 + */
2684 + saved_max_pfn = e820_end_of_ram_pfn();
2685 +#endif
2686 + e820.nr_map = 0;
2687 + userdef = 1;
2688 + return 0;
2689 + }
2690 +
2691 + oldp = p;
2692 + mem_size = memparse(p, &p);
2693 + if (p == oldp)
2694 + return -EINVAL;
2695 +
2696 + userdef = 1;
2697 + if (*p == '@') {
2698 + start_at = memparse(p+1, &p);
2699 + e820_add_region(start_at, mem_size, E820_RAM);
2700 + } else if (*p == '#') {
2701 + start_at = memparse(p+1, &p);
2702 + e820_add_region(start_at, mem_size, E820_ACPI);
2703 + } else if (*p == '$') {
2704 + start_at = memparse(p+1, &p);
2705 + e820_add_region(start_at, mem_size, E820_RESERVED);
2706 + } else
2707 + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2708 +
2709 + return *p == '\0' ? 0 : -EINVAL;
2710 +}
2711 +early_param("memmap", parse_memmap_opt);
2712 +
2713 +void __init finish_e820_parsing(void)
2714 +{
2715 + if (userdef) {
2716 + int nr = e820.nr_map;
2717 +
2718 + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
2719 + early_panic("Invalid user supplied memory map");
2720 + e820.nr_map = nr;
2721 +
2722 + printk(KERN_INFO "user-defined physical RAM map:\n");
2723 + e820_print_map("user");
2724 + }
2725 +}
2726 +#endif
2727 +
2728 +static inline const char *e820_type_to_string(int e820_type)
2729 +{
2730 + switch (e820_type) {
2731 + case E820_RESERVED_KERN:
2732 + case E820_RAM: return "System RAM";
2733 + case E820_ACPI: return "ACPI Tables";
2734 + case E820_NVS: return "ACPI Non-volatile Storage";
2735 + default: return "reserved";
2736 + }
2737 +}
2738 +
2739 +#ifdef CONFIG_XEN
2740 +#define e820 machine_e820
2741 +#endif
2742 +
2743 +/*
2744 + * Mark e820 reserved areas as busy for the resource manager.
2745 + */
2746 +void __init e820_reserve_resources(void)
2747 +{
2748 + int i;
2749 + struct resource *res;
2750 + u64 end;
2751 +
2752 + res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
2753 + for (i = 0; i < e820.nr_map; i++) {
2754 + end = e820.map[i].addr + e820.map[i].size - 1;
2755 +#ifndef CONFIG_RESOURCES_64BIT
2756 + if (end > 0x100000000ULL) {
2757 + res++;
2758 + continue;
2759 + }
2760 +#endif
2761 + res->name = e820_type_to_string(e820.map[i].type);
2762 + res->start = e820.map[i].addr;
2763 + res->end = end;
2764 +
2765 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
2766 + insert_resource(&iomem_resource, res);
2767 + res++;
2768 + }
2769 +
2770 + for (i = 0; i < e820_saved.nr_map; i++) {
2771 + struct e820entry *entry = &e820_saved.map[i];
2772 + firmware_map_add_early(entry->addr,
2773 + entry->addr + entry->size - 1,
2774 + e820_type_to_string(entry->type));
2775 + }
2776 +}
2777 +
2778 +#undef e820
2779 +
2780 +#ifndef CONFIG_XEN
2781 +char *__init default_machine_specific_memory_setup(void)
2782 +{
2783 + char *who = "BIOS-e820";
2784 + int new_nr;
2785 + /*
2786 + * Try to copy the BIOS-supplied E820-map.
2787 + *
2788 + * Otherwise fake a memory map; one section from 0k->640k,
2789 + * the next section from 1mb->appropriate_mem_k
2790 + */
2791 + new_nr = boot_params.e820_entries;
2792 + sanitize_e820_map(boot_params.e820_map,
2793 + ARRAY_SIZE(boot_params.e820_map),
2794 + &new_nr);
2795 + boot_params.e820_entries = new_nr;
2796 + if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
2797 + < 0) {
2798 + u64 mem_size;
2799 +
2800 + /* compare results from other methods and take the greater */
2801 + if (boot_params.alt_mem_k
2802 + < boot_params.screen_info.ext_mem_k) {
2803 + mem_size = boot_params.screen_info.ext_mem_k;
2804 + who = "BIOS-88";
2805 + } else {
2806 + mem_size = boot_params.alt_mem_k;
2807 + who = "BIOS-e801";
2808 + }
2809 +
2810 + e820.nr_map = 0;
2811 + e820_add_region(0, LOWMEMSIZE(), E820_RAM);
2812 + e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
2813 + }
2814 +
2815 + /* In case someone cares... */
2816 + return who;
2817 +}
2818 +
2819 +char *__init __attribute__((weak)) machine_specific_memory_setup(void)
2820 +{
2821 + if (x86_quirks->arch_memory_setup) {
2822 + char *who = x86_quirks->arch_memory_setup();
2823 +
2824 + if (who)
2825 + return who;
2826 + }
2827 + return default_machine_specific_memory_setup();
2828 +}
2829 +#endif
2830 +
2831 +char * __init memory_setup(void)
2832 +{
2833 + int rc, nr_map;
2834 + struct xen_memory_map memmap;
2835 + /*
2836 + * This is rather large for a stack variable but this early in
2837 + * the boot process we know we have plenty slack space.
2838 + */
2839 + struct e820entry map[E820MAX];
2840 +
2841 + memmap.nr_entries = E820MAX;
2842 + set_xen_guest_handle(memmap.buffer, map);
2843 +
2844 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
2845 + if (rc == -ENOSYS) {
2846 + memmap.nr_entries = 1;
2847 + map[0].addr = 0ULL;
2848 + map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
2849 + /* 8MB slack (to balance backend allocations). */
2850 + map[0].size += 8ULL << 20;
2851 + map[0].type = E820_RAM;
2852 + rc = 0;
2853 + }
2854 + BUG_ON(rc);
2855 +
2856 + nr_map = memmap.nr_entries;
2857 + sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
2858 +
2859 + if (append_e820_map(map, nr_map) < 0)
2860 + BUG();
2861 +
2862 +#ifdef CONFIG_XEN
2863 + if (is_initial_xendomain()) {
2864 + memmap.nr_entries = E820MAX;
2865 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
2866 +
2867 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
2868 + BUG();
2869 + machine_e820.nr_map = memmap.nr_entries;
2870 + } else
2871 + machine_e820 = e820;
2872 +#endif
2873 +
2874 + return "Xen";
2875 +}
2876 +
2877 +void __init setup_memory_map(void)
2878 +{
2879 + char *who;
2880 +
2881 + who = memory_setup();
2882 + memcpy(&e820_saved, &e820, sizeof(struct e820map));
2883 + printk(KERN_INFO "Xen-provided physical RAM map:\n");
2884 + e820_print_map(who);
2885 +}
2886 Index: head-2008-12-01/arch/x86/kernel/e820_32-xen.c
2887 ===================================================================
2888 --- head-2008-12-01.orig/arch/x86/kernel/e820_32-xen.c 2008-12-01 11:44:55.000000000 +0100
2889 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
2890 @@ -1,873 +0,0 @@
2891 -#include <linux/kernel.h>
2892 -#include <linux/types.h>
2893 -#include <linux/init.h>
2894 -#include <linux/bootmem.h>
2895 -#include <linux/ioport.h>
2896 -#include <linux/string.h>
2897 -#include <linux/kexec.h>
2898 -#include <linux/module.h>
2899 -#include <linux/mm.h>
2900 -#include <linux/pfn.h>
2901 -#include <linux/uaccess.h>
2902 -#include <linux/suspend.h>
2903 -
2904 -#include <asm/pgtable.h>
2905 -#include <asm/page.h>
2906 -#include <asm/e820.h>
2907 -#include <asm/setup.h>
2908 -#include <xen/interface/memory.h>
2909 -
2910 -struct e820map e820;
2911 -struct change_member {
2912 - struct e820entry *pbios; /* pointer to original bios entry */
2913 - unsigned long long addr; /* address for this change point */
2914 -};
2915 -static struct change_member change_point_list[2*E820MAX] __initdata;
2916 -static struct change_member *change_point[2*E820MAX] __initdata;
2917 -static struct e820entry *overlap_list[E820MAX] __initdata;
2918 -static struct e820entry new_bios[E820MAX] __initdata;
2919 -/* For PCI or other memory-mapped resources */
2920 -unsigned long pci_mem_start = 0x10000000;
2921 -#ifdef CONFIG_PCI
2922 -EXPORT_SYMBOL(pci_mem_start);
2923 -#endif
2924 -extern int user_defined_memmap;
2925 -
2926 -static struct resource system_rom_resource = {
2927 - .name = "System ROM",
2928 - .start = 0xf0000,
2929 - .end = 0xfffff,
2930 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2931 -};
2932 -
2933 -static struct resource extension_rom_resource = {
2934 - .name = "Extension ROM",
2935 - .start = 0xe0000,
2936 - .end = 0xeffff,
2937 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2938 -};
2939 -
2940 -static struct resource adapter_rom_resources[] = { {
2941 - .name = "Adapter ROM",
2942 - .start = 0xc8000,
2943 - .end = 0,
2944 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2945 -}, {
2946 - .name = "Adapter ROM",
2947 - .start = 0,
2948 - .end = 0,
2949 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2950 -}, {
2951 - .name = "Adapter ROM",
2952 - .start = 0,
2953 - .end = 0,
2954 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2955 -}, {
2956 - .name = "Adapter ROM",
2957 - .start = 0,
2958 - .end = 0,
2959 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2960 -}, {
2961 - .name = "Adapter ROM",
2962 - .start = 0,
2963 - .end = 0,
2964 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2965 -}, {
2966 - .name = "Adapter ROM",
2967 - .start = 0,
2968 - .end = 0,
2969 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2970 -} };
2971 -
2972 -static struct resource video_rom_resource = {
2973 - .name = "Video ROM",
2974 - .start = 0xc0000,
2975 - .end = 0xc7fff,
2976 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2977 -};
2978 -
2979 -#define ROMSIGNATURE 0xaa55
2980 -
2981 -static int __init romsignature(const unsigned char *rom)
2982 -{
2983 - const unsigned short * const ptr = (const unsigned short *)rom;
2984 - unsigned short sig;
2985 -
2986 - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
2987 -}
2988 -
2989 -static int __init romchecksum(const unsigned char *rom, unsigned long length)
2990 -{
2991 - unsigned char sum, c;
2992 -
2993 - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
2994 - sum += c;
2995 - return !length && !sum;
2996 -}
2997 -
2998 -static void __init probe_roms(void)
2999 -{
3000 - const unsigned char *rom;
3001 - unsigned long start, length, upper;
3002 - unsigned char c;
3003 - int i;
3004 -
3005 -#ifdef CONFIG_XEN
3006 - /* Nothing to do if not running in dom0. */
3007 - if (!is_initial_xendomain())
3008 - return;
3009 -#endif
3010 -
3011 - /* video rom */
3012 - upper = adapter_rom_resources[0].start;
3013 - for (start = video_rom_resource.start; start < upper; start += 2048) {
3014 - rom = isa_bus_to_virt(start);
3015 - if (!romsignature(rom))
3016 - continue;
3017 -
3018 - video_rom_resource.start = start;
3019 -
3020 - if (probe_kernel_address(rom + 2, c) != 0)
3021 - continue;
3022 -
3023 - /* 0 < length <= 0x7f * 512, historically */
3024 - length = c * 512;
3025 -
3026 - /* if checksum okay, trust length byte */
3027 - if (length && romchecksum(rom, length))
3028 - video_rom_resource.end = start + length - 1;
3029 -
3030 - request_resource(&iomem_resource, &video_rom_resource);
3031 - break;
3032 - }
3033 -
3034 - start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
3035 - if (start < upper)
3036 - start = upper;
3037 -
3038 - /* system rom */
3039 - request_resource(&iomem_resource, &system_rom_resource);
3040 - upper = system_rom_resource.start;
3041 -
3042 - /* check for extension rom (ignore length byte!) */
3043 - rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
3044 - if (romsignature(rom)) {
3045 - length = extension_rom_resource.end - extension_rom_resource.start + 1;
3046 - if (romchecksum(rom, length)) {
3047 - request_resource(&iomem_resource, &extension_rom_resource);
3048 - upper = extension_rom_resource.start;
3049 - }
3050 - }
3051 -
3052 - /* check for adapter roms on 2k boundaries */
3053 - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
3054 - rom = isa_bus_to_virt(start);
3055 - if (!romsignature(rom))
3056 - continue;
3057 -
3058 - if (probe_kernel_address(rom + 2, c) != 0)
3059 - continue;
3060 -
3061 - /* 0 < length <= 0x7f * 512, historically */
3062 - length = c * 512;
3063 -
3064 - /* but accept any length that fits if checksum okay */
3065 - if (!length || start + length > upper || !romchecksum(rom, length))
3066 - continue;
3067 -
3068 - adapter_rom_resources[i].start = start;
3069 - adapter_rom_resources[i].end = start + length - 1;
3070 - request_resource(&iomem_resource, &adapter_rom_resources[i]);
3071 -
3072 - start = adapter_rom_resources[i++].end & ~2047UL;
3073 - }
3074 -}
3075 -
3076 -#ifdef CONFIG_XEN
3077 -static struct e820map machine_e820;
3078 -#define e820 machine_e820
3079 -#endif
3080 -
3081 -/*
3082 - * Request address space for all standard RAM and ROM resources
3083 - * and also for regions reported as reserved by the e820.
3084 - */
3085 -void __init init_iomem_resources(struct resource *code_resource,
3086 - struct resource *data_resource,
3087 - struct resource *bss_resource)
3088 -{
3089 - int i;
3090 -
3091 - probe_roms();
3092 - for (i = 0; i < e820.nr_map; i++) {
3093 - struct resource *res;
3094 -#ifndef CONFIG_RESOURCES_64BIT
3095 - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
3096 - continue;
3097 -#endif
3098 - res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
3099 - switch (e820.map[i].type) {
3100 - case E820_RAM: res->name = "System RAM"; break;
3101 - case E820_ACPI: res->name = "ACPI Tables"; break;
3102 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
3103 - default: res->name = "reserved";
3104 - }
3105 - res->start = e820.map[i].addr;
3106 - res->end = res->start + e820.map[i].size - 1;
3107 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3108 - if (request_resource(&iomem_resource, res)) {
3109 - kfree(res);
3110 - continue;
3111 - }
3112 - if (e820.map[i].type == E820_RAM) {
3113 - /*
3114 - * We don't know which RAM region contains kernel data,
3115 - * so we try it repeatedly and let the resource manager
3116 - * test it.
3117 - */
3118 -#ifndef CONFIG_XEN
3119 - request_resource(res, code_resource);
3120 - request_resource(res, data_resource);
3121 - request_resource(res, bss_resource);
3122 -#endif
3123 -#ifdef CONFIG_KEXEC
3124 - if (crashk_res.start != crashk_res.end)
3125 - request_resource(res, &crashk_res);
3126 -#ifdef CONFIG_XEN
3127 - xen_machine_kexec_register_resources(res);
3128 -#endif
3129 -#endif
3130 - }
3131 - }
3132 -}
3133 -
3134 -#undef e820
3135 -
3136 -#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
3137 -/**
3138 - * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
3139 - * correspond to e820 RAM areas and mark the corresponding pages as nosave for
3140 - * hibernation.
3141 - *
3142 - * This function requires the e820 map to be sorted and without any
3143 - * overlapping entries and assumes the first e820 area to be RAM.
3144 - */
3145 -void __init e820_mark_nosave_regions(void)
3146 -{
3147 - int i;
3148 - unsigned long pfn;
3149 -
3150 - pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
3151 - for (i = 1; i < e820.nr_map; i++) {
3152 - struct e820entry *ei = &e820.map[i];
3153 -
3154 - if (pfn < PFN_UP(ei->addr))
3155 - register_nosave_region(pfn, PFN_UP(ei->addr));
3156 -
3157 - pfn = PFN_DOWN(ei->addr + ei->size);
3158 - if (ei->type != E820_RAM)
3159 - register_nosave_region(PFN_UP(ei->addr), pfn);
3160 -
3161 - if (pfn >= max_low_pfn)
3162 - break;
3163 - }
3164 -}
3165 -#endif
3166 -
3167 -void __init add_memory_region(unsigned long long start,
3168 - unsigned long long size, int type)
3169 -{
3170 - int x;
3171 -
3172 - x = e820.nr_map;
3173 -
3174 - if (x == E820MAX) {
3175 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
3176 - return;
3177 - }
3178 -
3179 - e820.map[x].addr = start;
3180 - e820.map[x].size = size;
3181 - e820.map[x].type = type;
3182 - e820.nr_map++;
3183 -} /* add_memory_region */
3184 -
3185 -/*
3186 - * Sanitize the BIOS e820 map.
3187 - *
3188 - * Some e820 responses include overlapping entries. The following
3189 - * replaces the original e820 map with a new one, removing overlaps.
3190 - *
3191 - */
3192 -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
3193 -{
3194 - struct change_member *change_tmp;
3195 - unsigned long current_type, last_type;
3196 - unsigned long long last_addr;
3197 - int chgidx, still_changing;
3198 - int overlap_entries;
3199 - int new_bios_entry;
3200 - int old_nr, new_nr, chg_nr;
3201 - int i;
3202 -
3203 - /*
3204 - Visually we're performing the following (1,2,3,4 = memory types)...
3205 -
3206 - Sample memory map (w/overlaps):
3207 - ____22__________________
3208 - ______________________4_
3209 - ____1111________________
3210 - _44_____________________
3211 - 11111111________________
3212 - ____________________33__
3213 - ___________44___________
3214 - __________33333_________
3215 - ______________22________
3216 - ___________________2222_
3217 - _________111111111______
3218 - _____________________11_
3219 - _________________4______
3220 -
3221 - Sanitized equivalent (no overlap):
3222 - 1_______________________
3223 - _44_____________________
3224 - ___1____________________
3225 - ____22__________________
3226 - ______11________________
3227 - _________1______________
3228 - __________3_____________
3229 - ___________44___________
3230 - _____________33_________
3231 - _______________2________
3232 - ________________1_______
3233 - _________________4______
3234 - ___________________2____
3235 - ____________________33__
3236 - ______________________4_
3237 - */
3238 - /* if there's only one memory region, don't bother */
3239 - if (*pnr_map < 2) {
3240 - return -1;
3241 - }
3242 -
3243 - old_nr = *pnr_map;
3244 -
3245 - /* bail out if we find any unreasonable addresses in bios map */
3246 - for (i=0; i<old_nr; i++)
3247 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
3248 - return -1;
3249 - }
3250 -
3251 - /* create pointers for initial change-point information (for sorting) */
3252 - for (i=0; i < 2*old_nr; i++)
3253 - change_point[i] = &change_point_list[i];
3254 -
3255 - /* record all known change-points (starting and ending addresses),
3256 - omitting those that are for empty memory regions */
3257 - chgidx = 0;
3258 - for (i=0; i < old_nr; i++) {
3259 - if (biosmap[i].size != 0) {
3260 - change_point[chgidx]->addr = biosmap[i].addr;
3261 - change_point[chgidx++]->pbios = &biosmap[i];
3262 - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
3263 - change_point[chgidx++]->pbios = &biosmap[i];
3264 - }
3265 - }
3266 - chg_nr = chgidx; /* true number of change-points */
3267 -
3268 - /* sort change-point list by memory addresses (low -> high) */
3269 - still_changing = 1;
3270 - while (still_changing) {
3271 - still_changing = 0;
3272 - for (i=1; i < chg_nr; i++) {
3273 - /* if <current_addr> > <last_addr>, swap */
3274 - /* or, if current=<start_addr> & last=<end_addr>, swap */
3275 - if ((change_point[i]->addr < change_point[i-1]->addr) ||
3276 - ((change_point[i]->addr == change_point[i-1]->addr) &&
3277 - (change_point[i]->addr == change_point[i]->pbios->addr) &&
3278 - (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
3279 - )
3280 - {
3281 - change_tmp = change_point[i];
3282 - change_point[i] = change_point[i-1];
3283 - change_point[i-1] = change_tmp;
3284 - still_changing=1;
3285 - }
3286 - }
3287 - }
3288 -
3289 - /* create a new bios memory map, removing overlaps */
3290 - overlap_entries=0; /* number of entries in the overlap table */
3291 - new_bios_entry=0; /* index for creating new bios map entries */
3292 - last_type = 0; /* start with undefined memory type */
3293 - last_addr = 0; /* start with 0 as last starting address */
3294 - /* loop through change-points, determining affect on the new bios map */
3295 - for (chgidx=0; chgidx < chg_nr; chgidx++)
3296 - {
3297 - /* keep track of all overlapping bios entries */
3298 - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
3299 - {
3300 - /* add map entry to overlap list (> 1 entry implies an overlap) */
3301 - overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
3302 - }
3303 - else
3304 - {
3305 - /* remove entry from list (order independent, so swap with last) */
3306 - for (i=0; i<overlap_entries; i++)
3307 - {
3308 - if (overlap_list[i] == change_point[chgidx]->pbios)
3309 - overlap_list[i] = overlap_list[overlap_entries-1];
3310 - }
3311 - overlap_entries--;
3312 - }
3313 - /* if there are overlapping entries, decide which "type" to use */
3314 - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
3315 - current_type = 0;
3316 - for (i=0; i<overlap_entries; i++)
3317 - if (overlap_list[i]->type > current_type)
3318 - current_type = overlap_list[i]->type;
3319 - /* continue building up new bios map based on this information */
3320 - if (current_type != last_type) {
3321 - if (last_type != 0) {
3322 - new_bios[new_bios_entry].size =
3323 - change_point[chgidx]->addr - last_addr;
3324 - /* move forward only if the new size was non-zero */
3325 - if (new_bios[new_bios_entry].size != 0)
3326 - if (++new_bios_entry >= E820MAX)
3327 - break; /* no more space left for new bios entries */
3328 - }
3329 - if (current_type != 0) {
3330 - new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
3331 - new_bios[new_bios_entry].type = current_type;
3332 - last_addr=change_point[chgidx]->addr;
3333 - }
3334 - last_type = current_type;
3335 - }
3336 - }
3337 - new_nr = new_bios_entry; /* retain count for new bios entries */
3338 -
3339 - /* copy new bios mapping into original location */
3340 - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
3341 - *pnr_map = new_nr;
3342 -
3343 - return 0;
3344 -}
3345 -
3346 -/*
3347 - * Copy the BIOS e820 map into a safe place.
3348 - *
3349 - * Sanity-check it while we're at it..
3350 - *
3351 - * If we're lucky and live on a modern system, the setup code
3352 - * will have given us a memory map that we can use to properly
3353 - * set up memory. If we aren't, we'll fake a memory map.
3354 - *
3355 - * We check to see that the memory map contains at least 2 elements
3356 - * before we'll use it, because the detection code in setup.S may
3357 - * not be perfect and most every PC known to man has two memory
3358 - * regions: one from 0 to 640k, and one from 1mb up. (The IBM
3359 - * thinkpad 560x, for example, does not cooperate with the memory
3360 - * detection code.)
3361 - */
3362 -int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
3363 -{
3364 -#ifndef CONFIG_XEN
3365 - /* Only one memory region (or negative)? Ignore it */
3366 - if (nr_map < 2)
3367 - return -1;
3368 -#else
3369 - BUG_ON(nr_map < 1);
3370 -#endif
3371 -
3372 - do {
3373 - u64 start = biosmap->addr;
3374 - u64 size = biosmap->size;
3375 - u64 end = start + size;
3376 - u32 type = biosmap->type;
3377 -
3378 - /* Overflow in 64 bits? Ignore the memory map. */
3379 - if (start > end)
3380 - return -1;
3381 -
3382 - add_memory_region(start, size, type);
3383 - } while (biosmap++, --nr_map);
3384 -
3385 -#ifdef CONFIG_XEN
3386 - if (is_initial_xendomain()) {
3387 - struct xen_memory_map memmap;
3388 -
3389 - memmap.nr_entries = E820MAX;
3390 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
3391 -
3392 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3393 - BUG();
3394 - machine_e820.nr_map = memmap.nr_entries;
3395 - } else
3396 - machine_e820 = e820;
3397 -#endif
3398 -
3399 - return 0;
3400 -}
3401 -
3402 -/*
3403 - * Find the highest page frame number we have available
3404 - */
3405 -void __init propagate_e820_map(void)
3406 -{
3407 - int i;
3408 -
3409 - max_pfn = 0;
3410 -
3411 - for (i = 0; i < e820.nr_map; i++) {
3412 - unsigned long start, end;
3413 - /* RAM? */
3414 - if (e820.map[i].type != E820_RAM)
3415 - continue;
3416 - start = PFN_UP(e820.map[i].addr);
3417 - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3418 - if (start >= end)
3419 - continue;
3420 - if (end > max_pfn)
3421 - max_pfn = end;
3422 - memory_present(0, start, end);
3423 - }
3424 -}
3425 -
3426 -/*
3427 - * Register fully available low RAM pages with the bootmem allocator.
3428 - */
3429 -void __init register_bootmem_low_pages(unsigned long max_low_pfn)
3430 -{
3431 - int i;
3432 -
3433 - for (i = 0; i < e820.nr_map; i++) {
3434 - unsigned long curr_pfn, last_pfn, size;
3435 - /*
3436 - * Reserve usable low memory
3437 - */
3438 - if (e820.map[i].type != E820_RAM)
3439 - continue;
3440 - /*
3441 - * We are rounding up the start address of usable memory:
3442 - */
3443 - curr_pfn = PFN_UP(e820.map[i].addr);
3444 - if (curr_pfn >= max_low_pfn)
3445 - continue;
3446 - /*
3447 - * ... and at the end of the usable range downwards:
3448 - */
3449 - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3450 -
3451 -#ifdef CONFIG_XEN
3452 - /*
3453 - * Truncate to the number of actual pages currently
3454 - * present.
3455 - */
3456 - if (last_pfn > xen_start_info->nr_pages)
3457 - last_pfn = xen_start_info->nr_pages;
3458 -#endif
3459 -
3460 - if (last_pfn > max_low_pfn)
3461 - last_pfn = max_low_pfn;
3462 -
3463 - /*
3464 - * .. finally, did all the rounding and playing
3465 - * around just make the area go away?
3466 - */
3467 - if (last_pfn <= curr_pfn)
3468 - continue;
3469 -
3470 - size = last_pfn - curr_pfn;
3471 - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
3472 - }
3473 -}
3474 -
3475 -void __init e820_register_memory(void)
3476 -{
3477 - unsigned long gapstart, gapsize, round;
3478 - unsigned long long last;
3479 - int i;
3480 -
3481 -#ifdef CONFIG_XEN
3482 - if (is_initial_xendomain()) {
3483 - struct xen_memory_map memmap;
3484 -
3485 - memmap.nr_entries = E820MAX;
3486 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
3487 -
3488 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3489 - BUG();
3490 - machine_e820.nr_map = memmap.nr_entries;
3491 - }
3492 - else
3493 - machine_e820 = e820;
3494 -#define e820 machine_e820
3495 -#endif
3496 -
3497 - /*
3498 - * Search for the biggest gap in the low 32 bits of the e820
3499 - * memory space.
3500 - */
3501 - last = 0x100000000ull;
3502 - gapstart = 0x10000000;
3503 - gapsize = 0x400000;
3504 - i = e820.nr_map;
3505 - while (--i >= 0) {
3506 - unsigned long long start = e820.map[i].addr;
3507 - unsigned long long end = start + e820.map[i].size;
3508 -
3509 - /*
3510 - * Since "last" is at most 4GB, we know we'll
3511 - * fit in 32 bits if this condition is true
3512 - */
3513 - if (last > end) {
3514 - unsigned long gap = last - end;
3515 -
3516 - if (gap > gapsize) {
3517 - gapsize = gap;
3518 - gapstart = end;
3519 - }
3520 - }
3521 - if (start < last)
3522 - last = start;
3523 - }
3524 -#undef e820
3525 -
3526 - /*
3527 - * See how much we want to round up: start off with
3528 - * rounding to the next 1MB area.
3529 - */
3530 - round = 0x100000;
3531 - while ((gapsize >> 4) > round)
3532 - round += round;
3533 - /* Fun with two's complement */
3534 - pci_mem_start = (gapstart + round) & -round;
3535 -
3536 - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
3537 - pci_mem_start, gapstart, gapsize);
3538 -}
3539 -
3540 -void __init print_memory_map(char *who)
3541 -{
3542 - int i;
3543 -
3544 - for (i = 0; i < e820.nr_map; i++) {
3545 - printk(" %s: %016Lx - %016Lx ", who,
3546 - e820.map[i].addr,
3547 - e820.map[i].addr + e820.map[i].size);
3548 - switch (e820.map[i].type) {
3549 - case E820_RAM: printk("(usable)\n");
3550 - break;
3551 - case E820_RESERVED:
3552 - printk("(reserved)\n");
3553 - break;
3554 - case E820_ACPI:
3555 - printk("(ACPI data)\n");
3556 - break;
3557 - case E820_NVS:
3558 - printk("(ACPI NVS)\n");
3559 - break;
3560 - default: printk("type %u\n", e820.map[i].type);
3561 - break;
3562 - }
3563 - }
3564 -}
3565 -
3566 -void __init limit_regions(unsigned long long size)
3567 -{
3568 - unsigned long long current_addr = 0;
3569 - int i;
3570 -
3571 - print_memory_map("limit_regions start");
3572 - for (i = 0; i < e820.nr_map; i++) {
3573 - current_addr = e820.map[i].addr + e820.map[i].size;
3574 - if (current_addr < size)
3575 - continue;
3576 -
3577 - if (e820.map[i].type != E820_RAM)
3578 - continue;
3579 -
3580 - if (e820.map[i].addr >= size) {
3581 - /*
3582 - * This region starts past the end of the
3583 - * requested size, skip it completely.
3584 - */
3585 - e820.nr_map = i;
3586 - } else {
3587 - e820.nr_map = i + 1;
3588 - e820.map[i].size -= current_addr - size;
3589 - }
3590 - print_memory_map("limit_regions endfor");
3591 - return;
3592 - }
3593 -#ifdef CONFIG_XEN
3594 - if (current_addr < size) {
3595 - /*
3596 - * The e820 map finished before our requested size so
3597 - * extend the final entry to the requested address.
3598 - */
3599 - --i;
3600 - if (e820.map[i].type == E820_RAM)
3601 - e820.map[i].size -= current_addr - size;
3602 - else
3603 - add_memory_region(current_addr, size - current_addr, E820_RAM);
3604 - }
3605 -#endif
3606 - print_memory_map("limit_regions endfunc");
3607 -}
3608 -
3609 -/*
3610 - * This function checks if any part of the range <start,end> is mapped
3611 - * with type.
3612 - */
3613 -int
3614 -e820_any_mapped(u64 start, u64 end, unsigned type)
3615 -{
3616 - int i;
3617 -
3618 -#ifndef CONFIG_XEN
3619 - for (i = 0; i < e820.nr_map; i++) {
3620 - const struct e820entry *ei = &e820.map[i];
3621 -#else
3622 - if (!is_initial_xendomain())
3623 - return 0;
3624 - for (i = 0; i < machine_e820.nr_map; ++i) {
3625 - const struct e820entry *ei = &machine_e820.map[i];
3626 -#endif
3627 -
3628 - if (type && ei->type != type)
3629 - continue;
3630 - if (ei->addr >= end || ei->addr + ei->size <= start)
3631 - continue;
3632 - return 1;
3633 - }
3634 - return 0;
3635 -}
3636 -EXPORT_SYMBOL_GPL(e820_any_mapped);
3637 -
3638 - /*
3639 - * This function checks if the entire range <start,end> is mapped with type.
3640 - *
3641 - * Note: this function only works correct if the e820 table is sorted and
3642 - * not-overlapping, which is the case
3643 - */
3644 -int __init
3645 -e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
3646 -{
3647 - u64 start = s;
3648 - u64 end = e;
3649 - int i;
3650 -
3651 -#ifndef CONFIG_XEN
3652 - for (i = 0; i < e820.nr_map; i++) {
3653 - struct e820entry *ei = &e820.map[i];
3654 -#else
3655 - if (!is_initial_xendomain())
3656 - return 0;
3657 - for (i = 0; i < machine_e820.nr_map; ++i) {
3658 - const struct e820entry *ei = &machine_e820.map[i];
3659 -#endif
3660 -
3661 - if (type && ei->type != type)
3662 - continue;
3663 - /* is the region (part) in overlap with the current region ?*/
3664 - if (ei->addr >= end || ei->addr + ei->size <= start)
3665 - continue;
3666 - /* if the region is at the beginning of <start,end> we move
3667 - * start to the end of the region since it's ok until there
3668 - */
3669 - if (ei->addr <= start)
3670 - start = ei->addr + ei->size;
3671 - /* if start is now at or beyond end, we're done, full
3672 - * coverage */
3673 - if (start >= end)
3674 - return 1; /* we're done */
3675 - }
3676 - return 0;
3677 -}
3678 -
3679 -static int __init parse_memmap(char *arg)
3680 -{
3681 - if (!arg)
3682 - return -EINVAL;
3683 -
3684 - if (strcmp(arg, "exactmap") == 0) {
3685 -#ifdef CONFIG_CRASH_DUMP
3686 - /* If we are doing a crash dump, we
3687 - * still need to know the real mem
3688 - * size before original memory map is
3689 - * reset.
3690 - */
3691 - propagate_e820_map();
3692 - saved_max_pfn = max_pfn;
3693 -#endif
3694 - e820.nr_map = 0;
3695 - user_defined_memmap = 1;
3696 - } else {
3697 - /* If the user specifies memory size, we
3698 - * limit the BIOS-provided memory map to
3699 - * that size. exactmap can be used to specify
3700 - * the exact map. mem=number can be used to
3701 - * trim the existing memory map.
3702 - */
3703 - unsigned long long start_at, mem_size;
3704 -
3705 - mem_size = memparse(arg, &arg);
3706 - if (*arg == '@') {
3707 - start_at = memparse(arg+1, &arg);
3708 - add_memory_region(start_at, mem_size, E820_RAM);
3709 - } else if (*arg == '#') {
3710 - start_at = memparse(arg+1, &arg);
3711 - add_memory_region(start_at, mem_size, E820_ACPI);
3712 - } else if (*arg == '$') {
3713 - start_at = memparse(arg+1, &arg);
3714 - add_memory_region(start_at, mem_size, E820_RESERVED);
3715 - } else {
3716 - limit_regions(mem_size);
3717 - user_defined_memmap = 1;
3718 - }
3719 - }
3720 - return 0;
3721 -}
3722 -early_param("memmap", parse_memmap);
3723 -
3724 -#ifndef CONFIG_XEN
3725 -void __init update_memory_range(u64 start, u64 size, unsigned old_type,
3726 - unsigned new_type)
3727 -{
3728 - int i;
3729 -
3730 - BUG_ON(old_type == new_type);
3731 -
3732 - for (i = 0; i < e820.nr_map; i++) {
3733 - struct e820entry *ei = &e820.map[i];
3734 - u64 final_start, final_end;
3735 - if (ei->type != old_type)
3736 - continue;
3737 - /* totally covered? */
3738 - if (ei->addr >= start && ei->size <= size) {
3739 - ei->type = new_type;
3740 - continue;
3741 - }
3742 - /* partially covered */
3743 - final_start = max(start, ei->addr);
3744 - final_end = min(start + size, ei->addr + ei->size);
3745 - if (final_start >= final_end)
3746 - continue;
3747 - add_memory_region(final_start, final_end - final_start,
3748 - new_type);
3749 - }
3750 -}
3751 -
3752 -void __init update_e820(void)
3753 -{
3754 - u8 nr_map;
3755 -
3756 - nr_map = e820.nr_map;
3757 - if (sanitize_e820_map(e820.map, &nr_map))
3758 - return;
3759 - e820.nr_map = nr_map;
3760 - printk(KERN_INFO "modified physical RAM map:\n");
3761 - print_memory_map("modified");
3762 -}
3763 -#endif
3764 Index: head-2008-12-01/arch/x86/kernel/e820_64-xen.c
3765 ===================================================================
3766 --- head-2008-12-01.orig/arch/x86/kernel/e820_64-xen.c 2008-12-01 11:44:55.000000000 +0100
3767 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3768 @@ -1,1045 +0,0 @@
3769 -/*
3770 - * Handle the memory map.
3771 - * The functions here do the job until bootmem takes over.
3772 - *
3773 - * Getting sanitize_e820_map() in sync with i386 version by applying change:
3774 - * - Provisions for empty E820 memory regions (reported by certain BIOSes).
3775 - * Alex Achenbach <xela@slit.de>, December 2002.
3776 - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
3777 - *
3778 - */
3779 -#include <linux/kernel.h>
3780 -#include <linux/types.h>
3781 -#include <linux/init.h>
3782 -#include <linux/bootmem.h>
3783 -#include <linux/ioport.h>
3784 -#include <linux/string.h>
3785 -#include <linux/kexec.h>
3786 -#include <linux/module.h>
3787 -#include <linux/mm.h>
3788 -#include <linux/suspend.h>
3789 -#include <linux/pfn.h>
3790 -
3791 -#include <asm/pgtable.h>
3792 -#include <asm/page.h>
3793 -#include <asm/e820.h>
3794 -#include <asm/proto.h>
3795 -#include <asm/setup.h>
3796 -#include <asm/sections.h>
3797 -#include <asm/kdebug.h>
3798 -#include <xen/interface/memory.h>
3799 -
3800 -struct e820map e820 __initdata;
3801 -#ifdef CONFIG_XEN
3802 -struct e820map machine_e820;
3803 -#endif
3804 -
3805 -/*
3806 - * PFN of last memory page.
3807 - */
3808 -unsigned long end_pfn;
3809 -
3810 -/*
3811 - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
3812 - * The direct mapping extends to max_pfn_mapped, so that we can directly access
3813 - * apertures, ACPI and other tables without having to play with fixmaps.
3814 - */
3815 -unsigned long max_pfn_mapped;
3816 -
3817 -/*
3818 - * Last pfn which the user wants to use.
3819 - */
3820 -static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
3821 -
3822 -/*
3823 - * Early reserved memory areas.
3824 - */
3825 -#define MAX_EARLY_RES 20
3826 -
3827 -struct early_res {
3828 - unsigned long start, end;
3829 - char name[16];
3830 -};
3831 -static struct early_res early_res[MAX_EARLY_RES] __initdata = {
3832 -#ifndef CONFIG_XEN
3833 - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
3834 -#ifdef CONFIG_X86_TRAMPOLINE
3835 - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
3836 -#endif
3837 -#endif
3838 - {}
3839 -};
3840 -
3841 -void __init reserve_early(unsigned long start, unsigned long end, char *name)
3842 -{
3843 - int i;
3844 - struct early_res *r;
3845 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3846 - r = &early_res[i];
3847 - if (end > r->start && start < r->end)
3848 - panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
3849 - start, end - 1, name?name:"", r->start, r->end - 1, r->name);
3850 - }
3851 - if (i >= MAX_EARLY_RES)
3852 - panic("Too many early reservations");
3853 - r = &early_res[i];
3854 - r->start = start;
3855 - r->end = end;
3856 - if (name)
3857 - strncpy(r->name, name, sizeof(r->name) - 1);
3858 -}
3859 -
3860 -void __init free_early(unsigned long start, unsigned long end)
3861 -{
3862 - struct early_res *r;
3863 - int i, j;
3864 -
3865 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3866 - r = &early_res[i];
3867 - if (start == r->start && end == r->end)
3868 - break;
3869 - }
3870 - if (i >= MAX_EARLY_RES || !early_res[i].end)
3871 - panic("free_early on not reserved area: %lx-%lx!", start, end);
3872 -
3873 - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
3874 - ;
3875 -
3876 - memmove(&early_res[i], &early_res[i + 1],
3877 - (j - 1 - i) * sizeof(struct early_res));
3878 -
3879 - early_res[j - 1].end = 0;
3880 -}
3881 -
3882 -void __init early_res_to_bootmem(unsigned long start, unsigned long end)
3883 -{
3884 - int i;
3885 - unsigned long final_start, final_end;
3886 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3887 - struct early_res *r = &early_res[i];
3888 - final_start = max(start, r->start);
3889 - final_end = min(end, r->end);
3890 - if (final_start >= final_end)
3891 - continue;
3892 - printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
3893 - final_start, final_end - 1, r->name);
3894 - reserve_bootmem_generic(final_start, final_end - final_start);
3895 - }
3896 -}
3897 -
3898 -/* Check for already reserved areas */
3899 -static inline int __init
3900 -bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
3901 -{
3902 - int i;
3903 - unsigned long addr = *addrp, last;
3904 - int changed = 0;
3905 -again:
3906 - last = addr + size;
3907 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3908 - struct early_res *r = &early_res[i];
3909 - if (last >= r->start && addr < r->end) {
3910 - *addrp = addr = round_up(r->end, align);
3911 - changed = 1;
3912 - goto again;
3913 - }
3914 - }
3915 - return changed;
3916 -}
3917 -
3918 -/* Check for already reserved areas */
3919 -static inline int __init
3920 -bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
3921 -{
3922 - int i;
3923 - unsigned long addr = *addrp, last;
3924 - unsigned long size = *sizep;
3925 - int changed = 0;
3926 -again:
3927 - last = addr + size;
3928 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3929 - struct early_res *r = &early_res[i];
3930 - if (last > r->start && addr < r->start) {
3931 - size = r->start - addr;
3932 - changed = 1;
3933 - goto again;
3934 - }
3935 - if (last > r->end && addr < r->end) {
3936 - addr = round_up(r->end, align);
3937 - size = last - addr;
3938 - changed = 1;
3939 - goto again;
3940 - }
3941 - if (last <= r->end && addr >= r->start) {
3942 - (*sizep)++;
3943 - return 0;
3944 - }
3945 - }
3946 - if (changed) {
3947 - *addrp = addr;
3948 - *sizep = size;
3949 - }
3950 - return changed;
3951 -}
3952 -/*
3953 - * This function checks if any part of the range <start,end> is mapped
3954 - * with type.
3955 - */
3956 -int
3957 -e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
3958 -{
3959 - int i;
3960 -
3961 -#ifndef CONFIG_XEN
3962 - for (i = 0; i < e820.nr_map; i++) {
3963 - struct e820entry *ei = &e820.map[i];
3964 -#else
3965 - if (!is_initial_xendomain())
3966 - return 0;
3967 - for (i = 0; i < machine_e820.nr_map; i++) {
3968 - const struct e820entry *ei = &machine_e820.map[i];
3969 -#endif
3970 -
3971 - if (type && ei->type != type)
3972 - continue;
3973 - if (ei->addr >= end || ei->addr + ei->size <= start)
3974 - continue;
3975 - return 1;
3976 - }
3977 - return 0;
3978 -}
3979 -EXPORT_SYMBOL_GPL(e820_any_mapped);
3980 -
3981 -/*
3982 - * This function checks if the entire range <start,end> is mapped with type.
3983 - *
3984 - * Note: this function only works correct if the e820 table is sorted and
3985 - * not-overlapping, which is the case
3986 - */
3987 -int __init e820_all_mapped(unsigned long start, unsigned long end,
3988 - unsigned type)
3989 -{
3990 - int i;
3991 -
3992 -#ifndef CONFIG_XEN
3993 - for (i = 0; i < e820.nr_map; i++) {
3994 - struct e820entry *ei = &e820.map[i];
3995 -#else
3996 - if (!is_initial_xendomain())
3997 - return 0;
3998 - for (i = 0; i < machine_e820.nr_map; i++) {
3999 - const struct e820entry *ei = &machine_e820.map[i];
4000 -#endif
4001 -
4002 - if (type && ei->type != type)
4003 - continue;
4004 - /* is the region (part) in overlap with the current region ?*/
4005 - if (ei->addr >= end || ei->addr + ei->size <= start)
4006 - continue;
4007 -
4008 - /* if the region is at the beginning of <start,end> we move
4009 - * start to the end of the region since it's ok until there
4010 - */
4011 - if (ei->addr <= start)
4012 - start = ei->addr + ei->size;
4013 - /*
4014 - * if start is now at or beyond end, we're done, full
4015 - * coverage
4016 - */
4017 - if (start >= end)
4018 - return 1;
4019 - }
4020 - return 0;
4021 -}
4022 -
4023 -/*
4024 - * Find a free area with specified alignment in a specific range.
4025 - */
4026 -unsigned long __init find_e820_area(unsigned long start, unsigned long end,
4027 - unsigned long size, unsigned long align)
4028 -{
4029 - int i;
4030 -
4031 - for (i = 0; i < e820.nr_map; i++) {
4032 - struct e820entry *ei = &e820.map[i];
4033 - unsigned long addr, last;
4034 - unsigned long ei_last;
4035 -
4036 - if (ei->type != E820_RAM)
4037 - continue;
4038 - addr = round_up(ei->addr, align);
4039 - ei_last = ei->addr + ei->size;
4040 - if (addr < start)
4041 - addr = round_up(start, align);
4042 - if (addr >= ei_last)
4043 - continue;
4044 - while (bad_addr(&addr, size, align) && addr+size <= ei_last)
4045 - ;
4046 - last = addr + size;
4047 - if (last > ei_last)
4048 - continue;
4049 - if (last > end)
4050 - continue;
4051 - return addr;
4052 - }
4053 - return -1UL;
4054 -}
4055 -
4056 -/*
4057 - * Find next free range after *start
4058 - */
4059 -unsigned long __init find_e820_area_size(unsigned long start,
4060 - unsigned long *sizep,
4061 - unsigned long align)
4062 -{
4063 - int i;
4064 -
4065 - for (i = 0; i < e820.nr_map; i++) {
4066 - struct e820entry *ei = &e820.map[i];
4067 - unsigned long addr, last;
4068 - unsigned long ei_last;
4069 -
4070 - if (ei->type != E820_RAM)
4071 - continue;
4072 - addr = round_up(ei->addr, align);
4073 - ei_last = ei->addr + ei->size;
4074 - if (addr < start)
4075 - addr = round_up(start, align);
4076 - if (addr >= ei_last)
4077 - continue;
4078 - *sizep = ei_last - addr;
4079 - while (bad_addr_size(&addr, sizep, align) &&
4080 - addr + *sizep <= ei_last)
4081 - ;
4082 - last = addr + *sizep;
4083 - if (last > ei_last)
4084 - continue;
4085 - return addr;
4086 - }
4087 - return -1UL;
4088 -
4089 -}
4090 -/*
4091 - * Find the highest page frame number we have available
4092 - */
4093 -unsigned long __init e820_end_of_ram(void)
4094 -{
4095 - unsigned long end_pfn;
4096 -
4097 - end_pfn = find_max_pfn_with_active_regions();
4098 -
4099 - if (end_pfn > max_pfn_mapped)
4100 - max_pfn_mapped = end_pfn;
4101 - if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
4102 - max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
4103 - if (end_pfn > end_user_pfn)
4104 - end_pfn = end_user_pfn;
4105 - if (end_pfn > max_pfn_mapped)
4106 - end_pfn = max_pfn_mapped;
4107 -
4108 - printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
4109 - return end_pfn;
4110 -}
4111 -
4112 -/*
4113 - * Mark e820 reserved areas as busy for the resource manager.
4114 - */
4115 -void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
4116 -{
4117 - int i;
4118 - struct resource *res;
4119 -
4120 - res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
4121 - for (i = 0; i < nr_map; i++) {
4122 - switch (e820[i].type) {
4123 - case E820_RAM: res->name = "System RAM"; break;
4124 - case E820_ACPI: res->name = "ACPI Tables"; break;
4125 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
4126 - default: res->name = "reserved";
4127 - }
4128 - res->start = e820[i].addr;
4129 - res->end = res->start + e820[i].size - 1;
4130 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4131 - insert_resource(&iomem_resource, res);
4132 - res++;
4133 - }
4134 -}
4135 -
4136 -#ifndef CONFIG_XEN
4137 -/*
4138 - * Find the ranges of physical addresses that do not correspond to
4139 - * e820 RAM areas and mark the corresponding pages as nosave for software
4140 - * suspend and suspend to RAM.
4141 - *
4142 - * This function requires the e820 map to be sorted and without any
4143 - * overlapping entries and assumes the first e820 area to be RAM.
4144 - */
4145 -void __init e820_mark_nosave_regions(void)
4146 -{
4147 - int i;
4148 - unsigned long paddr;
4149 -
4150 - paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
4151 - for (i = 1; i < e820.nr_map; i++) {
4152 - struct e820entry *ei = &e820.map[i];
4153 -
4154 - if (paddr < ei->addr)
4155 - register_nosave_region(PFN_DOWN(paddr),
4156 - PFN_UP(ei->addr));
4157 -
4158 - paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
4159 - if (ei->type != E820_RAM)
4160 - register_nosave_region(PFN_UP(ei->addr),
4161 - PFN_DOWN(paddr));
4162 -
4163 - if (paddr >= (end_pfn << PAGE_SHIFT))
4164 - break;
4165 - }
4166 -}
4167 -#endif
4168 -
4169 -/*
4170 - * Finds an active region in the address range from start_pfn to end_pfn and
4171 - * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
4172 - */
4173 -static int __init e820_find_active_region(const struct e820entry *ei,
4174 - unsigned long start_pfn,
4175 - unsigned long end_pfn,
4176 - unsigned long *ei_startpfn,
4177 - unsigned long *ei_endpfn)
4178 -{
4179 - *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
4180 - *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
4181 -
4182 - /* Skip map entries smaller than a page */
4183 - if (*ei_startpfn >= *ei_endpfn)
4184 - return 0;
4185 -
4186 - /* Check if max_pfn_mapped should be updated */
4187 - if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
4188 - max_pfn_mapped = *ei_endpfn;
4189 -
4190 - /* Skip if map is outside the node */
4191 - if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
4192 - *ei_startpfn >= end_pfn)
4193 - return 0;
4194 -
4195 - /* Check for overlaps */
4196 - if (*ei_startpfn < start_pfn)
4197 - *ei_startpfn = start_pfn;
4198 - if (*ei_endpfn > end_pfn)
4199 - *ei_endpfn = end_pfn;
4200 -
4201 - /* Obey end_user_pfn to save on memmap */
4202 - if (*ei_startpfn >= end_user_pfn)
4203 - return 0;
4204 - if (*ei_endpfn > end_user_pfn)
4205 - *ei_endpfn = end_user_pfn;
4206 -
4207 - return 1;
4208 -}
4209 -
4210 -/* Walk the e820 map and register active regions within a node */
4211 -void __init
4212 -e820_register_active_regions(int nid, unsigned long start_pfn,
4213 - unsigned long end_pfn)
4214 -{
4215 - unsigned long ei_startpfn;
4216 - unsigned long ei_endpfn;
4217 - int i;
4218 -
4219 - for (i = 0; i < e820.nr_map; i++)
4220 - if (e820_find_active_region(&e820.map[i],
4221 - start_pfn, end_pfn,
4222 - &ei_startpfn, &ei_endpfn))
4223 - add_active_range(nid, ei_startpfn, ei_endpfn);
4224 -}
4225 -
4226 -/*
4227 - * Add a memory region to the kernel e820 map.
4228 - */
4229 -void __init add_memory_region(unsigned long start, unsigned long size, int type)
4230 -{
4231 - int x = e820.nr_map;
4232 -
4233 - if (x == E820MAX) {
4234 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
4235 - return;
4236 - }
4237 -
4238 - e820.map[x].addr = start;
4239 - e820.map[x].size = size;
4240 - e820.map[x].type = type;
4241 - e820.nr_map++;
4242 -}
4243 -
4244 -/*
4245 - * Find the hole size (in bytes) in the memory range.
4246 - * @start: starting address of the memory range to scan
4247 - * @end: ending address of the memory range to scan
4248 - */
4249 -unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
4250 -{
4251 - unsigned long start_pfn = start >> PAGE_SHIFT;
4252 - unsigned long end_pfn = end >> PAGE_SHIFT;
4253 - unsigned long ei_startpfn, ei_endpfn, ram = 0;
4254 - int i;
4255 -
4256 - for (i = 0; i < e820.nr_map; i++) {
4257 - if (e820_find_active_region(&e820.map[i],
4258 - start_pfn, end_pfn,
4259 - &ei_startpfn, &ei_endpfn))
4260 - ram += ei_endpfn - ei_startpfn;
4261 - }
4262 - return end - start - (ram << PAGE_SHIFT);
4263 -}
4264 -
4265 -static void __init e820_print_map(char *who)
4266 -{
4267 - int i;
4268 -
4269 - for (i = 0; i < e820.nr_map; i++) {
4270 - printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
4271 - (unsigned long long) e820.map[i].addr,
4272 - (unsigned long long)
4273 - (e820.map[i].addr + e820.map[i].size));
4274 - switch (e820.map[i].type) {
4275 - case E820_RAM:
4276 - printk(KERN_CONT "(usable)\n");
4277 - break;
4278 - case E820_RESERVED:
4279 - printk(KERN_CONT "(reserved)\n");
4280 - break;
4281 - case E820_ACPI:
4282 - printk(KERN_CONT "(ACPI data)\n");
4283 - break;
4284 - case E820_NVS:
4285 - printk(KERN_CONT "(ACPI NVS)\n");
4286 - break;
4287 - default:
4288 - printk(KERN_CONT "type %u\n", e820.map[i].type);
4289 - break;
4290 - }
4291 - }
4292 -}
4293 -
4294 -/*
4295 - * Sanitize the BIOS e820 map.
4296 - *
4297 - * Some e820 responses include overlapping entries. The following
4298 - * replaces the original e820 map with a new one, removing overlaps.
4299 - *
4300 - */
4301 -static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
4302 -{
4303 - struct change_member {
4304 - struct e820entry *pbios; /* pointer to original bios entry */
4305 - unsigned long long addr; /* address for this change point */
4306 - };
4307 - static struct change_member change_point_list[2*E820MAX] __initdata;
4308 - static struct change_member *change_point[2*E820MAX] __initdata;
4309 - static struct e820entry *overlap_list[E820MAX] __initdata;
4310 - static struct e820entry new_bios[E820MAX] __initdata;
4311 - struct change_member *change_tmp;
4312 - unsigned long current_type, last_type;
4313 - unsigned long long last_addr;
4314 - int chgidx, still_changing;
4315 - int overlap_entries;
4316 - int new_bios_entry;
4317 - int old_nr, new_nr, chg_nr;
4318 - int i;
4319 -
4320 - /*
4321 - Visually we're performing the following
4322 - (1,2,3,4 = memory types)...
4323 -
4324 - Sample memory map (w/overlaps):
4325 - ____22__________________
4326 - ______________________4_
4327 - ____1111________________
4328 - _44_____________________
4329 - 11111111________________
4330 - ____________________33__
4331 - ___________44___________
4332 - __________33333_________
4333 - ______________22________
4334 - ___________________2222_
4335 - _________111111111______
4336 - _____________________11_
4337 - _________________4______
4338 -
4339 - Sanitized equivalent (no overlap):
4340 - 1_______________________
4341 - _44_____________________
4342 - ___1____________________
4343 - ____22__________________
4344 - ______11________________
4345 - _________1______________
4346 - __________3_____________
4347 - ___________44___________
4348 - _____________33_________
4349 - _______________2________
4350 - ________________1_______
4351 - _________________4______
4352 - ___________________2____
4353 - ____________________33__
4354 - ______________________4_
4355 - */
4356 -
4357 - /* if there's only one memory region, don't bother */
4358 - if (*pnr_map < 2)
4359 - return -1;
4360 -
4361 - old_nr = *pnr_map;
4362 -
4363 - /* bail out if we find any unreasonable addresses in bios map */
4364 - for (i = 0; i < old_nr; i++)
4365 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
4366 - return -1;
4367 -
4368 - /* create pointers for initial change-point information (for sorting) */
4369 - for (i = 0; i < 2 * old_nr; i++)
4370 - change_point[i] = &change_point_list[i];
4371 -
4372 - /* record all known change-points (starting and ending addresses),
4373 - omitting those that are for empty memory regions */
4374 - chgidx = 0;
4375 - for (i = 0; i < old_nr; i++) {
4376 - if (biosmap[i].size != 0) {
4377 - change_point[chgidx]->addr = biosmap[i].addr;
4378 - change_point[chgidx++]->pbios = &biosmap[i];
4379 - change_point[chgidx]->addr = biosmap[i].addr +
4380 - biosmap[i].size;
4381 - change_point[chgidx++]->pbios = &biosmap[i];
4382 - }
4383 - }
4384 - chg_nr = chgidx;
4385 -
4386 - /* sort change-point list by memory addresses (low -> high) */
4387 - still_changing = 1;
4388 - while (still_changing) {
4389 - still_changing = 0;
4390 - for (i = 1; i < chg_nr; i++) {
4391 - unsigned long long curaddr, lastaddr;
4392 - unsigned long long curpbaddr, lastpbaddr;
4393 -
4394 - curaddr = change_point[i]->addr;
4395 - lastaddr = change_point[i - 1]->addr;
4396 - curpbaddr = change_point[i]->pbios->addr;
4397 - lastpbaddr = change_point[i - 1]->pbios->addr;
4398 -
4399 - /*
4400 - * swap entries, when:
4401 - *
4402 - * curaddr > lastaddr or
4403 - * curaddr == lastaddr and curaddr == curpbaddr and
4404 - * lastaddr != lastpbaddr
4405 - */
4406 - if (curaddr < lastaddr ||
4407 - (curaddr == lastaddr && curaddr == curpbaddr &&
4408 - lastaddr != lastpbaddr)) {
4409 - change_tmp = change_point[i];
4410 - change_point[i] = change_point[i-1];
4411 - change_point[i-1] = change_tmp;
4412 - still_changing = 1;
4413 - }
4414 - }
4415 - }
4416 -
4417 - /* create a new bios memory map, removing overlaps */
4418 - overlap_entries = 0; /* number of entries in the overlap table */
4419 - new_bios_entry = 0; /* index for creating new bios map entries */
4420 - last_type = 0; /* start with undefined memory type */
4421 - last_addr = 0; /* start with 0 as last starting address */
4422 -
4423 - /* loop through change-points, determining affect on the new bios map */
4424 - for (chgidx = 0; chgidx < chg_nr; chgidx++) {
4425 - /* keep track of all overlapping bios entries */
4426 - if (change_point[chgidx]->addr ==
4427 - change_point[chgidx]->pbios->addr) {
4428 - /*
4429 - * add map entry to overlap list (> 1 entry
4430 - * implies an overlap)
4431 - */
4432 - overlap_list[overlap_entries++] =
4433 - change_point[chgidx]->pbios;
4434 - } else {
4435 - /*
4436 - * remove entry from list (order independent,
4437 - * so swap with last)
4438 - */
4439 - for (i = 0; i < overlap_entries; i++) {
4440 - if (overlap_list[i] ==
4441 - change_point[chgidx]->pbios)
4442 - overlap_list[i] =
4443 - overlap_list[overlap_entries-1];
4444 - }
4445 - overlap_entries--;
4446 - }
4447 - /*
4448 - * if there are overlapping entries, decide which
4449 - * "type" to use (larger value takes precedence --
4450 - * 1=usable, 2,3,4,4+=unusable)
4451 - */
4452 - current_type = 0;
4453 - for (i = 0; i < overlap_entries; i++)
4454 - if (overlap_list[i]->type > current_type)
4455 - current_type = overlap_list[i]->type;
4456 - /*
4457 - * continue building up new bios map based on this
4458 - * information
4459 - */
4460 - if (current_type != last_type) {
4461 - if (last_type != 0) {
4462 - new_bios[new_bios_entry].size =
4463 - change_point[chgidx]->addr - last_addr;
4464 - /*
4465 - * move forward only if the new size
4466 - * was non-zero
4467 - */
4468 - if (new_bios[new_bios_entry].size != 0)
4469 - /*
4470 - * no more space left for new
4471 - * bios entries ?
4472 - */
4473 - if (++new_bios_entry >= E820MAX)
4474 - break;
4475 - }
4476 - if (current_type != 0) {
4477 - new_bios[new_bios_entry].addr =
4478 - change_point[chgidx]->addr;
4479 - new_bios[new_bios_entry].type = current_type;
4480 - last_addr = change_point[chgidx]->addr;
4481 - }
4482 - last_type = current_type;
4483 - }
4484 - }
4485 - /* retain count for new bios entries */
4486 - new_nr = new_bios_entry;
4487 -
4488 - /* copy new bios mapping into original location */
4489 - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
4490 - *pnr_map = new_nr;
4491 -
4492 - return 0;
4493 -}
4494 -
4495 -/*
4496 - * Copy the BIOS e820 map into a safe place.
4497 - *
4498 - * Sanity-check it while we're at it..
4499 - *
4500 - * If we're lucky and live on a modern system, the setup code
4501 - * will have given us a memory map that we can use to properly
4502 - * set up memory. If we aren't, we'll fake a memory map.
4503 - */
4504 -static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
4505 -{
4506 -#ifndef CONFIG_XEN
4507 - /* Only one memory region (or negative)? Ignore it */
4508 - if (nr_map < 2)
4509 - return -1;
4510 -#else
4511 - BUG_ON(nr_map < 1);
4512 -#endif
4513 -
4514 - do {
4515 - u64 start = biosmap->addr;
4516 - u64 size = biosmap->size;
4517 - u64 end = start + size;
4518 - u32 type = biosmap->type;
4519 -
4520 - /* Overflow in 64 bits? Ignore the memory map. */
4521 - if (start > end)
4522 - return -1;
4523 -
4524 - add_memory_region(start, size, type);
4525 - } while (biosmap++, --nr_map);
4526 -
4527 -#ifdef CONFIG_XEN
4528 - if (is_initial_xendomain()) {
4529 - struct xen_memory_map memmap;
4530 -
4531 - memmap.nr_entries = E820MAX;
4532 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
4533 -
4534 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
4535 - BUG();
4536 - machine_e820.nr_map = memmap.nr_entries;
4537 - } else
4538 - machine_e820 = e820;
4539 -#endif
4540 -
4541 - return 0;
4542 -}
4543 -
4544 -static void early_panic(char *msg)
4545 -{
4546 - early_printk(msg);
4547 - panic(msg);
4548 -}
4549 -
4550 -/* We're not void only for x86 32-bit compat */
4551 -char * __init machine_specific_memory_setup(void)
4552 -{
4553 -#ifndef CONFIG_XEN
4554 - char *who = "BIOS-e820";
4555 - /*
4556 - * Try to copy the BIOS-supplied E820-map.
4557 - *
4558 - * Otherwise fake a memory map; one section from 0k->640k,
4559 - * the next section from 1mb->appropriate_mem_k
4560 - */
4561 - sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
4562 - if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
4563 - early_panic("Cannot find a valid memory map");
4564 -#else /* CONFIG_XEN */
4565 - char *who = "Xen";
4566 - int rc;
4567 - struct xen_memory_map memmap;
4568 - /*
4569 - * This is rather large for a stack variable but this early in
4570 - * the boot process we know we have plenty slack space.
4571 - */
4572 - struct e820entry map[E820MAX];
4573 -
4574 - memmap.nr_entries = E820MAX;
4575 - set_xen_guest_handle(memmap.buffer, map);
4576 -
4577 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
4578 - if ( rc == -ENOSYS ) {
4579 - memmap.nr_entries = 1;
4580 - map[0].addr = 0ULL;
4581 - map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
4582 - /* 8MB slack (to balance backend allocations). */
4583 - map[0].size += 8 << 20;
4584 - map[0].type = E820_RAM;
4585 - rc = 0;
4586 - }
4587 - BUG_ON(rc);
4588 -
4589 - sanitize_e820_map(map, (char *)&memmap.nr_entries);
4590 -
4591 - if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
4592 - early_panic("Cannot find a valid memory map");
4593 -#endif
4594 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
4595 - e820_print_map(who);
4596 -
4597 - /* In case someone cares... */
4598 - return who;
4599 -}
4600 -
4601 -static int __init parse_memopt(char *p)
4602 -{
4603 - int i;
4604 - unsigned long current_end;
4605 - unsigned long end;
4606 -
4607 - if (!p)
4608 - return -EINVAL;
4609 - end_user_pfn = memparse(p, &p);
4610 - end_user_pfn >>= PAGE_SHIFT;
4611 -
4612 - end = end_user_pfn<<PAGE_SHIFT;
4613 - i = e820.nr_map-1;
4614 - current_end = e820.map[i].addr + e820.map[i].size;
4615 -
4616 - if (current_end < end) {
4617 - /*
4618 - * The e820 map ends before our requested size so
4619 - * extend the final entry to the requested address.
4620 - */
4621 - if (e820.map[i].type == E820_RAM)
4622 - e820.map[i].size = end - e820.map[i].addr;
4623 - else
4624 - add_memory_region(current_end, end - current_end, E820_RAM);
4625 - }
4626 -
4627 - return 0;
4628 -}
4629 -early_param("mem", parse_memopt);
4630 -
4631 -static int userdef __initdata;
4632 -
4633 -static int __init parse_memmap_opt(char *p)
4634 -{
4635 - char *oldp;
4636 - unsigned long long start_at, mem_size;
4637 -
4638 - if (!strcmp(p, "exactmap")) {
4639 -#ifdef CONFIG_CRASH_DUMP
4640 - /*
4641 - * If we are doing a crash dump, we still need to know
4642 - * the real mem size before original memory map is
4643 - * reset.
4644 - */
4645 - e820_register_active_regions(0, 0, -1UL);
4646 - saved_max_pfn = e820_end_of_ram();
4647 - remove_all_active_ranges();
4648 -#endif
4649 - max_pfn_mapped = 0;
4650 - e820.nr_map = 0;
4651 - userdef = 1;
4652 - return 0;
4653 - }
4654 -
4655 - oldp = p;
4656 - mem_size = memparse(p, &p);
4657 - if (p == oldp)
4658 - return -EINVAL;
4659 -
4660 - userdef = 1;
4661 - if (*p == '@') {
4662 - start_at = memparse(p+1, &p);
4663 - add_memory_region(start_at, mem_size, E820_RAM);
4664 - } else if (*p == '#') {
4665 - start_at = memparse(p+1, &p);
4666 - add_memory_region(start_at, mem_size, E820_ACPI);
4667 - } else if (*p == '$') {
4668 - start_at = memparse(p+1, &p);
4669 - add_memory_region(start_at, mem_size, E820_RESERVED);
4670 - } else {
4671 - end_user_pfn = (mem_size >> PAGE_SHIFT);
4672 - }
4673 - return *p == '\0' ? 0 : -EINVAL;
4674 -}
4675 -early_param("memmap", parse_memmap_opt);
4676 -
4677 -void __init finish_e820_parsing(void)
4678 -{
4679 - if (userdef) {
4680 - char nr = e820.nr_map;
4681 -
4682 - if (sanitize_e820_map(e820.map, &nr) < 0)
4683 - early_panic("Invalid user supplied memory map");
4684 - e820.nr_map = nr;
4685 -
4686 - printk(KERN_INFO "user-defined physical RAM map:\n");
4687 - e820_print_map("user");
4688 - }
4689 -}
4690 -
4691 -#ifndef CONFIG_XEN
4692 -void __init update_memory_range(u64 start, u64 size, unsigned old_type,
4693 - unsigned new_type)
4694 -{
4695 - int i;
4696 -
4697 - BUG_ON(old_type == new_type);
4698 -
4699 - for (i = 0; i < e820.nr_map; i++) {
4700 - struct e820entry *ei = &e820.map[i];
4701 - u64 final_start, final_end;
4702 - if (ei->type != old_type)
4703 - continue;
4704 - /* totally covered? */
4705 - if (ei->addr >= start && ei->size <= size) {
4706 - ei->type = new_type;
4707 - continue;
4708 - }
4709 - /* partially covered */
4710 - final_start = max(start, ei->addr);
4711 - final_end = min(start + size, ei->addr + ei->size);
4712 - if (final_start >= final_end)
4713 - continue;
4714 - add_memory_region(final_start, final_end - final_start,
4715 - new_type);
4716 - }
4717 -}
4718 -
4719 -void __init update_e820(void)
4720 -{
4721 - u8 nr_map;
4722 -
4723 - nr_map = e820.nr_map;
4724 - if (sanitize_e820_map(e820.map, &nr_map))
4725 - return;
4726 - e820.nr_map = nr_map;
4727 - printk(KERN_INFO "modified physical RAM map:\n");
4728 - e820_print_map("modified");
4729 -}
4730 -#endif
4731 -
4732 -unsigned long pci_mem_start = 0xaeedbabe;
4733 -EXPORT_SYMBOL(pci_mem_start);
4734 -
4735 -/*
4736 - * Search for the biggest gap in the low 32 bits of the e820
4737 - * memory space. We pass this space to PCI to assign MMIO resources
4738 - * for hotplug or unconfigured devices in.
4739 - * Hopefully the BIOS let enough space left.
4740 - */
4741 -__init void e820_setup_gap(struct e820entry *e820, int nr_map)
4742 -{
4743 - unsigned long gapstart, gapsize, round;
4744 - unsigned long last;
4745 - int i;
4746 - int found = 0;
4747 -
4748 - last = 0x100000000ull;
4749 - gapstart = 0x10000000;
4750 - gapsize = 0x400000;
4751 - i = nr_map;
4752 - while (--i >= 0) {
4753 - unsigned long long start = e820[i].addr;
4754 - unsigned long long end = start + e820[i].size;
4755 -
4756 - /*
4757 - * Since "last" is at most 4GB, we know we'll
4758 - * fit in 32 bits if this condition is true
4759 - */
4760 - if (last > end) {
4761 - unsigned long gap = last - end;
4762 -
4763 - if (gap > gapsize) {
4764 - gapsize = gap;
4765 - gapstart = end;
4766 - found = 1;
4767 - }
4768 - }
4769 - if (start < last)
4770 - last = start;
4771 - }
4772 -
4773 - if (!found) {
4774 - gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
4775 - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
4776 - "address range\n"
4777 - KERN_ERR "PCI: Unassigned devices with 32bit resource "
4778 - "registers may break!\n");
4779 - }
4780 -
4781 - /*
4782 - * See how much we want to round up: start off with
4783 - * rounding to the next 1MB area.
4784 - */
4785 - round = 0x100000;
4786 - while ((gapsize >> 4) > round)
4787 - round += round;
4788 - /* Fun with two's complement */
4789 - pci_mem_start = (gapstart + round) & -round;
4790 -
4791 - printk(KERN_INFO
4792 - "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
4793 - pci_mem_start, gapstart, gapsize);
4794 -}
4795 -
4796 -int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
4797 -{
4798 - int i;
4799 -
4800 - if (slot < 0 || slot >= e820.nr_map)
4801 - return -1;
4802 - for (i = slot; i < e820.nr_map; i++) {
4803 - if (e820.map[i].type != E820_RAM)
4804 - continue;
4805 - break;
4806 - }
4807 - if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
4808 - return -1;
4809 - *addr = e820.map[i].addr;
4810 - *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
4811 - max_pfn << PAGE_SHIFT) - *addr;
4812 - return i + 1;
4813 -}
4814 Index: head-2008-12-01/arch/x86/kernel/early_printk-xen.c
4815 ===================================================================
4816 --- head-2008-12-01.orig/arch/x86/kernel/early_printk-xen.c 2008-12-01 11:44:55.000000000 +0100
4817 +++ head-2008-12-01/arch/x86/kernel/early_printk-xen.c 2008-12-01 11:49:07.000000000 +0100
4818 @@ -225,7 +225,7 @@ static struct console simnow_console = {
4819 static struct console *early_console = &early_vga_console;
4820 static int early_console_initialized;
4821
4822 -void early_printk(const char *fmt, ...)
4823 +asmlinkage void early_printk(const char *fmt, ...)
4824 {
4825 char buf[512];
4826 int n;
4827 Index: head-2008-12-01/arch/x86/kernel/entry_32-xen.S
4828 ===================================================================
4829 --- head-2008-12-01.orig/arch/x86/kernel/entry_32-xen.S 2008-12-01 11:44:55.000000000 +0100
4830 +++ head-2008-12-01/arch/x86/kernel/entry_32-xen.S 2008-12-01 11:49:07.000000000 +0100
4831 @@ -51,15 +51,26 @@
4832 #include <asm/percpu.h>
4833 #include <asm/dwarf2.h>
4834 #include <asm/processor-flags.h>
4835 -#include "irq_vectors.h"
4836 +#include <asm/ftrace.h>
4837 +#include <asm/irq_vectors.h>
4838 #include <xen/interface/xen.h>
4839
4840 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
4841 +#include <linux/elf-em.h>
4842 +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
4843 +#define __AUDIT_ARCH_LE 0x40000000
4844 +
4845 +#ifndef CONFIG_AUDITSYSCALL
4846 +#define sysenter_audit syscall_trace_entry
4847 +#define sysexit_audit syscall_exit_work
4848 +#endif
4849 +
4850 /*
4851 * We use macros for low-level operations which need to be overridden
4852 * for paravirtualization. The following will never clobber any registers:
4853 * INTERRUPT_RETURN (aka. "iret")
4854 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
4855 - * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
4856 + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
4857 *
4858 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
4859 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
4860 @@ -277,11 +288,6 @@ END(resume_kernel)
4861 #endif
4862 CFI_ENDPROC
4863
4864 - .macro test_tif ti_reg # system call tracing in operation / emulation
4865 - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
4866 - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
4867 - .endm
4868 -
4869 /* SYSENTER_RETURN points to after the "sysenter" instruction in
4870 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
4871
4872 @@ -338,8 +344,9 @@ sysenter_past_esp:
4873 .previous
4874
4875 GET_THREAD_INFO(%ebp)
4876 - test_tif %ebp
4877 - jnz syscall_trace_entry
4878 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4879 + jnz sysenter_audit
4880 +sysenter_do_call:
4881 cmpl $(nr_syscalls), %eax
4882 jae syscall_badsys
4883 call *sys_call_table(,%eax,4)
4884 @@ -349,14 +356,54 @@ sysenter_past_esp:
4885 TRACE_IRQS_OFF
4886 movl TI_flags(%ebp), %ecx
4887 testw $_TIF_ALLWORK_MASK, %cx
4888 - jne syscall_exit_work
4889 + jne sysexit_audit
4890 +sysenter_exit:
4891 /* if something modifies registers it must also disable sysexit */
4892 movl PT_EIP(%esp), %edx
4893 movl PT_OLDESP(%esp), %ecx
4894 xorl %ebp,%ebp
4895 TRACE_IRQS_ON
4896 1: mov PT_FS(%esp), %fs
4897 - ENABLE_INTERRUPTS_SYSCALL_RET
4898 + ENABLE_INTERRUPTS_SYSEXIT
4899 +
4900 +#ifdef CONFIG_AUDITSYSCALL
4901 +sysenter_audit:
4902 + testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
4903 + jnz syscall_trace_entry
4904 + addl $4,%esp
4905 + CFI_ADJUST_CFA_OFFSET -4
4906 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
4907 + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
4908 + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
4909 + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
4910 + movl %eax,%edx /* 2nd arg: syscall number */
4911 + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
4912 + call audit_syscall_entry
4913 + pushl %ebx
4914 + CFI_ADJUST_CFA_OFFSET 4
4915 + movl PT_EAX(%esp),%eax /* reload syscall number */
4916 + jmp sysenter_do_call
4917 +
4918 +sysexit_audit:
4919 + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4920 + jne syscall_exit_work
4921 + TRACE_IRQS_ON
4922 + ENABLE_INTERRUPTS(CLBR_ANY)
4923 + movl %eax,%edx /* second arg, syscall return value */
4924 + cmpl $0,%eax /* is it < 0? */
4925 + setl %al /* 1 if so, 0 if not */
4926 + movzbl %al,%eax /* zero-extend that */
4927 + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
4928 + call audit_syscall_exit
4929 + DISABLE_INTERRUPTS(CLBR_ANY)
4930 + TRACE_IRQS_OFF
4931 + movl TI_flags(%ebp), %ecx
4932 + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4933 + jne syscall_exit_work
4934 + movl PT_EAX(%esp),%eax /* reload syscall return value */
4935 + jmp sysenter_exit
4936 +#endif
4937 +
4938 CFI_ENDPROC
4939 .pushsection .fixup,"ax"
4940 2: movl $0,PT_FS(%esp)
4941 @@ -400,7 +447,7 @@ ENTRY(system_call)
4942 CFI_ADJUST_CFA_OFFSET 4
4943 SAVE_ALL
4944 GET_THREAD_INFO(%ebp)
4945 - test_tif %ebp
4946 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4947 jnz syscall_trace_entry
4948 cmpl $(nr_syscalls), %eax
4949 jae syscall_badsys
4950 @@ -413,10 +460,6 @@ syscall_exit:
4951 # setting need_resched or sigpending
4952 # between sampling and the iret
4953 TRACE_IRQS_OFF
4954 - testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
4955 - jz no_singlestep
4956 - orl $_TIF_SINGLESTEP,TI_flags(%ebp)
4957 -no_singlestep:
4958 movl TI_flags(%ebp), %ecx
4959 testw $_TIF_ALLWORK_MASK, %cx # current->work
4960 jne syscall_exit_work
4961 @@ -588,12 +631,8 @@ END(work_pending)
4962 syscall_trace_entry:
4963 movl $-ENOSYS,PT_EAX(%esp)
4964 movl %esp, %eax
4965 - xorl %edx,%edx
4966 - call do_syscall_trace
4967 - cmpl $0, %eax
4968 - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
4969 - # so must skip actual syscall
4970 - movl PT_ORIG_EAX(%esp), %eax
4971 + call syscall_trace_enter
4972 + /* What it returned is what we'll actually use. */
4973 cmpl $(nr_syscalls), %eax
4974 jnae syscall_call
4975 jmp syscall_exit
4976 @@ -602,14 +641,13 @@ END(syscall_trace_entry)
4977 # perform syscall exit tracing
4978 ALIGN
4979 syscall_exit_work:
4980 - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
4981 + testb $_TIF_WORK_SYSCALL_EXIT, %cl
4982 jz work_pending
4983 TRACE_IRQS_ON
4984 - ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
4985 + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
4986 # schedule() instead
4987 movl %esp, %eax
4988 - movl $1, %edx
4989 - call do_syscall_trace
4990 + call syscall_trace_leave
4991 jmp resume_userspace
4992 END(syscall_exit_work)
4993 CFI_ENDPROC
4994 @@ -1109,10 +1147,10 @@ ENTRY(native_iret)
4995 .previous
4996 END(native_iret)
4997
4998 -ENTRY(native_irq_enable_syscall_ret)
4999 +ENTRY(native_irq_enable_sysexit)
5000 sti
5001 sysexit
5002 -END(native_irq_enable_syscall_ret)
5003 +END(native_irq_enable_sysexit)
5004 #endif
5005
5006 KPROBE_ENTRY(int3)
5007 @@ -1261,6 +1299,77 @@ ENTRY(kernel_thread_helper)
5008 CFI_ENDPROC
5009 ENDPROC(kernel_thread_helper)
5010
5011 +#ifdef CONFIG_FTRACE
5012 +#ifdef CONFIG_DYNAMIC_FTRACE
5013 +
5014 +ENTRY(mcount)
5015 + pushl %eax
5016 + pushl %ecx
5017 + pushl %edx
5018 + movl 0xc(%esp), %eax
5019 + subl $MCOUNT_INSN_SIZE, %eax
5020 +
5021 +.globl mcount_call
5022 +mcount_call:
5023 + call ftrace_stub
5024 +
5025 + popl %edx
5026 + popl %ecx
5027 + popl %eax
5028 +
5029 + ret
5030 +END(mcount)
5031 +
5032 +ENTRY(ftrace_caller)
5033 + pushl %eax
5034 + pushl %ecx
5035 + pushl %edx
5036 + movl 0xc(%esp), %eax
5037 + movl 0x4(%ebp), %edx
5038 + subl $MCOUNT_INSN_SIZE, %eax
5039 +
5040 +.globl ftrace_call
5041 +ftrace_call:
5042 + call ftrace_stub
5043 +
5044 + popl %edx
5045 + popl %ecx
5046 + popl %eax
5047 +
5048 +.globl ftrace_stub
5049 +ftrace_stub:
5050 + ret
5051 +END(ftrace_caller)
5052 +
5053 +#else /* ! CONFIG_DYNAMIC_FTRACE */
5054 +
5055 +ENTRY(mcount)
5056 + cmpl $ftrace_stub, ftrace_trace_function
5057 + jnz trace
5058 +.globl ftrace_stub
5059 +ftrace_stub:
5060 + ret
5061 +
5062 + /* taken from glibc */
5063 +trace:
5064 + pushl %eax
5065 + pushl %ecx
5066 + pushl %edx
5067 + movl 0xc(%esp), %eax
5068 + movl 0x4(%ebp), %edx
5069 + subl $MCOUNT_INSN_SIZE, %eax
5070 +
5071 + call *ftrace_trace_function
5072 +
5073 + popl %edx
5074 + popl %ecx
5075 + popl %eax
5076 +
5077 + jmp ftrace_stub
5078 +END(mcount)
5079 +#endif /* CONFIG_DYNAMIC_FTRACE */
5080 +#endif /* CONFIG_FTRACE */
5081 +
5082 #include <asm/alternative-asm.h>
5083
5084 # pv syscall call handler stub
5085 @@ -1286,7 +1395,7 @@ ENTRY(ia32pv_cstar_target)
5086 .previous
5087 SAVE_ALL
5088 GET_THREAD_INFO(%ebp)
5089 - test_tif %ebp
5090 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
5091 jnz cstar_trace_entry
5092 cmpl $nr_syscalls,%eax
5093 jae cstar_badsys
5094 @@ -1320,29 +1429,21 @@ cstar_trace_entry:
5095 btl %eax,cstar_special
5096 jc .Lcstar_trace_special
5097 1: movl %esp,%eax
5098 - xorl %edx,%edx
5099 LOCK_PREFIX
5100 orl $_TIF_CSTAR,TI_flags(%ebp)
5101 - call do_syscall_trace
5102 + call syscall_trace_enter
5103 LOCK_PREFIX
5104 andl $~_TIF_CSTAR,TI_flags(%ebp)
5105 - testl %eax,%eax
5106 - jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
5107 - # so must skip actual syscall
5108 - movl PT_ORIG_EAX(%esp),%eax
5109 + /* What it returned is what we'll actually use. */
5110 cmpl $nr_syscalls,%eax
5111 jb .Lcstar_call
5112 jmp .Lcstar_exit
5113 .Lcstar_trace_special:
5114 movl PT_ECX(%esp),%ecx
5115 movl %esp,%eax
5116 - xorl %edx,%edx
5117 movl %ecx,PT_EBP(%esp) # put user EBP back in place
5118 - call do_syscall_trace
5119 - testl %eax,%eax
5120 - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5121 - # so must skip actual syscall
5122 - movl PT_ORIG_EAX(%esp),%eax
5123 + call syscall_trace_enter
5124 + /* What it returned is what we'll actually use. */
5125 cmpl $nr_syscalls,%eax
5126 jb syscall_call
5127 jmp syscall_exit
5128 Index: head-2008-12-01/arch/x86/kernel/entry_64.S
5129 ===================================================================
5130 --- head-2008-12-01.orig/arch/x86/kernel/entry_64.S 2008-12-03 15:48:43.000000000 +0100
5131 +++ head-2008-12-01/arch/x86/kernel/entry_64.S 2008-12-01 11:49:07.000000000 +0100
5132 @@ -1409,7 +1409,7 @@ ENTRY(arch_unwind_init_running)
5133 ENDPROC(arch_unwind_init_running)
5134 #endif
5135
5136 -#ifdef CONFIG_XEN
5137 +#ifdef CONFIG_PARAVIRT_XEN
5138 ENTRY(xen_hypervisor_callback)
5139 zeroentry xen_do_hypervisor_callback
5140 END(xen_hypervisor_callback)
5141 @@ -1507,4 +1507,4 @@ ENTRY(xen_failsafe_callback)
5142 CFI_ENDPROC
5143 END(xen_failsafe_callback)
5144
5145 -#endif /* CONFIG_XEN */
5146 +#endif /* CONFIG_PARAVIRT_XEN */
5147 Index: head-2008-12-01/arch/x86/kernel/entry_64-xen.S
5148 ===================================================================
5149 --- head-2008-12-01.orig/arch/x86/kernel/entry_64-xen.S 2008-12-01 11:44:55.000000000 +0100
5150 +++ head-2008-12-01/arch/x86/kernel/entry_64-xen.S 2008-12-01 11:49:07.000000000 +0100
5151 @@ -53,19 +53,130 @@
5152 #include <asm/hw_irq.h>
5153 #include <asm/page.h>
5154 #include <asm/irqflags.h>
5155 +#include <asm/ftrace.h>
5156 #include <asm/errno.h>
5157 #include <xen/interface/xen.h>
5158 #include <xen/interface/features.h>
5159
5160 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
5161 +#include <linux/elf-em.h>
5162 +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
5163 +#define __AUDIT_ARCH_64BIT 0x80000000
5164 +#define __AUDIT_ARCH_LE 0x40000000
5165 +
5166 .code64
5167
5168 +#ifdef CONFIG_FTRACE
5169 +#ifdef CONFIG_DYNAMIC_FTRACE
5170 +ENTRY(mcount)
5171 +
5172 + subq $0x38, %rsp
5173 + movq %rax, (%rsp)
5174 + movq %rcx, 8(%rsp)
5175 + movq %rdx, 16(%rsp)
5176 + movq %rsi, 24(%rsp)
5177 + movq %rdi, 32(%rsp)
5178 + movq %r8, 40(%rsp)
5179 + movq %r9, 48(%rsp)
5180 +
5181 + movq 0x38(%rsp), %rdi
5182 + subq $MCOUNT_INSN_SIZE, %rdi
5183 +
5184 +.globl mcount_call
5185 +mcount_call:
5186 + call ftrace_stub
5187 +
5188 + movq 48(%rsp), %r9
5189 + movq 40(%rsp), %r8
5190 + movq 32(%rsp), %rdi
5191 + movq 24(%rsp), %rsi
5192 + movq 16(%rsp), %rdx
5193 + movq 8(%rsp), %rcx
5194 + movq (%rsp), %rax
5195 + addq $0x38, %rsp
5196 +
5197 + retq
5198 +END(mcount)
5199 +
5200 +ENTRY(ftrace_caller)
5201 +
5202 + /* taken from glibc */
5203 + subq $0x38, %rsp
5204 + movq %rax, (%rsp)
5205 + movq %rcx, 8(%rsp)
5206 + movq %rdx, 16(%rsp)
5207 + movq %rsi, 24(%rsp)
5208 + movq %rdi, 32(%rsp)
5209 + movq %r8, 40(%rsp)
5210 + movq %r9, 48(%rsp)
5211 +
5212 + movq 0x38(%rsp), %rdi
5213 + movq 8(%rbp), %rsi
5214 + subq $MCOUNT_INSN_SIZE, %rdi
5215 +
5216 +.globl ftrace_call
5217 +ftrace_call:
5218 + call ftrace_stub
5219 +
5220 + movq 48(%rsp), %r9
5221 + movq 40(%rsp), %r8
5222 + movq 32(%rsp), %rdi
5223 + movq 24(%rsp), %rsi
5224 + movq 16(%rsp), %rdx
5225 + movq 8(%rsp), %rcx
5226 + movq (%rsp), %rax
5227 + addq $0x38, %rsp
5228 +
5229 +.globl ftrace_stub
5230 +ftrace_stub:
5231 + retq
5232 +END(ftrace_caller)
5233 +
5234 +#else /* ! CONFIG_DYNAMIC_FTRACE */
5235 +ENTRY(mcount)
5236 + cmpq $ftrace_stub, ftrace_trace_function
5237 + jnz trace
5238 +.globl ftrace_stub
5239 +ftrace_stub:
5240 + retq
5241 +
5242 +trace:
5243 + /* taken from glibc */
5244 + subq $0x38, %rsp
5245 + movq %rax, (%rsp)
5246 + movq %rcx, 8(%rsp)
5247 + movq %rdx, 16(%rsp)
5248 + movq %rsi, 24(%rsp)
5249 + movq %rdi, 32(%rsp)
5250 + movq %r8, 40(%rsp)
5251 + movq %r9, 48(%rsp)
5252 +
5253 + movq 0x38(%rsp), %rdi
5254 + movq 8(%rbp), %rsi
5255 + subq $MCOUNT_INSN_SIZE, %rdi
5256 +
5257 + call *ftrace_trace_function
5258 +
5259 + movq 48(%rsp), %r9
5260 + movq 40(%rsp), %r8
5261 + movq 32(%rsp), %rdi
5262 + movq 24(%rsp), %rsi
5263 + movq 16(%rsp), %rdx
5264 + movq 8(%rsp), %rcx
5265 + movq (%rsp), %rax
5266 + addq $0x38, %rsp
5267 +
5268 + jmp ftrace_stub
5269 +END(mcount)
5270 +#endif /* CONFIG_DYNAMIC_FTRACE */
5271 +#endif /* CONFIG_FTRACE */
5272 +
5273 #ifndef CONFIG_PREEMPT
5274 #define retint_kernel retint_restore_args
5275 #endif
5276
5277 #ifdef CONFIG_PARAVIRT
5278 -ENTRY(native_irq_enable_syscall_ret)
5279 - movq %gs:pda_oldrsp,%rsp
5280 +ENTRY(native_usergs_sysret64)
5281 swapgs
5282 sysretq
5283 #endif /* CONFIG_PARAVIRT */
5284 @@ -102,7 +213,7 @@ NMI_MASK = 0x80000000
5285 .macro FAKE_STACK_FRAME child_rip
5286 /* push in order ss, rsp, eflags, cs, rip */
5287 xorl %eax, %eax
5288 - pushq %rax /* ss */
5289 + pushq $__KERNEL_DS /* ss */
5290 CFI_ADJUST_CFA_OFFSET 8
5291 /*CFI_REL_OFFSET ss,0*/
5292 pushq %rax /* rsp */
5293 @@ -197,13 +308,13 @@ ENTRY(ret_from_fork)
5294 CFI_ADJUST_CFA_OFFSET -4
5295 call schedule_tail
5296 GET_THREAD_INFO(%rcx)
5297 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
5298 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5299 jnz rff_trace
5300 rff_action:
5301 RESTORE_REST
5302 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
5303 je int_ret_from_sys_call
5304 - testl $_TIF_IA32,threadinfo_flags(%rcx)
5305 + testl $_TIF_IA32,TI_flags(%rcx)
5306 jnz int_ret_from_sys_call
5307 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
5308 jmp ret_from_sys_call
5309 @@ -265,8 +376,9 @@ ENTRY(system_call)
5310 SAVE_ARGS -8,0
5311 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
5312 GET_THREAD_INFO(%rcx)
5313 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
5314 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
5315 jnz tracesys
5316 +system_call_fastpath:
5317 cmpq $__NR_syscall_max,%rax
5318 ja badsys
5319 movq %r10,%rcx
5320 @@ -284,7 +396,7 @@ sysret_check:
5321 GET_THREAD_INFO(%rcx)
5322 DISABLE_INTERRUPTS(CLBR_NONE)
5323 TRACE_IRQS_OFF
5324 - movl threadinfo_flags(%rcx),%edx
5325 + movl TI_flags(%rcx),%edx
5326 andl %edi,%edx
5327 jnz sysret_careful
5328 CFI_REMEMBER_STATE
5329 @@ -315,16 +427,16 @@ sysret_careful:
5330 sysret_signal:
5331 TRACE_IRQS_ON
5332 ENABLE_INTERRUPTS(CLBR_NONE)
5333 - testl $_TIF_DO_NOTIFY_MASK,%edx
5334 - jz 1f
5335 -
5336 - /* Really a signal */
5337 +#ifdef CONFIG_AUDITSYSCALL
5338 + bt $TIF_SYSCALL_AUDIT,%edx
5339 + jc sysret_audit
5340 +#endif
5341 /* edx: work flags (arg3) */
5342 leaq do_notify_resume(%rip),%rax
5343 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
5344 xorl %esi,%esi # oldset -> arg2
5345 call ptregscall_common
5346 -1: movl $_TIF_NEED_RESCHED,%edi
5347 + movl $_TIF_WORK_MASK,%edi
5348 /* Use IRET because user could have changed frame. This
5349 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
5350 DISABLE_INTERRUPTS(CLBR_NONE)
5351 @@ -335,14 +447,56 @@ badsys:
5352 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
5353 jmp ret_from_sys_call
5354
5355 +#ifdef CONFIG_AUDITSYSCALL
5356 + /*
5357 + * Fast path for syscall audit without full syscall trace.
5358 + * We just call audit_syscall_entry() directly, and then
5359 + * jump back to the normal fast path.
5360 + */
5361 +auditsys:
5362 + movq %r10,%r9 /* 6th arg: 4th syscall arg */
5363 + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
5364 + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
5365 + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
5366 + movq %rax,%rsi /* 2nd arg: syscall number */
5367 + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
5368 + call audit_syscall_entry
5369 + LOAD_ARGS 0 /* reload call-clobbered registers */
5370 + jmp system_call_fastpath
5371 +
5372 + /*
5373 + * Return fast path for syscall audit. Call audit_syscall_exit()
5374 + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
5375 + * masked off.
5376 + */
5377 +sysret_audit:
5378 + movq %rax,%rsi /* second arg, syscall return value */
5379 + cmpq $0,%rax /* is it < 0? */
5380 + setl %al /* 1 if so, 0 if not */
5381 + movzbl %al,%edi /* zero-extend that into %edi */
5382 + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
5383 + call audit_syscall_exit
5384 + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
5385 + jmp sysret_check
5386 +#endif /* CONFIG_AUDITSYSCALL */
5387 +
5388 /* Do syscall tracing */
5389 tracesys:
5390 +#ifdef CONFIG_AUDITSYSCALL
5391 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5392 + jz auditsys
5393 +#endif
5394 SAVE_REST
5395 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
5396 FIXUP_TOP_OF_STACK %rdi
5397 movq %rsp,%rdi
5398 call syscall_trace_enter
5399 - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
5400 + /*
5401 + * Reload arg registers from stack in case ptrace changed them.
5402 + * We don't reload %rax because syscall_trace_enter() returned
5403 + * the value it wants us to use in the table lookup.
5404 + */
5405 + LOAD_ARGS ARGOFFSET, 1
5406 RESTORE_REST
5407 cmpq $__NR_syscall_max,%rax
5408 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
5409 @@ -356,6 +510,7 @@ tracesys:
5410 * Has correct top of stack, but partial stack frame.
5411 */
5412 .globl int_ret_from_sys_call
5413 + .globl int_with_check
5414 int_ret_from_sys_call:
5415 DISABLE_INTERRUPTS(CLBR_NONE)
5416 TRACE_IRQS_OFF
5417 @@ -370,10 +525,10 @@ int_ret_from_sys_call:
5418 int_with_check:
5419 LOCKDEP_SYS_EXIT_IRQ
5420 GET_THREAD_INFO(%rcx)
5421 - movl threadinfo_flags(%rcx),%edx
5422 + movl TI_flags(%rcx),%edx
5423 andl %edi,%edx
5424 jnz int_careful
5425 - andl $~TS_COMPAT,threadinfo_status(%rcx)
5426 + andl $~TS_COMPAT,TI_status(%rcx)
5427 jmp retint_restore_args
5428
5429 /* Either reschedule or signal or syscall exit tracking needed. */
5430 @@ -399,7 +554,7 @@ int_very_careful:
5431 ENABLE_INTERRUPTS(CLBR_NONE)
5432 SAVE_REST
5433 /* Check for syscall exit trace */
5434 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
5435 + testl $_TIF_WORK_SYSCALL_EXIT,%edx
5436 jz int_signal
5437 pushq %rdi
5438 CFI_ADJUST_CFA_OFFSET 8
5439 @@ -407,7 +562,7 @@ int_very_careful:
5440 call syscall_trace_leave
5441 popq %rdi
5442 CFI_ADJUST_CFA_OFFSET -8
5443 - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
5444 + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
5445 jmp int_restore_rest
5446
5447 int_signal:
5448 @@ -416,7 +571,7 @@ int_signal:
5449 movq %rsp,%rdi # &ptregs -> arg1
5450 xorl %esi,%esi # oldset -> arg2
5451 call do_notify_resume
5452 -1: movl $_TIF_NEED_RESCHED,%edi
5453 +1: movl $_TIF_WORK_MASK,%edi
5454 int_restore_rest:
5455 RESTORE_REST
5456 DISABLE_INTERRUPTS(CLBR_NONE)
5457 @@ -443,7 +598,6 @@ END(\label)
5458 PTREGSCALL stub_clone, sys_clone, %r8
5459 PTREGSCALL stub_fork, sys_fork, %rdi
5460 PTREGSCALL stub_vfork, sys_vfork, %rdi
5461 - PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
5462 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
5463 PTREGSCALL stub_iopl, sys_iopl, %rsi
5464
5465 @@ -517,10 +671,12 @@ END(stub_rt_sigreturn)
5466 *
5467 */
5468
5469 -retint_check:
5470 +retint_with_reschedule:
5471 CFI_DEFAULT_STACK adj=1
5472 + movl $_TIF_WORK_MASK,%edi
5473 +retint_check:
5474 LOCKDEP_SYS_EXIT_IRQ
5475 - movl threadinfo_flags(%rcx),%edx
5476 + movl TI_flags(%rcx),%edx
5477 andl %edi,%edx
5478 CFI_REMEMBER_STATE
5479 jnz retint_careful
5480 @@ -565,17 +721,16 @@ retint_signal:
5481 RESTORE_REST
5482 DISABLE_INTERRUPTS(CLBR_NONE)
5483 TRACE_IRQS_OFF
5484 - movl $_TIF_NEED_RESCHED,%edi
5485 GET_THREAD_INFO(%rcx)
5486 - jmp retint_check
5487 + jmp retint_with_reschedule
5488
5489 #ifdef CONFIG_PREEMPT
5490 /* Returning to kernel space. Check if we need preemption */
5491 /* rcx: threadinfo. interrupts off. */
5492 ENTRY(retint_kernel)
5493 - cmpl $0,threadinfo_preempt_count(%rcx)
5494 + cmpl $0,TI_preempt_count(%rcx)
5495 jnz retint_restore_args
5496 - bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
5497 + bt $TIF_NEED_RESCHED,TI_flags(%rcx)
5498 jnc retint_restore_args
5499 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
5500 jnc retint_restore_args
5501 @@ -630,6 +785,9 @@ END(invalidate_interrupt\num)
5502 ENTRY(call_function_interrupt)
5503 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
5504 END(call_function_interrupt)
5505 +ENTRY(call_function_single_interrupt)
5506 + apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
5507 +END(call_function_single_interrupt)
5508 ENTRY(irq_move_cleanup_interrupt)
5509 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
5510 END(irq_move_cleanup_interrupt)
5511 @@ -639,6 +797,10 @@ ENTRY(apic_timer_interrupt)
5512 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
5513 END(apic_timer_interrupt)
5514
5515 +ENTRY(uv_bau_message_intr1)
5516 + apicinterrupt 220,uv_bau_message_interrupt
5517 +END(uv_bau_message_intr1)
5518 +
5519 ENTRY(error_interrupt)
5520 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
5521 END(error_interrupt)
5522 @@ -752,7 +914,7 @@ paranoid_restore\trace:
5523 jmp irq_return
5524 paranoid_userspace\trace:
5525 GET_THREAD_INFO(%rcx)
5526 - movl threadinfo_flags(%rcx),%ebx
5527 + movl TI_flags(%rcx),%ebx
5528 andl $_TIF_WORK_MASK,%ebx
5529 jz paranoid_swapgs\trace
5530 movq %rsp,%rdi /* &pt_regs */
5531 @@ -849,7 +1011,7 @@ error_exit:
5532 testb $3,CS-ARGOFFSET(%rsp)
5533 jz retint_kernel
5534 LOCKDEP_SYS_EXIT_IRQ
5535 - movl threadinfo_flags(%rcx),%edx
5536 + movl TI_flags(%rcx),%edx
5537 movl $_TIF_WORK_MASK,%edi
5538 andl %edi,%edx
5539 jnz retint_careful
5540 @@ -871,11 +1033,11 @@ error_kernelspace:
5541 iret run with kernel gs again, so don't set the user space flag.
5542 B stepping K8s sometimes report an truncated RIP for IRET
5543 exceptions returning to compat mode. Check for these here too. */
5544 - leaq irq_return(%rip),%rbp
5545 - cmpq %rbp,RIP(%rsp)
5546 + leaq irq_return(%rip),%rcx
5547 + cmpq %rcx,RIP(%rsp)
5548 je error_swapgs
5549 - movl %ebp,%ebp /* zero extend */
5550 - cmpq %rbp,RIP(%rsp)
5551 + movl %ecx,%ecx /* zero extend */
5552 + cmpq %rcx,RIP(%rsp)
5553 je error_swapgs
5554 cmpq $gs_change,RIP(%rsp)
5555 je error_swapgs
5556 @@ -1121,6 +1283,7 @@ END(device_not_available)
5557 /* runs on exception stack */
5558 KPROBE_ENTRY(debug)
5559 /* INTR_FRAME
5560 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5561 pushq $0
5562 CFI_ADJUST_CFA_OFFSET 8 */
5563 zeroentry do_debug
5564 @@ -1148,6 +1311,7 @@ END(do_nmi_callback)
5565
5566 KPROBE_ENTRY(int3)
5567 /* INTR_FRAME
5568 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5569 pushq $0
5570 CFI_ADJUST_CFA_OFFSET 8 */
5571 zeroentry do_int3
5572 @@ -1171,14 +1335,11 @@ ENTRY(coprocessor_segment_overrun)
5573 zeroentry do_coprocessor_segment_overrun
5574 END(coprocessor_segment_overrun)
5575
5576 -ENTRY(reserved)
5577 - zeroentry do_reserved
5578 -END(reserved)
5579 -
5580 #if 0
5581 /* runs on exception stack */
5582 ENTRY(double_fault)
5583 XCPT_FRAME
5584 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5585 paranoidentry do_double_fault
5586 jmp paranoid_exit1
5587 CFI_ENDPROC
5588 @@ -1196,6 +1357,7 @@ END(segment_not_present)
5589 /* runs on exception stack */
5590 ENTRY(stack_segment)
5591 /* XCPT_FRAME
5592 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5593 paranoidentry do_stack_segment */
5594 errorentry do_stack_segment
5595 /* jmp paranoid_exit1
5596 @@ -1222,6 +1384,7 @@ END(spurious_interrupt_bug)
5597 /* runs on exception stack */
5598 ENTRY(machine_check)
5599 INTR_FRAME
5600 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5601 pushq $0
5602 CFI_ADJUST_CFA_OFFSET 8
5603 paranoidentry do_machine_check
5604 Index: head-2008-12-01/arch/x86/kernel/genapic_64-xen.c
5605 ===================================================================
5606 --- head-2008-12-01.orig/arch/x86/kernel/genapic_64-xen.c 2008-12-01 11:44:55.000000000 +0100
5607 +++ head-2008-12-01/arch/x86/kernel/genapic_64-xen.c 2008-12-01 11:49:07.000000000 +0100
5608 @@ -58,7 +58,7 @@ void __init setup_apic_routing(void)
5609 else
5610 #endif
5611
5612 - if (num_possible_cpus() <= 8)
5613 + if (max_physical_apicid < 8)
5614 genapic = &apic_flat;
5615 else
5616 genapic = &apic_physflat;
5617 @@ -121,4 +121,5 @@ int is_uv_system(void)
5618 {
5619 return uv_system_type != UV_NONE;
5620 }
5621 +EXPORT_SYMBOL_GPL(is_uv_system);
5622 #endif
5623 Index: head-2008-12-01/arch/x86/kernel/genapic_xen_64.c
5624 ===================================================================
5625 --- head-2008-12-01.orig/arch/x86/kernel/genapic_xen_64.c 2008-12-01 11:44:55.000000000 +0100
5626 +++ head-2008-12-01/arch/x86/kernel/genapic_xen_64.c 2008-12-01 11:49:07.000000000 +0100
5627 @@ -43,7 +43,7 @@ void xen_send_IPI_shortcut(unsigned int
5628 __send_IPI_one(smp_processor_id(), vector);
5629 break;
5630 case APIC_DEST_ALLBUT:
5631 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5632 + for_each_possible_cpu(cpu) {
5633 if (cpu == smp_processor_id())
5634 continue;
5635 if (cpu_isset(cpu, cpu_online_map)) {
5636 @@ -52,7 +52,7 @@ void xen_send_IPI_shortcut(unsigned int
5637 }
5638 break;
5639 case APIC_DEST_ALLINC:
5640 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5641 + for_each_possible_cpu(cpu) {
5642 if (cpu_isset(cpu, cpu_online_map)) {
5643 __send_IPI_one(cpu, vector);
5644 }
5645 @@ -81,8 +81,6 @@ static cpumask_t xen_vector_allocation_d
5646 */
5647 static void xen_init_apic_ldr(void)
5648 {
5649 - Dprintk("%s\n", __FUNCTION__);
5650 - return;
5651 }
5652
5653 static void xen_send_IPI_allbutself(int vector)
5654 @@ -92,14 +90,12 @@ static void xen_send_IPI_allbutself(int
5655 * we get an APIC send error if we try to broadcast.
5656 * thus we have to avoid sending IPIs in this case.
5657 */
5658 - Dprintk("%s\n", __FUNCTION__);
5659 if (num_online_cpus() > 1)
5660 xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
5661 }
5662
5663 static void xen_send_IPI_all(int vector)
5664 {
5665 - Dprintk("%s\n", __FUNCTION__);
5666 xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
5667 }
5668
5669 @@ -109,11 +105,10 @@ static void xen_send_IPI_mask(cpumask_t
5670 unsigned int cpu;
5671 unsigned long flags;
5672
5673 - Dprintk("%s\n", __FUNCTION__);
5674 local_irq_save(flags);
5675 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
5676
5677 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5678 + for_each_possible_cpu(cpu) {
5679 if (cpu_isset(cpu, cpumask)) {
5680 __send_IPI_one(cpu, vector);
5681 }
5682 @@ -125,14 +120,12 @@ static void xen_send_IPI_mask(cpumask_t
5683 static int xen_apic_id_registered(void)
5684 {
5685 /* better be set */
5686 - Dprintk("%s\n", __FUNCTION__);
5687 return physid_isset(smp_processor_id(), phys_cpu_present_map);
5688 }
5689 #endif
5690
5691 static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
5692 {
5693 - Dprintk("%s\n", __FUNCTION__);
5694 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
5695 }
5696
5697 @@ -140,7 +133,6 @@ static unsigned int phys_pkg_id(int inde
5698 {
5699 u32 ebx;
5700
5701 - Dprintk("%s\n", __FUNCTION__);
5702 ebx = cpuid_ebx(1);
5703 return ((ebx >> 24) & 0xFF) >> index_msb;
5704 }
5705 Index: head-2008-12-01/arch/x86/kernel/head-xen.c
5706 ===================================================================
5707 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5708 +++ head-2008-12-01/arch/x86/kernel/head-xen.c 2008-12-01 11:49:07.000000000 +0100
5709 @@ -0,0 +1,57 @@
5710 +#include <linux/kernel.h>
5711 +#include <linux/init.h>
5712 +
5713 +#include <asm/setup.h>
5714 +#include <asm/bios_ebda.h>
5715 +
5716 +#define BIOS_LOWMEM_KILOBYTES 0x413
5717 +
5718 +/*
5719 + * The BIOS places the EBDA/XBDA at the top of conventional
5720 + * memory, and usually decreases the reported amount of
5721 + * conventional memory (int 0x12) too. This also contains a
5722 + * workaround for Dell systems that neglect to reserve EBDA.
5723 + * The same workaround also avoids a problem with the AMD768MPX
5724 + * chipset: reserve a page before VGA to prevent PCI prefetch
5725 + * into it (errata #56). Usually the page is reserved anyways,
5726 + * unless you have no PS/2 mouse plugged in.
5727 + */
5728 +void __init reserve_ebda_region(void)
5729 +{
5730 +#ifndef CONFIG_XEN
5731 + unsigned int lowmem, ebda_addr;
5732 +
5733 + /* To determine the position of the EBDA and the */
5734 + /* end of conventional memory, we need to look at */
5735 + /* the BIOS data area. In a paravirtual environment */
5736 + /* that area is absent. We'll just have to assume */
5737 + /* that the paravirt case can handle memory setup */
5738 + /* correctly, without our help. */
5739 + if (paravirt_enabled())
5740 + return;
5741 +
5742 + /* end of low (conventional) memory */
5743 + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5744 + lowmem <<= 10;
5745 +
5746 + /* start of EBDA area */
5747 + ebda_addr = get_bios_ebda();
5748 +
5749 + /* Fixup: bios puts an EBDA in the top 64K segment */
5750 + /* of conventional memory, but does not adjust lowmem. */
5751 + if ((lowmem - ebda_addr) <= 0x10000)
5752 + lowmem = ebda_addr;
5753 +
5754 + /* Fixup: bios does not report an EBDA at all. */
5755 + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5756 + if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5757 + lowmem = 0x9f000;
5758 +
5759 + /* Paranoia: should never happen, but... */
5760 + if ((lowmem == 0) || (lowmem >= 0x100000))
5761 + lowmem = 0x9f000;
5762 +
5763 + /* reserve all memory between lowmem and the 1MB mark */
5764 + reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
5765 +#endif
5766 +}
5767 Index: head-2008-12-01/arch/x86/kernel/head32-xen.c
5768 ===================================================================
5769 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5770 +++ head-2008-12-01/arch/x86/kernel/head32-xen.c 2008-12-01 11:49:07.000000000 +0100
5771 @@ -0,0 +1,57 @@
5772 +/*
5773 + * linux/arch/i386/kernel/head32.c -- prepare to run common code
5774 + *
5775 + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5776 + * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
5777 + */
5778 +
5779 +#include <linux/init.h>
5780 +#include <linux/start_kernel.h>
5781 +
5782 +#include <asm/setup.h>
5783 +#include <asm/sections.h>
5784 +#include <asm/e820.h>
5785 +#include <asm/bios_ebda.h>
5786 +
5787 +void __init i386_start_kernel(void)
5788 +{
5789 + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
5790 +
5791 +#ifndef CONFIG_XEN
5792 +#ifdef CONFIG_BLK_DEV_INITRD
5793 + /* Reserve INITRD */
5794 + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
5795 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
5796 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
5797 + u64 ramdisk_end = ramdisk_image + ramdisk_size;
5798 + reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
5799 + }
5800 +#endif
5801 + reserve_early(init_pg_tables_start, init_pg_tables_end,
5802 + "INIT_PG_TABLE");
5803 +#else
5804 + reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
5805 + __pa(xen_start_info->pt_base)
5806 + + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
5807 + "Xen provided");
5808 +
5809 + {
5810 + int max_cmdline;
5811 +
5812 + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
5813 + max_cmdline = COMMAND_LINE_SIZE;
5814 + memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
5815 + boot_command_line[max_cmdline-1] = '\0';
5816 + }
5817 +#endif
5818 +
5819 + reserve_ebda_region();
5820 +
5821 + /*
5822 + * At this point everything still needed from the boot loader
5823 + * or BIOS or kernel text should be early reserved or marked not
5824 + * RAM in e820. All other memory is free game.
5825 + */
5826 +
5827 + start_kernel();
5828 +}
5829 Index: head-2008-12-01/arch/x86/kernel/head64-xen.c
5830 ===================================================================
5831 --- head-2008-12-01.orig/arch/x86/kernel/head64-xen.c 2008-12-01 11:44:55.000000000 +0100
5832 +++ head-2008-12-01/arch/x86/kernel/head64-xen.c 2008-12-01 11:49:07.000000000 +0100
5833 @@ -32,7 +32,26 @@
5834 #include <asm/e820.h>
5835 #include <asm/bios_ebda.h>
5836
5837 -unsigned long start_pfn;
5838 +/* boot cpu pda */
5839 +static struct x8664_pda _boot_cpu_pda __read_mostly;
5840 +
5841 +#ifdef CONFIG_SMP
5842 +/*
5843 + * We install an empty cpu_pda pointer table to indicate to early users
5844 + * (numa_set_node) that the cpu_pda pointer table for cpus other than
5845 + * the boot cpu is not yet setup.
5846 + */
5847 +static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
5848 +#else
5849 +static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
5850 +#endif
5851 +
5852 +void __init x86_64_init_pda(void)
5853 +{
5854 + _cpu_pda = __cpu_pda;
5855 + cpu_pda(0) = &_boot_cpu_pda;
5856 + pda_init(0);
5857 +}
5858
5859 #ifndef CONFIG_XEN
5860 static void __init zap_identity_mappings(void)
5861 @@ -77,83 +96,10 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
5862 unsigned int machine_to_phys_order;
5863 EXPORT_SYMBOL(machine_to_phys_order);
5864
5865 -#define BIOS_LOWMEM_KILOBYTES 0x413
5866 -
5867 -/*
5868 - * The BIOS places the EBDA/XBDA at the top of conventional
5869 - * memory, and usually decreases the reported amount of
5870 - * conventional memory (int 0x12) too. This also contains a
5871 - * workaround for Dell systems that neglect to reserve EBDA.
5872 - * The same workaround also avoids a problem with the AMD768MPX
5873 - * chipset: reserve a page before VGA to prevent PCI prefetch
5874 - * into it (errata #56). Usually the page is reserved anyways,
5875 - * unless you have no PS/2 mouse plugged in.
5876 - */
5877 -static void __init reserve_ebda_region(void)
5878 -{
5879 -#ifndef CONFIG_XEN
5880 - unsigned int lowmem, ebda_addr;
5881 -
5882 - /* To determine the position of the EBDA and the */
5883 - /* end of conventional memory, we need to look at */
5884 - /* the BIOS data area. In a paravirtual environment */
5885 - /* that area is absent. We'll just have to assume */
5886 - /* that the paravirt case can handle memory setup */
5887 - /* correctly, without our help. */
5888 - if (paravirt_enabled())
5889 - return;
5890 -
5891 - /* end of low (conventional) memory */
5892 - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5893 - lowmem <<= 10;
5894 -
5895 - /* start of EBDA area */
5896 - ebda_addr = get_bios_ebda();
5897 -
5898 - /* Fixup: bios puts an EBDA in the top 64K segment */
5899 - /* of conventional memory, but does not adjust lowmem. */
5900 - if ((lowmem - ebda_addr) <= 0x10000)
5901 - lowmem = ebda_addr;
5902 -
5903 - /* Fixup: bios does not report an EBDA at all. */
5904 - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5905 - if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5906 - lowmem = 0x9f000;
5907 -
5908 - /* Paranoia: should never happen, but... */
5909 - if ((lowmem == 0) || (lowmem >= 0x100000))
5910 - lowmem = 0x9f000;
5911 -
5912 - /* reserve all memory between lowmem and the 1MB mark */
5913 - reserve_early(lowmem, 0x100000, "BIOS reserved");
5914 -#endif
5915 -}
5916 -
5917 -static void __init reserve_setup_data(void)
5918 -{
5919 -#ifndef CONFIG_XEN
5920 - struct setup_data *data;
5921 - unsigned long pa_data;
5922 - char buf[32];
5923 -
5924 - if (boot_params.hdr.version < 0x0209)
5925 - return;
5926 - pa_data = boot_params.hdr.setup_data;
5927 - while (pa_data) {
5928 - data = early_ioremap(pa_data, sizeof(*data));
5929 - sprintf(buf, "setup data %x", data->type);
5930 - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
5931 - pa_data = data->next;
5932 - early_iounmap(data, sizeof(*data));
5933 - }
5934 -#endif
5935 -}
5936 -
5937 void __init x86_64_start_kernel(char * real_mode_data)
5938 {
5939 struct xen_machphys_mapping mapping;
5940 unsigned long machine_to_phys_nr_ents;
5941 - int i;
5942
5943 /*
5944 * Build-time sanity checks on the kernel image and module
5945 @@ -167,6 +113,7 @@ void __init x86_64_start_kernel(char * r
5946 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
5947 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
5948 (__START_KERNEL & PGDIR_MASK)));
5949 + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
5950
5951 xen_setup_features();
5952
5953 @@ -174,8 +121,6 @@ void __init x86_64_start_kernel(char * r
5954 if (!xen_feature(XENFEAT_auto_translated_physmap))
5955 phys_to_machine_mapping =
5956 (unsigned long *)xen_start_info->mfn_list;
5957 - start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
5958 - xen_start_info->nr_pt_frames;
5959
5960 machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
5961 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
5962 @@ -208,19 +153,23 @@ void __init x86_64_start_kernel(char * r
5963
5964 early_printk("Kernel alive\n");
5965
5966 - for (i = 0; i < NR_CPUS; i++)
5967 - cpu_pda(i) = &boot_cpu_pda[i];
5968 + x86_64_init_pda();
5969
5970 - pda_init(0);
5971 + early_printk("Kernel really alive\n");
5972 +
5973 + x86_64_start_reservations(real_mode_data);
5974 +}
5975 +
5976 +void __init x86_64_start_reservations(char *real_mode_data)
5977 +{
5978 copy_bootdata(__va(real_mode_data));
5979
5980 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
5981
5982 reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
5983 - start_pfn << PAGE_SHIFT, "Xen provided");
5984 -
5985 - reserve_ebda_region();
5986 - reserve_setup_data();
5987 + __pa(xen_start_info->pt_base)
5988 + + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
5989 + "Xen provided");
5990
5991 /*
5992 * At this point everything still needed from the boot loader
5993 Index: head-2008-12-01/arch/x86/kernel/head_64-xen.S
5994 ===================================================================
5995 --- head-2008-12-01.orig/arch/x86/kernel/head_64-xen.S 2008-12-01 11:36:47.000000000 +0100
5996 +++ head-2008-12-01/arch/x86/kernel/head_64-xen.S 2008-12-01 11:49:07.000000000 +0100
5997 @@ -95,53 +95,6 @@ NEXT_PAGE(hypercall_page)
5998
5999 #undef NEXT_PAGE
6000
6001 - .data
6002 -
6003 - .align 16
6004 - .globl cpu_gdt_descr
6005 -cpu_gdt_descr:
6006 - .word gdt_end-cpu_gdt_table-1
6007 -gdt:
6008 - .quad cpu_gdt_table
6009 -#ifdef CONFIG_SMP
6010 - .rept NR_CPUS-1
6011 - .word 0
6012 - .quad 0
6013 - .endr
6014 -#endif
6015 -
6016 -/* We need valid kernel segments for data and code in long mode too
6017 - * IRET will check the segment types kkeil 2000/10/28
6018 - * Also sysret mandates a special GDT layout
6019 - */
6020 -
6021 - .section .data.page_aligned, "aw"
6022 - .align PAGE_SIZE
6023 -
6024 -/* The TLS descriptors are currently at a different place compared to i386.
6025 - Hopefully nobody expects them at a fixed place (Wine?) */
6026 -
6027 -ENTRY(cpu_gdt_table)
6028 - .quad 0x0000000000000000 /* NULL descriptor */
6029 - .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
6030 - .quad 0x00af9b000000ffff /* __KERNEL_CS */
6031 - .quad 0x00cf93000000ffff /* __KERNEL_DS */
6032 - .quad 0x00cffb000000ffff /* __USER32_CS */
6033 - .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
6034 - .quad 0x00affb000000ffff /* __USER_CS */
6035 - .quad 0x0 /* unused */
6036 - .quad 0,0 /* TSS */
6037 - .quad 0,0 /* LDT */
6038 - .quad 0,0,0 /* three TLS descriptors */
6039 - .quad 0x0000f40000000000 /* node/CPU stored in limit */
6040 -gdt_end:
6041 - /* asm/segment.h:GDT_ENTRIES must match this */
6042 - /* This should be a multiple of the cache line size */
6043 - /* GDTs of other CPUs are now dynamically allocated */
6044 -
6045 - /* zero the remaining page */
6046 - .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
6047 -
6048 .section .bss.page_aligned, "aw", @nobits
6049 .align PAGE_SIZE
6050 ENTRY(empty_zero_page)
6051 Index: head-2008-12-01/arch/x86/kernel/io_apic_32-xen.c
6052 ===================================================================
6053 --- head-2008-12-01.orig/arch/x86/kernel/io_apic_32-xen.c 2008-12-01 11:44:55.000000000 +0100
6054 +++ head-2008-12-01/arch/x86/kernel/io_apic_32-xen.c 2008-12-01 11:49:07.000000000 +0100
6055 @@ -25,6 +25,7 @@
6056 #include <linux/init.h>
6057 #include <linux/delay.h>
6058 #include <linux/sched.h>
6059 +#include <linux/bootmem.h>
6060 #include <linux/mc146818rtc.h>
6061 #include <linux/compiler.h>
6062 #include <linux/acpi.h>
6063 @@ -75,7 +76,7 @@ static struct { int pin, apic; } ioapic_
6064 static DEFINE_SPINLOCK(ioapic_lock);
6065 static DEFINE_SPINLOCK(vector_lock);
6066
6067 -int timer_over_8254 __initdata = 1;
6068 +int timer_through_8259 __initdata;
6069
6070 /*
6071 * Is the SiS APIC rmw bug present ?
6072 @@ -89,15 +90,21 @@ int sis_apic_bug = -1;
6073 int nr_ioapic_registers[MAX_IO_APICS];
6074
6075 /* I/O APIC entries */
6076 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
6077 +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
6078 int nr_ioapics;
6079
6080 /* MP IRQ source entries */
6081 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6082 +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6083
6084 /* # of MP IRQ source entries */
6085 int mp_irq_entries;
6086
6087 +#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
6088 +int mp_bus_id_to_type[MAX_MP_BUSSES];
6089 +#endif
6090 +
6091 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
6092 +
6093 static int disable_timer_pin_1 __initdata;
6094
6095 /*
6096 @@ -128,7 +135,7 @@ struct io_apic {
6097 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
6098 {
6099 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
6100 - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
6101 + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
6102 }
6103 #endif
6104
6105 @@ -142,7 +149,7 @@ static inline unsigned int io_apic_read(
6106 struct physdev_apic apic_op;
6107 int ret;
6108
6109 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6110 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6111 apic_op.reg = reg;
6112 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
6113 if (ret)
6114 @@ -160,7 +167,7 @@ static inline void io_apic_write(unsigne
6115 #else
6116 struct physdev_apic apic_op;
6117
6118 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6119 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6120 apic_op.reg = reg;
6121 apic_op.value = value;
6122 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
6123 @@ -288,7 +295,7 @@ static void __init replace_pin_at_irq(un
6124 }
6125 }
6126
6127 -static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
6128 +static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
6129 {
6130 struct irq_pin_list *entry = irq_2_pin + irq;
6131 unsigned int pin, reg;
6132 @@ -308,30 +315,32 @@ static void __modify_IO_APIC_irq (unsign
6133 }
6134
6135 /* mask = 1 */
6136 -static void __mask_IO_APIC_irq (unsigned int irq)
6137 +static void __mask_IO_APIC_irq(unsigned int irq)
6138 {
6139 - __modify_IO_APIC_irq(irq, 0x00010000, 0);
6140 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
6141 }
6142
6143 /* mask = 0 */
6144 -static void __unmask_IO_APIC_irq (unsigned int irq)
6145 +static void __unmask_IO_APIC_irq(unsigned int irq)
6146 {
6147 - __modify_IO_APIC_irq(irq, 0, 0x00010000);
6148 + __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
6149 }
6150
6151 /* mask = 1, trigger = 0 */
6152 -static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
6153 +static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
6154 {
6155 - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
6156 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
6157 + IO_APIC_REDIR_LEVEL_TRIGGER);
6158 }
6159
6160 /* mask = 0, trigger = 1 */
6161 -static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
6162 +static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
6163 {
6164 - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
6165 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
6166 + IO_APIC_REDIR_MASKED);
6167 }
6168
6169 -static void mask_IO_APIC_irq (unsigned int irq)
6170 +static void mask_IO_APIC_irq(unsigned int irq)
6171 {
6172 unsigned long flags;
6173
6174 @@ -340,7 +349,7 @@ static void mask_IO_APIC_irq (unsigned i
6175 spin_unlock_irqrestore(&ioapic_lock, flags);
6176 }
6177
6178 -static void unmask_IO_APIC_irq (unsigned int irq)
6179 +static void unmask_IO_APIC_irq(unsigned int irq)
6180 {
6181 unsigned long flags;
6182
6183 @@ -352,7 +361,7 @@ static void unmask_IO_APIC_irq (unsigned
6184 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
6185 {
6186 struct IO_APIC_route_entry entry;
6187 -
6188 +
6189 /* Check delivery_mode to be sure we're not clearing an SMI pin */
6190 entry = ioapic_read_entry(apic, pin);
6191 if (entry.delivery_mode == dest_SMI)
6192 @@ -364,7 +373,7 @@ static void clear_IO_APIC_pin(unsigned i
6193 ioapic_mask_entry(apic, pin);
6194 }
6195
6196 -static void clear_IO_APIC (void)
6197 +static void clear_IO_APIC(void)
6198 {
6199 int apic, pin;
6200
6201 @@ -381,7 +390,7 @@ static void set_ioapic_affinity_irq(unsi
6202 struct irq_pin_list *entry = irq_2_pin + irq;
6203 unsigned int apicid_value;
6204 cpumask_t tmp;
6205 -
6206 +
6207 cpus_and(tmp, cpumask, cpu_online_map);
6208 if (cpus_empty(tmp))
6209 tmp = TARGET_CPUS;
6210 @@ -410,7 +419,7 @@ static void set_ioapic_affinity_irq(unsi
6211 # include <linux/kernel_stat.h> /* kstat */
6212 # include <linux/slab.h> /* kmalloc() */
6213 # include <linux/timer.h>
6214 -
6215 +
6216 #define IRQBALANCE_CHECK_ARCH -999
6217 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
6218 #define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
6219 @@ -422,14 +431,14 @@ static int physical_balance __read_mostl
6220 static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
6221
6222 static struct irq_cpu_info {
6223 - unsigned long * last_irq;
6224 - unsigned long * irq_delta;
6225 + unsigned long *last_irq;
6226 + unsigned long *irq_delta;
6227 unsigned long irq;
6228 } irq_cpu_data[NR_CPUS];
6229
6230 #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
6231 -#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
6232 -#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
6233 +#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
6234 +#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
6235
6236 #define IDLE_ENOUGH(cpu,now) \
6237 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
6238 @@ -468,8 +477,8 @@ inside:
6239 if (cpu == -1)
6240 cpu = NR_CPUS-1;
6241 }
6242 - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
6243 - (search_idle && !IDLE_ENOUGH(cpu,now)));
6244 + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
6245 + (search_idle && !IDLE_ENOUGH(cpu, now)));
6246
6247 return cpu;
6248 }
6249 @@ -479,15 +488,14 @@ static inline void balance_irq(int cpu,
6250 unsigned long now = jiffies;
6251 cpumask_t allowed_mask;
6252 unsigned int new_cpu;
6253 -
6254 +
6255 if (irqbalance_disabled)
6256 - return;
6257 + return;
6258
6259 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
6260 new_cpu = move(cpu, allowed_mask, now, 1);
6261 - if (cpu != new_cpu) {
6262 + if (cpu != new_cpu)
6263 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
6264 - }
6265 }
6266
6267 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
6268 @@ -499,14 +507,14 @@ static inline void rotate_irqs_among_cpu
6269 if (!irq_desc[j].action)
6270 continue;
6271 /* Is it a significant load ? */
6272 - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
6273 + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
6274 useful_load_threshold)
6275 continue;
6276 balance_irq(i, j);
6277 }
6278 }
6279 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6280 - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6281 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6282 return;
6283 }
6284
6285 @@ -535,22 +543,22 @@ static void do_irq_balance(void)
6286 /* Is this an active IRQ or balancing disabled ? */
6287 if (!irq_desc[j].action || irq_balancing_disabled(j))
6288 continue;
6289 - if ( package_index == i )
6290 - IRQ_DELTA(package_index,j) = 0;
6291 + if (package_index == i)
6292 + IRQ_DELTA(package_index, j) = 0;
6293 /* Determine the total count per processor per IRQ */
6294 value_now = (unsigned long) kstat_cpu(i).irqs[j];
6295
6296 /* Determine the activity per processor per IRQ */
6297 - delta = value_now - LAST_CPU_IRQ(i,j);
6298 + delta = value_now - LAST_CPU_IRQ(i, j);
6299
6300 /* Update last_cpu_irq[][] for the next time */
6301 - LAST_CPU_IRQ(i,j) = value_now;
6302 + LAST_CPU_IRQ(i, j) = value_now;
6303
6304 /* Ignore IRQs whose rate is less than the clock */
6305 if (delta < useful_load_threshold)
6306 continue;
6307 /* update the load for the processor or package total */
6308 - IRQ_DELTA(package_index,j) += delta;
6309 + IRQ_DELTA(package_index, j) += delta;
6310
6311 /* Keep track of the higher numbered sibling as well */
6312 if (i != package_index)
6313 @@ -576,7 +584,8 @@ static void do_irq_balance(void)
6314 max_cpu_irq = ULONG_MAX;
6315
6316 tryanothercpu:
6317 - /* Look for heaviest loaded processor.
6318 + /*
6319 + * Look for heaviest loaded processor.
6320 * We may come back to get the next heaviest loaded processor.
6321 * Skip processors with trivial loads.
6322 */
6323 @@ -585,7 +594,7 @@ tryanothercpu:
6324 for_each_online_cpu(i) {
6325 if (i != CPU_TO_PACKAGEINDEX(i))
6326 continue;
6327 - if (max_cpu_irq <= CPU_IRQ(i))
6328 + if (max_cpu_irq <= CPU_IRQ(i))
6329 continue;
6330 if (tmp_cpu_irq < CPU_IRQ(i)) {
6331 tmp_cpu_irq = CPU_IRQ(i);
6332 @@ -594,8 +603,9 @@ tryanothercpu:
6333 }
6334
6335 if (tmp_loaded == -1) {
6336 - /* In the case of small number of heavy interrupt sources,
6337 - * loading some of the cpus too much. We use Ingo's original
6338 + /*
6339 + * In the case of small number of heavy interrupt sources,
6340 + * loading some of the cpus too much. We use Ingo's original
6341 * approach to rotate them around.
6342 */
6343 if (!first_attempt && imbalance >= useful_load_threshold) {
6344 @@ -604,13 +614,14 @@ tryanothercpu:
6345 }
6346 goto not_worth_the_effort;
6347 }
6348 -
6349 +
6350 first_attempt = 0; /* heaviest search */
6351 max_cpu_irq = tmp_cpu_irq; /* load */
6352 max_loaded = tmp_loaded; /* processor */
6353 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
6354 -
6355 - /* if imbalance is less than approx 10% of max load, then
6356 +
6357 + /*
6358 + * if imbalance is less than approx 10% of max load, then
6359 * observe diminishing returns action. - quit
6360 */
6361 if (imbalance < (max_cpu_irq >> 3))
6362 @@ -626,26 +637,25 @@ tryanotherirq:
6363 /* Is this an active IRQ? */
6364 if (!irq_desc[j].action)
6365 continue;
6366 - if (imbalance <= IRQ_DELTA(max_loaded,j))
6367 + if (imbalance <= IRQ_DELTA(max_loaded, j))
6368 continue;
6369 /* Try to find the IRQ that is closest to the imbalance
6370 * without going over.
6371 */
6372 - if (move_this_load < IRQ_DELTA(max_loaded,j)) {
6373 - move_this_load = IRQ_DELTA(max_loaded,j);
6374 + if (move_this_load < IRQ_DELTA(max_loaded, j)) {
6375 + move_this_load = IRQ_DELTA(max_loaded, j);
6376 selected_irq = j;
6377 }
6378 }
6379 - if (selected_irq == -1) {
6380 + if (selected_irq == -1)
6381 goto tryanothercpu;
6382 - }
6383
6384 imbalance = move_this_load;
6385 -
6386 +
6387 /* For physical_balance case, we accumulated both load
6388 * values in the one of the siblings cpu_irq[],
6389 * to use the same code for physical and logical processors
6390 - * as much as possible.
6391 + * as much as possible.
6392 *
6393 * NOTE: the cpu_irq[] array holds the sum of the load for
6394 * sibling A and sibling B in the slot for the lowest numbered
6395 @@ -674,11 +684,11 @@ tryanotherirq:
6396 /* mark for change destination */
6397 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
6398
6399 - /* Since we made a change, come back sooner to
6400 + /* Since we made a change, come back sooner to
6401 * check for more variation.
6402 */
6403 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6404 - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6405 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6406 return;
6407 }
6408 goto tryanotherirq;
6409 @@ -689,7 +699,7 @@ not_worth_the_effort:
6410 * upward
6411 */
6412 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
6413 - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6414 + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6415 return;
6416 }
6417
6418 @@ -728,13 +738,13 @@ static int __init balanced_irq_init(void
6419 cpumask_t tmp;
6420
6421 cpus_shift_right(tmp, cpu_online_map, 2);
6422 - c = &boot_cpu_data;
6423 + c = &boot_cpu_data;
6424 /* When not overwritten by the command line ask subarchitecture. */
6425 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
6426 irqbalance_disabled = NO_BALANCE_IRQ;
6427 if (irqbalance_disabled)
6428 return 0;
6429 -
6430 +
6431 /* disable irqbalance completely if there is only one processor online */
6432 if (num_online_cpus() < 2) {
6433 irqbalance_disabled = 1;
6434 @@ -748,16 +758,14 @@ static int __init balanced_irq_init(void
6435 physical_balance = 1;
6436
6437 for_each_online_cpu(i) {
6438 - irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6439 - irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6440 + irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6441 + irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6442 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
6443 printk(KERN_ERR "balanced_irq_init: out of memory");
6444 goto failed;
6445 }
6446 - memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
6447 - memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
6448 }
6449 -
6450 +
6451 printk(KERN_INFO "Starting balanced_irq\n");
6452 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
6453 return 0;
6454 @@ -799,7 +807,7 @@ void send_IPI_self(int vector)
6455 /*
6456 * Send the IPI. The write to APIC_ICR fires this off.
6457 */
6458 - apic_write_around(APIC_ICR, cfg);
6459 + apic_write(APIC_ICR, cfg);
6460 #endif
6461 }
6462 #endif /* !CONFIG_SMP */
6463 @@ -853,10 +861,10 @@ static int find_irq_entry(int apic, int
6464 int i;
6465
6466 for (i = 0; i < mp_irq_entries; i++)
6467 - if (mp_irqs[i].mpc_irqtype == type &&
6468 - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
6469 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
6470 - mp_irqs[i].mpc_dstirq == pin)
6471 + if (mp_irqs[i].mp_irqtype == type &&
6472 + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
6473 + mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
6474 + mp_irqs[i].mp_dstirq == pin)
6475 return i;
6476
6477 return -1;
6478 @@ -871,13 +879,13 @@ static int __init find_isa_irq_pin(int i
6479 int i;
6480
6481 for (i = 0; i < mp_irq_entries; i++) {
6482 - int lbus = mp_irqs[i].mpc_srcbus;
6483 + int lbus = mp_irqs[i].mp_srcbus;
6484
6485 if (test_bit(lbus, mp_bus_not_pci) &&
6486 - (mp_irqs[i].mpc_irqtype == type) &&
6487 - (mp_irqs[i].mpc_srcbusirq == irq))
6488 + (mp_irqs[i].mp_irqtype == type) &&
6489 + (mp_irqs[i].mp_srcbusirq == irq))
6490
6491 - return mp_irqs[i].mpc_dstirq;
6492 + return mp_irqs[i].mp_dstirq;
6493 }
6494 return -1;
6495 }
6496 @@ -887,17 +895,17 @@ static int __init find_isa_irq_apic(int
6497 int i;
6498
6499 for (i = 0; i < mp_irq_entries; i++) {
6500 - int lbus = mp_irqs[i].mpc_srcbus;
6501 + int lbus = mp_irqs[i].mp_srcbus;
6502
6503 if (test_bit(lbus, mp_bus_not_pci) &&
6504 - (mp_irqs[i].mpc_irqtype == type) &&
6505 - (mp_irqs[i].mpc_srcbusirq == irq))
6506 + (mp_irqs[i].mp_irqtype == type) &&
6507 + (mp_irqs[i].mp_srcbusirq == irq))
6508 break;
6509 }
6510 if (i < mp_irq_entries) {
6511 int apic;
6512 - for(apic = 0; apic < nr_ioapics; apic++) {
6513 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
6514 + for (apic = 0; apic < nr_ioapics; apic++) {
6515 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
6516 return apic;
6517 }
6518 }
6519 @@ -918,28 +926,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6520
6521 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
6522 "slot:%d, pin:%d.\n", bus, slot, pin);
6523 - if (mp_bus_id_to_pci_bus[bus] == -1) {
6524 + if (test_bit(bus, mp_bus_not_pci)) {
6525 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
6526 return -1;
6527 }
6528 for (i = 0; i < mp_irq_entries; i++) {
6529 - int lbus = mp_irqs[i].mpc_srcbus;
6530 + int lbus = mp_irqs[i].mp_srcbus;
6531
6532 for (apic = 0; apic < nr_ioapics; apic++)
6533 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
6534 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
6535 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
6536 + mp_irqs[i].mp_dstapic == MP_APIC_ALL)
6537 break;
6538
6539 if (!test_bit(lbus, mp_bus_not_pci) &&
6540 - !mp_irqs[i].mpc_irqtype &&
6541 + !mp_irqs[i].mp_irqtype &&
6542 (bus == lbus) &&
6543 - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
6544 - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
6545 + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
6546 + int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
6547
6548 if (!(apic || IO_APIC_IRQ(irq)))
6549 continue;
6550
6551 - if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
6552 + if (pin == (mp_irqs[i].mp_srcbusirq & 3))
6553 return irq;
6554 /*
6555 * Use the first all-but-pin matching entry as a
6556 @@ -954,7 +962,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6557 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
6558
6559 /*
6560 - * This function currently is only a helper for the i386 smp boot process where
6561 + * This function currently is only a helper for the i386 smp boot process where
6562 * we need to reprogram the ioredtbls to cater for the cpus which have come online
6563 * so mask in all cases should simply be TARGET_CPUS
6564 */
6565 @@ -1008,7 +1016,7 @@ static int EISA_ELCR(unsigned int irq)
6566 * EISA conforming in the MP table, that means its trigger type must
6567 * be read in from the ELCR */
6568
6569 -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
6570 +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
6571 #define default_EISA_polarity(idx) default_ISA_polarity(idx)
6572
6573 /* PCI interrupts are always polarity one level triggered,
6574 @@ -1025,118 +1033,115 @@ static int EISA_ELCR(unsigned int irq)
6575
6576 static int MPBIOS_polarity(int idx)
6577 {
6578 - int bus = mp_irqs[idx].mpc_srcbus;
6579 + int bus = mp_irqs[idx].mp_srcbus;
6580 int polarity;
6581
6582 /*
6583 * Determine IRQ line polarity (high active or low active):
6584 */
6585 - switch (mp_irqs[idx].mpc_irqflag & 3)
6586 + switch (mp_irqs[idx].mp_irqflag & 3) {
6587 + case 0: /* conforms, ie. bus-type dependent polarity */
6588 {
6589 - case 0: /* conforms, ie. bus-type dependent polarity */
6590 - {
6591 - polarity = test_bit(bus, mp_bus_not_pci)?
6592 - default_ISA_polarity(idx):
6593 - default_PCI_polarity(idx);
6594 - break;
6595 - }
6596 - case 1: /* high active */
6597 - {
6598 - polarity = 0;
6599 - break;
6600 - }
6601 - case 2: /* reserved */
6602 - {
6603 - printk(KERN_WARNING "broken BIOS!!\n");
6604 - polarity = 1;
6605 - break;
6606 - }
6607 - case 3: /* low active */
6608 - {
6609 - polarity = 1;
6610 - break;
6611 - }
6612 - default: /* invalid */
6613 - {
6614 - printk(KERN_WARNING "broken BIOS!!\n");
6615 - polarity = 1;
6616 - break;
6617 - }
6618 + polarity = test_bit(bus, mp_bus_not_pci)?
6619 + default_ISA_polarity(idx):
6620 + default_PCI_polarity(idx);
6621 + break;
6622 + }
6623 + case 1: /* high active */
6624 + {
6625 + polarity = 0;
6626 + break;
6627 + }
6628 + case 2: /* reserved */
6629 + {
6630 + printk(KERN_WARNING "broken BIOS!!\n");
6631 + polarity = 1;
6632 + break;
6633 + }
6634 + case 3: /* low active */
6635 + {
6636 + polarity = 1;
6637 + break;
6638 + }
6639 + default: /* invalid */
6640 + {
6641 + printk(KERN_WARNING "broken BIOS!!\n");
6642 + polarity = 1;
6643 + break;
6644 + }
6645 }
6646 return polarity;
6647 }
6648
6649 static int MPBIOS_trigger(int idx)
6650 {
6651 - int bus = mp_irqs[idx].mpc_srcbus;
6652 + int bus = mp_irqs[idx].mp_srcbus;
6653 int trigger;
6654
6655 /*
6656 * Determine IRQ trigger mode (edge or level sensitive):
6657 */
6658 - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
6659 + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
6660 + case 0: /* conforms, ie. bus-type dependent */
6661 {
6662 - case 0: /* conforms, ie. bus-type dependent */
6663 - {
6664 - trigger = test_bit(bus, mp_bus_not_pci)?
6665 - default_ISA_trigger(idx):
6666 - default_PCI_trigger(idx);
6667 + trigger = test_bit(bus, mp_bus_not_pci)?
6668 + default_ISA_trigger(idx):
6669 + default_PCI_trigger(idx);
6670 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
6671 - switch (mp_bus_id_to_type[bus])
6672 - {
6673 - case MP_BUS_ISA: /* ISA pin */
6674 - {
6675 - /* set before the switch */
6676 - break;
6677 - }
6678 - case MP_BUS_EISA: /* EISA pin */
6679 - {
6680 - trigger = default_EISA_trigger(idx);
6681 - break;
6682 - }
6683 - case MP_BUS_PCI: /* PCI pin */
6684 - {
6685 - /* set before the switch */
6686 - break;
6687 - }
6688 - case MP_BUS_MCA: /* MCA pin */
6689 - {
6690 - trigger = default_MCA_trigger(idx);
6691 - break;
6692 - }
6693 - default:
6694 - {
6695 - printk(KERN_WARNING "broken BIOS!!\n");
6696 - trigger = 1;
6697 - break;
6698 - }
6699 - }
6700 -#endif
6701 + switch (mp_bus_id_to_type[bus]) {
6702 + case MP_BUS_ISA: /* ISA pin */
6703 + {
6704 + /* set before the switch */
6705 break;
6706 }
6707 - case 1: /* edge */
6708 + case MP_BUS_EISA: /* EISA pin */
6709 {
6710 - trigger = 0;
6711 + trigger = default_EISA_trigger(idx);
6712 break;
6713 }
6714 - case 2: /* reserved */
6715 + case MP_BUS_PCI: /* PCI pin */
6716 {
6717 - printk(KERN_WARNING "broken BIOS!!\n");
6718 - trigger = 1;
6719 + /* set before the switch */
6720 break;
6721 }
6722 - case 3: /* level */
6723 + case MP_BUS_MCA: /* MCA pin */
6724 {
6725 - trigger = 1;
6726 + trigger = default_MCA_trigger(idx);
6727 break;
6728 }
6729 - default: /* invalid */
6730 + default:
6731 {
6732 printk(KERN_WARNING "broken BIOS!!\n");
6733 - trigger = 0;
6734 + trigger = 1;
6735 break;
6736 }
6737 }
6738 +#endif
6739 + break;
6740 + }
6741 + case 1: /* edge */
6742 + {
6743 + trigger = 0;
6744 + break;
6745 + }
6746 + case 2: /* reserved */
6747 + {
6748 + printk(KERN_WARNING "broken BIOS!!\n");
6749 + trigger = 1;
6750 + break;
6751 + }
6752 + case 3: /* level */
6753 + {
6754 + trigger = 1;
6755 + break;
6756 + }
6757 + default: /* invalid */
6758 + {
6759 + printk(KERN_WARNING "broken BIOS!!\n");
6760 + trigger = 0;
6761 + break;
6762 + }
6763 + }
6764 return trigger;
6765 }
6766
6767 @@ -1153,16 +1158,16 @@ static inline int irq_trigger(int idx)
6768 static int pin_2_irq(int idx, int apic, int pin)
6769 {
6770 int irq, i;
6771 - int bus = mp_irqs[idx].mpc_srcbus;
6772 + int bus = mp_irqs[idx].mp_srcbus;
6773
6774 /*
6775 * Debugging check, we are in big trouble if this message pops up!
6776 */
6777 - if (mp_irqs[idx].mpc_dstirq != pin)
6778 + if (mp_irqs[idx].mp_dstirq != pin)
6779 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
6780
6781 if (test_bit(bus, mp_bus_not_pci))
6782 - irq = mp_irqs[idx].mpc_srcbusirq;
6783 + irq = mp_irqs[idx].mp_srcbusirq;
6784 else {
6785 /*
6786 * PCI IRQs are mapped in order
6787 @@ -1204,8 +1209,8 @@ static inline int IO_APIC_irq_trigger(in
6788
6789 for (apic = 0; apic < nr_ioapics; apic++) {
6790 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6791 - idx = find_irq_entry(apic,pin,mp_INT);
6792 - if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
6793 + idx = find_irq_entry(apic, pin, mp_INT);
6794 + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
6795 return irq_trigger(idx);
6796 }
6797 }
6798 @@ -1291,25 +1296,25 @@ static void __init setup_IO_APIC_irqs(vo
6799 /*
6800 * add it to the IO-APIC irq-routing table:
6801 */
6802 - memset(&entry,0,sizeof(entry));
6803 + memset(&entry, 0, sizeof(entry));
6804
6805 entry.delivery_mode = INT_DELIVERY_MODE;
6806 entry.dest_mode = INT_DEST_MODE;
6807 entry.mask = 0; /* enable IRQ */
6808 - entry.dest.logical.logical_dest =
6809 + entry.dest.logical.logical_dest =
6810 cpu_mask_to_apicid(TARGET_CPUS);
6811
6812 - idx = find_irq_entry(apic,pin,mp_INT);
6813 + idx = find_irq_entry(apic, pin, mp_INT);
6814 if (idx == -1) {
6815 if (first_notcon) {
6816 apic_printk(APIC_VERBOSE, KERN_DEBUG
6817 " IO-APIC (apicid-pin) %d-%d",
6818 - mp_ioapics[apic].mpc_apicid,
6819 + mp_ioapics[apic].mp_apicid,
6820 pin);
6821 first_notcon = 0;
6822 } else
6823 apic_printk(APIC_VERBOSE, ", %d-%d",
6824 - mp_ioapics[apic].mpc_apicid, pin);
6825 + mp_ioapics[apic].mp_apicid, pin);
6826 continue;
6827 }
6828
6829 @@ -1343,7 +1348,7 @@ static void __init setup_IO_APIC_irqs(vo
6830 vector = assign_irq_vector(irq);
6831 entry.vector = vector;
6832 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
6833 -
6834 +
6835 if (!apic && (irq < 16))
6836 disable_8259A_irq(irq);
6837 }
6838 @@ -1355,27 +1360,23 @@ static void __init setup_IO_APIC_irqs(vo
6839 apic_printk(APIC_VERBOSE, " not connected.\n");
6840 }
6841
6842 +#ifndef CONFIG_XEN
6843 /*
6844 - * Set up the 8259A-master output pin:
6845 + * Set up the timer pin, possibly with the 8259A-master behind.
6846 */
6847 -#ifndef CONFIG_XEN
6848 -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
6849 +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
6850 + int vector)
6851 {
6852 struct IO_APIC_route_entry entry;
6853
6854 - memset(&entry,0,sizeof(entry));
6855 -
6856 - disable_8259A_irq(0);
6857 -
6858 - /* mask LVT0 */
6859 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6860 + memset(&entry, 0, sizeof(entry));
6861
6862 /*
6863 * We use logical delivery to get the timer IRQ
6864 * to the first CPU.
6865 */
6866 entry.dest_mode = INT_DEST_MODE;
6867 - entry.mask = 0; /* unmask IRQ now */
6868 + entry.mask = 1; /* mask IRQ now */
6869 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6870 entry.delivery_mode = INT_DELIVERY_MODE;
6871 entry.polarity = 0;
6872 @@ -1384,17 +1385,14 @@ static void __init setup_ExtINT_IRQ0_pin
6873
6874 /*
6875 * The timer IRQ doesn't have to know that behind the
6876 - * scene we have a 8259A-master in AEOI mode ...
6877 + * scene we may have a 8259A-master in AEOI mode ...
6878 */
6879 - irq_desc[0].chip = &ioapic_chip;
6880 - set_irq_handler(0, handle_edge_irq);
6881 + ioapic_register_intr(0, vector, IOAPIC_EDGE);
6882
6883 /*
6884 * Add it to the IO-APIC irq-routing table:
6885 */
6886 ioapic_write_entry(apic, pin, entry);
6887 -
6888 - enable_8259A_irq(0);
6889 }
6890
6891 void __init print_IO_APIC(void)
6892 @@ -1409,10 +1407,10 @@ void __init print_IO_APIC(void)
6893 if (apic_verbosity == APIC_QUIET)
6894 return;
6895
6896 - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6897 + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6898 for (i = 0; i < nr_ioapics; i++)
6899 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
6900 - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
6901 + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
6902
6903 /*
6904 * We are a bit conservative about what we expect. We have to
6905 @@ -1431,7 +1429,7 @@ void __init print_IO_APIC(void)
6906 reg_03.raw = io_apic_read(apic, 3);
6907 spin_unlock_irqrestore(&ioapic_lock, flags);
6908
6909 - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
6910 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
6911 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
6912 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
6913 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
6914 @@ -1512,7 +1510,7 @@ void __init print_IO_APIC(void)
6915 return;
6916 }
6917
6918 -static void print_APIC_bitfield (int base)
6919 +static void print_APIC_bitfield(int base)
6920 {
6921 unsigned int v;
6922 int i, j;
6923 @@ -1533,7 +1531,7 @@ static void print_APIC_bitfield (int bas
6924 }
6925 }
6926
6927 -void /*__init*/ print_local_APIC(void * dummy)
6928 +void /*__init*/ print_local_APIC(void *dummy)
6929 {
6930 unsigned int v, ver, maxlvt;
6931
6932 @@ -1542,6 +1540,7 @@ void /*__init*/ print_local_APIC(void *
6933
6934 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
6935 smp_processor_id(), hard_smp_processor_id());
6936 + v = apic_read(APIC_ID);
6937 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
6938 GET_APIC_ID(read_apic_id()));
6939 v = apic_read(APIC_LVR);
6940 @@ -1616,9 +1615,9 @@ void /*__init*/ print_local_APIC(void *
6941 printk("\n");
6942 }
6943
6944 -void print_all_local_APICs (void)
6945 +void print_all_local_APICs(void)
6946 {
6947 - on_each_cpu(print_local_APIC, NULL, 1, 1);
6948 + on_each_cpu(print_local_APIC, NULL, 1);
6949 }
6950
6951 void /*__init*/ print_PIC(void)
6952 @@ -1639,11 +1638,11 @@ void /*__init*/ print_PIC(void)
6953 v = inb(0xa0) << 8 | inb(0x20);
6954 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
6955
6956 - outb(0x0b,0xa0);
6957 - outb(0x0b,0x20);
6958 + outb(0x0b, 0xa0);
6959 + outb(0x0b, 0x20);
6960 v = inb(0xa0) << 8 | inb(0x20);
6961 - outb(0x0a,0xa0);
6962 - outb(0x0a,0x20);
6963 + outb(0x0a, 0xa0);
6964 + outb(0x0a, 0x20);
6965
6966 spin_unlock_irqrestore(&i8259A_lock, flags);
6967
6968 @@ -1652,6 +1651,8 @@ void /*__init*/ print_PIC(void)
6969 v = inb(0x4d1) << 8 | inb(0x4d0);
6970 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
6971 }
6972 +#else
6973 +void __init print_IO_APIC(void) {}
6974 #endif /* !CONFIG_XEN */
6975
6976 static void __init enable_IO_APIC(void)
6977 @@ -1681,7 +1682,7 @@ static void __init enable_IO_APIC(void)
6978 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
6979 }
6980 #ifndef CONFIG_XEN
6981 - for(apic = 0; apic < nr_ioapics; apic++) {
6982 + for (apic = 0; apic < nr_ioapics; apic++) {
6983 int pin;
6984 /* See if any of the pins is in ExtINT mode */
6985 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6986 @@ -1774,7 +1775,7 @@ void disable_IO_APIC(void)
6987 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
6988 */
6989
6990 -#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
6991 +#ifndef CONFIG_XEN
6992 static void __init setup_ioapic_ids_from_mpc(void)
6993 {
6994 union IO_APIC_reg_00 reg_00;
6995 @@ -1784,6 +1785,11 @@ static void __init setup_ioapic_ids_from
6996 unsigned char old_id;
6997 unsigned long flags;
6998
6999 +#ifdef CONFIG_X86_NUMAQ
7000 + if (found_numaq)
7001 + return;
7002 +#endif
7003 +
7004 /*
7005 * Don't check I/O APIC IDs for xAPIC systems. They have
7006 * no meaning without the serial APIC bus.
7007 @@ -1806,15 +1812,15 @@ static void __init setup_ioapic_ids_from
7008 spin_lock_irqsave(&ioapic_lock, flags);
7009 reg_00.raw = io_apic_read(apic, 0);
7010 spin_unlock_irqrestore(&ioapic_lock, flags);
7011 -
7012 - old_id = mp_ioapics[apic].mpc_apicid;
7013
7014 - if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
7015 + old_id = mp_ioapics[apic].mp_apicid;
7016 +
7017 + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
7018 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
7019 - apic, mp_ioapics[apic].mpc_apicid);
7020 + apic, mp_ioapics[apic].mp_apicid);
7021 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7022 reg_00.bits.ID);
7023 - mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
7024 + mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
7025 }
7026
7027 /*
7028 @@ -1823,9 +1829,9 @@ static void __init setup_ioapic_ids_from
7029 * 'stuck on smp_invalidate_needed IPI wait' messages.
7030 */
7031 if (check_apicid_used(phys_id_present_map,
7032 - mp_ioapics[apic].mpc_apicid)) {
7033 + mp_ioapics[apic].mp_apicid)) {
7034 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
7035 - apic, mp_ioapics[apic].mpc_apicid);
7036 + apic, mp_ioapics[apic].mp_apicid);
7037 for (i = 0; i < get_physical_broadcast(); i++)
7038 if (!physid_isset(i, phys_id_present_map))
7039 break;
7040 @@ -1834,13 +1840,13 @@ static void __init setup_ioapic_ids_from
7041 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7042 i);
7043 physid_set(i, phys_id_present_map);
7044 - mp_ioapics[apic].mpc_apicid = i;
7045 + mp_ioapics[apic].mp_apicid = i;
7046 } else {
7047 physid_mask_t tmp;
7048 - tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
7049 + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
7050 apic_printk(APIC_VERBOSE, "Setting %d in the "
7051 "phys_id_present_map\n",
7052 - mp_ioapics[apic].mpc_apicid);
7053 + mp_ioapics[apic].mp_apicid);
7054 physids_or(phys_id_present_map, phys_id_present_map, tmp);
7055 }
7056
7057 @@ -1849,21 +1855,21 @@ static void __init setup_ioapic_ids_from
7058 * We need to adjust the IRQ routing table
7059 * if the ID changed.
7060 */
7061 - if (old_id != mp_ioapics[apic].mpc_apicid)
7062 + if (old_id != mp_ioapics[apic].mp_apicid)
7063 for (i = 0; i < mp_irq_entries; i++)
7064 - if (mp_irqs[i].mpc_dstapic == old_id)
7065 - mp_irqs[i].mpc_dstapic
7066 - = mp_ioapics[apic].mpc_apicid;
7067 + if (mp_irqs[i].mp_dstapic == old_id)
7068 + mp_irqs[i].mp_dstapic
7069 + = mp_ioapics[apic].mp_apicid;
7070
7071 /*
7072 * Read the right value from the MPC table and
7073 * write it into the ID register.
7074 - */
7075 + */
7076 apic_printk(APIC_VERBOSE, KERN_INFO
7077 "...changing IO-APIC physical APIC ID to %d ...",
7078 - mp_ioapics[apic].mpc_apicid);
7079 + mp_ioapics[apic].mp_apicid);
7080
7081 - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
7082 + reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
7083 spin_lock_irqsave(&ioapic_lock, flags);
7084 io_apic_write(apic, 0, reg_00.raw);
7085 spin_unlock_irqrestore(&ioapic_lock, flags);
7086 @@ -1874,17 +1880,13 @@ static void __init setup_ioapic_ids_from
7087 spin_lock_irqsave(&ioapic_lock, flags);
7088 reg_00.raw = io_apic_read(apic, 0);
7089 spin_unlock_irqrestore(&ioapic_lock, flags);
7090 - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
7091 + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
7092 printk("could not set ID!\n");
7093 else
7094 apic_printk(APIC_VERBOSE, " ok.\n");
7095 }
7096 }
7097 -#else
7098 -static void __init setup_ioapic_ids_from_mpc(void) { }
7099 -#endif
7100
7101 -#ifndef CONFIG_XEN
7102 int no_timer_check __initdata;
7103
7104 static int __init notimercheck(char *s)
7105 @@ -2077,45 +2079,53 @@ static inline void init_IO_APIC_traps(vo
7106 * The local APIC irq-chip implementation:
7107 */
7108
7109 -static void ack_apic(unsigned int irq)
7110 +static void ack_lapic_irq(unsigned int irq)
7111 {
7112 ack_APIC_irq();
7113 }
7114
7115 -static void mask_lapic_irq (unsigned int irq)
7116 +static void mask_lapic_irq(unsigned int irq)
7117 {
7118 unsigned long v;
7119
7120 v = apic_read(APIC_LVT0);
7121 - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
7122 + apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
7123 }
7124
7125 -static void unmask_lapic_irq (unsigned int irq)
7126 +static void unmask_lapic_irq(unsigned int irq)
7127 {
7128 unsigned long v;
7129
7130 v = apic_read(APIC_LVT0);
7131 - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
7132 + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
7133 }
7134
7135 static struct irq_chip lapic_chip __read_mostly = {
7136 - .name = "local-APIC-edge",
7137 + .name = "local-APIC",
7138 .mask = mask_lapic_irq,
7139 .unmask = unmask_lapic_irq,
7140 - .eoi = ack_apic,
7141 + .ack = ack_lapic_irq,
7142 };
7143
7144 +static void lapic_register_intr(int irq, int vector)
7145 +{
7146 + irq_desc[irq].status &= ~IRQ_LEVEL;
7147 + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
7148 + "edge");
7149 + set_intr_gate(vector, interrupt[irq]);
7150 +}
7151 +
7152 static void __init setup_nmi(void)
7153 {
7154 /*
7155 - * Dirty trick to enable the NMI watchdog ...
7156 + * Dirty trick to enable the NMI watchdog ...
7157 * We put the 8259A master into AEOI mode and
7158 * unmask on all local APICs LVT0 as NMI.
7159 *
7160 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
7161 * is from Maciej W. Rozycki - so we do not have to EOI from
7162 * the NMI handler or the timer interrupt.
7163 - */
7164 + */
7165 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
7166
7167 enable_NMI_through_LVT0();
7168 @@ -2191,11 +2201,16 @@ static inline void __init unlock_ExtINT_
7169 static inline void __init check_timer(void)
7170 {
7171 int apic1, pin1, apic2, pin2;
7172 + int no_pin1 = 0;
7173 int vector;
7174 + unsigned int ver;
7175 unsigned long flags;
7176
7177 local_irq_save(flags);
7178
7179 + ver = apic_read(APIC_LVR);
7180 + ver = GET_APIC_VERSION(ver);
7181 +
7182 /*
7183 * get/set the timer IRQ vector:
7184 */
7185 @@ -2204,34 +2219,54 @@ static inline void __init check_timer(vo
7186 set_intr_gate(vector, interrupt[0]);
7187
7188 /*
7189 - * Subtle, code in do_timer_interrupt() expects an AEOI
7190 - * mode for the 8259A whenever interrupts are routed
7191 - * through I/O APICs. Also IRQ0 has to be enabled in
7192 - * the 8259A which implies the virtual wire has to be
7193 - * disabled in the local APIC.
7194 + * As IRQ0 is to be enabled in the 8259A, the virtual
7195 + * wire has to be disabled in the local APIC. Also
7196 + * timer interrupts need to be acknowledged manually in
7197 + * the 8259A for the i82489DX when using the NMI
7198 + * watchdog as that APIC treats NMIs as level-triggered.
7199 + * The AEOI mode will finish them in the 8259A
7200 + * automatically.
7201 */
7202 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7203 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7204 init_8259A(1);
7205 - timer_ack = 1;
7206 - if (timer_over_8254 > 0)
7207 - enable_8259A_irq(0);
7208 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
7209
7210 pin1 = find_isa_irq_pin(0, mp_INT);
7211 apic1 = find_isa_irq_apic(0, mp_INT);
7212 pin2 = ioapic_i8259.pin;
7213 apic2 = ioapic_i8259.apic;
7214
7215 - printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
7216 - vector, apic1, pin1, apic2, pin2);
7217 + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
7218 + "apic1=%d pin1=%d apic2=%d pin2=%d\n",
7219 + vector, apic1, pin1, apic2, pin2);
7220 +
7221 + /*
7222 + * Some BIOS writers are clueless and report the ExtINTA
7223 + * I/O APIC input from the cascaded 8259A as the timer
7224 + * interrupt input. So just in case, if only one pin
7225 + * was found above, try it both directly and through the
7226 + * 8259A.
7227 + */
7228 + if (pin1 == -1) {
7229 + pin1 = pin2;
7230 + apic1 = apic2;
7231 + no_pin1 = 1;
7232 + } else if (pin2 == -1) {
7233 + pin2 = pin1;
7234 + apic2 = apic1;
7235 + }
7236
7237 if (pin1 != -1) {
7238 /*
7239 * Ok, does IRQ0 through the IOAPIC work?
7240 */
7241 + if (no_pin1) {
7242 + add_pin_to_irq(0, apic1, pin1);
7243 + setup_timer_IRQ0_pin(apic1, pin1, vector);
7244 + }
7245 unmask_IO_APIC_irq(0);
7246 if (timer_irq_works()) {
7247 if (nmi_watchdog == NMI_IO_APIC) {
7248 - disable_8259A_irq(0);
7249 setup_nmi();
7250 enable_8259A_irq(0);
7251 }
7252 @@ -2240,71 +2275,77 @@ static inline void __init check_timer(vo
7253 goto out;
7254 }
7255 clear_IO_APIC_pin(apic1, pin1);
7256 - printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
7257 - "IO-APIC\n");
7258 - }
7259 -
7260 - printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
7261 - if (pin2 != -1) {
7262 - printk("\n..... (found pin %d) ...", pin2);
7263 + if (!no_pin1)
7264 + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
7265 + "8254 timer not connected to IO-APIC\n");
7266 +
7267 + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
7268 + "(IRQ0) through the 8259A ...\n");
7269 + apic_printk(APIC_QUIET, KERN_INFO
7270 + "..... (found apic %d pin %d) ...\n", apic2, pin2);
7271 /*
7272 * legacy devices should be connected to IO APIC #0
7273 */
7274 - setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
7275 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7276 + setup_timer_IRQ0_pin(apic2, pin2, vector);
7277 + unmask_IO_APIC_irq(0);
7278 + enable_8259A_irq(0);
7279 if (timer_irq_works()) {
7280 - printk("works.\n");
7281 - if (pin1 != -1)
7282 - replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7283 - else
7284 - add_pin_to_irq(0, apic2, pin2);
7285 + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
7286 + timer_through_8259 = 1;
7287 if (nmi_watchdog == NMI_IO_APIC) {
7288 + disable_8259A_irq(0);
7289 setup_nmi();
7290 + enable_8259A_irq(0);
7291 }
7292 goto out;
7293 }
7294 /*
7295 * Cleanup, just in case ...
7296 */
7297 + disable_8259A_irq(0);
7298 clear_IO_APIC_pin(apic2, pin2);
7299 + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
7300 }
7301 - printk(" failed.\n");
7302
7303 if (nmi_watchdog == NMI_IO_APIC) {
7304 - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
7305 - nmi_watchdog = 0;
7306 + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
7307 + "through the IO-APIC - disabling NMI Watchdog!\n");
7308 + nmi_watchdog = NMI_NONE;
7309 }
7310 + timer_ack = 0;
7311
7312 - printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
7313 + apic_printk(APIC_QUIET, KERN_INFO
7314 + "...trying to set up timer as Virtual Wire IRQ...\n");
7315
7316 - disable_8259A_irq(0);
7317 - set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
7318 - "fasteoi");
7319 - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7320 + lapic_register_intr(0, vector);
7321 + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7322 enable_8259A_irq(0);
7323
7324 if (timer_irq_works()) {
7325 - printk(" works.\n");
7326 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7327 goto out;
7328 }
7329 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7330 - printk(" failed.\n");
7331 + disable_8259A_irq(0);
7332 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7333 + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
7334
7335 - printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
7336 + apic_printk(APIC_QUIET, KERN_INFO
7337 + "...trying to set up timer as ExtINT IRQ...\n");
7338
7339 - timer_ack = 0;
7340 init_8259A(0);
7341 make_8259A_irq(0);
7342 - apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
7343 + apic_write(APIC_LVT0, APIC_DM_EXTINT);
7344
7345 unlock_ExtINT_logic();
7346
7347 if (timer_irq_works()) {
7348 - printk(" works.\n");
7349 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7350 goto out;
7351 }
7352 - printk(" failed :(.\n");
7353 + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
7354 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
7355 - "report. Then try booting with the 'noapic' option");
7356 + "report. Then try booting with the 'noapic' option.\n");
7357 out:
7358 local_irq_restore(flags);
7359 }
7360 @@ -2314,11 +2355,21 @@ int timer_uses_ioapic_pin_0 = 0;
7361 #endif
7362
7363 /*
7364 - *
7365 - * IRQ's that are handled by the PIC in the MPS IOAPIC case.
7366 - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
7367 - * Linux doesn't really care, as it's not actually used
7368 - * for any interrupt handling anyway.
7369 + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
7370 + * to devices. However there may be an I/O APIC pin available for
7371 + * this interrupt regardless. The pin may be left unconnected, but
7372 + * typically it will be reused as an ExtINT cascade interrupt for
7373 + * the master 8259A. In the MPS case such a pin will normally be
7374 + * reported as an ExtINT interrupt in the MP table. With ACPI
7375 + * there is no provision for ExtINT interrupts, and in the absence
7376 + * of an override it would be treated as an ordinary ISA I/O APIC
7377 + * interrupt, that is edge-triggered and unmasked by default. We
7378 + * used to do this, but it caused problems on some systems because
7379 + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
7380 + * the same ExtINT cascade interrupt to drive the local APIC of the
7381 + * bootstrap processor. Therefore we refrain from routing IRQ2 to
7382 + * the I/O APIC in all cases now. No actual device should request
7383 + * it anyway. --macro
7384 */
7385 #define PIC_IRQS (1 << PIC_CASCADE_IR)
7386
7387 @@ -2328,25 +2379,22 @@ void __init setup_IO_APIC(void)
7388 int i;
7389
7390 /* Reserve all the system vectors. */
7391 - for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
7392 + for (i = first_system_vector; i < NR_VECTORS; i++)
7393 set_bit(i, used_vectors);
7394 #endif
7395
7396 enable_IO_APIC();
7397
7398 - if (acpi_ioapic)
7399 - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
7400 - else
7401 - io_apic_irqs = ~PIC_IRQS;
7402 + io_apic_irqs = ~PIC_IRQS;
7403
7404 printk("ENABLING IO-APIC IRQs\n");
7405
7406 +#ifndef CONFIG_XEN
7407 /*
7408 * Set up IO-APIC IRQ routing.
7409 */
7410 if (!acpi_ioapic)
7411 setup_ioapic_ids_from_mpc();
7412 -#ifndef CONFIG_XEN
7413 sync_Arb_IDs();
7414 #endif
7415 setup_IO_APIC_irqs();
7416 @@ -2356,28 +2404,14 @@ void __init setup_IO_APIC(void)
7417 print_IO_APIC();
7418 }
7419
7420 -static int __init setup_disable_8254_timer(char *s)
7421 -{
7422 - timer_over_8254 = -1;
7423 - return 1;
7424 -}
7425 -static int __init setup_enable_8254_timer(char *s)
7426 -{
7427 - timer_over_8254 = 2;
7428 - return 1;
7429 -}
7430 -
7431 -__setup("disable_8254_timer", setup_disable_8254_timer);
7432 -__setup("enable_8254_timer", setup_enable_8254_timer);
7433 -
7434 /*
7435 * Called after all the initialization is done. If we didnt find any
7436 * APIC bugs then we can allow the modify fast path
7437 */
7438 -
7439 +
7440 static int __init io_apic_bug_finalize(void)
7441 {
7442 - if(sis_apic_bug == -1)
7443 + if (sis_apic_bug == -1)
7444 sis_apic_bug = 0;
7445 if (is_initial_xendomain()) {
7446 struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
7447 @@ -2396,17 +2430,17 @@ struct sysfs_ioapic_data {
7448 struct sys_device dev;
7449 struct IO_APIC_route_entry entry[0];
7450 };
7451 -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
7452 +static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
7453
7454 static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
7455 {
7456 struct IO_APIC_route_entry *entry;
7457 struct sysfs_ioapic_data *data;
7458 int i;
7459 -
7460 +
7461 data = container_of(dev, struct sysfs_ioapic_data, dev);
7462 entry = data->entry;
7463 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7464 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7465 entry[i] = ioapic_read_entry(dev->id, i);
7466
7467 return 0;
7468 @@ -2419,18 +2453,18 @@ static int ioapic_resume(struct sys_devi
7469 unsigned long flags;
7470 union IO_APIC_reg_00 reg_00;
7471 int i;
7472 -
7473 +
7474 data = container_of(dev, struct sysfs_ioapic_data, dev);
7475 entry = data->entry;
7476
7477 spin_lock_irqsave(&ioapic_lock, flags);
7478 reg_00.raw = io_apic_read(dev->id, 0);
7479 - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
7480 - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
7481 + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
7482 + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
7483 io_apic_write(dev->id, 0, reg_00.raw);
7484 }
7485 spin_unlock_irqrestore(&ioapic_lock, flags);
7486 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7487 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7488 ioapic_write_entry(dev->id, i, entry[i]);
7489
7490 return 0;
7491 @@ -2444,24 +2478,23 @@ static struct sysdev_class ioapic_sysdev
7492
7493 static int __init ioapic_init_sysfs(void)
7494 {
7495 - struct sys_device * dev;
7496 + struct sys_device *dev;
7497 int i, size, error = 0;
7498
7499 error = sysdev_class_register(&ioapic_sysdev_class);
7500 if (error)
7501 return error;
7502
7503 - for (i = 0; i < nr_ioapics; i++ ) {
7504 - size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7505 + for (i = 0; i < nr_ioapics; i++) {
7506 + size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7507 * sizeof(struct IO_APIC_route_entry);
7508 - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
7509 + mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
7510 if (!mp_ioapic_data[i]) {
7511 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
7512 continue;
7513 }
7514 - memset(mp_ioapic_data[i], 0, size);
7515 dev = &mp_ioapic_data[i]->dev;
7516 - dev->id = i;
7517 + dev->id = i;
7518 dev->cls = &ioapic_sysdev_class;
7519 error = sysdev_register(dev);
7520 if (error) {
7521 @@ -2538,7 +2571,7 @@ static int msi_compose_msg(struct pci_de
7522 msg->address_lo =
7523 MSI_ADDR_BASE_LO |
7524 ((INT_DEST_MODE == 0) ?
7525 - MSI_ADDR_DEST_MODE_PHYSICAL:
7526 +MSI_ADDR_DEST_MODE_PHYSICAL:
7527 MSI_ADDR_DEST_MODE_LOGICAL) |
7528 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7529 MSI_ADDR_REDIRECTION_CPU:
7530 @@ -2549,7 +2582,7 @@ static int msi_compose_msg(struct pci_de
7531 MSI_DATA_TRIGGER_EDGE |
7532 MSI_DATA_LEVEL_ASSERT |
7533 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7534 - MSI_DATA_DELIVERY_FIXED:
7535 +MSI_DATA_DELIVERY_FIXED:
7536 MSI_DATA_DELIVERY_LOWPRI) |
7537 MSI_DATA_VECTOR(vector);
7538 }
7539 @@ -2720,12 +2753,12 @@ int arch_setup_ht_irq(unsigned int irq,
7540 #endif /* CONFIG_HT_IRQ */
7541
7542 /* --------------------------------------------------------------------------
7543 - ACPI-based IOAPIC Configuration
7544 + ACPI-based IOAPIC Configuration
7545 -------------------------------------------------------------------------- */
7546
7547 #ifdef CONFIG_ACPI
7548
7549 -int __init io_apic_get_unique_id (int ioapic, int apic_id)
7550 +int __init io_apic_get_unique_id(int ioapic, int apic_id)
7551 {
7552 #ifndef CONFIG_XEN
7553 union IO_APIC_reg_00 reg_00;
7554 @@ -2735,10 +2768,10 @@ int __init io_apic_get_unique_id (int io
7555 int i = 0;
7556
7557 /*
7558 - * The P4 platform supports up to 256 APIC IDs on two separate APIC
7559 - * buses (one for LAPICs, one for IOAPICs), where predecessors only
7560 + * The P4 platform supports up to 256 APIC IDs on two separate APIC
7561 + * buses (one for LAPICs, one for IOAPICs), where predecessors only
7562 * supports up to 16 on one shared APIC bus.
7563 - *
7564 + *
7565 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
7566 * advantage of new APIC bus architecture.
7567 */
7568 @@ -2757,7 +2790,7 @@ int __init io_apic_get_unique_id (int io
7569 }
7570
7571 /*
7572 - * Every APIC in a system must have a unique ID or we get lots of nice
7573 + * Every APIC in a system must have a unique ID or we get lots of nice
7574 * 'stuck on smp_invalidate_needed IPI wait' messages.
7575 */
7576 if (check_apicid_used(apic_id_map, apic_id)) {
7577 @@ -2774,7 +2807,7 @@ int __init io_apic_get_unique_id (int io
7578 "trying %d\n", ioapic, apic_id, i);
7579
7580 apic_id = i;
7581 - }
7582 + }
7583
7584 tmp = apicid_to_cpu_present(apic_id);
7585 physids_or(apic_id_map, apic_id_map, tmp);
7586 @@ -2802,7 +2835,7 @@ int __init io_apic_get_unique_id (int io
7587 }
7588
7589
7590 -int __init io_apic_get_version (int ioapic)
7591 +int __init io_apic_get_version(int ioapic)
7592 {
7593 union IO_APIC_reg_01 reg_01;
7594 unsigned long flags;
7595 @@ -2815,7 +2848,7 @@ int __init io_apic_get_version (int ioap
7596 }
7597
7598
7599 -int __init io_apic_get_redir_entries (int ioapic)
7600 +int __init io_apic_get_redir_entries(int ioapic)
7601 {
7602 union IO_APIC_reg_01 reg_01;
7603 unsigned long flags;
7604 @@ -2828,7 +2861,7 @@ int __init io_apic_get_redir_entries (in
7605 }
7606
7607
7608 -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
7609 +int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
7610 {
7611 struct IO_APIC_route_entry entry;
7612
7613 @@ -2844,7 +2877,7 @@ int io_apic_set_pci_routing (int ioapic,
7614 * corresponding device driver registers for this IRQ.
7615 */
7616
7617 - memset(&entry,0,sizeof(entry));
7618 + memset(&entry, 0, sizeof(entry));
7619
7620 entry.delivery_mode = INT_DELIVERY_MODE;
7621 entry.dest_mode = INT_DEST_MODE;
7622 @@ -2863,7 +2896,7 @@ int io_apic_set_pci_routing (int ioapic,
7623
7624 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
7625 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
7626 - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
7627 + mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
7628 edge_level, active_high_low);
7629
7630 ioapic_register_intr(irq, entry.vector, edge_level);
7631 @@ -2884,8 +2917,8 @@ int acpi_get_override_irq(int bus_irq, i
7632 return -1;
7633
7634 for (i = 0; i < mp_irq_entries; i++)
7635 - if (mp_irqs[i].mpc_irqtype == mp_INT &&
7636 - mp_irqs[i].mpc_srcbusirq == bus_irq)
7637 + if (mp_irqs[i].mp_irqtype == mp_INT &&
7638 + mp_irqs[i].mp_srcbusirq == bus_irq)
7639 break;
7640 if (i >= mp_irq_entries)
7641 return -1;
7642 @@ -2918,3 +2951,35 @@ static int __init parse_noapic(char *arg
7643 return 0;
7644 }
7645 early_param("noapic", parse_noapic);
7646 +
7647 +#ifndef CONFIG_XEN
7648 +void __init ioapic_init_mappings(void)
7649 +{
7650 + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
7651 + int i;
7652 +
7653 + for (i = 0; i < nr_ioapics; i++) {
7654 + if (smp_found_config) {
7655 + ioapic_phys = mp_ioapics[i].mp_apicaddr;
7656 + if (!ioapic_phys) {
7657 + printk(KERN_ERR
7658 + "WARNING: bogus zero IO-APIC "
7659 + "address found in MPTABLE, "
7660 + "disabling IO/APIC support!\n");
7661 + smp_found_config = 0;
7662 + skip_ioapic_setup = 1;
7663 + goto fake_ioapic_page;
7664 + }
7665 + } else {
7666 +fake_ioapic_page:
7667 + ioapic_phys = (unsigned long)
7668 + alloc_bootmem_pages(PAGE_SIZE);
7669 + ioapic_phys = __pa(ioapic_phys);
7670 + }
7671 + set_fixmap_nocache(idx, ioapic_phys);
7672 + printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
7673 + __fix_to_virt(idx), ioapic_phys);
7674 + idx++;
7675 + }
7676 +}
7677 +#endif
7678 Index: head-2008-12-01/arch/x86/kernel/io_apic_64-xen.c
7679 ===================================================================
7680 --- head-2008-12-01.orig/arch/x86/kernel/io_apic_64-xen.c 2008-12-01 11:44:55.000000000 +0100
7681 +++ head-2008-12-01/arch/x86/kernel/io_apic_64-xen.c 2008-12-01 11:49:07.000000000 +0100
7682 @@ -45,6 +45,7 @@
7683 #include <asm/proto.h>
7684 #include <asm/acpi.h>
7685 #include <asm/dma.h>
7686 +#include <asm/i8259.h>
7687 #include <asm/nmi.h>
7688 #include <asm/msidef.h>
7689 #include <asm/hypertransport.h>
7690 @@ -63,10 +64,16 @@ struct irq_cfg {
7691 };
7692
7693 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
7694 -struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7695 +static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7696
7697 static int assign_irq_vector(int irq, cpumask_t mask);
7698
7699 +#ifndef CONFIG_XEN
7700 +int first_system_vector = 0xfe;
7701 +
7702 +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
7703 +#endif
7704 +
7705 #define __apicdebuginit __init
7706
7707 int sis_apic_bug; /* not actually supported, dummy for compile */
7708 @@ -89,14 +96,14 @@ unsigned long io_apic_irqs;
7709
7710 #define clear_IO_APIC() ((void)0)
7711 #else
7712 -int timer_over_8254 __initdata = 1;
7713 +int timer_through_8259 __initdata;
7714
7715 /* Where if anywhere is the i8259 connect in external int mode */
7716 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
7717 #endif
7718
7719 static DEFINE_SPINLOCK(ioapic_lock);
7720 -DEFINE_SPINLOCK(vector_lock);
7721 +static DEFINE_SPINLOCK(vector_lock);
7722
7723 /*
7724 * # of IRQ routing registers
7725 @@ -104,15 +111,17 @@ DEFINE_SPINLOCK(vector_lock);
7726 int nr_ioapic_registers[MAX_IO_APICS];
7727
7728 /* I/O APIC entries */
7729 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
7730 +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
7731 int nr_ioapics;
7732
7733 /* MP IRQ source entries */
7734 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7735 +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7736
7737 /* # of MP IRQ source entries */
7738 int mp_irq_entries;
7739
7740 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
7741 +
7742 /*
7743 * Rough estimation of how many shared IRQs there are, can
7744 * be changed anytime.
7745 @@ -141,7 +150,7 @@ struct io_apic {
7746 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
7747 {
7748 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
7749 - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
7750 + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
7751 }
7752 #endif
7753
7754 @@ -155,7 +164,7 @@ static inline unsigned int io_apic_read(
7755 struct physdev_apic apic_op;
7756 int ret;
7757
7758 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7759 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7760 apic_op.reg = reg;
7761 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
7762 if (ret)
7763 @@ -173,7 +182,7 @@ static inline void io_apic_write(unsigne
7764 #else
7765 struct physdev_apic apic_op;
7766
7767 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7768 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7769 apic_op.reg = reg;
7770 apic_op.value = value;
7771 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
7772 @@ -209,7 +218,7 @@ static bool io_apic_level_ack_pending(un
7773 break;
7774 reg = io_apic_read(entry->apic, 0x10 + pin*2);
7775 /* Is the remote IRR bit set? */
7776 - if ((reg >> 14) & 1) {
7777 + if (reg & IO_APIC_REDIR_REMOTE_IRR) {
7778 spin_unlock_irqrestore(&ioapic_lock, flags);
7779 return true;
7780 }
7781 @@ -308,7 +317,7 @@ static void __target_IO_APIC_irq(unsigne
7782 break;
7783 io_apic_write(apic, 0x11 + pin*2, dest);
7784 reg = io_apic_read(apic, 0x10 + pin*2);
7785 - reg &= ~0x000000ff;
7786 + reg &= ~IO_APIC_REDIR_VECTOR_MASK;
7787 reg |= vector;
7788 io_apic_modify(apic, reg);
7789 if (!entry->next)
7790 @@ -372,6 +381,26 @@ static void add_pin_to_irq(unsigned int
7791 }
7792
7793 #ifndef CONFIG_XEN
7794 +/*
7795 + * Reroute an IRQ to a different pin.
7796 + */
7797 +static void __init replace_pin_at_irq(unsigned int irq,
7798 + int oldapic, int oldpin,
7799 + int newapic, int newpin)
7800 +{
7801 + struct irq_pin_list *entry = irq_2_pin + irq;
7802 +
7803 + while (1) {
7804 + if (entry->apic == oldapic && entry->pin == oldpin) {
7805 + entry->apic = newapic;
7806 + entry->pin = newpin;
7807 + }
7808 + if (!entry->next)
7809 + break;
7810 + entry = irq_2_pin + entry->next;
7811 + }
7812 +}
7813 +
7814 #define __DO_ACTION(R, ACTION, FINAL) \
7815 \
7816 { \
7817 @@ -399,10 +428,11 @@ static void add_pin_to_irq(unsigned int
7818 static void name##_IO_APIC_irq (unsigned int irq) \
7819 __DO_ACTION(R, ACTION, FINAL)
7820
7821 -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
7822 - /* mask = 1 */
7823 -DO_ACTION( __unmask, 0, &= 0xfffeffff, )
7824 - /* mask = 0 */
7825 +/* mask = 1 */
7826 +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
7827 +
7828 +/* mask = 0 */
7829 +DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
7830
7831 static void mask_IO_APIC_irq (unsigned int irq)
7832 {
7833 @@ -465,22 +495,6 @@ static int __init disable_timer_pin_setu
7834 }
7835 __setup("disable_timer_pin_1", disable_timer_pin_setup);
7836
7837 -#ifndef CONFIG_XEN
7838 -static int __init setup_disable_8254_timer(char *s)
7839 -{
7840 - timer_over_8254 = -1;
7841 - return 1;
7842 -}
7843 -static int __init setup_enable_8254_timer(char *s)
7844 -{
7845 - timer_over_8254 = 2;
7846 - return 1;
7847 -}
7848 -
7849 -__setup("disable_8254_timer", setup_disable_8254_timer);
7850 -__setup("enable_8254_timer", setup_enable_8254_timer);
7851 -#endif /* !CONFIG_XEN */
7852 -
7853
7854 /*
7855 * Find the IRQ entry number of a certain pin.
7856 @@ -490,10 +504,10 @@ static int find_irq_entry(int apic, int
7857 int i;
7858
7859 for (i = 0; i < mp_irq_entries; i++)
7860 - if (mp_irqs[i].mpc_irqtype == type &&
7861 - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
7862 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
7863 - mp_irqs[i].mpc_dstirq == pin)
7864 + if (mp_irqs[i].mp_irqtype == type &&
7865 + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
7866 + mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
7867 + mp_irqs[i].mp_dstirq == pin)
7868 return i;
7869
7870 return -1;
7871 @@ -508,13 +522,13 @@ static int __init find_isa_irq_pin(int i
7872 int i;
7873
7874 for (i = 0; i < mp_irq_entries; i++) {
7875 - int lbus = mp_irqs[i].mpc_srcbus;
7876 + int lbus = mp_irqs[i].mp_srcbus;
7877
7878 if (test_bit(lbus, mp_bus_not_pci) &&
7879 - (mp_irqs[i].mpc_irqtype == type) &&
7880 - (mp_irqs[i].mpc_srcbusirq == irq))
7881 + (mp_irqs[i].mp_irqtype == type) &&
7882 + (mp_irqs[i].mp_srcbusirq == irq))
7883
7884 - return mp_irqs[i].mpc_dstirq;
7885 + return mp_irqs[i].mp_dstirq;
7886 }
7887 return -1;
7888 }
7889 @@ -524,17 +538,17 @@ static int __init find_isa_irq_apic(int
7890 int i;
7891
7892 for (i = 0; i < mp_irq_entries; i++) {
7893 - int lbus = mp_irqs[i].mpc_srcbus;
7894 + int lbus = mp_irqs[i].mp_srcbus;
7895
7896 if (test_bit(lbus, mp_bus_not_pci) &&
7897 - (mp_irqs[i].mpc_irqtype == type) &&
7898 - (mp_irqs[i].mpc_srcbusirq == irq))
7899 + (mp_irqs[i].mp_irqtype == type) &&
7900 + (mp_irqs[i].mp_srcbusirq == irq))
7901 break;
7902 }
7903 if (i < mp_irq_entries) {
7904 int apic;
7905 for(apic = 0; apic < nr_ioapics; apic++) {
7906 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
7907 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
7908 return apic;
7909 }
7910 }
7911 @@ -555,28 +569,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7912
7913 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
7914 bus, slot, pin);
7915 - if (mp_bus_id_to_pci_bus[bus] == -1) {
7916 + if (test_bit(bus, mp_bus_not_pci)) {
7917 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
7918 return -1;
7919 }
7920 for (i = 0; i < mp_irq_entries; i++) {
7921 - int lbus = mp_irqs[i].mpc_srcbus;
7922 + int lbus = mp_irqs[i].mp_srcbus;
7923
7924 for (apic = 0; apic < nr_ioapics; apic++)
7925 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
7926 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
7927 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
7928 + mp_irqs[i].mp_dstapic == MP_APIC_ALL)
7929 break;
7930
7931 if (!test_bit(lbus, mp_bus_not_pci) &&
7932 - !mp_irqs[i].mpc_irqtype &&
7933 + !mp_irqs[i].mp_irqtype &&
7934 (bus == lbus) &&
7935 - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
7936 - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
7937 + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
7938 + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
7939
7940 if (!(apic || IO_APIC_IRQ(irq)))
7941 continue;
7942
7943 - if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
7944 + if (pin == (mp_irqs[i].mp_srcbusirq & 3))
7945 return irq;
7946 /*
7947 * Use the first all-but-pin matching entry as a
7948 @@ -604,13 +618,13 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7949
7950 static int MPBIOS_polarity(int idx)
7951 {
7952 - int bus = mp_irqs[idx].mpc_srcbus;
7953 + int bus = mp_irqs[idx].mp_srcbus;
7954 int polarity;
7955
7956 /*
7957 * Determine IRQ line polarity (high active or low active):
7958 */
7959 - switch (mp_irqs[idx].mpc_irqflag & 3)
7960 + switch (mp_irqs[idx].mp_irqflag & 3)
7961 {
7962 case 0: /* conforms, ie. bus-type dependent polarity */
7963 if (test_bit(bus, mp_bus_not_pci))
7964 @@ -646,13 +660,13 @@ static int MPBIOS_polarity(int idx)
7965
7966 static int MPBIOS_trigger(int idx)
7967 {
7968 - int bus = mp_irqs[idx].mpc_srcbus;
7969 + int bus = mp_irqs[idx].mp_srcbus;
7970 int trigger;
7971
7972 /*
7973 * Determine IRQ trigger mode (edge or level sensitive):
7974 */
7975 - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
7976 + switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
7977 {
7978 case 0: /* conforms, ie. bus-type dependent */
7979 if (test_bit(bus, mp_bus_not_pci))
7980 @@ -699,16 +713,16 @@ static inline int irq_trigger(int idx)
7981 static int pin_2_irq(int idx, int apic, int pin)
7982 {
7983 int irq, i;
7984 - int bus = mp_irqs[idx].mpc_srcbus;
7985 + int bus = mp_irqs[idx].mp_srcbus;
7986
7987 /*
7988 * Debugging check, we are in big trouble if this message pops up!
7989 */
7990 - if (mp_irqs[idx].mpc_dstirq != pin)
7991 + if (mp_irqs[idx].mp_dstirq != pin)
7992 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
7993
7994 if (test_bit(bus, mp_bus_not_pci)) {
7995 - irq = mp_irqs[idx].mpc_srcbusirq;
7996 + irq = mp_irqs[idx].mp_srcbusirq;
7997 } else {
7998 /*
7999 * PCI IRQs are mapped in order
8000 @@ -722,6 +736,19 @@ static int pin_2_irq(int idx, int apic,
8001 return irq;
8002 }
8003
8004 +void lock_vector_lock(void)
8005 +{
8006 + /* Used to the online set of cpus does not change
8007 + * during assign_irq_vector.
8008 + */
8009 + spin_lock(&vector_lock);
8010 +}
8011 +
8012 +void unlock_vector_lock(void)
8013 +{
8014 + spin_unlock(&vector_lock);
8015 +}
8016 +
8017 static int __assign_irq_vector(int irq, cpumask_t mask)
8018 {
8019 struct physdev_irq irq_op;
8020 @@ -773,7 +800,7 @@ static void __clear_irq_vector(int irq)
8021
8022 vector = cfg->vector;
8023 cpus_and(mask, cfg->domain, cpu_online_map);
8024 - for_each_cpu_mask(cpu, mask)
8025 + for_each_cpu_mask_nr(cpu, mask)
8026 per_cpu(vector_irq, cpu)[vector] = -1;
8027
8028 cfg->vector = 0;
8029 @@ -842,7 +869,7 @@ static void setup_IO_APIC_irq(int apic,
8030 apic_printk(APIC_VERBOSE,KERN_DEBUG
8031 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
8032 "IRQ %d Mode:%i Active:%i)\n",
8033 - apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
8034 + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
8035 irq, trigger, polarity);
8036
8037 /*
8038 @@ -883,10 +910,10 @@ static void __init setup_IO_APIC_irqs(vo
8039 idx = find_irq_entry(apic,pin,mp_INT);
8040 if (idx == -1) {
8041 if (first_notcon) {
8042 - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8043 + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
8044 first_notcon = 0;
8045 } else
8046 - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8047 + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
8048 continue;
8049 }
8050 if (!first_notcon) {
8051 @@ -908,26 +935,21 @@ static void __init setup_IO_APIC_irqs(vo
8052
8053 #ifndef CONFIG_XEN
8054 /*
8055 - * Set up the 8259A-master output pin as broadcast to all
8056 - * CPUs.
8057 + * Set up the timer pin, possibly with the 8259A-master behind.
8058 */
8059 -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
8060 +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
8061 + int vector)
8062 {
8063 struct IO_APIC_route_entry entry;
8064
8065 memset(&entry, 0, sizeof(entry));
8066
8067 - disable_8259A_irq(0);
8068 -
8069 - /* mask LVT0 */
8070 - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8071 -
8072 /*
8073 * We use logical delivery to get the timer IRQ
8074 * to the first CPU.
8075 */
8076 entry.dest_mode = INT_DEST_MODE;
8077 - entry.mask = 0; /* unmask IRQ now */
8078 + entry.mask = 1; /* mask IRQ now */
8079 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
8080 entry.delivery_mode = INT_DELIVERY_MODE;
8081 entry.polarity = 0;
8082 @@ -936,7 +958,7 @@ static void __init setup_ExtINT_IRQ0_pin
8083
8084 /*
8085 * The timer IRQ doesn't have to know that behind the
8086 - * scene we have a 8259A-master in AEOI mode ...
8087 + * scene we may have a 8259A-master in AEOI mode ...
8088 */
8089 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
8090
8091 @@ -944,8 +966,6 @@ static void __init setup_ExtINT_IRQ0_pin
8092 * Add it to the IO-APIC irq-routing table:
8093 */
8094 ioapic_write_entry(apic, pin, entry);
8095 -
8096 - enable_8259A_irq(0);
8097 }
8098
8099 void __apicdebuginit print_IO_APIC(void)
8100 @@ -962,7 +982,7 @@ void __apicdebuginit print_IO_APIC(void)
8101 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
8102 for (i = 0; i < nr_ioapics; i++)
8103 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
8104 - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
8105 + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
8106
8107 /*
8108 * We are a bit conservative about what we expect. We have to
8109 @@ -980,7 +1000,7 @@ void __apicdebuginit print_IO_APIC(void)
8110 spin_unlock_irqrestore(&ioapic_lock, flags);
8111
8112 printk("\n");
8113 - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
8114 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
8115 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
8116 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
8117
8118 @@ -1072,6 +1092,7 @@ void __apicdebuginit print_local_APIC(vo
8119
8120 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
8121 smp_processor_id(), hard_smp_processor_id());
8122 + v = apic_read(APIC_ID);
8123 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
8124 v = apic_read(APIC_LVR);
8125 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
8126 @@ -1141,7 +1162,7 @@ void __apicdebuginit print_local_APIC(vo
8127
8128 void print_all_local_APICs (void)
8129 {
8130 - on_each_cpu(print_local_APIC, NULL, 1, 1);
8131 + on_each_cpu(print_local_APIC, NULL, 1);
8132 }
8133
8134 void __apicdebuginit print_PIC(void)
8135 @@ -1175,6 +1196,8 @@ void __apicdebuginit print_PIC(void)
8136 v = inb(0x4d1) << 8 | inb(0x4d0);
8137 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
8138 }
8139 +#else
8140 +void __apicdebuginit print_IO_APIC(void) {}
8141 #endif /* !CONFIG_XEN */
8142
8143 void __init enable_IO_APIC(void)
8144 @@ -1359,12 +1382,10 @@ static unsigned int startup_ioapic_irq(u
8145 static int ioapic_retrigger_irq(unsigned int irq)
8146 {
8147 struct irq_cfg *cfg = &irq_cfg[irq];
8148 - cpumask_t mask;
8149 unsigned long flags;
8150
8151 spin_lock_irqsave(&vector_lock, flags);
8152 - mask = cpumask_of_cpu(first_cpu(cfg->domain));
8153 - send_IPI_mask(mask, cfg->vector);
8154 + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
8155 spin_unlock_irqrestore(&vector_lock, flags);
8156
8157 return 1;
8158 @@ -1545,7 +1566,7 @@ static inline void init_IO_APIC_traps(vo
8159 }
8160
8161 #ifndef CONFIG_XEN
8162 -static void enable_lapic_irq (unsigned int irq)
8163 +static void unmask_lapic_irq(unsigned int irq)
8164 {
8165 unsigned long v;
8166
8167 @@ -1553,7 +1574,7 @@ static void enable_lapic_irq (unsigned i
8168 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
8169 }
8170
8171 -static void disable_lapic_irq (unsigned int irq)
8172 +static void mask_lapic_irq(unsigned int irq)
8173 {
8174 unsigned long v;
8175
8176 @@ -1566,19 +1587,20 @@ static void ack_lapic_irq (unsigned int
8177 ack_APIC_irq();
8178 }
8179
8180 -static void end_lapic_irq (unsigned int i) { /* nothing */ }
8181 -
8182 -static struct hw_interrupt_type lapic_irq_type __read_mostly = {
8183 - .name = "local-APIC",
8184 - .typename = "local-APIC-edge",
8185 - .startup = NULL, /* startup_irq() not used for IRQ0 */
8186 - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
8187 - .enable = enable_lapic_irq,
8188 - .disable = disable_lapic_irq,
8189 - .ack = ack_lapic_irq,
8190 - .end = end_lapic_irq,
8191 +static struct irq_chip lapic_chip __read_mostly = {
8192 + .name = "local-APIC",
8193 + .mask = mask_lapic_irq,
8194 + .unmask = unmask_lapic_irq,
8195 + .ack = ack_lapic_irq,
8196 };
8197
8198 +static void lapic_register_intr(int irq)
8199 +{
8200 + irq_desc[irq].status &= ~IRQ_LEVEL;
8201 + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
8202 + "edge");
8203 +}
8204 +
8205 static void __init setup_nmi(void)
8206 {
8207 /*
8208 @@ -1664,6 +1686,7 @@ static inline void __init check_timer(vo
8209 struct irq_cfg *cfg = irq_cfg + 0;
8210 int apic1, pin1, apic2, pin2;
8211 unsigned long flags;
8212 + int no_pin1 = 0;
8213
8214 local_irq_save(flags);
8215
8216 @@ -1674,34 +1697,48 @@ static inline void __init check_timer(vo
8217 assign_irq_vector(0, TARGET_CPUS);
8218
8219 /*
8220 - * Subtle, code in do_timer_interrupt() expects an AEOI
8221 - * mode for the 8259A whenever interrupts are routed
8222 - * through I/O APICs. Also IRQ0 has to be enabled in
8223 - * the 8259A which implies the virtual wire has to be
8224 - * disabled in the local APIC.
8225 + * As IRQ0 is to be enabled in the 8259A, the virtual
8226 + * wire has to be disabled in the local APIC.
8227 */
8228 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8229 init_8259A(1);
8230 - if (timer_over_8254 > 0)
8231 - enable_8259A_irq(0);
8232
8233 pin1 = find_isa_irq_pin(0, mp_INT);
8234 apic1 = find_isa_irq_apic(0, mp_INT);
8235 pin2 = ioapic_i8259.pin;
8236 apic2 = ioapic_i8259.apic;
8237
8238 - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
8239 - cfg->vector, apic1, pin1, apic2, pin2);
8240 + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
8241 + "apic1=%d pin1=%d apic2=%d pin2=%d\n",
8242 + cfg->vector, apic1, pin1, apic2, pin2);
8243 +
8244 + /*
8245 + * Some BIOS writers are clueless and report the ExtINTA
8246 + * I/O APIC input from the cascaded 8259A as the timer
8247 + * interrupt input. So just in case, if only one pin
8248 + * was found above, try it both directly and through the
8249 + * 8259A.
8250 + */
8251 + if (pin1 == -1) {
8252 + pin1 = pin2;
8253 + apic1 = apic2;
8254 + no_pin1 = 1;
8255 + } else if (pin2 == -1) {
8256 + pin2 = pin1;
8257 + apic2 = apic1;
8258 + }
8259
8260 if (pin1 != -1) {
8261 /*
8262 * Ok, does IRQ0 through the IOAPIC work?
8263 */
8264 + if (no_pin1) {
8265 + add_pin_to_irq(0, apic1, pin1);
8266 + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
8267 + }
8268 unmask_IO_APIC_irq(0);
8269 if (!no_timer_check && timer_irq_works()) {
8270 - nmi_watchdog_default();
8271 if (nmi_watchdog == NMI_IO_APIC) {
8272 - disable_8259A_irq(0);
8273 setup_nmi();
8274 enable_8259A_irq(0);
8275 }
8276 @@ -1710,54 +1747,62 @@ static inline void __init check_timer(vo
8277 goto out;
8278 }
8279 clear_IO_APIC_pin(apic1, pin1);
8280 - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
8281 - "connected to IO-APIC\n");
8282 - }
8283 -
8284 - apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
8285 - "through the 8259A ... ");
8286 - if (pin2 != -1) {
8287 - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
8288 - apic2, pin2);
8289 + if (!no_pin1)
8290 + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
8291 + "8254 timer not connected to IO-APIC\n");
8292 +
8293 + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
8294 + "(IRQ0) through the 8259A ...\n");
8295 + apic_printk(APIC_QUIET, KERN_INFO
8296 + "..... (found apic %d pin %d) ...\n", apic2, pin2);
8297 /*
8298 * legacy devices should be connected to IO APIC #0
8299 */
8300 - setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
8301 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
8302 + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
8303 + unmask_IO_APIC_irq(0);
8304 + enable_8259A_irq(0);
8305 if (timer_irq_works()) {
8306 - apic_printk(APIC_VERBOSE," works.\n");
8307 - nmi_watchdog_default();
8308 + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
8309 + timer_through_8259 = 1;
8310 if (nmi_watchdog == NMI_IO_APIC) {
8311 + disable_8259A_irq(0);
8312 setup_nmi();
8313 + enable_8259A_irq(0);
8314 }
8315 goto out;
8316 }
8317 /*
8318 * Cleanup, just in case ...
8319 */
8320 + disable_8259A_irq(0);
8321 clear_IO_APIC_pin(apic2, pin2);
8322 + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
8323 }
8324 - apic_printk(APIC_VERBOSE," failed.\n");
8325
8326 if (nmi_watchdog == NMI_IO_APIC) {
8327 - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
8328 - nmi_watchdog = 0;
8329 + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
8330 + "through the IO-APIC - disabling NMI Watchdog!\n");
8331 + nmi_watchdog = NMI_NONE;
8332 }
8333
8334 - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
8335 + apic_printk(APIC_QUIET, KERN_INFO
8336 + "...trying to set up timer as Virtual Wire IRQ...\n");
8337
8338 - disable_8259A_irq(0);
8339 - irq_desc[0].chip = &lapic_irq_type;
8340 + lapic_register_intr(0);
8341 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
8342 enable_8259A_irq(0);
8343
8344 if (timer_irq_works()) {
8345 - apic_printk(APIC_VERBOSE," works.\n");
8346 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8347 goto out;
8348 }
8349 + disable_8259A_irq(0);
8350 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
8351 - apic_printk(APIC_VERBOSE," failed.\n");
8352 + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
8353
8354 - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
8355 + apic_printk(APIC_QUIET, KERN_INFO
8356 + "...trying to set up timer as ExtINT IRQ...\n");
8357
8358 init_8259A(0);
8359 make_8259A_irq(0);
8360 @@ -1766,11 +1811,12 @@ static inline void __init check_timer(vo
8361 unlock_ExtINT_logic();
8362
8363 if (timer_irq_works()) {
8364 - apic_printk(APIC_VERBOSE," works.\n");
8365 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8366 goto out;
8367 }
8368 - apic_printk(APIC_VERBOSE," failed :(.\n");
8369 - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
8370 + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
8371 + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
8372 + "report. Then try booting with the 'noapic' option.\n");
8373 out:
8374 local_irq_restore(flags);
8375 }
8376 @@ -1788,10 +1834,21 @@ __setup("no_timer_check", notimercheck);
8377
8378 /*
8379 *
8380 - * IRQs that are handled by the PIC in the MPS IOAPIC case.
8381 - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
8382 - * Linux doesn't really care, as it's not actually used
8383 - * for any interrupt handling anyway.
8384 + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
8385 + * to devices. However there may be an I/O APIC pin available for
8386 + * this interrupt regardless. The pin may be left unconnected, but
8387 + * typically it will be reused as an ExtINT cascade interrupt for
8388 + * the master 8259A. In the MPS case such a pin will normally be
8389 + * reported as an ExtINT interrupt in the MP table. With ACPI
8390 + * there is no provision for ExtINT interrupts, and in the absence
8391 + * of an override it would be treated as an ordinary ISA I/O APIC
8392 + * interrupt, that is edge-triggered and unmasked by default. We
8393 + * used to do this, but it caused problems on some systems because
8394 + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
8395 + * the same ExtINT cascade interrupt to drive the local APIC of the
8396 + * bootstrap processor. Therefore we refrain from routing IRQ2 to
8397 + * the I/O APIC in all cases now. No actual device should request
8398 + * it anyway. --macro
8399 */
8400 #define PIC_IRQS (1<<2)
8401
8402 @@ -1799,10 +1856,7 @@ void __init setup_IO_APIC(void)
8403 {
8404 enable_IO_APIC();
8405
8406 - if (acpi_ioapic)
8407 - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
8408 - else
8409 - io_apic_irqs = ~PIC_IRQS;
8410 + io_apic_irqs = ~PIC_IRQS;
8411
8412 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
8413
8414 @@ -1851,8 +1905,8 @@ static int ioapic_resume(struct sys_devi
8415
8416 spin_lock_irqsave(&ioapic_lock, flags);
8417 reg_00.raw = io_apic_read(dev->id, 0);
8418 - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
8419 - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
8420 + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
8421 + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
8422 io_apic_write(dev->id, 0, reg_00.raw);
8423 }
8424 spin_unlock_irqrestore(&ioapic_lock, flags);
8425 @@ -2254,8 +2308,8 @@ int acpi_get_override_irq(int bus_irq, i
8426 return -1;
8427
8428 for (i = 0; i < mp_irq_entries; i++)
8429 - if (mp_irqs[i].mpc_irqtype == mp_INT &&
8430 - mp_irqs[i].mpc_srcbusirq == bus_irq)
8431 + if (mp_irqs[i].mp_irqtype == mp_INT &&
8432 + mp_irqs[i].mp_srcbusirq == bus_irq)
8433 break;
8434 if (i >= mp_irq_entries)
8435 return -1;
8436 @@ -2349,7 +2403,7 @@ void __init ioapic_init_mappings(void)
8437 ioapic_res = ioapic_setup_resources();
8438 for (i = 0; i < nr_ioapics; i++) {
8439 if (smp_found_config) {
8440 - ioapic_phys = mp_ioapics[i].mpc_apicaddr;
8441 + ioapic_phys = mp_ioapics[i].mp_apicaddr;
8442 } else {
8443 ioapic_phys = (unsigned long)
8444 alloc_bootmem_pages(PAGE_SIZE);
8445 Index: head-2008-12-01/arch/x86/kernel/ipi-xen.c
8446 ===================================================================
8447 --- head-2008-12-01.orig/arch/x86/kernel/ipi-xen.c 2008-12-01 11:44:55.000000000 +0100
8448 +++ head-2008-12-01/arch/x86/kernel/ipi-xen.c 2008-12-01 11:49:07.000000000 +0100
8449 @@ -8,7 +8,6 @@
8450 #include <linux/kernel_stat.h>
8451 #include <linux/mc146818rtc.h>
8452 #include <linux/cache.h>
8453 -#include <linux/interrupt.h>
8454 #include <linux/cpu.h>
8455 #include <linux/module.h>
8456
8457 @@ -85,7 +84,7 @@ void __send_IPI_shortcut(unsigned int sh
8458 /*
8459 * Send the IPI. The write to APIC_ICR fires this off.
8460 */
8461 - apic_write_around(APIC_ICR, cfg);
8462 + apic_write(APIC_ICR, cfg);
8463 #else
8464 int cpu;
8465
8466 @@ -132,7 +131,7 @@ static inline void __send_IPI_dest_field
8467 * prepare target chip field
8468 */
8469 cfg = __prepare_ICR2(mask);
8470 - apic_write_around(APIC_ICR2, cfg);
8471 + apic_write(APIC_ICR2, cfg);
8472
8473 /*
8474 * program the ICR
8475 @@ -142,7 +141,7 @@ static inline void __send_IPI_dest_field
8476 /*
8477 * Send the IPI. The write to APIC_ICR fires this off.
8478 */
8479 - apic_write_around(APIC_ICR, cfg);
8480 + apic_write(APIC_ICR, cfg);
8481 }
8482 #endif
8483
8484 Index: head-2008-12-01/arch/x86/kernel/irq_32-xen.c
8485 ===================================================================
8486 --- head-2008-12-01.orig/arch/x86/kernel/irq_32-xen.c 2008-12-01 11:44:55.000000000 +0100
8487 +++ head-2008-12-01/arch/x86/kernel/irq_32-xen.c 2008-12-01 11:49:07.000000000 +0100
8488 @@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
8489 #endif
8490 }
8491
8492 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
8493 +/* Debugging check for stack overflow: is there less than 1KB free? */
8494 +static int check_stack_overflow(void)
8495 +{
8496 + long sp;
8497 +
8498 + __asm__ __volatile__("andl %%esp,%0" :
8499 + "=r" (sp) : "0" (THREAD_SIZE - 1));
8500 +
8501 + return sp < (sizeof(struct thread_info) + STACK_WARN);
8502 +}
8503 +
8504 +static void print_stack_overflow(void)
8505 +{
8506 + printk(KERN_WARNING "low stack detected by irq handler\n");
8507 + dump_stack();
8508 +}
8509 +
8510 +#else
8511 +static inline int check_stack_overflow(void) { return 0; }
8512 +static inline void print_stack_overflow(void) { }
8513 +#endif
8514 +
8515 #ifdef CONFIG_4KSTACKS
8516 /*
8517 * per-CPU IRQ handling contexts (thread information and stack)
8518 @@ -59,48 +82,26 @@ union irq_ctx {
8519
8520 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
8521 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
8522 -#endif
8523 -
8524 -/*
8525 - * do_IRQ handles all normal device IRQ's (the special
8526 - * SMP cross-CPU interrupts have their own specific
8527 - * handlers).
8528 - */
8529 -unsigned int do_IRQ(struct pt_regs *regs)
8530 -{
8531 - struct pt_regs *old_regs;
8532 - /* high bit used in ret_from_ code */
8533 - int irq = ~regs->orig_ax;
8534 - struct irq_desc *desc = irq_desc + irq;
8535 -#ifdef CONFIG_4KSTACKS
8536 - union irq_ctx *curctx, *irqctx;
8537 - u32 *isp;
8538 -#endif
8539
8540 - if (unlikely((unsigned)irq >= NR_IRQS)) {
8541 - printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8542 - __func__, irq);
8543 - BUG();
8544 - }
8545 +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8546 +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8547
8548 - old_regs = set_irq_regs(regs);
8549 - /*irq_enter();*/
8550 -#ifdef CONFIG_DEBUG_STACKOVERFLOW
8551 - /* Debugging check for stack overflow: is there less than 1KB free? */
8552 - {
8553 - long sp;
8554 -
8555 - __asm__ __volatile__("andl %%esp,%0" :
8556 - "=r" (sp) : "0" (THREAD_SIZE - 1));
8557 - if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
8558 - printk("do_IRQ: stack overflow: %ld\n",
8559 - sp - sizeof(struct thread_info));
8560 - dump_stack();
8561 - }
8562 - }
8563 -#endif
8564 +static void call_on_stack(void *func, void *stack)
8565 +{
8566 + asm volatile("xchgl %%ebx,%%esp \n"
8567 + "call *%%edi \n"
8568 + "movl %%ebx,%%esp \n"
8569 + : "=b" (stack)
8570 + : "0" (stack),
8571 + "D"(func)
8572 + : "memory", "cc", "edx", "ecx", "eax");
8573 +}
8574
8575 -#ifdef CONFIG_4KSTACKS
8576 +static inline int
8577 +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
8578 +{
8579 + union irq_ctx *curctx, *irqctx;
8580 + u32 *isp, arg1, arg2;
8581
8582 curctx = (union irq_ctx *) current_thread_info();
8583 irqctx = hardirq_ctx[smp_processor_id()];
8584 @@ -111,52 +112,39 @@ unsigned int do_IRQ(struct pt_regs *regs
8585 * handler) we can't do that and just have to keep using the
8586 * current stack (which is the irq stack already after all)
8587 */
8588 - if (curctx != irqctx) {
8589 - int arg1, arg2, bx;
8590 -
8591 - /* build the stack frame on the IRQ stack */
8592 - isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8593 - irqctx->tinfo.task = curctx->tinfo.task;
8594 - irqctx->tinfo.previous_esp = current_stack_pointer;
8595 + if (unlikely(curctx == irqctx))
8596 + return 0;
8597
8598 - /*
8599 - * Copy the softirq bits in preempt_count so that the
8600 - * softirq checks work in the hardirq context.
8601 - */
8602 - irqctx->tinfo.preempt_count =
8603 - (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8604 - (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8605 -
8606 - asm volatile(
8607 - " xchgl %%ebx,%%esp \n"
8608 - " call *%%edi \n"
8609 - " movl %%ebx,%%esp \n"
8610 - : "=a" (arg1), "=d" (arg2), "=b" (bx)
8611 - : "0" (irq), "1" (desc), "2" (isp),
8612 - "D" (desc->handle_irq)
8613 - : "memory", "cc", "ecx"
8614 - );
8615 - } else
8616 -#endif
8617 - desc->handle_irq(irq, desc);
8618 + /* build the stack frame on the IRQ stack */
8619 + isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
8620 + irqctx->tinfo.task = curctx->tinfo.task;
8621 + irqctx->tinfo.previous_esp = current_stack_pointer;
8622
8623 - /*irq_exit();*/
8624 - set_irq_regs(old_regs);
8625 + /*
8626 + * Copy the softirq bits in preempt_count so that the
8627 + * softirq checks work in the hardirq context.
8628 + */
8629 + irqctx->tinfo.preempt_count =
8630 + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8631 + (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8632 +
8633 + if (unlikely(overflow))
8634 + call_on_stack(print_stack_overflow, isp);
8635 +
8636 + asm volatile("xchgl %%ebx,%%esp \n"
8637 + "call *%%edi \n"
8638 + "movl %%ebx,%%esp \n"
8639 + : "=a" (arg1), "=d" (arg2), "=b" (isp)
8640 + : "0" (irq), "1" (desc), "2" (isp),
8641 + "D" (desc->handle_irq)
8642 + : "memory", "cc", "ecx");
8643 return 1;
8644 }
8645
8646 -#ifdef CONFIG_4KSTACKS
8647 -
8648 -static char softirq_stack[NR_CPUS * THREAD_SIZE]
8649 - __attribute__((__section__(".bss.page_aligned")));
8650 -
8651 -static char hardirq_stack[NR_CPUS * THREAD_SIZE]
8652 - __attribute__((__section__(".bss.page_aligned")));
8653 -
8654 /*
8655 * allocate per-cpu stacks for hardirq and for softirq processing
8656 */
8657 -void irq_ctx_init(int cpu)
8658 +void __cpuinit irq_ctx_init(int cpu)
8659 {
8660 union irq_ctx *irqctx;
8661
8662 @@ -164,25 +152,25 @@ void irq_ctx_init(int cpu)
8663 return;
8664
8665 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
8666 - irqctx->tinfo.task = NULL;
8667 - irqctx->tinfo.exec_domain = NULL;
8668 - irqctx->tinfo.cpu = cpu;
8669 - irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8670 - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8671 + irqctx->tinfo.task = NULL;
8672 + irqctx->tinfo.exec_domain = NULL;
8673 + irqctx->tinfo.cpu = cpu;
8674 + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8675 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8676
8677 hardirq_ctx[cpu] = irqctx;
8678
8679 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
8680 - irqctx->tinfo.task = NULL;
8681 - irqctx->tinfo.exec_domain = NULL;
8682 - irqctx->tinfo.cpu = cpu;
8683 - irqctx->tinfo.preempt_count = 0;
8684 - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8685 + irqctx->tinfo.task = NULL;
8686 + irqctx->tinfo.exec_domain = NULL;
8687 + irqctx->tinfo.cpu = cpu;
8688 + irqctx->tinfo.preempt_count = 0;
8689 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8690
8691 softirq_ctx[cpu] = irqctx;
8692
8693 - printk("CPU %u irqstacks, hard=%p soft=%p\n",
8694 - cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8695 + printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
8696 + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8697 }
8698
8699 void irq_ctx_exit(int cpu)
8700 @@ -211,25 +199,56 @@ asmlinkage void do_softirq(void)
8701 /* build the stack frame on the softirq stack */
8702 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8703
8704 - asm volatile(
8705 - " xchgl %%ebx,%%esp \n"
8706 - " call __do_softirq \n"
8707 - " movl %%ebx,%%esp \n"
8708 - : "=b"(isp)
8709 - : "0"(isp)
8710 - : "memory", "cc", "edx", "ecx", "eax"
8711 - );
8712 + call_on_stack(__do_softirq, isp);
8713 /*
8714 * Shouldnt happen, we returned above if in_interrupt():
8715 - */
8716 + */
8717 WARN_ON_ONCE(softirq_count());
8718 }
8719
8720 local_irq_restore(flags);
8721 }
8722 +
8723 +#else
8724 +static inline int
8725 +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
8726 #endif
8727
8728 /*
8729 + * do_IRQ handles all normal device IRQ's (the special
8730 + * SMP cross-CPU interrupts have their own specific
8731 + * handlers).
8732 + */
8733 +unsigned int do_IRQ(struct pt_regs *regs)
8734 +{
8735 + struct pt_regs *old_regs;
8736 + /* high bit used in ret_from_ code */
8737 + int overflow, irq = ~regs->orig_ax;
8738 + struct irq_desc *desc = irq_desc + irq;
8739 +
8740 + if (unlikely((unsigned)irq >= NR_IRQS)) {
8741 + printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8742 + __func__, irq);
8743 + BUG();
8744 + }
8745 +
8746 + old_regs = set_irq_regs(regs);
8747 + /*irq_enter();*/
8748 +
8749 + overflow = check_stack_overflow();
8750 +
8751 + if (!execute_on_irq_stack(overflow, desc, irq)) {
8752 + if (unlikely(overflow))
8753 + print_stack_overflow();
8754 + desc->handle_irq(irq, desc);
8755 + }
8756 +
8757 + /*irq_exit();*/
8758 + set_irq_regs(old_regs);
8759 + return 1;
8760 +}
8761 +
8762 +/*
8763 * Interrupt statistics:
8764 */
8765
8766 @@ -337,6 +356,42 @@ skip:
8767 return 0;
8768 }
8769
8770 +/*
8771 + * /proc/stat helpers
8772 + */
8773 +u64 arch_irq_stat_cpu(unsigned int cpu)
8774 +{
8775 + u64 sum = nmi_count(cpu);
8776 +
8777 +#ifdef CONFIG_X86_LOCAL_APIC
8778 + sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
8779 +#endif
8780 +#ifdef CONFIG_SMP
8781 + sum += per_cpu(irq_stat, cpu).irq_resched_count;
8782 + sum += per_cpu(irq_stat, cpu).irq_call_count;
8783 +#ifndef CONFIG_XEN
8784 + sum += per_cpu(irq_stat, cpu).irq_tlb_count;
8785 +#endif
8786 +#endif
8787 +#ifdef CONFIG_X86_MCE
8788 + sum += per_cpu(irq_stat, cpu).irq_thermal_count;
8789 +#endif
8790 +#ifdef CONFIG_X86_LOCAL_APIC
8791 + sum += per_cpu(irq_stat, cpu).irq_spurious_count;
8792 +#endif
8793 + return sum;
8794 +}
8795 +
8796 +u64 arch_irq_stat(void)
8797 +{
8798 + u64 sum = atomic_read(&irq_err_count);
8799 +
8800 +#ifdef CONFIG_X86_IO_APIC
8801 + sum += atomic_read(&irq_mis_count);
8802 +#endif
8803 + return sum;
8804 +}
8805 +
8806 #ifdef CONFIG_HOTPLUG_CPU
8807
8808 void fixup_irqs(cpumask_t map)
8809 Index: head-2008-12-01/arch/x86/kernel/irq_64-xen.c
8810 ===================================================================
8811 --- head-2008-12-01.orig/arch/x86/kernel/irq_64-xen.c 2008-12-01 11:37:10.000000000 +0100
8812 +++ head-2008-12-01/arch/x86/kernel/irq_64-xen.c 2008-12-01 11:49:07.000000000 +0100
8813 @@ -163,6 +163,34 @@ skip:
8814 }
8815
8816 /*
8817 + * /proc/stat helpers
8818 + */
8819 +u64 arch_irq_stat_cpu(unsigned int cpu)
8820 +{
8821 + u64 sum = cpu_pda(cpu)->__nmi_count;
8822 +
8823 + sum += cpu_pda(cpu)->apic_timer_irqs;
8824 +#ifdef CONFIG_SMP
8825 + sum += cpu_pda(cpu)->irq_resched_count;
8826 + sum += cpu_pda(cpu)->irq_call_count;
8827 +#ifndef CONFIG_XEN
8828 + sum += cpu_pda(cpu)->irq_tlb_count;
8829 +#endif
8830 +#endif
8831 +#ifdef CONFIG_X86_MCE
8832 + sum += cpu_pda(cpu)->irq_thermal_count;
8833 + sum += cpu_pda(cpu)->irq_threshold_count;
8834 +#endif
8835 + sum += cpu_pda(cpu)->irq_spurious_count;
8836 + return sum;
8837 +}
8838 +
8839 +u64 arch_irq_stat(void)
8840 +{
8841 + return atomic_read(&irq_err_count);
8842 +}
8843 +
8844 +/*
8845 * do_IRQ handles all normal device IRQ's (the special
8846 * SMP cross-CPU interrupts have their own specific
8847 * handlers).
8848 Index: head-2008-12-01/arch/x86/kernel/ldt-xen.c
8849 ===================================================================
8850 --- head-2008-12-01.orig/arch/x86/kernel/ldt-xen.c 2008-12-01 11:37:10.000000000 +0100
8851 +++ head-2008-12-01/arch/x86/kernel/ldt-xen.c 2008-12-01 11:49:07.000000000 +0100
8852 @@ -20,9 +20,9 @@
8853 #include <asm/mmu_context.h>
8854
8855 #ifdef CONFIG_SMP
8856 -static void flush_ldt(void *null)
8857 +static void flush_ldt(void *current_mm)
8858 {
8859 - if (current->active_mm)
8860 + if (current->active_mm == current_mm)
8861 load_LDT(&current->active_mm->context);
8862 }
8863 #endif
8864 @@ -62,8 +62,6 @@ static int alloc_ldt(mm_context_t *pc, i
8865
8866 if (reload) {
8867 #ifdef CONFIG_SMP
8868 - cpumask_t mask;
8869 -
8870 preempt_disable();
8871 #endif
8872 make_pages_readonly(newldt,
8873 @@ -71,9 +69,9 @@ static int alloc_ldt(mm_context_t *pc, i
8874 XENFEAT_writable_descriptor_tables);
8875 load_LDT(pc);
8876 #ifdef CONFIG_SMP
8877 - mask = cpumask_of_cpu(smp_processor_id());
8878 - if (!cpus_equal(current->mm->cpu_vm_mask, mask))
8879 - smp_call_function(flush_ldt, NULL, 1, 1);
8880 + if (!cpus_equal(current->mm->cpu_vm_mask,
8881 + cpumask_of_cpu(smp_processor_id())))
8882 + smp_call_function(flush_ldt, current->mm, 1);
8883 preempt_enable();
8884 #endif
8885 }
8886 Index: head-2008-12-01/arch/x86/kernel/microcode-xen.c
8887 ===================================================================
8888 --- head-2008-12-01.orig/arch/x86/kernel/microcode-xen.c 2008-12-01 11:44:55.000000000 +0100
8889 +++ head-2008-12-01/arch/x86/kernel/microcode-xen.c 2008-12-01 11:49:07.000000000 +0100
8890 @@ -5,13 +5,14 @@
8891 * 2006 Shaohua Li <shaohua.li@intel.com>
8892 *
8893 * This driver allows to upgrade microcode on Intel processors
8894 - * belonging to IA-32 family - PentiumPro, Pentium II,
8895 + * belonging to IA-32 family - PentiumPro, Pentium II,
8896 * Pentium III, Xeon, Pentium 4, etc.
8897 *
8898 - * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
8899 - * Order Number 245472 or free download from:
8900 - *
8901 - * http://developer.intel.com/design/pentium4/manuals/245472.htm
8902 + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
8903 + * Software Developer's Manual
8904 + * Order Number 253668 or free download from:
8905 + *
8906 + * http://developer.intel.com/design/pentium4/manuals/253668.htm
8907 *
8908 * For more information, go to http://www.urbanmyth.org/microcode
8909 *
8910 @@ -26,6 +27,7 @@
8911 #include <linux/kernel.h>
8912 #include <linux/init.h>
8913 #include <linux/sched.h>
8914 +#include <linux/smp_lock.h>
8915 #include <linux/cpumask.h>
8916 #include <linux/module.h>
8917 #include <linux/slab.h>
8918 @@ -86,6 +88,7 @@ static int do_microcode_update (const vo
8919
8920 static int microcode_open (struct inode *unused1, struct file *unused2)
8921 {
8922 + cycle_kernel_lock();
8923 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
8924 }
8925
8926 @@ -162,7 +165,7 @@ static int request_microcode(void)
8927 c->x86, c->x86_model, c->x86_mask);
8928 error = request_firmware(&firmware, name, &microcode_pdev->dev);
8929 if (error) {
8930 - pr_debug("microcode: ucode data file %s load failed\n", name);
8931 + pr_debug("microcode: data file %s load failed\n", name);
8932 return error;
8933 }
8934
8935 @@ -183,6 +186,9 @@ static int __init microcode_init (void)
8936 {
8937 int error;
8938
8939 + printk(KERN_INFO
8940 + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8941 +
8942 error = microcode_dev_init();
8943 if (error)
8944 return error;
8945 @@ -195,8 +201,6 @@ static int __init microcode_init (void)
8946
8947 request_microcode();
8948
8949 - printk(KERN_INFO
8950 - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8951 return 0;
8952 }
8953
8954 Index: head-2008-12-01/arch/x86/kernel/mpparse-xen.c
8955 ===================================================================
8956 --- head-2008-12-01.orig/arch/x86/kernel/mpparse-xen.c 2008-12-01 11:44:55.000000000 +0100
8957 +++ head-2008-12-01/arch/x86/kernel/mpparse-xen.c 2008-12-01 11:49:07.000000000 +0100
8958 @@ -25,6 +25,9 @@
8959 #include <asm/proto.h>
8960 #include <asm/acpi.h>
8961 #include <asm/bios_ebda.h>
8962 +#include <asm/e820.h>
8963 +#include <asm/trampoline.h>
8964 +#include <asm/setup.h>
8965
8966 #include <mach_apic.h>
8967 #ifdef CONFIG_X86_32
8968 @@ -32,28 +35,6 @@
8969 #include <mach_mpparse.h>
8970 #endif
8971
8972 -/* Have we found an MP table */
8973 -int smp_found_config;
8974 -
8975 -/*
8976 - * Various Linux-internal data structures created from the
8977 - * MP-table.
8978 - */
8979 -#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
8980 -int mp_bus_id_to_type[MAX_MP_BUSSES];
8981 -#endif
8982 -
8983 -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
8984 -int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
8985 -
8986 -static int mp_current_pci_id;
8987 -
8988 -int pic_mode;
8989 -
8990 -/*
8991 - * Intel MP BIOS table parsing routines:
8992 - */
8993 -
8994 /*
8995 * Checksum an MP configuration block.
8996 */
8997 @@ -68,20 +49,8 @@ static int __init mpf_checksum(unsigned
8998 return sum & 0xFF;
8999 }
9000
9001 -#ifdef CONFIG_X86_NUMAQ
9002 -/*
9003 - * Have to match translation table entries to main table entries by counter
9004 - * hence the mpc_record variable .... can't see a less disgusting way of
9005 - * doing this ....
9006 - */
9007 -
9008 -static int mpc_record;
9009 -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
9010 - __cpuinitdata;
9011 -#endif
9012 -
9013 #ifndef CONFIG_XEN
9014 -static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
9015 +static void __init MP_processor_info(struct mpc_config_processor *m)
9016 {
9017 int apicid;
9018 char *bootup_cpu = "";
9019 @@ -90,11 +59,12 @@ static void __cpuinit MP_processor_info(
9020 disabled_cpus++;
9021 return;
9022 }
9023 -#ifdef CONFIG_X86_NUMAQ
9024 - apicid = mpc_apic_id(m, translation_table[mpc_record]);
9025 -#else
9026 - apicid = m->mpc_apicid;
9027 -#endif
9028 +
9029 + if (x86_quirks->mpc_apic_id)
9030 + apicid = x86_quirks->mpc_apic_id(m);
9031 + else
9032 + apicid = m->mpc_apicid;
9033 +
9034 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
9035 bootup_cpu = " (Bootup-CPU)";
9036 boot_cpu_physical_apicid = m->mpc_apicid;
9037 @@ -104,24 +74,23 @@ static void __cpuinit MP_processor_info(
9038 generic_processor_info(apicid, m->mpc_apicver);
9039 }
9040 #else
9041 -static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
9042 +static void __init MP_processor_info(struct mpc_config_processor *m)
9043 {
9044 num_processors++;
9045 }
9046 #endif /* CONFIG_XEN */
9047
9048 +#ifdef CONFIG_X86_IO_APIC
9049 static void __init MP_bus_info(struct mpc_config_bus *m)
9050 {
9051 char str[7];
9052 -
9053 memcpy(str, m->mpc_bustype, 6);
9054 str[6] = 0;
9055
9056 -#ifdef CONFIG_X86_NUMAQ
9057 - mpc_oem_bus_info(m, str, translation_table[mpc_record]);
9058 -#else
9059 - Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
9060 -#endif
9061 + if (x86_quirks->mpc_oem_bus_info)
9062 + x86_quirks->mpc_oem_bus_info(m, str);
9063 + else
9064 + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
9065
9066 #if MAX_MP_BUSSES < 256
9067 if (m->mpc_busid >= MAX_MP_BUSSES) {
9068 @@ -138,12 +107,10 @@ static void __init MP_bus_info(struct mp
9069 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
9070 #endif
9071 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
9072 -#ifdef CONFIG_X86_NUMAQ
9073 - mpc_oem_pci_bus(m, translation_table[mpc_record]);
9074 -#endif
9075 + if (x86_quirks->mpc_oem_pci_bus)
9076 + x86_quirks->mpc_oem_pci_bus(m);
9077 +
9078 clear_bit(m->mpc_busid, mp_bus_not_pci);
9079 - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
9080 - mp_current_pci_id++;
9081 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
9082 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
9083 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
9084 @@ -154,6 +121,7 @@ static void __init MP_bus_info(struct mp
9085 } else
9086 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
9087 }
9088 +#endif
9089
9090 #ifdef CONFIG_X86_IO_APIC
9091
9092 @@ -183,117 +151,111 @@ static void __init MP_ioapic_info(struct
9093 if (bad_ioapic(m->mpc_apicaddr))
9094 return;
9095
9096 - mp_ioapics[nr_ioapics] = *m;
9097 + mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
9098 + mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
9099 + mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
9100 + mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
9101 + mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
9102 nr_ioapics++;
9103 }
9104
9105 -static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9106 +static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
9107 {
9108 - mp_irqs[mp_irq_entries] = *m;
9109 - Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
9110 + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9111 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9112 m->mpc_irqtype, m->mpc_irqflag & 3,
9113 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
9114 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
9115 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
9116 - panic("Max # of irq sources exceeded!!\n");
9117 }
9118
9119 -#endif
9120 -
9121 -static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9122 +static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
9123 {
9124 - Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
9125 - " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9126 - m->mpc_irqtype, m->mpc_irqflag & 3,
9127 - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9128 - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9129 + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9130 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9131 + mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
9132 + (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
9133 + mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
9134 }
9135
9136 -#ifdef CONFIG_X86_NUMAQ
9137 -static void __init MP_translation_info(struct mpc_config_translation *m)
9138 +static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
9139 + struct mp_config_intsrc *mp_irq)
9140 {
9141 - printk(KERN_INFO
9142 - "Translation: record %d, type %d, quad %d, global %d, local %d\n",
9143 - mpc_record, m->trans_type, m->trans_quad, m->trans_global,
9144 - m->trans_local);
9145 + mp_irq->mp_dstapic = m->mpc_dstapic;
9146 + mp_irq->mp_type = m->mpc_type;
9147 + mp_irq->mp_irqtype = m->mpc_irqtype;
9148 + mp_irq->mp_irqflag = m->mpc_irqflag;
9149 + mp_irq->mp_srcbus = m->mpc_srcbus;
9150 + mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
9151 + mp_irq->mp_dstirq = m->mpc_dstirq;
9152 +}
9153
9154 - if (mpc_record >= MAX_MPC_ENTRY)
9155 - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
9156 - else
9157 - translation_table[mpc_record] = m; /* stash this for later */
9158 - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
9159 - node_set_online(m->trans_quad);
9160 +static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
9161 + struct mpc_config_intsrc *m)
9162 +{
9163 + m->mpc_dstapic = mp_irq->mp_dstapic;
9164 + m->mpc_type = mp_irq->mp_type;
9165 + m->mpc_irqtype = mp_irq->mp_irqtype;
9166 + m->mpc_irqflag = mp_irq->mp_irqflag;
9167 + m->mpc_srcbus = mp_irq->mp_srcbus;
9168 + m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
9169 + m->mpc_dstirq = mp_irq->mp_dstirq;
9170 }
9171
9172 -/*
9173 - * Read/parse the MPC oem tables
9174 - */
9175 +static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
9176 + struct mpc_config_intsrc *m)
9177 +{
9178 + if (mp_irq->mp_dstapic != m->mpc_dstapic)
9179 + return 1;
9180 + if (mp_irq->mp_type != m->mpc_type)
9181 + return 2;
9182 + if (mp_irq->mp_irqtype != m->mpc_irqtype)
9183 + return 3;
9184 + if (mp_irq->mp_irqflag != m->mpc_irqflag)
9185 + return 4;
9186 + if (mp_irq->mp_srcbus != m->mpc_srcbus)
9187 + return 5;
9188 + if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
9189 + return 6;
9190 + if (mp_irq->mp_dstirq != m->mpc_dstirq)
9191 + return 7;
9192 +
9193 + return 0;
9194 +}
9195
9196 -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
9197 - unsigned short oemsize)
9198 +static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9199 {
9200 - int count = sizeof(*oemtable); /* the header size */
9201 - unsigned char *oemptr = ((unsigned char *)oemtable) + count;
9202 + int i;
9203
9204 - mpc_record = 0;
9205 - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
9206 - oemtable);
9207 - if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
9208 - printk(KERN_WARNING
9209 - "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
9210 - oemtable->oem_signature[0], oemtable->oem_signature[1],
9211 - oemtable->oem_signature[2], oemtable->oem_signature[3]);
9212 - return;
9213 - }
9214 - if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
9215 - printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
9216 - return;
9217 - }
9218 - while (count < oemtable->oem_length) {
9219 - switch (*oemptr) {
9220 - case MP_TRANSLATION:
9221 - {
9222 - struct mpc_config_translation *m =
9223 - (struct mpc_config_translation *)oemptr;
9224 - MP_translation_info(m);
9225 - oemptr += sizeof(*m);
9226 - count += sizeof(*m);
9227 - ++mpc_record;
9228 - break;
9229 - }
9230 - default:
9231 - {
9232 - printk(KERN_WARNING
9233 - "Unrecognised OEM table entry type! - %d\n",
9234 - (int)*oemptr);
9235 - return;
9236 - }
9237 - }
9238 + print_MP_intsrc_info(m);
9239 +
9240 + for (i = 0; i < mp_irq_entries; i++) {
9241 + if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
9242 + return;
9243 }
9244 +
9245 + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
9246 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
9247 + panic("Max # of irq sources exceeded!!\n");
9248 }
9249
9250 -static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
9251 - char *productid)
9252 +#endif
9253 +
9254 +static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9255 {
9256 - if (strncmp(oem, "IBM NUMA", 8))
9257 - printk("Warning! May not be a NUMA-Q system!\n");
9258 - if (mpc->mpc_oemptr)
9259 - smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
9260 - mpc->mpc_oemsize);
9261 + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
9262 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9263 + m->mpc_irqtype, m->mpc_irqflag & 3,
9264 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9265 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9266 }
9267 -#endif /* CONFIG_X86_NUMAQ */
9268
9269 /*
9270 * Read/parse the MPC
9271 */
9272
9273 -static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9274 +static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
9275 + char *str)
9276 {
9277 - char str[16];
9278 - char oem[10];
9279 - int count = sizeof(*mpc);
9280 - unsigned char *mpt = ((unsigned char *)mpc) + count;
9281
9282 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
9283 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
9284 @@ -316,19 +278,41 @@ static int __init smp_read_mpc(struct mp
9285 }
9286 memcpy(oem, mpc->mpc_oem, 8);
9287 oem[8] = 0;
9288 - printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
9289 + printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
9290
9291 memcpy(str, mpc->mpc_productid, 12);
9292 str[12] = 0;
9293 - printk("Product ID: %s ", str);
9294
9295 -#ifdef CONFIG_X86_32
9296 - mps_oem_check(mpc, oem, str);
9297 -#endif
9298 - printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
9299 + printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
9300
9301 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
9302
9303 + return 1;
9304 +}
9305 +
9306 +static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9307 +{
9308 + char str[16];
9309 + char oem[10];
9310 +
9311 + int count = sizeof(*mpc);
9312 + unsigned char *mpt = ((unsigned char *)mpc) + count;
9313 +
9314 + if (!smp_check_mpc(mpc, oem, str))
9315 + return 0;
9316 +
9317 +#ifdef CONFIG_X86_32
9318 + /*
9319 + * need to make sure summit and es7000's mps_oem_check is safe to be
9320 + * called early via genericarch 's mps_oem_check
9321 + */
9322 + if (early) {
9323 +#ifdef CONFIG_X86_NUMAQ
9324 + numaq_mps_oem_check(mpc, oem, str);
9325 +#endif
9326 + } else
9327 + mps_oem_check(mpc, oem, str);
9328 +#endif
9329 /* save the local APIC address, it might be non-default */
9330 if (!acpi_lapic)
9331 mp_lapic_addr = mpc->mpc_lapic;
9332 @@ -336,12 +320,17 @@ static int __init smp_read_mpc(struct mp
9333 if (early)
9334 return 1;
9335
9336 + if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
9337 + struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
9338 + x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
9339 + }
9340 +
9341 /*
9342 * Now process the configuration blocks.
9343 */
9344 -#ifdef CONFIG_X86_NUMAQ
9345 - mpc_record = 0;
9346 -#endif
9347 + if (x86_quirks->mpc_record)
9348 + *x86_quirks->mpc_record = 0;
9349 +
9350 while (count < mpc->mpc_length) {
9351 switch (*mpt) {
9352 case MP_PROCESSOR:
9353 @@ -359,7 +348,9 @@ static int __init smp_read_mpc(struct mp
9354 {
9355 struct mpc_config_bus *m =
9356 (struct mpc_config_bus *)mpt;
9357 +#ifdef CONFIG_X86_IO_APIC
9358 MP_bus_info(m);
9359 +#endif
9360 mpt += sizeof(*m);
9361 count += sizeof(*m);
9362 break;
9363 @@ -405,10 +396,14 @@ static int __init smp_read_mpc(struct mp
9364 count = mpc->mpc_length;
9365 break;
9366 }
9367 -#ifdef CONFIG_X86_NUMAQ
9368 - ++mpc_record;
9369 -#endif
9370 + if (x86_quirks->mpc_record)
9371 + (*x86_quirks->mpc_record)++;
9372 }
9373 +
9374 +#ifdef CONFIG_X86_GENERICARCH
9375 + generic_bigsmp_probe();
9376 +#endif
9377 +
9378 setup_apic_routing();
9379 if (!num_processors)
9380 printk(KERN_ERR "MPTABLE: no processors registered!\n");
9381 @@ -434,7 +429,7 @@ static void __init construct_default_ioi
9382 intsrc.mpc_type = MP_INTSRC;
9383 intsrc.mpc_irqflag = 0; /* conforming */
9384 intsrc.mpc_srcbus = 0;
9385 - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
9386 + intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
9387
9388 intsrc.mpc_irqtype = mp_INT;
9389
9390 @@ -495,40 +490,11 @@ static void __init construct_default_ioi
9391 MP_intsrc_info(&intsrc);
9392 }
9393
9394 -#endif
9395
9396 -static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9397 +static void __init construct_ioapic_table(int mpc_default_type)
9398 {
9399 - struct mpc_config_processor processor;
9400 - struct mpc_config_bus bus;
9401 -#ifdef CONFIG_X86_IO_APIC
9402 struct mpc_config_ioapic ioapic;
9403 -#endif
9404 - struct mpc_config_lintsrc lintsrc;
9405 - int linttypes[2] = { mp_ExtINT, mp_NMI };
9406 - int i;
9407 -
9408 - /*
9409 - * local APIC has default address
9410 - */
9411 - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9412 -
9413 - /*
9414 - * 2 CPUs, numbered 0 & 1.
9415 - */
9416 - processor.mpc_type = MP_PROCESSOR;
9417 - /* Either an integrated APIC or a discrete 82489DX. */
9418 - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9419 - processor.mpc_cpuflag = CPU_ENABLED;
9420 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9421 - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9422 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9423 - processor.mpc_reserved[0] = 0;
9424 - processor.mpc_reserved[1] = 0;
9425 - for (i = 0; i < 2; i++) {
9426 - processor.mpc_apicid = i;
9427 - MP_processor_info(&processor);
9428 - }
9429 + struct mpc_config_bus bus;
9430
9431 bus.mpc_type = MP_BUS;
9432 bus.mpc_busid = 0;
9433 @@ -557,7 +523,6 @@ static inline void __init construct_defa
9434 MP_bus_info(&bus);
9435 }
9436
9437 -#ifdef CONFIG_X86_IO_APIC
9438 ioapic.mpc_type = MP_IOAPIC;
9439 ioapic.mpc_apicid = 2;
9440 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9441 @@ -569,7 +534,42 @@ static inline void __init construct_defa
9442 * We set up most of the low 16 IO-APIC pins according to MPS rules.
9443 */
9444 construct_default_ioirq_mptable(mpc_default_type);
9445 +}
9446 +#else
9447 +static inline void __init construct_ioapic_table(int mpc_default_type) { }
9448 #endif
9449 +
9450 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9451 +{
9452 + struct mpc_config_processor processor;
9453 + struct mpc_config_lintsrc lintsrc;
9454 + int linttypes[2] = { mp_ExtINT, mp_NMI };
9455 + int i;
9456 +
9457 + /*
9458 + * local APIC has default address
9459 + */
9460 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9461 +
9462 + /*
9463 + * 2 CPUs, numbered 0 & 1.
9464 + */
9465 + processor.mpc_type = MP_PROCESSOR;
9466 + /* Either an integrated APIC or a discrete 82489DX. */
9467 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9468 + processor.mpc_cpuflag = CPU_ENABLED;
9469 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9470 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9471 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9472 + processor.mpc_reserved[0] = 0;
9473 + processor.mpc_reserved[1] = 0;
9474 + for (i = 0; i < 2; i++) {
9475 + processor.mpc_apicid = i;
9476 + MP_processor_info(&processor);
9477 + }
9478 +
9479 + construct_ioapic_table(mpc_default_type);
9480 +
9481 lintsrc.mpc_type = MP_LINTSRC;
9482 lintsrc.mpc_irqflag = 0; /* conforming */
9483 lintsrc.mpc_srcbusid = 0;
9484 @@ -587,10 +587,14 @@ static struct intel_mp_floating *mpf_fou
9485 /*
9486 * Scan the memory blocks for an SMP configuration block.
9487 */
9488 -static void __init __get_smp_config(unsigned early)
9489 +static void __init __get_smp_config(unsigned int early)
9490 {
9491 struct intel_mp_floating *mpf = mpf_found;
9492
9493 + if (x86_quirks->mach_get_smp_config) {
9494 + if (x86_quirks->mach_get_smp_config(early))
9495 + return;
9496 + }
9497 if (acpi_lapic && early)
9498 return;
9499 /*
9500 @@ -607,7 +611,7 @@ static void __init __get_smp_config(unsi
9501
9502 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
9503 mpf->mpf_specification);
9504 -#ifdef CONFIG_X86_32
9505 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
9506 if (mpf->mpf_feature2 & (1 << 7)) {
9507 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
9508 pic_mode = 1;
9509 @@ -639,7 +643,9 @@ static void __init __get_smp_config(unsi
9510 * override the defaults.
9511 */
9512 if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
9513 +#ifdef CONFIG_X86_LOCAL_APIC
9514 smp_found_config = 0;
9515 +#endif
9516 printk(KERN_ERR
9517 "BIOS bug, MP table errors detected!...\n");
9518 printk(KERN_ERR "... disabling SMP support. "
9519 @@ -696,7 +702,8 @@ static int __init smp_scan_config(unsign
9520 unsigned int *bp = isa_bus_to_virt(base);
9521 struct intel_mp_floating *mpf;
9522
9523 - Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
9524 + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
9525 + bp, length);
9526 BUILD_BUG_ON(sizeof(*mpf) != 16);
9527
9528 while (length > 0) {
9529 @@ -706,16 +713,22 @@ static int __init smp_scan_config(unsign
9530 !mpf_checksum((unsigned char *)bp, 16) &&
9531 ((mpf->mpf_specification == 1)
9532 || (mpf->mpf_specification == 4))) {
9533 -
9534 +#ifdef CONFIG_X86_LOCAL_APIC
9535 smp_found_config = 1;
9536 +#endif
9537 mpf_found = mpf;
9538 -#ifdef CONFIG_X86_32
9539 +
9540 #ifndef CONFIG_XEN
9541 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9542 mpf, virt_to_phys(mpf));
9543 - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
9544 +
9545 + if (!reserve)
9546 + return 1;
9547 + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
9548 BOOTMEM_DEFAULT);
9549 if (mpf->mpf_physptr) {
9550 + unsigned long size = PAGE_SIZE;
9551 +#ifdef CONFIG_X86_32
9552 /*
9553 * We cannot access to MPC table to compute
9554 * table size yet, as only few megabytes from
9555 @@ -725,27 +738,18 @@ static int __init smp_scan_config(unsign
9556 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
9557 * in reserve_bootmem.
9558 */
9559 - unsigned long size = PAGE_SIZE;
9560 unsigned long end = max_low_pfn * PAGE_SIZE;
9561 if (mpf->mpf_physptr + size > end)
9562 size = end - mpf->mpf_physptr;
9563 - reserve_bootmem(mpf->mpf_physptr, size,
9564 +#endif
9565 + reserve_bootmem_generic(mpf->mpf_physptr, size,
9566 BOOTMEM_DEFAULT);
9567 }
9568 #else
9569 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9570 mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
9571 #endif
9572 -#elif !defined(CONFIG_XEN)
9573 - if (!reserve)
9574 - return 1;
9575 -
9576 - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
9577 - if (mpf->mpf_physptr)
9578 - reserve_bootmem_generic(mpf->mpf_physptr,
9579 - PAGE_SIZE);
9580 -#endif
9581 - return 1;
9582 + return 1;
9583 }
9584 bp += 4;
9585 length -= 16;
9586 @@ -753,10 +757,15 @@ static int __init smp_scan_config(unsign
9587 return 0;
9588 }
9589
9590 -static void __init __find_smp_config(unsigned reserve)
9591 +static void __init __find_smp_config(unsigned int reserve)
9592 {
9593 #ifndef CONFIG_XEN
9594 unsigned int address;
9595 +
9596 + if (x86_quirks->mach_find_smp_config) {
9597 + if (x86_quirks->mach_find_smp_config(reserve))
9598 + return;
9599 + }
9600 #endif
9601
9602 /*
9603 @@ -805,300 +814,301 @@ void __init find_smp_config(void)
9604 __find_smp_config(1);
9605 }
9606
9607 -/* --------------------------------------------------------------------------
9608 - ACPI-based MP Configuration
9609 - -------------------------------------------------------------------------- */
9610 -
9611 -/*
9612 - * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
9613 - */
9614 -int es7000_plat;
9615 -
9616 -#ifdef CONFIG_ACPI
9617 +#ifdef CONFIG_X86_IO_APIC
9618 +static u8 __initdata irq_used[MAX_IRQ_SOURCES];
9619
9620 -#ifdef CONFIG_X86_IO_APIC
9621 +static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
9622 +{
9623 + int i;
9624
9625 -#define MP_ISA_BUS 0
9626 + if (m->mpc_irqtype != mp_INT)
9627 + return 0;
9628
9629 -extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
9630 + if (m->mpc_irqflag != 0x0f)
9631 + return 0;
9632
9633 -static int mp_find_ioapic(int gsi)
9634 -{
9635 - int i = 0;
9636 + /* not legacy */
9637
9638 - /* Find the IOAPIC that manages this GSI. */
9639 - for (i = 0; i < nr_ioapics; i++) {
9640 - if ((gsi >= mp_ioapic_routing[i].gsi_base)
9641 - && (gsi <= mp_ioapic_routing[i].gsi_end))
9642 - return i;
9643 + for (i = 0; i < mp_irq_entries; i++) {
9644 + if (mp_irqs[i].mp_irqtype != mp_INT)
9645 + continue;
9646 +
9647 + if (mp_irqs[i].mp_irqflag != 0x0f)
9648 + continue;
9649 +
9650 + if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
9651 + continue;
9652 + if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
9653 + continue;
9654 + if (irq_used[i]) {
9655 + /* already claimed */
9656 + return -2;
9657 + }
9658 + irq_used[i] = 1;
9659 + return i;
9660 }
9661
9662 - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
9663 + /* not found */
9664 return -1;
9665 }
9666
9667 -static u8 __init uniq_ioapic_id(u8 id)
9668 -{
9669 -#ifdef CONFIG_X86_32
9670 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
9671 - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
9672 - return io_apic_get_unique_id(nr_ioapics, id);
9673 - else
9674 - return id;
9675 -#else
9676 - int i;
9677 - DECLARE_BITMAP(used, 256);
9678 - bitmap_zero(used, 256);
9679 - for (i = 0; i < nr_ioapics; i++) {
9680 - struct mpc_config_ioapic *ia = &mp_ioapics[i];
9681 - __set_bit(ia->mpc_apicid, used);
9682 - }
9683 - if (!test_bit(id, used))
9684 - return id;
9685 - return find_first_zero_bit(used, 256);
9686 +#define SPARE_SLOT_NUM 20
9687 +
9688 +static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
9689 #endif
9690 -}
9691
9692 -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
9693 +static int __init replace_intsrc_all(struct mp_config_table *mpc,
9694 + unsigned long mpc_new_phys,
9695 + unsigned long mpc_new_length)
9696 {
9697 - int idx = 0;
9698 -
9699 - if (bad_ioapic(address))
9700 - return;
9701 +#ifdef CONFIG_X86_IO_APIC
9702 + int i;
9703 + int nr_m_spare = 0;
9704 +#endif
9705
9706 - idx = nr_ioapics;
9707 + int count = sizeof(*mpc);
9708 + unsigned char *mpt = ((unsigned char *)mpc) + count;
9709
9710 - mp_ioapics[idx].mpc_type = MP_IOAPIC;
9711 - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
9712 - mp_ioapics[idx].mpc_apicaddr = address;
9713 + printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
9714 + while (count < mpc->mpc_length) {
9715 + switch (*mpt) {
9716 + case MP_PROCESSOR:
9717 + {
9718 + struct mpc_config_processor *m =
9719 + (struct mpc_config_processor *)mpt;
9720 + mpt += sizeof(*m);
9721 + count += sizeof(*m);
9722 + break;
9723 + }
9724 + case MP_BUS:
9725 + {
9726 + struct mpc_config_bus *m =
9727 + (struct mpc_config_bus *)mpt;
9728 + mpt += sizeof(*m);
9729 + count += sizeof(*m);
9730 + break;
9731 + }
9732 + case MP_IOAPIC:
9733 + {
9734 + mpt += sizeof(struct mpc_config_ioapic);
9735 + count += sizeof(struct mpc_config_ioapic);
9736 + break;
9737 + }
9738 + case MP_INTSRC:
9739 + {
9740 +#ifdef CONFIG_X86_IO_APIC
9741 + struct mpc_config_intsrc *m =
9742 + (struct mpc_config_intsrc *)mpt;
9743
9744 -#ifndef CONFIG_XEN
9745 - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
9746 + printk(KERN_INFO "OLD ");
9747 + print_MP_intsrc_info(m);
9748 + i = get_MP_intsrc_index(m);
9749 + if (i > 0) {
9750 + assign_to_mpc_intsrc(&mp_irqs[i], m);
9751 + printk(KERN_INFO "NEW ");
9752 + print_mp_irq_info(&mp_irqs[i]);
9753 + } else if (!i) {
9754 + /* legacy, do nothing */
9755 + } else if (nr_m_spare < SPARE_SLOT_NUM) {
9756 + /*
9757 + * not found (-1), or duplicated (-2)
9758 + * are invalid entries,
9759 + * we need to use the slot later
9760 + */
9761 + m_spare[nr_m_spare] = m;
9762 + nr_m_spare++;
9763 + }
9764 #endif
9765 - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
9766 -#ifdef CONFIG_X86_32
9767 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
9768 -#else
9769 - mp_ioapics[idx].mpc_apicver = 0;
9770 + mpt += sizeof(struct mpc_config_intsrc);
9771 + count += sizeof(struct mpc_config_intsrc);
9772 + break;
9773 + }
9774 + case MP_LINTSRC:
9775 + {
9776 + struct mpc_config_lintsrc *m =
9777 + (struct mpc_config_lintsrc *)mpt;
9778 + mpt += sizeof(*m);
9779 + count += sizeof(*m);
9780 + break;
9781 + }
9782 + default:
9783 + /* wrong mptable */
9784 + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
9785 + printk(KERN_ERR "type %x\n", *mpt);
9786 + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
9787 + 1, mpc, mpc->mpc_length, 1);
9788 + goto out;
9789 + }
9790 + }
9791 +
9792 +#ifdef CONFIG_X86_IO_APIC
9793 + for (i = 0; i < mp_irq_entries; i++) {
9794 + if (irq_used[i])
9795 + continue;
9796 +
9797 + if (mp_irqs[i].mp_irqtype != mp_INT)
9798 + continue;
9799 +
9800 + if (mp_irqs[i].mp_irqflag != 0x0f)
9801 + continue;
9802 +
9803 + if (nr_m_spare > 0) {
9804 + printk(KERN_INFO "*NEW* found ");
9805 + nr_m_spare--;
9806 + assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
9807 + m_spare[nr_m_spare] = NULL;
9808 + } else {
9809 + struct mpc_config_intsrc *m =
9810 + (struct mpc_config_intsrc *)mpt;
9811 + count += sizeof(struct mpc_config_intsrc);
9812 + if (!mpc_new_phys) {
9813 + printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
9814 + } else {
9815 + if (count <= mpc_new_length)
9816 + printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
9817 + else {
9818 + printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
9819 + goto out;
9820 + }
9821 + }
9822 + assign_to_mpc_intsrc(&mp_irqs[i], m);
9823 + mpc->mpc_length = count;
9824 + mpt += sizeof(struct mpc_config_intsrc);
9825 + }
9826 + print_mp_irq_info(&mp_irqs[i]);
9827 + }
9828 #endif
9829 - /*
9830 - * Build basic GSI lookup table to facilitate gsi->io_apic lookups
9831 - * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
9832 - */
9833 - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
9834 - mp_ioapic_routing[idx].gsi_base = gsi_base;
9835 - mp_ioapic_routing[idx].gsi_end = gsi_base +
9836 - io_apic_get_redir_entries(idx);
9837 -
9838 - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
9839 - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
9840 - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
9841 - mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
9842 +out:
9843 + /* update checksum */
9844 + mpc->mpc_checksum = 0;
9845 + mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
9846 + mpc->mpc_length);
9847
9848 - nr_ioapics++;
9849 + return 0;
9850 }
9851
9852 -void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
9853 -{
9854 - struct mpc_config_intsrc intsrc;
9855 - int ioapic = -1;
9856 - int pin = -1;
9857 -
9858 - /*
9859 - * Convert 'gsi' to 'ioapic.pin'.
9860 - */
9861 - ioapic = mp_find_ioapic(gsi);
9862 - if (ioapic < 0)
9863 - return;
9864 - pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9865 +static int __initdata enable_update_mptable;
9866
9867 - /*
9868 - * TBD: This check is for faulty timer entries, where the override
9869 - * erroneously sets the trigger to level, resulting in a HUGE
9870 - * increase of timer interrupts!
9871 - */
9872 - if ((bus_irq == 0) && (trigger == 3))
9873 - trigger = 1;
9874 +static int __init update_mptable_setup(char *str)
9875 +{
9876 + enable_update_mptable = 1;
9877 + return 0;
9878 +}
9879 +early_param("update_mptable", update_mptable_setup);
9880
9881 - intsrc.mpc_type = MP_INTSRC;
9882 - intsrc.mpc_irqtype = mp_INT;
9883 - intsrc.mpc_irqflag = (trigger << 2) | polarity;
9884 - intsrc.mpc_srcbus = MP_ISA_BUS;
9885 - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
9886 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
9887 - intsrc.mpc_dstirq = pin; /* INTIN# */
9888 +static unsigned long __initdata mpc_new_phys;
9889 +static unsigned long mpc_new_length __initdata = 4096;
9890
9891 - MP_intsrc_info(&intsrc);
9892 +/* alloc_mptable or alloc_mptable=4k */
9893 +static int __initdata alloc_mptable;
9894 +static int __init parse_alloc_mptable_opt(char *p)
9895 +{
9896 + enable_update_mptable = 1;
9897 + alloc_mptable = 1;
9898 + if (!p)
9899 + return 0;
9900 + mpc_new_length = PAGE_SIZE << get_order(memparse(p, &p));
9901 + return 0;
9902 }
9903 +early_param("alloc_mptable", parse_alloc_mptable_opt);
9904
9905 -void __init mp_config_acpi_legacy_irqs(void)
9906 +void __init early_reserve_e820_mpc_new(void)
9907 {
9908 - struct mpc_config_intsrc intsrc;
9909 - int i = 0;
9910 - int ioapic = -1;
9911 -
9912 -#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9913 - /*
9914 - * Fabricate the legacy ISA bus (bus #31).
9915 - */
9916 - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
9917 + if (enable_update_mptable && alloc_mptable) {
9918 + u64 startt = PAGE_SIZE;
9919 +#ifdef CONFIG_X86_TRAMPOLINE
9920 + startt = TRAMPOLINE_BASE;
9921 #endif
9922 - set_bit(MP_ISA_BUS, mp_bus_not_pci);
9923 - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
9924 -
9925 - /*
9926 - * Older generations of ES7000 have no legacy identity mappings
9927 - */
9928 - if (es7000_plat == 1)
9929 - return;
9930 -
9931 - /*
9932 - * Locate the IOAPIC that manages the ISA IRQs (0-15).
9933 - */
9934 - ioapic = mp_find_ioapic(0);
9935 - if (ioapic < 0)
9936 - return;
9937 -
9938 - intsrc.mpc_type = MP_INTSRC;
9939 - intsrc.mpc_irqflag = 0; /* Conforming */
9940 - intsrc.mpc_srcbus = MP_ISA_BUS;
9941 -#ifdef CONFIG_X86_IO_APIC
9942 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
9943 -#endif
9944 - /*
9945 - * Use the default configuration for the IRQs 0-15. Unless
9946 - * overridden by (MADT) interrupt source override entries.
9947 - */
9948 - for (i = 0; i < 16; i++) {
9949 - int idx;
9950 -
9951 - for (idx = 0; idx < mp_irq_entries; idx++) {
9952 - struct mpc_config_intsrc *irq = mp_irqs + idx;
9953 -
9954 - /* Do we already have a mapping for this ISA IRQ? */
9955 - if (irq->mpc_srcbus == MP_ISA_BUS
9956 - && irq->mpc_srcbusirq == i)
9957 - break;
9958 -
9959 - /* Do we already have a mapping for this IOAPIC pin */
9960 - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
9961 - (irq->mpc_dstirq == i))
9962 - break;
9963 - }
9964 -
9965 - if (idx != mp_irq_entries) {
9966 - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
9967 - continue; /* IRQ already used */
9968 - }
9969 -
9970 - intsrc.mpc_irqtype = mp_INT;
9971 - intsrc.mpc_srcbusirq = i; /* Identity mapped */
9972 - intsrc.mpc_dstirq = i;
9973 -
9974 - MP_intsrc_info(&intsrc);
9975 + mpc_new_phys = early_reserve_e820(startt, mpc_new_length,
9976 + mpc_new_length);
9977 }
9978 }
9979
9980 -int mp_register_gsi(u32 gsi, int triggering, int polarity)
9981 +static int __init update_mp_table(void)
9982 {
9983 - int ioapic;
9984 - int ioapic_pin;
9985 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
9986 -#define MAX_GSI_NUM 4096
9987 -#define IRQ_COMPRESSION_START 64
9988 + char str[16];
9989 + char oem[10];
9990 + struct intel_mp_floating *mpf;
9991 + struct mp_config_table *mpc;
9992 + struct mp_config_table *mpc_new;
9993 +
9994 + if (!enable_update_mptable)
9995 + return 0;
9996 +
9997 + mpf = mpf_found;
9998 + if (!mpf)
9999 + return 0;
10000
10001 - static int pci_irq = IRQ_COMPRESSION_START;
10002 /*
10003 - * Mapping between Global System Interrupts, which
10004 - * represent all possible interrupts, and IRQs
10005 - * assigned to actual devices.
10006 + * Now see if we need to go further.
10007 */
10008 - static int gsi_to_irq[MAX_GSI_NUM];
10009 -#else
10010 + if (mpf->mpf_feature1 != 0)
10011 + return 0;
10012
10013 - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
10014 - return gsi;
10015 -#endif
10016 + if (!mpf->mpf_physptr)
10017 + return 0;
10018
10019 - /* Don't set up the ACPI SCI because it's already set up */
10020 - if (acpi_gbl_FADT.sci_interrupt == gsi)
10021 - return gsi;
10022 + mpc = isa_bus_to_virt(mpf->mpf_physptr);
10023
10024 - ioapic = mp_find_ioapic(gsi);
10025 - if (ioapic < 0) {
10026 - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
10027 - return gsi;
10028 - }
10029 + if (!smp_check_mpc(mpc, oem, str))
10030 + return 0;
10031
10032 - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
10033 + printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
10034 + printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
10035
10036 -#ifndef CONFIG_X86_32
10037 - if (ioapic_renumber_irq)
10038 - gsi = ioapic_renumber_irq(ioapic, gsi);
10039 -#endif
10040 + if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
10041 + mpc_new_phys = 0;
10042 + printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
10043 + mpc_new_length);
10044 + }
10045 +
10046 + if (!mpc_new_phys) {
10047 + unsigned char old, new;
10048 + /* check if we can change the postion */
10049 + mpc->mpc_checksum = 0;
10050 + old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10051 + mpc->mpc_checksum = 0xff;
10052 + new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10053 + if (old == new) {
10054 + printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
10055 + return 0;
10056 + }
10057 + printk(KERN_INFO "use in-positon replacing\n");
10058 + } else {
10059 + maddr_t mpc_new_bus;
10060
10061 - /*
10062 - * Avoid pin reprogramming. PRTs typically include entries
10063 - * with redundant pin->gsi mappings (but unique PCI devices);
10064 - * we only program the IOAPIC on the first.
10065 - */
10066 - if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
10067 - printk(KERN_ERR "Invalid reference to IOAPIC pin "
10068 - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
10069 - ioapic_pin);
10070 - return gsi;
10071 - }
10072 - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
10073 - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
10074 - mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
10075 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10076 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
10077 -#else
10078 - return gsi;
10079 -#endif
10080 + if (xen_create_contiguous_region((unsigned long)phys_to_virt(mpc_new_phys),
10081 + get_order(mpc_new_length), 32))
10082 + BUG();
10083 + mpc_new_bus = phys_to_machine(mpc_new_phys);
10084 + mpf->mpf_physptr = mpc_new_bus;
10085 + mpc_new = phys_to_virt(mpc_new_phys);
10086 + memcpy(mpc_new, mpc, mpc->mpc_length);
10087 + mpc = mpc_new;
10088 + /* check if we can modify that */
10089 + if (mpc_new_bus - mpf->mpf_physptr) {
10090 + struct intel_mp_floating *mpf_new;
10091 + /* steal 16 bytes from [0, 1k) */
10092 + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
10093 + mpf_new = isa_bus_to_virt(0x400 - 16);
10094 + memcpy(mpf_new, mpf, 16);
10095 + mpf = mpf_new;
10096 + mpf->mpf_physptr = mpc_new_bus;
10097 + }
10098 + mpf->mpf_checksum = 0;
10099 + mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
10100 + printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
10101 }
10102
10103 - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
10104 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10105 /*
10106 - * For GSI >= 64, use IRQ compression
10107 + * only replace the one with mp_INT and
10108 + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
10109 + * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
10110 + * may need pci=routeirq for all coverage
10111 */
10112 - if ((gsi >= IRQ_COMPRESSION_START)
10113 - && (triggering == ACPI_LEVEL_SENSITIVE)) {
10114 - /*
10115 - * For PCI devices assign IRQs in order, avoiding gaps
10116 - * due to unused I/O APIC pins.
10117 - */
10118 - int irq = gsi;
10119 - if (gsi < MAX_GSI_NUM) {
10120 - /*
10121 - * Retain the VIA chipset work-around (gsi > 15), but
10122 - * avoid a problem where the 8254 timer (IRQ0) is setup
10123 - * via an override (so it's not on pin 0 of the ioapic),
10124 - * and at the same time, the pin 0 interrupt is a PCI
10125 - * type. The gsi > 15 test could cause these two pins
10126 - * to be shared as IRQ0, and they are not shareable.
10127 - * So test for this condition, and if necessary, avoid
10128 - * the pin collision.
10129 - */
10130 - gsi = pci_irq++;
10131 - /*
10132 - * Don't assign IRQ used by ACPI SCI
10133 - */
10134 - if (gsi == acpi_gbl_FADT.sci_interrupt)
10135 - gsi = pci_irq++;
10136 - gsi_to_irq[irq] = gsi;
10137 - } else {
10138 - printk(KERN_ERR "GSI %u is too high\n", gsi);
10139 - return gsi;
10140 - }
10141 - }
10142 -#endif
10143 - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
10144 - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
10145 - polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
10146 - return gsi;
10147 + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
10148 +
10149 + return 0;
10150 }
10151
10152 -#endif /* CONFIG_X86_IO_APIC */
10153 -#endif /* CONFIG_ACPI */
10154 +late_initcall(update_mp_table);
10155 Index: head-2008-12-01/arch/x86/kernel/nmi.c
10156 ===================================================================
10157 --- head-2008-12-01.orig/arch/x86/kernel/nmi.c 2008-12-03 15:48:43.000000000 +0100
10158 +++ head-2008-12-01/arch/x86/kernel/nmi.c 2008-12-01 11:49:07.000000000 +0100
10159 @@ -27,7 +27,9 @@
10160 #include <linux/kdebug.h>
10161 #include <linux/smp.h>
10162
10163 +#ifndef CONFIG_XEN
10164 #include <asm/i8259.h>
10165 +#endif
10166 #include <asm/io_apic.h>
10167 #include <asm/smp.h>
10168 #include <asm/nmi.h>
10169 @@ -179,8 +181,10 @@ int __init check_nmi_watchdog(void)
10170 kfree(prev_nmi_count);
10171 return 0;
10172 error:
10173 +#ifndef CONFIG_XEN
10174 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
10175 disable_8259A_irq(0);
10176 +#endif
10177 #ifdef CONFIG_X86_32
10178 timer_ack = 0;
10179 #endif
10180 Index: head-2008-12-01/arch/x86/kernel/pci-dma-xen.c
10181 ===================================================================
10182 --- head-2008-12-01.orig/arch/x86/kernel/pci-dma-xen.c 2008-12-01 11:44:55.000000000 +0100
10183 +++ head-2008-12-01/arch/x86/kernel/pci-dma-xen.c 2008-12-01 11:49:07.000000000 +0100
10184 @@ -5,13 +5,13 @@
10185
10186 #include <asm/proto.h>
10187 #include <asm/dma.h>
10188 -#include <asm/gart.h>
10189 +#include <asm/iommu.h>
10190 #include <asm/calgary.h>
10191 +#include <asm/amd_iommu.h>
10192
10193 -int forbid_dac __read_mostly;
10194 -EXPORT_SYMBOL(forbid_dac);
10195 +static int forbid_dac __read_mostly;
10196
10197 -const struct dma_mapping_ops *dma_ops;
10198 +struct dma_mapping_ops *dma_ops;
10199 EXPORT_SYMBOL(dma_ops);
10200
10201 static int iommu_sac_force __read_mostly;
10202 @@ -74,13 +74,17 @@ early_param("dma32_size", parse_dma32_si
10203 void __init dma32_reserve_bootmem(void)
10204 {
10205 unsigned long size, align;
10206 - if (end_pfn <= MAX_DMA32_PFN)
10207 + if (max_pfn <= MAX_DMA32_PFN)
10208 return;
10209
10210 + /*
10211 + * check aperture_64.c allocate_aperture() for reason about
10212 + * using 512M as goal
10213 + */
10214 align = 64ULL<<20;
10215 size = round_up(dma32_bootmem_size, align);
10216 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
10217 - __pa(MAX_DMA_ADDRESS));
10218 + 512ULL<<20);
10219 if (dma32_bootmem_ptr)
10220 dma32_bootmem_size = size;
10221 else
10222 @@ -88,17 +92,14 @@ void __init dma32_reserve_bootmem(void)
10223 }
10224 static void __init dma32_free_bootmem(void)
10225 {
10226 - int node;
10227
10228 - if (end_pfn <= MAX_DMA32_PFN)
10229 + if (max_pfn <= MAX_DMA32_PFN)
10230 return;
10231
10232 if (!dma32_bootmem_ptr)
10233 return;
10234
10235 - for_each_online_node(node)
10236 - free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
10237 - dma32_bootmem_size);
10238 + free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
10239
10240 dma32_bootmem_ptr = NULL;
10241 dma32_bootmem_size = 0;
10242 @@ -107,7 +108,7 @@ static void __init dma32_free_bootmem(vo
10243 #define dma32_free_bootmem() ((void)0)
10244 #endif
10245
10246 -static const struct dma_mapping_ops swiotlb_dma_ops = {
10247 +static struct dma_mapping_ops swiotlb_dma_ops = {
10248 .mapping_error = swiotlb_dma_mapping_error,
10249 .map_single = swiotlb_map_single_phys,
10250 .unmap_single = swiotlb_unmap_single,
10251 @@ -130,25 +131,31 @@ void __init pci_iommu_alloc(void)
10252 * The order of these functions is important for
10253 * fall-back/fail-over reasons
10254 */
10255 -#ifdef CONFIG_GART_IOMMU
10256 gart_iommu_hole_init();
10257 -#endif
10258
10259 -#ifdef CONFIG_CALGARY_IOMMU
10260 detect_calgary();
10261 -#endif
10262
10263 detect_intel_iommu();
10264
10265 -#ifdef CONFIG_SWIOTLB
10266 + amd_iommu_detect();
10267 +
10268 swiotlb_init();
10269 if (swiotlb) {
10270 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
10271 dma_ops = &swiotlb_dma_ops;
10272 }
10273 -#endif
10274 }
10275
10276 +#ifndef CONFIG_XEN
10277 +unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
10278 +{
10279 + unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
10280 +
10281 + return size >> PAGE_SHIFT;
10282 +}
10283 +EXPORT_SYMBOL(iommu_num_pages);
10284 +#endif
10285 +
10286 /*
10287 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
10288 * documentation.
10289 @@ -201,9 +208,7 @@ static __init int iommu_setup(char *p)
10290 swiotlb = 1;
10291 #endif
10292
10293 -#ifdef CONFIG_GART_IOMMU
10294 gart_parse_options(p);
10295 -#endif
10296
10297 #ifdef CONFIG_CALGARY_IOMMU
10298 if (!strncmp(p, "calgary", 7))
10299 @@ -245,136 +250,19 @@ int range_straddles_page_boundary(paddr_
10300 !check_pages_physically_contiguous(pfn, offset, size));
10301 }
10302
10303 -#ifdef CONFIG_X86_32
10304 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
10305 - dma_addr_t device_addr, size_t size, int flags)
10306 -{
10307 - void __iomem *mem_base = NULL;
10308 - int pages = size >> PAGE_SHIFT;
10309 - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
10310 -
10311 - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
10312 - goto out;
10313 - if (!size)
10314 - goto out;
10315 - if (dev->dma_mem)
10316 - goto out;
10317 -
10318 - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
10319 -
10320 - mem_base = ioremap(bus_addr, size);
10321 - if (!mem_base)
10322 - goto out;
10323 -
10324 - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
10325 - if (!dev->dma_mem)
10326 - goto out;
10327 - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
10328 - if (!dev->dma_mem->bitmap)
10329 - goto free1_out;
10330 -
10331 - dev->dma_mem->virt_base = mem_base;
10332 - dev->dma_mem->device_base = device_addr;
10333 - dev->dma_mem->size = pages;
10334 - dev->dma_mem->flags = flags;
10335 -
10336 - if (flags & DMA_MEMORY_MAP)
10337 - return DMA_MEMORY_MAP;
10338 -
10339 - return DMA_MEMORY_IO;
10340 -
10341 - free1_out:
10342 - kfree(dev->dma_mem);
10343 - out:
10344 - if (mem_base)
10345 - iounmap(mem_base);
10346 - return 0;
10347 -}
10348 -EXPORT_SYMBOL(dma_declare_coherent_memory);
10349 -
10350 -void dma_release_declared_memory(struct device *dev)
10351 -{
10352 - struct dma_coherent_mem *mem = dev->dma_mem;
10353 -
10354 - if (!mem)
10355 - return;
10356 - dev->dma_mem = NULL;
10357 - iounmap(mem->virt_base);
10358 - kfree(mem->bitmap);
10359 - kfree(mem);
10360 -}
10361 -EXPORT_SYMBOL(dma_release_declared_memory);
10362 -
10363 -void *dma_mark_declared_memory_occupied(struct device *dev,
10364 - dma_addr_t device_addr, size_t size)
10365 -{
10366 - struct dma_coherent_mem *mem = dev->dma_mem;
10367 - int pos, err;
10368 - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
10369 -
10370 - pages >>= PAGE_SHIFT;
10371 -
10372 - if (!mem)
10373 - return ERR_PTR(-EINVAL);
10374 -
10375 - pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
10376 - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
10377 - if (err != 0)
10378 - return ERR_PTR(err);
10379 - return mem->virt_base + (pos << PAGE_SHIFT);
10380 -}
10381 -EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
10382 -
10383 -static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
10384 - dma_addr_t *dma_handle, void **ret)
10385 -{
10386 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10387 - int order = get_order(size);
10388 -
10389 - if (mem) {
10390 - int page = bitmap_find_free_region(mem->bitmap, mem->size,
10391 - order);
10392 - if (page >= 0) {
10393 - *dma_handle = mem->device_base + (page << PAGE_SHIFT);
10394 - *ret = mem->virt_base + (page << PAGE_SHIFT);
10395 - memset(*ret, 0, size);
10396 - }
10397 - if (mem->flags & DMA_MEMORY_EXCLUSIVE)
10398 - *ret = NULL;
10399 - }
10400 - return (mem != NULL);
10401 -}
10402 -
10403 -static int dma_release_coherent(struct device *dev, int order, void *vaddr)
10404 -{
10405 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10406 -
10407 - if (mem && vaddr >= mem->virt_base && vaddr <
10408 - (mem->virt_base + (mem->size << PAGE_SHIFT))) {
10409 - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
10410 -
10411 - bitmap_release_region(mem->bitmap, page, order);
10412 - return 1;
10413 - }
10414 - return 0;
10415 -}
10416 -#else
10417 -#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
10418 -#define dma_release_coherent(dev, order, vaddr) (0)
10419 -#endif /* CONFIG_X86_32 */
10420 -
10421 int dma_supported(struct device *dev, u64 mask)
10422 {
10423 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10424 +
10425 #ifdef CONFIG_PCI
10426 if (mask > 0xffffffff && forbid_dac > 0) {
10427 - printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
10428 - dev->bus_id);
10429 + dev_info(dev, "PCI: Disallowing DAC for device\n");
10430 return 0;
10431 }
10432 #endif
10433
10434 - if (dma_ops->dma_supported)
10435 - return dma_ops->dma_supported(dev, mask);
10436 + if (ops->dma_supported)
10437 + return ops->dma_supported(dev, mask);
10438
10439 /* Copied from i386. Doesn't make much sense, because it will
10440 only work for pci_alloc_coherent.
10441 @@ -395,8 +283,7 @@ int dma_supported(struct device *dev, u6
10442 type. Normally this doesn't make any difference, but gives
10443 more gentle handling of IOMMU overflow. */
10444 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
10445 - printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
10446 - dev->bus_id, mask);
10447 + dev_info(dev, "Force SAC with mask %Lx\n", mask);
10448 return 0;
10449 }
10450
10451 @@ -422,6 +309,9 @@ void *
10452 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
10453 gfp_t gfp)
10454 {
10455 +#ifndef CONFIG_XEN
10456 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10457 +#endif
10458 void *memory = NULL;
10459 struct page *page;
10460 unsigned long dma_mask = 0;
10461 @@ -431,7 +321,7 @@ dma_alloc_coherent(struct device *dev, s
10462 /* ignore region specifiers */
10463 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
10464
10465 - if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
10466 + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
10467 return memory;
10468
10469 if (!dev) {
10470 @@ -491,8 +381,8 @@ dma_alloc_coherent(struct device *dev, s
10471 /* Let low level make its own zone decisions */
10472 gfp &= ~(GFP_DMA32|GFP_DMA);
10473
10474 - if (dma_ops->alloc_coherent)
10475 - return dma_ops->alloc_coherent(dev, size,
10476 + if (ops->alloc_coherent)
10477 + return ops->alloc_coherent(dev, size,
10478 dma_handle, gfp);
10479 return NULL;
10480 }
10481 @@ -504,14 +394,14 @@ dma_alloc_coherent(struct device *dev, s
10482 }
10483 }
10484
10485 - if (dma_ops->alloc_coherent) {
10486 + if (ops->alloc_coherent) {
10487 free_pages((unsigned long)memory, order);
10488 gfp &= ~(GFP_DMA|GFP_DMA32);
10489 - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
10490 + return ops->alloc_coherent(dev, size, dma_handle, gfp);
10491 }
10492
10493 - if (dma_ops->map_simple) {
10494 - *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
10495 + if (ops->map_simple) {
10496 + *dma_handle = ops->map_simple(dev, virt_to_bus(memory),
10497 size,
10498 PCI_DMA_BIDIRECTIONAL);
10499 if (*dma_handle != bad_dma_address)
10500 @@ -542,13 +432,17 @@ EXPORT_SYMBOL(dma_alloc_coherent);
10501 void dma_free_coherent(struct device *dev, size_t size,
10502 void *vaddr, dma_addr_t bus)
10503 {
10504 +#ifndef CONFIG_XEN
10505 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10506 +#endif
10507 +
10508 int order = get_order(size);
10509 WARN_ON(irqs_disabled()); /* for portability */
10510 - if (dma_release_coherent(dev, order, vaddr))
10511 + if (dma_release_from_coherent(dev, order, vaddr))
10512 return;
10513 #ifndef CONFIG_XEN
10514 - if (dma_ops->unmap_single)
10515 - dma_ops->unmap_single(dev, bus, size, 0);
10516 + if (ops->unmap_single)
10517 + ops->unmap_single(dev, bus, size, 0);
10518 #endif
10519 xen_destroy_contiguous_region((unsigned long)vaddr, order);
10520 free_pages((unsigned long)vaddr, order);
10521 @@ -557,15 +451,13 @@ EXPORT_SYMBOL(dma_free_coherent);
10522
10523 static int __init pci_iommu_init(void)
10524 {
10525 -#ifdef CONFIG_CALGARY_IOMMU
10526 calgary_iommu_init();
10527 -#endif
10528
10529 intel_iommu_init();
10530
10531 -#ifdef CONFIG_GART_IOMMU
10532 + amd_iommu_init();
10533 +
10534 gart_iommu_init();
10535 -#endif
10536
10537 no_iommu_init();
10538 return 0;
10539 Index: head-2008-12-01/arch/x86/kernel/pci-nommu-xen.c
10540 ===================================================================
10541 --- head-2008-12-01.orig/arch/x86/kernel/pci-nommu-xen.c 2008-12-01 11:44:55.000000000 +0100
10542 +++ head-2008-12-01/arch/x86/kernel/pci-nommu-xen.c 2008-12-01 11:49:07.000000000 +0100
10543 @@ -79,18 +79,12 @@ gnttab_unmap_single(struct device *dev,
10544 gnttab_dma_unmap_page(dma_addr);
10545 }
10546
10547 -static int nommu_mapping_error(dma_addr_t dma_addr)
10548 -{
10549 - return (dma_addr == bad_dma_address);
10550 -}
10551 -
10552 -static const struct dma_mapping_ops nommu_dma_ops = {
10553 +static struct dma_mapping_ops nommu_dma_ops = {
10554 .map_single = gnttab_map_single,
10555 .unmap_single = gnttab_unmap_single,
10556 .map_sg = gnttab_map_sg,
10557 .unmap_sg = gnttab_unmap_sg,
10558 .dma_supported = swiotlb_dma_supported,
10559 - .mapping_error = nommu_mapping_error
10560 };
10561
10562 void __init no_iommu_init(void)
10563 Index: head-2008-12-01/arch/x86/kernel/probe_roms_32.c
10564 ===================================================================
10565 --- head-2008-12-01.orig/arch/x86/kernel/probe_roms_32.c 2008-12-03 15:48:43.000000000 +0100
10566 +++ head-2008-12-01/arch/x86/kernel/probe_roms_32.c 2008-12-01 11:49:07.000000000 +0100
10567 @@ -99,6 +99,11 @@ void __init probe_roms(void)
10568 unsigned char c;
10569 int i;
10570
10571 +#ifdef CONFIG_XEN
10572 + if (!is_initial_xendomain())
10573 + return;
10574 +#endif
10575 +
10576 /* video rom */
10577 upper = adapter_rom_resources[0].start;
10578 for (start = video_rom_resource.start; start < upper; start += 2048) {
10579 @@ -131,7 +136,7 @@ void __init probe_roms(void)
10580 upper = system_rom_resource.start;
10581
10582 /* check for extension rom (ignore length byte!) */
10583 - rom = isa_bus_to_virt(extension_rom_resource.start);
10584 + rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
10585 if (romsignature(rom)) {
10586 length = extension_rom_resource.end - extension_rom_resource.start + 1;
10587 if (romchecksum(rom, length)) {
10588 Index: head-2008-12-01/arch/x86/kernel/process-xen.c
10589 ===================================================================
10590 --- head-2008-12-01.orig/arch/x86/kernel/process-xen.c 2008-12-01 11:44:55.000000000 +0100
10591 +++ head-2008-12-01/arch/x86/kernel/process-xen.c 2008-12-01 11:49:07.000000000 +0100
10592 @@ -6,6 +6,13 @@
10593 #include <linux/sched.h>
10594 #include <linux/module.h>
10595 #include <linux/pm.h>
10596 +#include <linux/clockchips.h>
10597 +#include <asm/system.h>
10598 +
10599 +unsigned long idle_halt;
10600 +EXPORT_SYMBOL(idle_halt);
10601 +unsigned long idle_nomwait;
10602 +EXPORT_SYMBOL(idle_nomwait);
10603
10604 struct kmem_cache *task_xstate_cachep;
10605
10606 @@ -45,6 +52,70 @@ void arch_task_cache_init(void)
10607 SLAB_PANIC, NULL);
10608 }
10609
10610 +/*
10611 + * Idle related variables and functions
10612 + */
10613 +unsigned long boot_option_idle_override = 0;
10614 +EXPORT_SYMBOL(boot_option_idle_override);
10615 +
10616 +/*
10617 + * Powermanagement idle function, if any..
10618 + */
10619 +void (*pm_idle)(void);
10620 +EXPORT_SYMBOL(pm_idle);
10621 +
10622 +#ifdef CONFIG_X86_32
10623 +/*
10624 + * This halt magic was a workaround for ancient floppy DMA
10625 + * wreckage. It should be safe to remove.
10626 + */
10627 +static int hlt_counter;
10628 +void disable_hlt(void)
10629 +{
10630 + hlt_counter++;
10631 +}
10632 +EXPORT_SYMBOL(disable_hlt);
10633 +
10634 +void enable_hlt(void)
10635 +{
10636 + hlt_counter--;
10637 +}
10638 +EXPORT_SYMBOL(enable_hlt);
10639 +
10640 +static inline int hlt_use_halt(void)
10641 +{
10642 + return (!hlt_counter && boot_cpu_data.hlt_works_ok);
10643 +}
10644 +#else
10645 +static inline int hlt_use_halt(void)
10646 +{
10647 + return 1;
10648 +}
10649 +#endif
10650 +
10651 +/*
10652 + * We use this if we don't have any better
10653 + * idle routine..
10654 + */
10655 +void xen_idle(void)
10656 +{
10657 + current_thread_info()->status &= ~TS_POLLING;
10658 + /*
10659 + * TS_POLLING-cleared state must be visible before we
10660 + * test NEED_RESCHED:
10661 + */
10662 + smp_mb();
10663 +
10664 + if (!need_resched())
10665 + safe_halt(); /* enables interrupts racelessly */
10666 + else
10667 + local_irq_enable();
10668 + current_thread_info()->status |= TS_POLLING;
10669 +}
10670 +#ifdef CONFIG_APM_MODULE
10671 +EXPORT_SYMBOL(default_idle);
10672 +#endif
10673 +
10674 static void do_nothing(void *unused)
10675 {
10676 }
10677 @@ -61,7 +132,7 @@ void cpu_idle_wait(void)
10678 {
10679 smp_mb();
10680 /* kick all the CPUs so that they exit out of pm_idle */
10681 - smp_call_function(do_nothing, NULL, 0, 1);
10682 + smp_call_function(do_nothing, NULL, 1);
10683 }
10684 EXPORT_SYMBOL_GPL(cpu_idle_wait);
10685
10686 @@ -125,60 +196,175 @@ static void poll_idle(void)
10687 *
10688 * idle=mwait overrides this decision and forces the usage of mwait.
10689 */
10690 +static int __cpuinitdata force_mwait;
10691 +
10692 +#define MWAIT_INFO 0x05
10693 +#define MWAIT_ECX_EXTENDED_INFO 0x01
10694 +#define MWAIT_EDX_C1 0xf0
10695 +
10696 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
10697 {
10698 + u32 eax, ebx, ecx, edx;
10699 +
10700 if (force_mwait)
10701 return 1;
10702
10703 - if (c->x86_vendor == X86_VENDOR_AMD) {
10704 - switch(c->x86) {
10705 - case 0x10:
10706 - case 0x11:
10707 - return 0;
10708 + if (c->cpuid_level < MWAIT_INFO)
10709 + return 0;
10710 +
10711 + cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
10712 + /* Check, whether EDX has extended info about MWAIT */
10713 + if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
10714 + return 1;
10715 +
10716 + /*
10717 + * edx enumeratios MONITOR/MWAIT extensions. Check, whether
10718 + * C1 supports MWAIT
10719 + */
10720 + return (edx & MWAIT_EDX_C1);
10721 +}
10722 +
10723 +/*
10724 + * Check for AMD CPUs, which have potentially C1E support
10725 + */
10726 +static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
10727 +{
10728 + if (c->x86_vendor != X86_VENDOR_AMD)
10729 + return 0;
10730 +
10731 + if (c->x86 < 0x0F)
10732 + return 0;
10733 +
10734 + /* Family 0x0f models < rev F do not have C1E */
10735 + if (c->x86 == 0x0f && c->x86_model < 0x40)
10736 + return 0;
10737 +
10738 + return 1;
10739 +}
10740 +
10741 +static cpumask_t c1e_mask = CPU_MASK_NONE;
10742 +static int c1e_detected;
10743 +
10744 +void c1e_remove_cpu(int cpu)
10745 +{
10746 + cpu_clear(cpu, c1e_mask);
10747 +}
10748 +
10749 +/*
10750 + * C1E aware idle routine. We check for C1E active in the interrupt
10751 + * pending message MSR. If we detect C1E, then we handle it the same
10752 + * way as C3 power states (local apic timer and TSC stop)
10753 + */
10754 +static void c1e_idle(void)
10755 +{
10756 + if (need_resched())
10757 + return;
10758 +
10759 + if (!c1e_detected) {
10760 + u32 lo, hi;
10761 +
10762 + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
10763 + if (lo & K8_INTP_C1E_ACTIVE_MASK) {
10764 + c1e_detected = 1;
10765 + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
10766 + mark_tsc_unstable("TSC halt in AMD C1E");
10767 + printk(KERN_INFO "System has AMD C1E enabled\n");
10768 + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
10769 }
10770 }
10771 - return 1;
10772 +
10773 + if (c1e_detected) {
10774 + int cpu = smp_processor_id();
10775 +
10776 + if (!cpu_isset(cpu, c1e_mask)) {
10777 + cpu_set(cpu, c1e_mask);
10778 + /*
10779 + * Force broadcast so ACPI can not interfere. Needs
10780 + * to run with interrupts enabled as it uses
10781 + * smp_function_call.
10782 + */
10783 + local_irq_enable();
10784 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
10785 + &cpu);
10786 + printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
10787 + cpu);
10788 + local_irq_disable();
10789 + }
10790 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
10791 +
10792 + default_idle();
10793 +
10794 + /*
10795 + * The switch back from broadcast mode needs to be
10796 + * called with interrupts disabled.
10797 + */
10798 + local_irq_disable();
10799 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
10800 + local_irq_enable();
10801 + } else
10802 + default_idle();
10803 }
10804 #endif
10805
10806 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
10807 {
10808 #ifndef CONFIG_XEN
10809 - static int selected;
10810 -
10811 - if (selected)
10812 - return;
10813 #ifdef CONFIG_X86_SMP
10814 if (pm_idle == poll_idle && smp_num_siblings > 1) {
10815 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
10816 " performance may degrade.\n");
10817 }
10818 #endif
10819 + if (pm_idle)
10820 + return;
10821 +
10822 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
10823 /*
10824 - * Skip, if setup has overridden idle.
10825 * One CPU supports mwait => All CPUs supports mwait
10826 */
10827 - if (!pm_idle) {
10828 - printk(KERN_INFO "using mwait in idle threads.\n");
10829 - pm_idle = mwait_idle;
10830 - }
10831 - }
10832 - selected = 1;
10833 + printk(KERN_INFO "using mwait in idle threads.\n");
10834 + pm_idle = mwait_idle;
10835 + } else if (check_c1e_idle(c)) {
10836 + printk(KERN_INFO "using C1E aware idle routine\n");
10837 + pm_idle = c1e_idle;
10838 + } else
10839 + pm_idle = default_idle;
10840 #endif
10841 }
10842
10843 static int __init idle_setup(char *str)
10844 {
10845 + if (!str)
10846 + return -EINVAL;
10847 +
10848 if (!strcmp(str, "poll")) {
10849 printk("using polling idle threads.\n");
10850 pm_idle = poll_idle;
10851 - }
10852 #ifndef CONFIG_XEN
10853 - else if (!strcmp(str, "mwait"))
10854 + } else if (!strcmp(str, "mwait"))
10855 force_mwait = 1;
10856 + else if (!strcmp(str, "halt")) {
10857 + /*
10858 + * When the boot option of idle=halt is added, halt is
10859 + * forced to be used for CPU idle. In such case CPU C2/C3
10860 + * won't be used again.
10861 + * To continue to load the CPU idle driver, don't touch
10862 + * the boot_option_idle_override.
10863 + */
10864 + pm_idle = default_idle;
10865 + idle_halt = 1;
10866 + return 0;
10867 + } else if (!strcmp(str, "nomwait")) {
10868 + /*
10869 + * If the boot option of "idle=nomwait" is added,
10870 + * it means that mwait will be disabled for CPU C2/C3
10871 + * states. In such case it won't touch the variable
10872 + * of boot_option_idle_override.
10873 + */
10874 + idle_nomwait = 1;
10875 + return 0;
10876 #endif
10877 - else
10878 + } else
10879 return -1;
10880
10881 boot_option_idle_override = 1;
10882 Index: head-2008-12-01/arch/x86/kernel/process_32-xen.c
10883 ===================================================================
10884 --- head-2008-12-01.orig/arch/x86/kernel/process_32-xen.c 2008-12-01 11:44:55.000000000 +0100
10885 +++ head-2008-12-01/arch/x86/kernel/process_32-xen.c 2008-12-01 11:49:07.000000000 +0100
10886 @@ -59,15 +59,11 @@
10887 #include <asm/tlbflush.h>
10888 #include <asm/cpu.h>
10889 #include <asm/kdebug.h>
10890 +#include <asm/idle.h>
10891
10892 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
10893 asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
10894
10895 -static int hlt_counter;
10896 -
10897 -unsigned long boot_option_idle_override = 0;
10898 -EXPORT_SYMBOL(boot_option_idle_override);
10899 -
10900 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
10901 EXPORT_PER_CPU_SYMBOL(current_task);
10902
10903 @@ -82,46 +78,27 @@ unsigned long thread_saved_pc(struct tas
10904 return ((unsigned long *)tsk->thread.sp)[3];
10905 }
10906
10907 -/*
10908 - * Powermanagement idle function, if any..
10909 - */
10910 -void (*pm_idle)(void);
10911 -EXPORT_SYMBOL(pm_idle);
10912 +#ifdef CONFIG_HOTPLUG_CPU
10913 +#ifndef CONFIG_XEN
10914 +#include <asm/nmi.h>
10915
10916 -void disable_hlt(void)
10917 +static void cpu_exit_clear(void)
10918 {
10919 - hlt_counter++;
10920 -}
10921 + int cpu = raw_smp_processor_id();
10922
10923 -EXPORT_SYMBOL(disable_hlt);
10924 -
10925 -void enable_hlt(void)
10926 -{
10927 - hlt_counter--;
10928 -}
10929 + idle_task_exit();
10930
10931 -EXPORT_SYMBOL(enable_hlt);
10932 + cpu_uninit();
10933 + irq_ctx_exit(cpu);
10934
10935 -static void xen_idle(void)
10936 -{
10937 - current_thread_info()->status &= ~TS_POLLING;
10938 - /*
10939 - * TS_POLLING-cleared state must be visible before we
10940 - * test NEED_RESCHED:
10941 - */
10942 - smp_mb();
10943 + cpu_clear(cpu, cpu_callout_map);
10944 + cpu_clear(cpu, cpu_callin_map);
10945
10946 - if (!need_resched())
10947 - safe_halt(); /* enables interrupts racelessly */
10948 - else
10949 - local_irq_enable();
10950 - current_thread_info()->status |= TS_POLLING;
10951 + numa_remove_cpu(cpu);
10952 + c1e_remove_cpu(cpu);
10953 }
10954 -#ifdef CONFIG_APM_MODULE
10955 -EXPORT_SYMBOL(default_idle);
10956 #endif
10957
10958 -#ifdef CONFIG_HOTPLUG_CPU
10959 static inline void play_dead(void)
10960 {
10961 idle_task_exit();
10962 @@ -152,13 +129,11 @@ void cpu_idle(void)
10963
10964 /* endless idle loop with no priority at all */
10965 while (1) {
10966 - tick_nohz_stop_sched_tick();
10967 + tick_nohz_stop_sched_tick(1);
10968 while (!need_resched()) {
10969 - void (*idle)(void);
10970
10971 check_pgt_cache();
10972 rmb();
10973 - idle = xen_idle; /* no alternatives */
10974
10975 if (rcu_pending(cpu))
10976 rcu_check_callbacks(cpu, 0);
10977 @@ -168,7 +143,10 @@ void cpu_idle(void)
10978
10979 local_irq_disable();
10980 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
10981 - idle();
10982 + /* Don't trace irqs off for idle */
10983 + stop_critical_timings();
10984 + xen_idle();
10985 + start_critical_timings();
10986 }
10987 tick_nohz_restart_sched_tick();
10988 preempt_enable_no_resched();
10989 Index: head-2008-12-01/arch/x86/kernel/process_64-xen.c
10990 ===================================================================
10991 --- head-2008-12-01.orig/arch/x86/kernel/process_64-xen.c 2008-12-01 11:44:55.000000000 +0100
10992 +++ head-2008-12-01/arch/x86/kernel/process_64-xen.c 2008-12-01 11:49:07.000000000 +0100
10993 @@ -65,15 +65,6 @@ asmlinkage extern void ret_from_fork(voi
10994
10995 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
10996
10997 -unsigned long boot_option_idle_override = 0;
10998 -EXPORT_SYMBOL(boot_option_idle_override);
10999 -
11000 -/*
11001 - * Powermanagement idle function, if any..
11002 - */
11003 -void (*pm_idle)(void);
11004 -EXPORT_SYMBOL(pm_idle);
11005 -
11006 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
11007
11008 void idle_notifier_register(struct notifier_block *n)
11009 @@ -103,25 +94,13 @@ void exit_idle(void)
11010 __exit_idle();
11011 }
11012
11013 -static void xen_idle(void)
11014 -{
11015 - current_thread_info()->status &= ~TS_POLLING;
11016 - /*
11017 - * TS_POLLING-cleared state must be visible before we
11018 - * test NEED_RESCHED:
11019 - */
11020 - smp_mb();
11021 - if (!need_resched())
11022 - safe_halt(); /* enables interrupts racelessly */
11023 - else
11024 - local_irq_enable();
11025 - current_thread_info()->status |= TS_POLLING;
11026 -}
11027 -
11028 #ifdef CONFIG_HOTPLUG_CPU
11029 static inline void play_dead(void)
11030 {
11031 idle_task_exit();
11032 +#ifndef CONFIG_XEN
11033 + c1e_remove_cpu(raw_smp_processor_id());
11034 +#endif
11035 local_irq_disable();
11036 cpu_clear(smp_processor_id(), cpu_initialized);
11037 preempt_enable_no_resched();
11038 @@ -146,12 +125,11 @@ void cpu_idle(void)
11039 current_thread_info()->status |= TS_POLLING;
11040 /* endless idle loop with no priority at all */
11041 while (1) {
11042 - tick_nohz_stop_sched_tick();
11043 + tick_nohz_stop_sched_tick(1);
11044 while (!need_resched()) {
11045 - void (*idle)(void);
11046
11047 rmb();
11048 - idle = xen_idle; /* no alternatives */
11049 +
11050 if (cpu_is_offline(smp_processor_id()))
11051 play_dead();
11052 /*
11053 @@ -161,7 +139,10 @@ void cpu_idle(void)
11054 */
11055 local_irq_disable();
11056 enter_idle();
11057 - idle();
11058 + /* Don't trace irqs off for idle */
11059 + stop_critical_timings();
11060 + xen_idle();
11061 + start_critical_timings();
11062 /* In many cases the interrupt that ended idle
11063 has already called exit_idle. But some idle
11064 loops can be woken up without interrupt. */
11065 @@ -271,7 +252,7 @@ void exit_thread(void)
11066 }
11067 }
11068
11069 -void load_gs_index(unsigned gs)
11070 +void xen_load_gs_index(unsigned gs)
11071 {
11072 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
11073 }
11074 @@ -373,10 +354,10 @@ int copy_thread(int nr, unsigned long cl
11075 p->thread.fs = me->thread.fs;
11076 p->thread.gs = me->thread.gs;
11077
11078 - asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
11079 - asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
11080 - asm("mov %%es,%0" : "=m" (p->thread.es));
11081 - asm("mov %%ds,%0" : "=m" (p->thread.ds));
11082 + savesegment(gs, p->thread.gsindex);
11083 + savesegment(fs, p->thread.fsindex);
11084 + savesegment(es, p->thread.es);
11085 + savesegment(ds, p->thread.ds);
11086
11087 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
11088 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
11089 @@ -417,7 +398,9 @@ out:
11090 void
11091 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
11092 {
11093 - asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
11094 + loadsegment(fs, 0);
11095 + loadsegment(es, 0);
11096 + loadsegment(ds, 0);
11097 load_gs_index(0);
11098 regs->ip = new_ip;
11099 regs->sp = new_sp;
11100 @@ -557,8 +540,8 @@ static inline void __switch_to_xtra(stru
11101 struct task_struct *
11102 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
11103 {
11104 - struct thread_struct *prev = &prev_p->thread,
11105 - *next = &next_p->thread;
11106 + struct thread_struct *prev = &prev_p->thread;
11107 + struct thread_struct *next = &next_p->thread;
11108 int cpu = smp_processor_id();
11109 #ifndef CONFIG_X86_NO_TSS
11110 struct tss_struct *tss = &per_cpu(init_tss, cpu);
11111 @@ -661,12 +644,25 @@ __switch_to(struct task_struct *prev_p,
11112 */
11113 if (unlikely(next->es))
11114 loadsegment(es, next->es);
11115 -
11116 +
11117 if (unlikely(next->ds))
11118 loadsegment(ds, next->ds);
11119
11120 + /*
11121 + * Leave lazy mode, flushing any hypercalls made here.
11122 + * This must be done before restoring TLS segments so
11123 + * the GDT and LDT are properly updated, and must be
11124 + * done before math_state_restore, so the TS bit is up
11125 + * to date.
11126 + */
11127 + arch_leave_lazy_cpu_mode();
11128 +
11129 /*
11130 * Switch FS and GS.
11131 + *
11132 + * Segment register != 0 always requires a reload. Also
11133 + * reload when it has changed. When prev process used 64bit
11134 + * base always reload to avoid an information leak.
11135 */
11136 if (unlikely(next->fsindex))
11137 loadsegment(fs, next->fsindex);
11138 @@ -687,7 +683,8 @@ __switch_to(struct task_struct *prev_p,
11139 write_pda(oldrsp, next->usersp);
11140 write_pda(pcurrent, next_p);
11141 write_pda(kernelstack,
11142 - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
11143 + (unsigned long)task_stack_page(next_p) +
11144 + THREAD_SIZE - PDA_STACKOFFSET);
11145 #ifdef CONFIG_CC_STACKPROTECTOR
11146 write_pda(stack_canary, next_p->stack_canary);
11147
11148 @@ -848,7 +845,7 @@ long do_arch_prctl(struct task_struct *t
11149 set_32bit_tls(task, FS_TLS, addr);
11150 if (doit) {
11151 load_TLS(&task->thread, cpu);
11152 - asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
11153 + loadsegment(fs, FS_TLS_SEL);
11154 }
11155 task->thread.fsindex = FS_TLS_SEL;
11156 task->thread.fs = 0;
11157 @@ -858,7 +855,7 @@ long do_arch_prctl(struct task_struct *t
11158 if (doit) {
11159 /* set the selector to 0 to not confuse
11160 __switch_to */
11161 - asm volatile("movl %0,%%fs" :: "r" (0));
11162 + loadsegment(fs, 0);
11163 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
11164 addr);
11165 }
11166 @@ -882,7 +879,7 @@ long do_arch_prctl(struct task_struct *t
11167 if (task->thread.gsindex == GS_TLS_SEL)
11168 base = read_32bit_tls(task, GS_TLS);
11169 else if (doit) {
11170 - asm("movl %%gs,%0" : "=r" (gsindex));
11171 + savesegment(gs, gsindex);
11172 if (gsindex)
11173 rdmsrl(MSR_KERNEL_GS_BASE, base);
11174 else
11175 Index: head-2008-12-01/arch/x86/kernel/quirks-xen.c
11176 ===================================================================
11177 --- head-2008-12-01.orig/arch/x86/kernel/quirks-xen.c 2008-12-01 11:37:10.000000000 +0100
11178 +++ head-2008-12-01/arch/x86/kernel/quirks-xen.c 2008-12-01 11:49:07.000000000 +0100
11179 @@ -63,6 +63,7 @@ static enum {
11180 ICH_FORCE_HPET_RESUME,
11181 VT8237_FORCE_HPET_RESUME,
11182 NVIDIA_FORCE_HPET_RESUME,
11183 + ATI_FORCE_HPET_RESUME,
11184 } force_hpet_resume_type;
11185
11186 static void __iomem *rcba_base;
11187 @@ -156,6 +157,8 @@ static void ich_force_enable_hpet(struct
11188
11189 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
11190 ich_force_enable_hpet);
11191 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
11192 + ich_force_enable_hpet);
11193 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
11194 ich_force_enable_hpet);
11195 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
11196 @@ -172,6 +175,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
11197
11198 static struct pci_dev *cached_dev;
11199
11200 +static void hpet_print_force_info(void)
11201 +{
11202 + printk(KERN_INFO "HPET not enabled in BIOS. "
11203 + "You might try hpet=force boot option\n");
11204 +}
11205 +
11206 static void old_ich_force_hpet_resume(void)
11207 {
11208 u32 val;
11209 @@ -251,8 +260,12 @@ static void old_ich_force_enable_hpet_us
11210 {
11211 if (hpet_force_user)
11212 old_ich_force_enable_hpet(dev);
11213 + else
11214 + hpet_print_force_info();
11215 }
11216
11217 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
11218 + old_ich_force_enable_hpet_user);
11219 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
11220 old_ich_force_enable_hpet_user);
11221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
11222 @@ -288,9 +301,14 @@ static void vt8237_force_enable_hpet(str
11223 {
11224 u32 uninitialized_var(val);
11225
11226 - if (!hpet_force_user || hpet_address || force_hpet_address)
11227 + if (hpet_address || force_hpet_address)
11228 return;
11229
11230 + if (!hpet_force_user) {
11231 + hpet_print_force_info();
11232 + return;
11233 + }
11234 +
11235 pci_read_config_dword(dev, 0x68, &val);
11236 /*
11237 * Bit 7 is HPET enable bit.
11238 @@ -328,6 +346,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_V
11239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
11240 vt8237_force_enable_hpet);
11241
11242 +static void ati_force_hpet_resume(void)
11243 +{
11244 + pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
11245 + printk(KERN_DEBUG "Force enabled HPET at resume\n");
11246 +}
11247 +
11248 +static void ati_force_enable_hpet(struct pci_dev *dev)
11249 +{
11250 + u32 uninitialized_var(val);
11251 +
11252 + if (hpet_address || force_hpet_address)
11253 + return;
11254 +
11255 + if (!hpet_force_user) {
11256 + hpet_print_force_info();
11257 + return;
11258 + }
11259 +
11260 + pci_write_config_dword(dev, 0x14, 0xfed00000);
11261 + pci_read_config_dword(dev, 0x14, &val);
11262 + force_hpet_address = val;
11263 + force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
11264 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
11265 + force_hpet_address);
11266 + cached_dev = dev;
11267 + return;
11268 +}
11269 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
11270 + ati_force_enable_hpet);
11271 +
11272 /*
11273 * Undocumented chipset feature taken from LinuxBIOS.
11274 */
11275 @@ -341,8 +389,13 @@ static void nvidia_force_enable_hpet(str
11276 {
11277 u32 uninitialized_var(val);
11278
11279 - if (!hpet_force_user || hpet_address || force_hpet_address)
11280 + if (hpet_address || force_hpet_address)
11281 + return;
11282 +
11283 + if (!hpet_force_user) {
11284 + hpet_print_force_info();
11285 return;
11286 + }
11287
11288 pci_write_config_dword(dev, 0x44, 0xfed00001);
11289 pci_read_config_dword(dev, 0x44, &val);
11290 @@ -395,6 +448,9 @@ void force_hpet_resume(void)
11291 case NVIDIA_FORCE_HPET_RESUME:
11292 nvidia_force_hpet_resume();
11293 return;
11294 + case ATI_FORCE_HPET_RESUME:
11295 + ati_force_hpet_resume();
11296 + return;
11297 default:
11298 break;
11299 }
11300 Index: head-2008-12-01/arch/x86/kernel/setup-xen.c
11301 ===================================================================
11302 --- head-2008-12-01.orig/arch/x86/kernel/setup-xen.c 2008-12-01 11:44:55.000000000 +0100
11303 +++ head-2008-12-01/arch/x86/kernel/setup-xen.c 2008-12-01 11:49:07.000000000 +0100
11304 @@ -1,141 +1,1147 @@
11305 -#include <linux/kernel.h>
11306 +/*
11307 + * Copyright (C) 1995 Linus Torvalds
11308 + *
11309 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11310 + *
11311 + * Memory region support
11312 + * David Parsons <orc@pell.chi.il.us>, July-August 1999
11313 + *
11314 + * Added E820 sanitization routine (removes overlapping memory regions);
11315 + * Brian Moyle <bmoyle@mvista.com>, February 2001
11316 + *
11317 + * Moved CPU detection code to cpu/${cpu}.c
11318 + * Patrick Mochel <mochel@osdl.org>, March 2002
11319 + *
11320 + * Provisions for empty E820 memory regions (reported by certain BIOSes).
11321 + * Alex Achenbach <xela@slit.de>, December 2002.
11322 + *
11323 + */
11324 +
11325 +/*
11326 + * This file handles the architecture-dependent parts of initialization
11327 + */
11328 +
11329 +#include <linux/sched.h>
11330 +#include <linux/mm.h>
11331 +#include <linux/mmzone.h>
11332 +#include <linux/screen_info.h>
11333 +#include <linux/ioport.h>
11334 +#include <linux/acpi.h>
11335 +#include <linux/apm_bios.h>
11336 +#include <linux/initrd.h>
11337 +#include <linux/bootmem.h>
11338 +#include <linux/seq_file.h>
11339 +#include <linux/console.h>
11340 +#include <linux/mca.h>
11341 +#include <linux/root_dev.h>
11342 +#include <linux/highmem.h>
11343 #include <linux/module.h>
11344 +#include <linux/efi.h>
11345 #include <linux/init.h>
11346 -#include <linux/bootmem.h>
11347 +#include <linux/edd.h>
11348 +#include <linux/iscsi_ibft.h>
11349 +#include <linux/nodemask.h>
11350 +#include <linux/kexec.h>
11351 +#include <linux/dmi.h>
11352 +#include <linux/pfn.h>
11353 +#include <linux/pci.h>
11354 +#include <asm/pci-direct.h>
11355 +#include <linux/init_ohci1394_dma.h>
11356 +#include <linux/kvm_para.h>
11357 +
11358 +#include <linux/errno.h>
11359 +#include <linux/kernel.h>
11360 +#include <linux/stddef.h>
11361 +#include <linux/unistd.h>
11362 +#include <linux/ptrace.h>
11363 +#include <linux/slab.h>
11364 +#include <linux/user.h>
11365 +#include <linux/delay.h>
11366 +
11367 +#include <linux/kallsyms.h>
11368 +#include <linux/cpufreq.h>
11369 +#include <linux/dma-mapping.h>
11370 +#include <linux/ctype.h>
11371 +#include <linux/uaccess.h>
11372 +
11373 #include <linux/percpu.h>
11374 -#include <asm/smp.h>
11375 -#include <asm/percpu.h>
11376 +#include <linux/crash_dump.h>
11377 +
11378 +#include <video/edid.h>
11379 +
11380 +#include <asm/mtrr.h>
11381 +#include <asm/apic.h>
11382 +#include <asm/e820.h>
11383 +#include <asm/mpspec.h>
11384 +#include <asm/setup.h>
11385 +#include <asm/arch_hooks.h>
11386 +#include <asm/efi.h>
11387 #include <asm/sections.h>
11388 +#include <asm/dmi.h>
11389 +#include <asm/io_apic.h>
11390 +#include <asm/ist.h>
11391 +#include <asm/vmi.h>
11392 +#include <setup_arch.h>
11393 +#include <asm/bios_ebda.h>
11394 +#include <asm/cacheflush.h>
11395 #include <asm/processor.h>
11396 -#include <asm/setup.h>
11397 +#include <asm/bugs.h>
11398 +
11399 +#include <asm/system.h>
11400 +#include <asm/vsyscall.h>
11401 +#include <asm/smp.h>
11402 +#include <asm/desc.h>
11403 +#include <asm/dma.h>
11404 +#include <asm/iommu.h>
11405 +#include <asm/mmu_context.h>
11406 +#include <asm/proto.h>
11407 +
11408 +#include <mach_apic.h>
11409 +#include <asm/paravirt.h>
11410 +
11411 +#include <asm/percpu.h>
11412 #include <asm/topology.h>
11413 -#include <asm/mpspec.h>
11414 #include <asm/apicdef.h>
11415 +#ifdef CONFIG_X86_64
11416 +#include <asm/numa_64.h>
11417 +#endif
11418 +
11419 +#ifdef CONFIG_XEN
11420 +#include <asm/hypervisor.h>
11421 +#include <xen/interface/kexec.h>
11422 +#include <xen/interface/memory.h>
11423 +#include <xen/interface/nmi.h>
11424 +#include <xen/interface/physdev.h>
11425 +#include <xen/features.h>
11426 +#include <xen/firmware.h>
11427 +#include <xen/xencons.h>
11428 +
11429 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
11430 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
11431
11432 -#ifdef CONFIG_X86_LOCAL_APIC
11433 -unsigned int num_processors;
11434 -unsigned disabled_cpus __cpuinitdata;
11435 -/* Processor that is doing the boot up */
11436 -unsigned int boot_cpu_physical_apicid = -1U;
11437 -EXPORT_SYMBOL(boot_cpu_physical_apicid);
11438 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
11439 +static struct notifier_block xen_panic_block = {
11440 + xen_panic_event, NULL, 0 /* try to go last */
11441 +};
11442
11443 -DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
11444 -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
11445 +unsigned long *phys_to_machine_mapping;
11446 +EXPORT_SYMBOL(phys_to_machine_mapping);
11447
11448 -/* Bitmask of physically existing CPUs */
11449 -physid_mask_t phys_cpu_present_map;
11450 +unsigned long *pfn_to_mfn_frame_list_list,
11451 +#ifdef CONFIG_X86_64
11452 + *pfn_to_mfn_frame_list[512];
11453 +#else
11454 + *pfn_to_mfn_frame_list[128];
11455 +#endif
11456 +
11457 +/* Raw start-of-day parameters from the hypervisor. */
11458 +start_info_t *xen_start_info;
11459 +EXPORT_SYMBOL(xen_start_info);
11460 +#endif
11461 +
11462 +#ifndef ARCH_SETUP
11463 +#define ARCH_SETUP
11464 +#endif
11465 +
11466 +#ifndef CONFIG_XEN
11467 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
11468 +struct boot_params __initdata boot_params;
11469 +#else
11470 +struct boot_params boot_params;
11471 +#endif
11472 #endif
11473
11474 -#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
11475 /*
11476 - * Copy data used in early init routines from the initial arrays to the
11477 - * per cpu data areas. These arrays then become expendable and the
11478 - * *_early_ptr's are zeroed indicating that the static arrays are gone.
11479 + * Machine setup..
11480 */
11481 -static void __init setup_per_cpu_maps(void)
11482 +static struct resource data_resource = {
11483 + .name = "Kernel data",
11484 + .start = 0,
11485 + .end = 0,
11486 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11487 +};
11488 +
11489 +static struct resource code_resource = {
11490 + .name = "Kernel code",
11491 + .start = 0,
11492 + .end = 0,
11493 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11494 +};
11495 +
11496 +static struct resource bss_resource = {
11497 + .name = "Kernel bss",
11498 + .start = 0,
11499 + .end = 0,
11500 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11501 +};
11502 +
11503 +
11504 +#ifdef CONFIG_X86_32
11505 +#ifndef CONFIG_XEN
11506 +/* This value is set up by the early boot code to point to the value
11507 + immediately after the boot time page tables. It contains a *physical*
11508 + address, and must not be in the .bss segment! */
11509 +unsigned long init_pg_tables_start __initdata = ~0UL;
11510 +unsigned long init_pg_tables_end __initdata = ~0UL;
11511 +#endif
11512 +
11513 +static struct resource video_ram_resource = {
11514 + .name = "Video RAM area",
11515 + .start = 0xa0000,
11516 + .end = 0xbffff,
11517 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11518 +};
11519 +
11520 +/* cpu data as detected by the assembly code in head.S */
11521 +struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11522 +/* common cpu data for all cpus */
11523 +struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11524 +EXPORT_SYMBOL(boot_cpu_data);
11525 +#ifndef CONFIG_XEN
11526 +static void set_mca_bus(int x)
11527 {
11528 +#ifdef CONFIG_MCA
11529 + MCA_bus = x;
11530 +#endif
11531 +}
11532 +
11533 +unsigned int def_to_bigsmp;
11534 +
11535 +/* for MCA, but anyone else can use it if they want */
11536 +unsigned int machine_id;
11537 +unsigned int machine_submodel_id;
11538 +unsigned int BIOS_revision;
11539 +
11540 +struct apm_info apm_info;
11541 +EXPORT_SYMBOL(apm_info);
11542 +#endif
11543 +
11544 +#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
11545 +struct ist_info ist_info;
11546 +EXPORT_SYMBOL(ist_info);
11547 +#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
11548 +struct ist_info ist_info;
11549 +#endif
11550 +
11551 +#else
11552 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
11553 +EXPORT_SYMBOL(boot_cpu_data);
11554 +#endif
11555 +
11556 +
11557 +#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
11558 +unsigned long mmu_cr4_features;
11559 +#else
11560 +unsigned long mmu_cr4_features = X86_CR4_PAE;
11561 +#endif
11562 +
11563 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
11564 +int bootloader_type;
11565 +
11566 +/*
11567 + * Early DMI memory
11568 + */
11569 +int dmi_alloc_index;
11570 +char dmi_alloc_data[DMI_MAX_DATA];
11571 +
11572 +/*
11573 + * Setup options
11574 + */
11575 +struct screen_info screen_info;
11576 +EXPORT_SYMBOL(screen_info);
11577 +struct edid_info edid_info;
11578 +EXPORT_SYMBOL_GPL(edid_info);
11579 +
11580 +extern int root_mountflags;
11581 +
11582 +unsigned long saved_video_mode;
11583 +
11584 +#define RAMDISK_IMAGE_START_MASK 0x07FF
11585 +#define RAMDISK_PROMPT_FLAG 0x8000
11586 +#define RAMDISK_LOAD_FLAG 0x4000
11587 +
11588 +static char __initdata command_line[COMMAND_LINE_SIZE];
11589 +
11590 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
11591 +struct edd edd;
11592 +#ifdef CONFIG_EDD_MODULE
11593 +EXPORT_SYMBOL(edd);
11594 +#endif
11595 #ifndef CONFIG_XEN
11596 - int cpu;
11597 +/**
11598 + * copy_edd() - Copy the BIOS EDD information
11599 + * from boot_params into a safe place.
11600 + *
11601 + */
11602 +static inline void copy_edd(void)
11603 +{
11604 + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
11605 + sizeof(edd.mbr_signature));
11606 + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
11607 + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
11608 + edd.edd_info_nr = boot_params.eddbuf_entries;
11609 +}
11610 +#endif
11611 +#else
11612 +static inline void copy_edd(void)
11613 +{
11614 +}
11615 +#endif
11616 +
11617 +#ifdef CONFIG_BLK_DEV_INITRD
11618 +
11619 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11620 +
11621 +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
11622 +static void __init relocate_initrd(void)
11623 +{
11624 +
11625 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11626 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11627 + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11628 + u64 ramdisk_here;
11629 + unsigned long slop, clen, mapaddr;
11630 + char *p, *q;
11631 +
11632 + /* We need to move the initrd down into lowmem */
11633 + ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
11634 + PAGE_SIZE);
11635 +
11636 + if (ramdisk_here == -1ULL)
11637 + panic("Cannot find place for new RAMDISK of size %lld\n",
11638 + ramdisk_size);
11639 +
11640 + /* Note: this includes all the lowmem currently occupied by
11641 + the initrd, we rely on that fact to keep the data intact. */
11642 + reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
11643 + "NEW RAMDISK");
11644 + initrd_start = ramdisk_here + PAGE_OFFSET;
11645 + initrd_end = initrd_start + ramdisk_size;
11646 + printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
11647 + ramdisk_here, ramdisk_here + ramdisk_size);
11648 +
11649 + q = (char *)initrd_start;
11650 +
11651 + /* Copy any lowmem portion of the initrd */
11652 + if (ramdisk_image < end_of_lowmem) {
11653 + clen = end_of_lowmem - ramdisk_image;
11654 + p = (char *)__va(ramdisk_image);
11655 + memcpy(q, p, clen);
11656 + q += clen;
11657 + ramdisk_image += clen;
11658 + ramdisk_size -= clen;
11659 + }
11660 +
11661 + /* Copy the highmem portion of the initrd */
11662 + while (ramdisk_size) {
11663 + slop = ramdisk_image & ~PAGE_MASK;
11664 + clen = ramdisk_size;
11665 + if (clen > MAX_MAP_CHUNK-slop)
11666 + clen = MAX_MAP_CHUNK-slop;
11667 + mapaddr = ramdisk_image & PAGE_MASK;
11668 + p = early_ioremap(mapaddr, clen+slop);
11669 + memcpy(q, p+slop, clen);
11670 + early_iounmap(p, clen+slop);
11671 + q += clen;
11672 + ramdisk_image += clen;
11673 + ramdisk_size -= clen;
11674 + }
11675 + /* high pages is not converted by early_res_to_bootmem */
11676 + ramdisk_image = boot_params.hdr.ramdisk_image;
11677 + ramdisk_size = boot_params.hdr.ramdisk_size;
11678 + printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
11679 + " %08llx - %08llx\n",
11680 + ramdisk_image, ramdisk_image + ramdisk_size - 1,
11681 + ramdisk_here, ramdisk_here + ramdisk_size - 1);
11682 +}
11683 +#endif
11684 +
11685 +static void __init reserve_initrd(void)
11686 +{
11687 +#ifndef CONFIG_XEN
11688 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11689 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11690 + u64 ramdisk_end = ramdisk_image + ramdisk_size;
11691 + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11692
11693 - for_each_possible_cpu(cpu) {
11694 - per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
11695 - per_cpu(x86_bios_cpu_apicid, cpu) =
11696 - x86_bios_cpu_apicid_init[cpu];
11697 -#ifdef CONFIG_NUMA
11698 - per_cpu(x86_cpu_to_node_map, cpu) =
11699 - x86_cpu_to_node_map_init[cpu];
11700 + if (!boot_params.hdr.type_of_loader ||
11701 + !ramdisk_image || !ramdisk_size)
11702 + return; /* No initrd provided by bootloader */
11703 +#else
11704 + unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
11705 + unsigned long ramdisk_size = xen_start_info->mod_len;
11706 + unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
11707 + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11708 +
11709 + if (!xen_start_info->mod_start || !ramdisk_size)
11710 + return; /* No initrd provided by bootloader */
11711 #endif
11712 +
11713 + initrd_start = 0;
11714 +
11715 + if (ramdisk_size >= (end_of_lowmem>>1)) {
11716 + free_early(ramdisk_image, ramdisk_end);
11717 + printk(KERN_ERR "initrd too large to handle, "
11718 + "disabling initrd\n");
11719 + return;
11720 }
11721
11722 - /* indicate the early static arrays will soon be gone */
11723 - x86_cpu_to_apicid_early_ptr = NULL;
11724 - x86_bios_cpu_apicid_early_ptr = NULL;
11725 -#ifdef CONFIG_NUMA
11726 - x86_cpu_to_node_map_early_ptr = NULL;
11727 + printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
11728 + ramdisk_end);
11729 +
11730 +
11731 + if (ramdisk_end <= end_of_lowmem) {
11732 + /* All in lowmem, easy case */
11733 + /*
11734 + * don't need to reserve again, already reserved early
11735 + * in i386_start_kernel
11736 + */
11737 + initrd_start = ramdisk_image + PAGE_OFFSET;
11738 + initrd_end = initrd_start + ramdisk_size;
11739 +#ifdef CONFIG_X86_64_XEN
11740 + initrd_below_start_ok = 1;
11741 #endif
11742 + return;
11743 + }
11744 +
11745 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11746 + relocate_initrd();
11747 +#else
11748 + printk(KERN_ERR "initrd extends beyond end of memory "
11749 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
11750 + ramdisk_end, end_of_lowmem);
11751 + initrd_start = 0;
11752 #endif
11753 + free_early(ramdisk_image, ramdisk_end);
11754 }
11755 +#else
11756 +static void __init reserve_initrd(void)
11757 +{
11758 +}
11759 +#endif /* CONFIG_BLK_DEV_INITRD */
11760
11761 -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
11762 -cpumask_t *cpumask_of_cpu_map __read_mostly;
11763 -EXPORT_SYMBOL(cpumask_of_cpu_map);
11764 +static void __init parse_setup_data(void)
11765 +{
11766 +#ifndef CONFIG_XEN
11767 + struct setup_data *data;
11768 + u64 pa_data;
11769 +
11770 + if (boot_params.hdr.version < 0x0209)
11771 + return;
11772 + pa_data = boot_params.hdr.setup_data;
11773 + while (pa_data) {
11774 + data = early_ioremap(pa_data, PAGE_SIZE);
11775 + switch (data->type) {
11776 + case SETUP_E820_EXT:
11777 + parse_e820_ext(data, pa_data);
11778 + break;
11779 + default:
11780 + break;
11781 + }
11782 + pa_data = data->next;
11783 + early_iounmap(data, PAGE_SIZE);
11784 + }
11785 +#endif
11786 +}
11787
11788 -/* requires nr_cpu_ids to be initialized */
11789 -static void __init setup_cpumask_of_cpu(void)
11790 +static void __init e820_reserve_setup_data(void)
11791 {
11792 - int i;
11793 +#ifndef CONFIG_XEN
11794 + struct setup_data *data;
11795 + u64 pa_data;
11796 + int found = 0;
11797
11798 - /* alloc_bootmem zeroes memory */
11799 - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
11800 - for (i = 0; i < nr_cpu_ids; i++)
11801 - cpu_set(i, cpumask_of_cpu_map[i]);
11802 + if (boot_params.hdr.version < 0x0209)
11803 + return;
11804 + pa_data = boot_params.hdr.setup_data;
11805 + while (pa_data) {
11806 + data = early_ioremap(pa_data, sizeof(*data));
11807 + e820_update_range(pa_data, sizeof(*data)+data->len,
11808 + E820_RAM, E820_RESERVED_KERN);
11809 + found = 1;
11810 + pa_data = data->next;
11811 + early_iounmap(data, sizeof(*data));
11812 + }
11813 + if (!found)
11814 + return;
11815 +
11816 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
11817 + memcpy(&e820_saved, &e820, sizeof(struct e820map));
11818 + printk(KERN_INFO "extended physical RAM map:\n");
11819 + e820_print_map("reserve setup_data");
11820 +#endif
11821 }
11822 -#else
11823 -static inline void setup_cpumask_of_cpu(void) { }
11824 +
11825 +static void __init reserve_early_setup_data(void)
11826 +{
11827 +#ifndef CONFIG_XEN
11828 + struct setup_data *data;
11829 + u64 pa_data;
11830 + char buf[32];
11831 +
11832 + if (boot_params.hdr.version < 0x0209)
11833 + return;
11834 + pa_data = boot_params.hdr.setup_data;
11835 + while (pa_data) {
11836 + data = early_ioremap(pa_data, sizeof(*data));
11837 + sprintf(buf, "setup data %x", data->type);
11838 + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
11839 + pa_data = data->next;
11840 + early_iounmap(data, sizeof(*data));
11841 + }
11842 #endif
11843 +}
11844
11845 -#ifdef CONFIG_X86_32
11846 /*
11847 - * Great future not-so-futuristic plan: make i386 and x86_64 do it
11848 - * the same way
11849 + * --------- Crashkernel reservation ------------------------------
11850 + */
11851 +
11852 +#ifdef CONFIG_KEXEC
11853 +
11854 +#ifndef CONFIG_XEN
11855 +/**
11856 + * Reserve @size bytes of crashkernel memory at any suitable offset.
11857 + *
11858 + * @size: Size of the crashkernel memory to reserve.
11859 + * Returns the base address on success, and -1ULL on failure.
11860 + */
11861 +unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
11862 +{
11863 + const unsigned long long alignment = 16<<20; /* 16M */
11864 + unsigned long long start = 0LL;
11865 +
11866 + while (1) {
11867 + int ret;
11868 +
11869 + start = find_e820_area(start, ULONG_MAX, size, alignment);
11870 + if (start == -1ULL)
11871 + return start;
11872 +
11873 + /* try to reserve it */
11874 + ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
11875 + if (ret >= 0)
11876 + return start;
11877 +
11878 + start += alignment;
11879 + }
11880 +}
11881 +
11882 +static inline unsigned long long get_total_mem(void)
11883 +{
11884 + unsigned long long total;
11885 +
11886 + total = max_low_pfn - min_low_pfn;
11887 +#ifdef CONFIG_HIGHMEM
11888 + total += highend_pfn - highstart_pfn;
11889 +#endif
11890 +
11891 + return total << PAGE_SHIFT;
11892 +}
11893 +
11894 +static void __init reserve_crashkernel(void)
11895 +{
11896 + unsigned long long total_mem;
11897 + unsigned long long crash_size, crash_base;
11898 + int ret;
11899 +
11900 + total_mem = get_total_mem();
11901 +
11902 + ret = parse_crashkernel(boot_command_line, total_mem,
11903 + &crash_size, &crash_base);
11904 + if (ret != 0 || crash_size <= 0)
11905 + return;
11906 +
11907 + /* 0 means: find the address automatically */
11908 + if (crash_base <= 0) {
11909 + crash_base = find_and_reserve_crashkernel(crash_size);
11910 + if (crash_base == -1ULL) {
11911 + pr_info("crashkernel reservation failed. "
11912 + "No suitable area found.\n");
11913 + return;
11914 + }
11915 + } else {
11916 + ret = reserve_bootmem_generic(crash_base, crash_size,
11917 + BOOTMEM_EXCLUSIVE);
11918 + if (ret < 0) {
11919 + pr_info("crashkernel reservation failed - "
11920 + "memory is in use\n");
11921 + return;
11922 + }
11923 + }
11924 +
11925 + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
11926 + "for crashkernel (System RAM: %ldMB)\n",
11927 + (unsigned long)(crash_size >> 20),
11928 + (unsigned long)(crash_base >> 20),
11929 + (unsigned long)(total_mem >> 20));
11930 +
11931 + crashk_res.start = crash_base;
11932 + crashk_res.end = crash_base + crash_size - 1;
11933 + insert_resource(&iomem_resource, &crashk_res);
11934 +}
11935 +#else
11936 +#define reserve_crashkernel xen_machine_kexec_setup_resources
11937 +#endif
11938 +#else
11939 +static void __init reserve_crashkernel(void)
11940 +{
11941 +}
11942 +#endif
11943 +
11944 +static struct resource standard_io_resources[] = {
11945 + { .name = "dma1", .start = 0x00, .end = 0x1f,
11946 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11947 + { .name = "pic1", .start = 0x20, .end = 0x21,
11948 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11949 + { .name = "timer0", .start = 0x40, .end = 0x43,
11950 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11951 + { .name = "timer1", .start = 0x50, .end = 0x53,
11952 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11953 + { .name = "keyboard", .start = 0x60, .end = 0x60,
11954 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11955 + { .name = "keyboard", .start = 0x64, .end = 0x64,
11956 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11957 + { .name = "dma page reg", .start = 0x80, .end = 0x8f,
11958 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11959 + { .name = "pic2", .start = 0xa0, .end = 0xa1,
11960 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11961 + { .name = "dma2", .start = 0xc0, .end = 0xdf,
11962 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11963 + { .name = "fpu", .start = 0xf0, .end = 0xff,
11964 + .flags = IORESOURCE_BUSY | IORESOURCE_IO }
11965 +};
11966 +
11967 +static void __init reserve_standard_io_resources(void)
11968 +{
11969 + int i;
11970 +
11971 + /* Nothing to do if not running in dom0. */
11972 + if (!is_initial_xendomain())
11973 + return;
11974 +
11975 + /* request I/O space for devices used on all i[345]86 PCs */
11976 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
11977 + request_resource(&ioport_resource, &standard_io_resources[i]);
11978 +
11979 +}
11980 +
11981 +#ifdef CONFIG_PROC_VMCORE
11982 +/* elfcorehdr= specifies the location of elf core header
11983 + * stored by the crashed kernel. This option will be passed
11984 + * by kexec loader to the capture kernel.
11985 */
11986 -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
11987 -EXPORT_SYMBOL(__per_cpu_offset);
11988 +static int __init setup_elfcorehdr(char *arg)
11989 +{
11990 + char *end;
11991 + if (!arg)
11992 + return -EINVAL;
11993 + elfcorehdr_addr = memparse(arg, &end);
11994 + return end > arg ? 0 : -EINVAL;
11995 +}
11996 +early_param("elfcorehdr", setup_elfcorehdr);
11997 #endif
11998
11999 +static struct x86_quirks default_x86_quirks __initdata;
12000 +
12001 +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
12002 +
12003 /*
12004 - * Great future plan:
12005 - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
12006 - * Always point %gs to its beginning
12007 + * Determine if we were loaded by an EFI loader. If so, then we have also been
12008 + * passed the efi memmap, systab, etc., so we should use these data structures
12009 + * for initialization. Note, the efi init code path is determined by the
12010 + * global efi_enabled. This allows the same kernel image to be used on existing
12011 + * systems (with a traditional BIOS) as well as on EFI systems.
12012 */
12013 -void __init setup_per_cpu_areas(void)
12014 +/*
12015 + * setup_arch - architecture-specific boot-time initializations
12016 + *
12017 + * Note: On x86_64, fixmaps are ready for use even before this is called.
12018 + */
12019 +
12020 +void __init setup_arch(char **cmdline_p)
12021 {
12022 - int i, highest_cpu = 0;
12023 - unsigned long size;
12024 +#ifdef CONFIG_XEN
12025 + unsigned int i;
12026 + unsigned long p2m_pages;
12027 + struct physdev_set_iopl set_iopl;
12028
12029 -#ifdef CONFIG_HOTPLUG_CPU
12030 - prefill_possible_map();
12031 +#ifdef CONFIG_X86_32
12032 + /* Force a quick death if the kernel panics (not domain 0). */
12033 + extern int panic_timeout;
12034 + if (!panic_timeout && !is_initial_xendomain())
12035 + panic_timeout = 1;
12036 #endif
12037
12038 - /* Copy section for each CPU (we discard the original) */
12039 - size = PERCPU_ENOUGH_ROOM;
12040 - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
12041 - size);
12042 -
12043 - for_each_possible_cpu(i) {
12044 - char *ptr;
12045 -#ifndef CONFIG_NEED_MULTIPLE_NODES
12046 - ptr = alloc_bootmem_pages(size);
12047 -#else
12048 - int node = early_cpu_to_node(i);
12049 - if (!node_online(node) || !NODE_DATA(node)) {
12050 - ptr = alloc_bootmem_pages(size);
12051 - printk(KERN_INFO
12052 - "cpu %d has no node or node-local memory\n", i);
12053 - }
12054 - else
12055 - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
12056 + /* Register a call for panic conditions. */
12057 + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
12058 +
12059 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12060 + VMASST_TYPE_writable_pagetables));
12061 +#ifdef CONFIG_X86_32
12062 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12063 + VMASST_TYPE_4gb_segments));
12064 +#endif
12065 +#endif /* CONFIG_XEN */
12066 +
12067 +#ifdef CONFIG_X86_32
12068 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
12069 + visws_early_detect();
12070 + pre_setup_arch_hook();
12071 +#else
12072 + printk(KERN_INFO "Command line: %s\n", boot_command_line);
12073 +#endif
12074 +
12075 + early_cpu_init();
12076 + early_ioremap_init();
12077 +
12078 +#ifndef CONFIG_XEN
12079 + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
12080 + screen_info = boot_params.screen_info;
12081 + edid_info = boot_params.edid_info;
12082 +#ifdef CONFIG_X86_32
12083 + apm_info.bios = boot_params.apm_bios_info;
12084 + ist_info = boot_params.ist_info;
12085 + if (boot_params.sys_desc_table.length != 0) {
12086 + set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
12087 + machine_id = boot_params.sys_desc_table.table[0];
12088 + machine_submodel_id = boot_params.sys_desc_table.table[1];
12089 + BIOS_revision = boot_params.sys_desc_table.table[2];
12090 + }
12091 +#endif
12092 + saved_video_mode = boot_params.hdr.vid_mode;
12093 + bootloader_type = boot_params.hdr.type_of_loader;
12094 +
12095 +#ifdef CONFIG_BLK_DEV_RAM
12096 + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
12097 + rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
12098 + rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
12099 +#endif
12100 +#ifdef CONFIG_EFI
12101 + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
12102 +#ifdef CONFIG_X86_32
12103 + "EL32",
12104 +#else
12105 + "EL64",
12106 +#endif
12107 + 4)) {
12108 + efi_enabled = 1;
12109 + efi_reserve_early();
12110 + }
12111 +#endif
12112 +#else /* CONFIG_XEN */
12113 +#ifdef CONFIG_X86_32
12114 + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
12115 + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
12116 + */
12117 + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
12118 +#else
12119 + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
12120 +#endif
12121 + if (is_initial_xendomain()) {
12122 + const struct dom0_vga_console_info *info =
12123 + (void *)((char *)xen_start_info +
12124 + xen_start_info->console.dom0.info_off);
12125 +
12126 + dom0_init_screen_info(info,
12127 + xen_start_info->console.dom0.info_size);
12128 + xen_start_info->console.domU.mfn = 0;
12129 + xen_start_info->console.domU.evtchn = 0;
12130 + } else
12131 + screen_info.orig_video_isVGA = 0;
12132 + copy_edid();
12133 +#endif /* CONFIG_XEN */
12134 +
12135 + ARCH_SETUP
12136 +
12137 + setup_memory_map();
12138 + parse_setup_data();
12139 + /* update the e820_saved too */
12140 + e820_reserve_setup_data();
12141 +
12142 + copy_edd();
12143 +
12144 +#ifndef CONFIG_XEN
12145 + if (!boot_params.hdr.root_flags)
12146 + root_mountflags &= ~MS_RDONLY;
12147 #endif
12148 - if (!ptr)
12149 - panic("Cannot allocate cpu data for CPU %d\n", i);
12150 + init_mm.start_code = (unsigned long) _text;
12151 + init_mm.end_code = (unsigned long) _etext;
12152 + init_mm.end_data = (unsigned long) _edata;
12153 +#ifdef CONFIG_X86_32
12154 +#ifndef CONFIG_XEN
12155 + init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
12156 +#else
12157 + init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
12158 + xen_start_info->nr_pt_frames) << PAGE_SHIFT;
12159 +#endif
12160 +#else
12161 + init_mm.brk = (unsigned long) &_end;
12162 +#endif
12163 +
12164 + code_resource.start = virt_to_phys(_text);
12165 + code_resource.end = virt_to_phys(_etext)-1;
12166 + data_resource.start = virt_to_phys(_etext);
12167 + data_resource.end = virt_to_phys(_edata)-1;
12168 + bss_resource.start = virt_to_phys(&__bss_start);
12169 + bss_resource.end = virt_to_phys(&__bss_stop)-1;
12170 +
12171 + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
12172 + *cmdline_p = command_line;
12173 +
12174 + parse_early_param();
12175 +
12176 #ifdef CONFIG_X86_64
12177 - cpu_pda(i)->data_offset = ptr - __per_cpu_start;
12178 + check_efer();
12179 +#endif
12180 +
12181 +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
12182 + /*
12183 + * Must be before kernel pagetables are setup
12184 + * or fixmap area is touched.
12185 + */
12186 + vmi_init();
12187 +#endif
12188 +
12189 + /* after early param, so could get panic from serial */
12190 + reserve_early_setup_data();
12191 +
12192 + if (acpi_mps_check()) {
12193 +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
12194 + disable_apic = 1;
12195 +#endif
12196 + setup_clear_cpu_cap(X86_FEATURE_APIC);
12197 + }
12198 +
12199 +#ifdef CONFIG_PCI
12200 + if (pci_early_dump_regs)
12201 + early_dump_pci_devices();
12202 +#endif
12203 +
12204 + finish_e820_parsing();
12205 +
12206 +#ifdef CONFIG_X86_32
12207 + probe_roms();
12208 +#endif
12209 +
12210 +#ifndef CONFIG_XEN
12211 + /* after parse_early_param, so could debug it */
12212 + insert_resource(&iomem_resource, &code_resource);
12213 + insert_resource(&iomem_resource, &data_resource);
12214 + insert_resource(&iomem_resource, &bss_resource);
12215 +
12216 + if (efi_enabled)
12217 + efi_init();
12218 +
12219 +#ifdef CONFIG_X86_32
12220 + if (ppro_with_ram_bug()) {
12221 + e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
12222 + E820_RESERVED);
12223 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
12224 + printk(KERN_INFO "fixed physical RAM map:\n");
12225 + e820_print_map("bad_ppro");
12226 + }
12227 #else
12228 - __per_cpu_offset[i] = ptr - __per_cpu_start;
12229 + early_gart_iommu_check();
12230 #endif
12231 - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
12232 +#endif /* CONFIG_XEN */
12233
12234 - highest_cpu = i;
12235 + /*
12236 + * partially used pages are not usable - thus
12237 + * we are rounding upwards:
12238 + */
12239 + max_pfn = e820_end_of_ram_pfn();
12240 +
12241 + /* preallocate 4k for mptable mpc */
12242 + early_reserve_e820_mpc_new();
12243 + /* update e820 for memory not covered by WB MTRRs */
12244 + mtrr_bp_init();
12245 +#ifndef CONFIG_XEN
12246 + if (mtrr_trim_uncached_memory(max_pfn))
12247 + max_pfn = e820_end_of_ram_pfn();
12248 +#endif
12249 +
12250 +#ifdef CONFIG_X86_32
12251 + /* max_low_pfn get updated here */
12252 + find_low_pfn_range();
12253 +#else
12254 + num_physpages = max_pfn;
12255 + max_mapnr = max_pfn;
12256 +
12257 +
12258 + /* How many end-of-memory variables you have, grandma! */
12259 + /* need this before calling reserve_initrd */
12260 + if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
12261 + max_low_pfn = e820_end_of_low_ram_pfn();
12262 + else
12263 + max_low_pfn = max_pfn;
12264 +
12265 + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
12266 +#endif
12267 +
12268 + /* max_pfn_mapped is updated here */
12269 +#ifdef CONFIG_X86_64_XEN
12270 + /*
12271 + * Due to the way initial table space gets calculated on Xen, we have
12272 + * to call init_memory_mapping() with the larger end address first.
12273 + */
12274 + if (max_pfn > max_low_pfn)
12275 + max_pfn_mapped = init_memory_mapping(1UL<<32,
12276 + max_pfn<<PAGE_SHIFT);
12277 + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
12278 + if (max_pfn > max_low_pfn)
12279 + /* can we preserve max_low_pfn ?*/
12280 + max_low_pfn = max_pfn;
12281 + else
12282 + max_pfn_mapped = max_low_pfn_mapped;
12283 +#else
12284 + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
12285 + max_pfn_mapped = max_low_pfn_mapped;
12286 +
12287 +#ifdef CONFIG_X86_64
12288 + if (max_pfn > max_low_pfn) {
12289 + max_pfn_mapped = init_memory_mapping(1UL<<32,
12290 + max_pfn<<PAGE_SHIFT);
12291 + /* can we preseve max_low_pfn ?*/
12292 + max_low_pfn = max_pfn;
12293 }
12294 +#endif
12295 +#endif
12296
12297 - nr_cpu_ids = highest_cpu + 1;
12298 - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
12299 + /*
12300 + * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
12301 + */
12302
12303 - /* Setup percpu data maps */
12304 - setup_per_cpu_maps();
12305 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
12306 + if (init_ohci1394_dma_early)
12307 + init_ohci1394_dma_on_all_controllers();
12308 +#endif
12309
12310 - /* Setup cpumask_of_cpu map */
12311 - setup_cpumask_of_cpu();
12312 -}
12313 + reserve_initrd();
12314 +
12315 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12316 + vsmp_init();
12317 +#endif
12318 +
12319 + if (is_initial_xendomain())
12320 + dmi_scan_machine();
12321 +
12322 + io_delay_init();
12323 +
12324 +#ifdef CONFIG_ACPI
12325 + if (!is_initial_xendomain()) {
12326 + printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
12327 + disable_acpi();
12328 + }
12329 +#endif
12330 +
12331 + /*
12332 + * Parse the ACPI tables for possible boot-time SMP configuration.
12333 + */
12334 + acpi_boot_table_init();
12335 +
12336 +#ifdef CONFIG_ACPI_NUMA
12337 + /*
12338 + * Parse SRAT to discover nodes.
12339 + */
12340 + acpi_numa_init();
12341 +#endif
12342 +
12343 + initmem_init(0, max_pfn);
12344
12345 +#ifdef CONFIG_ACPI_SLEEP
12346 + /*
12347 + * Reserve low memory region for sleep support.
12348 + */
12349 + acpi_reserve_bootmem();
12350 #endif
12351 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
12352 + /*
12353 + * Find and reserve possible boot-time SMP configuration:
12354 + */
12355 + find_smp_config();
12356 +#endif
12357 + reserve_crashkernel();
12358 +
12359 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12360 + /*
12361 + * dma32_reserve_bootmem() allocates bootmem which may conflict
12362 + * with the crashkernel command line, so do that after
12363 + * reserve_crashkernel()
12364 + */
12365 + dma32_reserve_bootmem();
12366 +#endif
12367 +
12368 + reserve_ibft_region();
12369 +
12370 +#ifdef CONFIG_KVM_CLOCK
12371 + kvmclock_init();
12372 +#endif
12373 +
12374 + xen_pagetable_setup_start(swapper_pg_dir);
12375 + paging_init();
12376 + xen_pagetable_setup_done(swapper_pg_dir);
12377 + paravirt_post_allocator_init();
12378 +
12379 +#ifdef CONFIG_X86_64
12380 + map_vsyscall();
12381 +#endif
12382 +
12383 +#ifdef CONFIG_XEN
12384 + p2m_pages = max_pfn;
12385 + if (xen_start_info->nr_pages > max_pfn) {
12386 + /*
12387 + * the max_pfn was shrunk (probably by mem= or highmem=
12388 + * kernel parameter); shrink reservation with the HV
12389 + */
12390 + struct xen_memory_reservation reservation = {
12391 + .address_bits = 0,
12392 + .extent_order = 0,
12393 + .domid = DOMID_SELF
12394 + };
12395 + unsigned int difference;
12396 + int ret;
12397 +
12398 + difference = xen_start_info->nr_pages - max_pfn;
12399 +
12400 + set_xen_guest_handle(reservation.extent_start,
12401 + ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
12402 + reservation.nr_extents = difference;
12403 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
12404 + &reservation);
12405 + BUG_ON(ret != difference);
12406 + }
12407 + else if (max_pfn > xen_start_info->nr_pages)
12408 + p2m_pages = xen_start_info->nr_pages;
12409 +
12410 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
12411 + unsigned long i, j;
12412 + unsigned int k, fpp;
12413 +
12414 + /* Make sure we have a large enough P->M table. */
12415 + phys_to_machine_mapping = alloc_bootmem_pages(
12416 + max_pfn * sizeof(unsigned long));
12417 + memset(phys_to_machine_mapping, ~0,
12418 + max_pfn * sizeof(unsigned long));
12419 + memcpy(phys_to_machine_mapping,
12420 + (unsigned long *)xen_start_info->mfn_list,
12421 + p2m_pages * sizeof(unsigned long));
12422 + free_bootmem(
12423 + __pa(xen_start_info->mfn_list),
12424 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
12425 + sizeof(unsigned long))));
12426 +
12427 + /*
12428 + * Initialise the list of the frames that specify the list of
12429 + * frames that make up the p2m table. Used by save/restore.
12430 + */
12431 + pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
12432 +
12433 + fpp = PAGE_SIZE/sizeof(unsigned long);
12434 + for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
12435 + if (j == fpp)
12436 + j = 0;
12437 + if (j == 0) {
12438 + k++;
12439 + BUG_ON(k>=ARRAY_SIZE(pfn_to_mfn_frame_list));
12440 + pfn_to_mfn_frame_list[k] =
12441 + alloc_bootmem_pages(PAGE_SIZE);
12442 + pfn_to_mfn_frame_list_list[k] =
12443 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
12444 + }
12445 + pfn_to_mfn_frame_list[k][j] =
12446 + virt_to_mfn(&phys_to_machine_mapping[i]);
12447 + }
12448 + HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
12449 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
12450 + virt_to_mfn(pfn_to_mfn_frame_list_list);
12451 + }
12452 +
12453 + /* Mark all ISA DMA channels in-use - using them wouldn't work. */
12454 + for (i = 0; i < MAX_DMA_CHANNELS; ++i)
12455 + if (i != 4 && request_dma(i, "xen") != 0)
12456 + BUG();
12457 +#endif /* CONFIG_XEN */
12458 +
12459 +#ifdef CONFIG_X86_GENERICARCH
12460 + generic_apic_probe();
12461 +#endif
12462 +
12463 +#ifndef CONFIG_XEN
12464 + early_quirks();
12465 +#endif
12466 +
12467 + /*
12468 + * Read APIC and some other early information from ACPI tables.
12469 + */
12470 + acpi_boot_init();
12471 +
12472 +#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
12473 + /*
12474 + * get boot-time SMP configuration:
12475 + */
12476 + if (smp_found_config)
12477 + get_smp_config();
12478 +#endif
12479 +
12480 + prefill_possible_map();
12481 +#ifdef CONFIG_X86_64
12482 + init_cpu_to_node();
12483 +#endif
12484 +
12485 +#ifndef CONFIG_XEN
12486 + init_apic_mappings();
12487 + ioapic_init_mappings();
12488 +
12489 + kvm_guest_init();
12490 +
12491 + e820_reserve_resources();
12492 + e820_mark_nosave_regions(max_low_pfn);
12493 +#else
12494 + if (is_initial_xendomain())
12495 + e820_reserve_resources();
12496 +#endif
12497 +
12498 +#ifdef CONFIG_X86_32
12499 + request_resource(&iomem_resource, &video_ram_resource);
12500 +#endif
12501 + reserve_standard_io_resources();
12502 +
12503 +#ifndef CONFIG_XEN
12504 + e820_setup_gap();
12505 +
12506 +#ifdef CONFIG_VT
12507 +#if defined(CONFIG_VGA_CONSOLE)
12508 + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
12509 + conswitchp = &vga_con;
12510 +#elif defined(CONFIG_DUMMY_CONSOLE)
12511 + conswitchp = &dummy_con;
12512 +#endif
12513 +#endif
12514 +#else /* CONFIG_XEN */
12515 + if (is_initial_xendomain())
12516 + e820_setup_gap();
12517 +
12518 + set_iopl.iopl = 1;
12519 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
12520 +
12521 +#ifdef CONFIG_VT
12522 +#ifdef CONFIG_DUMMY_CONSOLE
12523 + conswitchp = &dummy_con;
12524 +#endif
12525 +#ifdef CONFIG_VGA_CONSOLE
12526 + if (is_initial_xendomain())
12527 + conswitchp = &vga_con;
12528 +#endif
12529 +#endif
12530 +#endif /* CONFIG_XEN */
12531 +}
12532 +
12533 +#ifdef CONFIG_XEN
12534 +static int
12535 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
12536 +{
12537 + HYPERVISOR_shutdown(SHUTDOWN_crash);
12538 + /* we're never actually going to get here... */
12539 + return NOTIFY_DONE;
12540 +}
12541 +#endif /* !CONFIG_XEN */
12542 Index: head-2008-12-01/arch/x86/kernel/setup64-xen.c
12543 ===================================================================
12544 --- head-2008-12-01.orig/arch/x86/kernel/setup64-xen.c 2008-12-01 11:44:55.000000000 +0100
12545 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12546 @@ -1,370 +0,0 @@
12547 -/*
12548 - * X86-64 specific CPU setup.
12549 - * Copyright (C) 1995 Linus Torvalds
12550 - * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
12551 - * See setup.c for older changelog.
12552 - *
12553 - * Jun Nakajima <jun.nakajima@intel.com>
12554 - * Modified for Xen
12555 - *
12556 - */
12557 -#include <linux/init.h>
12558 -#include <linux/kernel.h>
12559 -#include <linux/sched.h>
12560 -#include <linux/string.h>
12561 -#include <linux/bootmem.h>
12562 -#include <linux/bitops.h>
12563 -#include <linux/module.h>
12564 -#include <linux/kgdb.h>
12565 -#include <asm/pda.h>
12566 -#include <asm/pgtable.h>
12567 -#include <asm/processor.h>
12568 -#include <asm/desc.h>
12569 -#include <asm/atomic.h>
12570 -#include <asm/mmu_context.h>
12571 -#include <asm/smp.h>
12572 -#include <asm/i387.h>
12573 -#include <asm/percpu.h>
12574 -#include <asm/proto.h>
12575 -#include <asm/sections.h>
12576 -#include <asm/setup.h>
12577 -#include <asm/genapic.h>
12578 -#ifdef CONFIG_XEN
12579 -#include <asm/hypervisor.h>
12580 -#endif
12581 -
12582 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
12583 -struct boot_params __initdata boot_params;
12584 -#else
12585 -struct boot_params boot_params;
12586 -#endif
12587 -
12588 -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
12589 -
12590 -struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
12591 -EXPORT_SYMBOL(_cpu_pda);
12592 -struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
12593 -
12594 -#ifndef CONFIG_X86_NO_IDT
12595 -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
12596 -#endif
12597 -
12598 -char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
12599 -
12600 -unsigned long __supported_pte_mask __read_mostly = ~0UL;
12601 -EXPORT_SYMBOL(__supported_pte_mask);
12602 -
12603 -static int do_not_nx __cpuinitdata = 0;
12604 -
12605 -/* noexec=on|off
12606 -Control non executable mappings for 64bit processes.
12607 -
12608 -on Enable(default)
12609 -off Disable
12610 -*/
12611 -static int __init nonx_setup(char *str)
12612 -{
12613 - if (!str)
12614 - return -EINVAL;
12615 - if (!strncmp(str, "on", 2)) {
12616 - __supported_pte_mask |= _PAGE_NX;
12617 - do_not_nx = 0;
12618 - } else if (!strncmp(str, "off", 3)) {
12619 - do_not_nx = 1;
12620 - __supported_pte_mask &= ~_PAGE_NX;
12621 - }
12622 - return 0;
12623 -}
12624 -early_param("noexec", nonx_setup);
12625 -
12626 -int force_personality32 = 0;
12627 -
12628 -/* noexec32=on|off
12629 -Control non executable heap for 32bit processes.
12630 -To control the stack too use noexec=off
12631 -
12632 -on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
12633 -off PROT_READ implies PROT_EXEC
12634 -*/
12635 -static int __init nonx32_setup(char *str)
12636 -{
12637 - if (!strcmp(str, "on"))
12638 - force_personality32 &= ~READ_IMPLIES_EXEC;
12639 - else if (!strcmp(str, "off"))
12640 - force_personality32 |= READ_IMPLIES_EXEC;
12641 - return 1;
12642 -}
12643 -__setup("noexec32=", nonx32_setup);
12644 -
12645 -#ifdef CONFIG_XEN
12646 -static void __init_refok switch_pt(int cpu)
12647 -{
12648 - if (cpu == 0)
12649 - xen_init_pt();
12650 - xen_pt_switch(__pa_symbol(init_level4_pgt));
12651 - xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
12652 -}
12653 -#define switch_pt() switch_pt(cpu)
12654 -
12655 -static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12656 -{
12657 - unsigned long frames[16];
12658 - unsigned long va;
12659 - int f;
12660 -
12661 - for (va = gdt_descr->address, f = 0;
12662 - va < gdt_descr->address + gdt_descr->size;
12663 - va += PAGE_SIZE, f++) {
12664 - frames[f] = virt_to_mfn(va);
12665 - make_page_readonly(
12666 - (void *)va, XENFEAT_writable_descriptor_tables);
12667 - }
12668 - if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
12669 - sizeof (struct desc_struct)))
12670 - BUG();
12671 -}
12672 -#else
12673 -static void switch_pt(void)
12674 -{
12675 - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
12676 -}
12677 -
12678 -static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12679 -{
12680 - load_gdt(gdt_descr);
12681 - load_idt(idt_descr);
12682 -}
12683 -#endif
12684 -
12685 -void pda_init(int cpu)
12686 -{
12687 - struct x8664_pda *pda = cpu_pda(cpu);
12688 -
12689 - /* Setup up data that may be needed in __get_free_pages early */
12690 - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
12691 -#ifndef CONFIG_XEN
12692 - /* Memory clobbers used to order PDA accessed */
12693 - mb();
12694 - wrmsrl(MSR_GS_BASE, pda);
12695 - mb();
12696 -#else
12697 - if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
12698 - (unsigned long)pda))
12699 - BUG();
12700 -#endif
12701 - pda->cpunumber = cpu;
12702 - pda->irqcount = -1;
12703 - pda->kernelstack =
12704 - (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
12705 - pda->active_mm = &init_mm;
12706 - pda->mmu_state = 0;
12707 -
12708 - if (cpu == 0) {
12709 - /* others are initialized in smpboot.c */
12710 - pda->pcurrent = &init_task;
12711 - pda->irqstackptr = boot_cpu_stack;
12712 - } else {
12713 - pda->irqstackptr = (char *)
12714 - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
12715 - if (!pda->irqstackptr)
12716 - panic("cannot allocate irqstack for cpu %d", cpu);
12717 - }
12718 -
12719 - switch_pt();
12720 -
12721 - pda->irqstackptr += IRQSTACKSIZE-64;
12722 -}
12723 -
12724 -#ifndef CONFIG_X86_NO_TSS
12725 -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
12726 -__attribute__((section(".bss.page_aligned")));
12727 -#endif
12728 -
12729 -extern asmlinkage void ignore_sysret(void);
12730 -
12731 -/* May not be marked __init: used by software suspend */
12732 -void syscall_init(void)
12733 -{
12734 -#ifndef CONFIG_XEN
12735 - /*
12736 - * LSTAR and STAR live in a bit strange symbiosis.
12737 - * They both write to the same internal register. STAR allows to set CS/DS
12738 - * but only a 32bit target. LSTAR sets the 64bit rip.
12739 - */
12740 - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
12741 - wrmsrl(MSR_LSTAR, system_call);
12742 - wrmsrl(MSR_CSTAR, ignore_sysret);
12743 -
12744 - /* Flags to clear on syscall */
12745 - wrmsrl(MSR_SYSCALL_MASK,
12746 - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
12747 -#endif
12748 -#ifdef CONFIG_IA32_EMULATION
12749 - syscall32_cpu_init ();
12750 -#else
12751 - {
12752 - static const struct callback_register cstar = {
12753 - .type = CALLBACKTYPE_syscall32,
12754 - .address = (unsigned long)ignore_sysret
12755 - };
12756 - if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
12757 - printk(KERN_WARN "Unable to register CSTAR callback\n");
12758 - }
12759 -#endif
12760 -}
12761 -
12762 -void __cpuinit check_efer(void)
12763 -{
12764 - unsigned long efer;
12765 -
12766 - rdmsrl(MSR_EFER, efer);
12767 - if (!(efer & EFER_NX) || do_not_nx) {
12768 - __supported_pte_mask &= ~_PAGE_NX;
12769 - }
12770 -}
12771 -
12772 -unsigned long kernel_eflags;
12773 -
12774 -#ifndef CONFIG_X86_NO_TSS
12775 -/*
12776 - * Copies of the original ist values from the tss are only accessed during
12777 - * debugging, no special alignment required.
12778 - */
12779 -DEFINE_PER_CPU(struct orig_ist, orig_ist);
12780 -#endif
12781 -
12782 -/*
12783 - * cpu_init() initializes state that is per-CPU. Some data is already
12784 - * initialized (naturally) in the bootstrap process, such as the GDT
12785 - * and IDT. We reload them nevertheless, this function acts as a
12786 - * 'CPU state barrier', nothing should get across.
12787 - * A lot of state is already set up in PDA init.
12788 - */
12789 -void __cpuinit cpu_init (void)
12790 -{
12791 - int cpu = stack_smp_processor_id();
12792 -#ifndef CONFIG_X86_NO_TSS
12793 - struct tss_struct *t = &per_cpu(init_tss, cpu);
12794 - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
12795 - unsigned long v;
12796 - char *estacks = NULL;
12797 - unsigned i;
12798 -#endif
12799 - struct task_struct *me;
12800 -
12801 - /* CPU 0 is initialised in head64.c */
12802 - if (cpu != 0) {
12803 - pda_init(cpu);
12804 - }
12805 -#ifndef CONFIG_X86_NO_TSS
12806 - else
12807 - estacks = boot_exception_stacks;
12808 -#endif
12809 -
12810 - me = current;
12811 -
12812 - if (cpu_test_and_set(cpu, cpu_initialized))
12813 - panic("CPU#%d already initialized!\n", cpu);
12814 -
12815 - printk("Initializing CPU#%d\n", cpu);
12816 -
12817 - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
12818 -
12819 - /*
12820 - * Initialize the per-CPU GDT with the boot GDT,
12821 - * and set up the GDT descriptor:
12822 - */
12823 -#ifndef CONFIG_XEN
12824 - if (cpu)
12825 - memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
12826 -#endif
12827 -
12828 - cpu_gdt_descr[cpu].size = GDT_SIZE;
12829 - cpu_gdt_init(&cpu_gdt_descr[cpu]);
12830 -
12831 - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
12832 - syscall_init();
12833 -
12834 - wrmsrl(MSR_FS_BASE, 0);
12835 - wrmsrl(MSR_KERNEL_GS_BASE, 0);
12836 - barrier();
12837 -
12838 - check_efer();
12839 -
12840 -#ifndef CONFIG_X86_NO_TSS
12841 - /*
12842 - * set up and load the per-CPU TSS
12843 - */
12844 - for (v = 0; v < N_EXCEPTION_STACKS; v++) {
12845 - static const unsigned int order[N_EXCEPTION_STACKS] = {
12846 - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
12847 - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
12848 - };
12849 - if (cpu) {
12850 - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
12851 - if (!estacks)
12852 - panic("Cannot allocate exception stack %ld %d\n",
12853 - v, cpu);
12854 - }
12855 - estacks += PAGE_SIZE << order[v];
12856 - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
12857 - }
12858 -
12859 - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
12860 - /*
12861 - * <= is required because the CPU will access up to
12862 - * 8 bits beyond the end of the IO permission bitmap.
12863 - */
12864 - for (i = 0; i <= IO_BITMAP_LONGS; i++)
12865 - t->io_bitmap[i] = ~0UL;
12866 -#endif
12867 -
12868 - atomic_inc(&init_mm.mm_count);
12869 - me->active_mm = &init_mm;
12870 - if (me->mm)
12871 - BUG();
12872 - enter_lazy_tlb(&init_mm, me);
12873 -
12874 -#ifndef CONFIG_X86_NO_TSS
12875 - set_tss_desc(cpu, t);
12876 -#endif
12877 -#ifndef CONFIG_XEN
12878 - load_TR_desc();
12879 -#endif
12880 - load_LDT(&init_mm.context);
12881 -
12882 -#ifdef CONFIG_KGDB
12883 - /*
12884 - * If the kgdb is connected no debug regs should be altered. This
12885 - * is only applicable when KGDB and a KGDB I/O module are built
12886 - * into the kernel and you are using early debugging with
12887 - * kgdbwait. KGDB will control the kernel HW breakpoint registers.
12888 - */
12889 - if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
12890 - arch_kgdb_ops.correct_hw_break();
12891 - else {
12892 -#endif
12893 - /*
12894 - * Clear all 6 debug registers:
12895 - */
12896 -
12897 - set_debugreg(0UL, 0);
12898 - set_debugreg(0UL, 1);
12899 - set_debugreg(0UL, 2);
12900 - set_debugreg(0UL, 3);
12901 - set_debugreg(0UL, 6);
12902 - set_debugreg(0UL, 7);
12903 -#ifdef CONFIG_KGDB
12904 - /* If the kgdb is connected no debug regs should be altered. */
12905 - }
12906 -#endif
12907 -
12908 - fpu_init();
12909 -
12910 - asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
12911 - if (raw_irqs_disabled())
12912 - kernel_eflags &= ~X86_EFLAGS_IF;
12913 -
12914 - if (is_uv_system())
12915 - uv_cpu_init();
12916 -}
12917 Index: head-2008-12-01/arch/x86/kernel/setup_32-xen.c
12918 ===================================================================
12919 --- head-2008-12-01.orig/arch/x86/kernel/setup_32-xen.c 2008-12-01 11:44:55.000000000 +0100
12920 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12921 @@ -1,1151 +0,0 @@
12922 -/*
12923 - * Copyright (C) 1995 Linus Torvalds
12924 - *
12925 - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
12926 - *
12927 - * Memory region support
12928 - * David Parsons <orc@pell.chi.il.us>, July-August 1999
12929 - *
12930 - * Added E820 sanitization routine (removes overlapping memory regions);
12931 - * Brian Moyle <bmoyle@mvista.com>, February 2001
12932 - *
12933 - * Moved CPU detection code to cpu/${cpu}.c
12934 - * Patrick Mochel <mochel@osdl.org>, March 2002
12935 - *
12936 - * Provisions for empty E820 memory regions (reported by certain BIOSes).
12937 - * Alex Achenbach <xela@slit.de>, December 2002.
12938 - *
12939 - */
12940 -
12941 -/*
12942 - * This file handles the architecture-dependent parts of initialization
12943 - */
12944 -
12945 -#include <linux/sched.h>
12946 -#include <linux/mm.h>
12947 -#include <linux/mmzone.h>
12948 -#include <linux/screen_info.h>
12949 -#include <linux/ioport.h>
12950 -#include <linux/acpi.h>
12951 -#include <linux/apm_bios.h>
12952 -#include <linux/initrd.h>
12953 -#include <linux/bootmem.h>
12954 -#include <linux/seq_file.h>
12955 -#include <linux/console.h>
12956 -#include <linux/mca.h>
12957 -#include <linux/root_dev.h>
12958 -#include <linux/highmem.h>
12959 -#include <linux/module.h>
12960 -#include <linux/efi.h>
12961 -#include <linux/init.h>
12962 -#include <linux/edd.h>
12963 -#include <linux/iscsi_ibft.h>
12964 -#include <linux/nodemask.h>
12965 -#include <linux/kernel.h>
12966 -#include <linux/percpu.h>
12967 -#include <linux/notifier.h>
12968 -#include <linux/kexec.h>
12969 -#include <linux/crash_dump.h>
12970 -#include <linux/dmi.h>
12971 -#include <linux/pfn.h>
12972 -#include <linux/pci.h>
12973 -#include <linux/init_ohci1394_dma.h>
12974 -#include <linux/kvm_para.h>
12975 -
12976 -#include <video/edid.h>
12977 -
12978 -#include <asm/mtrr.h>
12979 -#include <asm/apic.h>
12980 -#include <asm/e820.h>
12981 -#include <asm/mpspec.h>
12982 -#include <asm/mmzone.h>
12983 -#include <asm/setup.h>
12984 -#include <asm/arch_hooks.h>
12985 -#include <asm/sections.h>
12986 -#include <asm/io_apic.h>
12987 -#include <asm/ist.h>
12988 -#include <asm/io.h>
12989 -#include <asm/hypervisor.h>
12990 -#include <xen/interface/physdev.h>
12991 -#include <xen/interface/memory.h>
12992 -#include <xen/features.h>
12993 -#include <xen/firmware.h>
12994 -#include <xen/xencons.h>
12995 -#include <setup_arch.h>
12996 -#include <asm/bios_ebda.h>
12997 -#include <asm/cacheflush.h>
12998 -#include <asm/processor.h>
12999 -
13000 -#ifdef CONFIG_XEN
13001 -#include <xen/interface/kexec.h>
13002 -#endif
13003 -
13004 -static int xen_panic_event(struct notifier_block *, unsigned long, void *);
13005 -static struct notifier_block xen_panic_block = {
13006 - xen_panic_event, NULL, 0 /* try to go last */
13007 -};
13008 -
13009 -/*
13010 - * Machine setup..
13011 - */
13012 -static struct resource data_resource = {
13013 - .name = "Kernel data",
13014 - .start = 0,
13015 - .end = 0,
13016 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13017 -};
13018 -
13019 -static struct resource code_resource = {
13020 - .name = "Kernel code",
13021 - .start = 0,
13022 - .end = 0,
13023 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13024 -};
13025 -
13026 -static struct resource bss_resource = {
13027 - .name = "Kernel bss",
13028 - .start = 0,
13029 - .end = 0,
13030 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13031 -};
13032 -
13033 -static struct resource video_ram_resource = {
13034 - .name = "Video RAM area",
13035 - .start = 0xa0000,
13036 - .end = 0xbffff,
13037 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13038 -};
13039 -
13040 -static struct resource standard_io_resources[] = { {
13041 - .name = "dma1",
13042 - .start = 0x0000,
13043 - .end = 0x001f,
13044 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13045 -}, {
13046 - .name = "pic1",
13047 - .start = 0x0020,
13048 - .end = 0x0021,
13049 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13050 -}, {
13051 - .name = "timer0",
13052 - .start = 0x0040,
13053 - .end = 0x0043,
13054 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13055 -}, {
13056 - .name = "timer1",
13057 - .start = 0x0050,
13058 - .end = 0x0053,
13059 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13060 -}, {
13061 - .name = "keyboard",
13062 - .start = 0x0060,
13063 - .end = 0x0060,
13064 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13065 -}, {
13066 - .name = "keyboard",
13067 - .start = 0x0064,
13068 - .end = 0x0064,
13069 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13070 -}, {
13071 - .name = "dma page reg",
13072 - .start = 0x0080,
13073 - .end = 0x008f,
13074 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13075 -}, {
13076 - .name = "pic2",
13077 - .start = 0x00a0,
13078 - .end = 0x00a1,
13079 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13080 -}, {
13081 - .name = "dma2",
13082 - .start = 0x00c0,
13083 - .end = 0x00df,
13084 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13085 -}, {
13086 - .name = "fpu",
13087 - .start = 0x00f0,
13088 - .end = 0x00ff,
13089 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13090 -} };
13091 -
13092 -/* cpu data as detected by the assembly code in head.S */
13093 -struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13094 -/* common cpu data for all cpus */
13095 -struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13096 -EXPORT_SYMBOL(boot_cpu_data);
13097 -
13098 -unsigned int def_to_bigsmp;
13099 -
13100 -#ifndef CONFIG_X86_PAE
13101 -unsigned long mmu_cr4_features;
13102 -#else
13103 -unsigned long mmu_cr4_features = X86_CR4_PAE;
13104 -#endif
13105 -
13106 -/* for MCA, but anyone else can use it if they want */
13107 -unsigned int machine_id;
13108 -unsigned int machine_submodel_id;
13109 -unsigned int BIOS_revision;
13110 -
13111 -/* Boot loader ID as an integer, for the benefit of proc_dointvec */
13112 -int bootloader_type;
13113 -
13114 -/* user-defined highmem size */
13115 -static unsigned int highmem_pages = -1;
13116 -
13117 -/*
13118 - * Setup options
13119 - */
13120 -struct screen_info screen_info;
13121 -EXPORT_SYMBOL(screen_info);
13122 -struct apm_info apm_info;
13123 -EXPORT_SYMBOL(apm_info);
13124 -struct edid_info edid_info;
13125 -EXPORT_SYMBOL_GPL(edid_info);
13126 -#ifndef CONFIG_XEN
13127 -#define copy_edid() (edid_info = boot_params.edid_info)
13128 -#endif
13129 -struct ist_info ist_info;
13130 -#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
13131 - defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
13132 -EXPORT_SYMBOL(ist_info);
13133 -#endif
13134 -
13135 -extern void early_cpu_init(void);
13136 -extern int root_mountflags;
13137 -
13138 -unsigned long saved_video_mode;
13139 -
13140 -#define RAMDISK_IMAGE_START_MASK 0x07FF
13141 -#define RAMDISK_PROMPT_FLAG 0x8000
13142 -#define RAMDISK_LOAD_FLAG 0x4000
13143 -
13144 -static char __initdata command_line[COMMAND_LINE_SIZE];
13145 -
13146 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
13147 -struct boot_params __initdata boot_params;
13148 -#else
13149 -struct boot_params boot_params;
13150 -#endif
13151 -
13152 -/*
13153 - * Point at the empty zero page to start with. We map the real shared_info
13154 - * page as soon as fixmap is up and running.
13155 - */
13156 -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
13157 -EXPORT_SYMBOL(HYPERVISOR_shared_info);
13158 -
13159 -unsigned long *phys_to_machine_mapping;
13160 -unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
13161 -EXPORT_SYMBOL(phys_to_machine_mapping);
13162 -
13163 -/* Raw start-of-day parameters from the hypervisor. */
13164 -start_info_t *xen_start_info;
13165 -EXPORT_SYMBOL(xen_start_info);
13166 -
13167 -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
13168 -struct edd edd;
13169 -#ifdef CONFIG_EDD_MODULE
13170 -EXPORT_SYMBOL(edd);
13171 -#endif
13172 -#ifndef CONFIG_XEN
13173 -/**
13174 - * copy_edd() - Copy the BIOS EDD information
13175 - * from boot_params into a safe place.
13176 - *
13177 - */
13178 -static inline void copy_edd(void)
13179 -{
13180 - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
13181 - sizeof(edd.mbr_signature));
13182 - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
13183 - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
13184 - edd.edd_info_nr = boot_params.eddbuf_entries;
13185 -}
13186 -#endif
13187 -#else
13188 -static inline void copy_edd(void)
13189 -{
13190 -}
13191 -#endif
13192 -
13193 -int __initdata user_defined_memmap;
13194 -
13195 -/*
13196 - * "mem=nopentium" disables the 4MB page tables.
13197 - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
13198 - * to <mem>, overriding the bios size.
13199 - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
13200 - * <start> to <start>+<mem>, overriding the bios size.
13201 - *
13202 - * HPA tells me bootloaders need to parse mem=, so no new
13203 - * option should be mem= [also see Documentation/i386/boot.txt]
13204 - */
13205 -static int __init parse_mem(char *arg)
13206 -{
13207 - if (!arg)
13208 - return -EINVAL;
13209 -
13210 - if (strcmp(arg, "nopentium") == 0) {
13211 - setup_clear_cpu_cap(X86_FEATURE_PSE);
13212 - } else {
13213 - /* If the user specifies memory size, we
13214 - * limit the BIOS-provided memory map to
13215 - * that size. exactmap can be used to specify
13216 - * the exact map. mem=number can be used to
13217 - * trim the existing memory map.
13218 - */
13219 - unsigned long long mem_size;
13220 -
13221 - mem_size = memparse(arg, &arg);
13222 - limit_regions(mem_size);
13223 - user_defined_memmap = 1;
13224 - }
13225 - return 0;
13226 -}
13227 -early_param("mem", parse_mem);
13228 -
13229 -#ifdef CONFIG_PROC_VMCORE
13230 -/* elfcorehdr= specifies the location of elf core header
13231 - * stored by the crashed kernel.
13232 - */
13233 -static int __init parse_elfcorehdr(char *arg)
13234 -{
13235 - if (!arg)
13236 - return -EINVAL;
13237 -
13238 - elfcorehdr_addr = memparse(arg, &arg);
13239 - return 0;
13240 -}
13241 -early_param("elfcorehdr", parse_elfcorehdr);
13242 -#endif /* CONFIG_PROC_VMCORE */
13243 -
13244 -/*
13245 - * highmem=size forces highmem to be exactly 'size' bytes.
13246 - * This works even on boxes that have no highmem otherwise.
13247 - * This also works to reduce highmem size on bigger boxes.
13248 - */
13249 -static int __init parse_highmem(char *arg)
13250 -{
13251 - if (!arg)
13252 - return -EINVAL;
13253 -
13254 - highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
13255 - return 0;
13256 -}
13257 -early_param("highmem", parse_highmem);
13258 -
13259 -/*
13260 - * vmalloc=size forces the vmalloc area to be exactly 'size'
13261 - * bytes. This can be used to increase (or decrease) the
13262 - * vmalloc area - the default is 128m.
13263 - */
13264 -static int __init parse_vmalloc(char *arg)
13265 -{
13266 - if (!arg)
13267 - return -EINVAL;
13268 -
13269 - __VMALLOC_RESERVE = memparse(arg, &arg);
13270 - return 0;
13271 -}
13272 -early_param("vmalloc", parse_vmalloc);
13273 -
13274 -#ifndef CONFIG_XEN
13275 -/*
13276 - * reservetop=size reserves a hole at the top of the kernel address space which
13277 - * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
13278 - * so relocating the fixmap can be done before paging initialization.
13279 - */
13280 -static int __init parse_reservetop(char *arg)
13281 -{
13282 - unsigned long address;
13283 -
13284 - if (!arg)
13285 - return -EINVAL;
13286 -
13287 - address = memparse(arg, &arg);
13288 - reserve_top_address(address);
13289 - return 0;
13290 -}
13291 -early_param("reservetop", parse_reservetop);
13292 -#endif
13293 -
13294 -/*
13295 - * Determine low and high memory ranges:
13296 - */
13297 -unsigned long __init find_max_low_pfn(void)
13298 -{
13299 - unsigned long max_low_pfn;
13300 -
13301 - max_low_pfn = max_pfn;
13302 - if (max_low_pfn > MAXMEM_PFN) {
13303 - if (highmem_pages == -1)
13304 - highmem_pages = max_pfn - MAXMEM_PFN;
13305 - if (highmem_pages + MAXMEM_PFN < max_pfn)
13306 - max_pfn = MAXMEM_PFN + highmem_pages;
13307 - if (highmem_pages + MAXMEM_PFN > max_pfn) {
13308 - printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
13309 - highmem_pages = 0;
13310 - }
13311 - max_low_pfn = MAXMEM_PFN;
13312 -#ifndef CONFIG_HIGHMEM
13313 - /* Maximum memory usable is what is directly addressable */
13314 - printk(KERN_WARNING "Warning only %ldMB will be used.\n",
13315 - MAXMEM>>20);
13316 - if (max_pfn > MAX_NONPAE_PFN)
13317 - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13318 - else
13319 - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
13320 - max_pfn = MAXMEM_PFN;
13321 -#else /* !CONFIG_HIGHMEM */
13322 -#ifndef CONFIG_HIGHMEM64G
13323 - if (max_pfn > MAX_NONPAE_PFN) {
13324 - max_pfn = MAX_NONPAE_PFN;
13325 - printk(KERN_WARNING "Warning only 4GB will be used.\n");
13326 - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13327 - }
13328 -#endif /* !CONFIG_HIGHMEM64G */
13329 -#endif /* !CONFIG_HIGHMEM */
13330 - } else {
13331 - if (highmem_pages == -1)
13332 - highmem_pages = 0;
13333 -#ifdef CONFIG_HIGHMEM
13334 - if (highmem_pages >= max_pfn) {
13335 - printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
13336 - highmem_pages = 0;
13337 - }
13338 - if (highmem_pages) {
13339 - if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
13340 - printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
13341 - highmem_pages = 0;
13342 - }
13343 - max_low_pfn -= highmem_pages;
13344 - }
13345 -#else
13346 - if (highmem_pages)
13347 - printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
13348 -#endif
13349 - }
13350 - return max_low_pfn;
13351 -}
13352 -
13353 -#ifndef CONFIG_XEN
13354 -#define BIOS_LOWMEM_KILOBYTES 0x413
13355 -
13356 -/*
13357 - * The BIOS places the EBDA/XBDA at the top of conventional
13358 - * memory, and usually decreases the reported amount of
13359 - * conventional memory (int 0x12) too. This also contains a
13360 - * workaround for Dell systems that neglect to reserve EBDA.
13361 - * The same workaround also avoids a problem with the AMD768MPX
13362 - * chipset: reserve a page before VGA to prevent PCI prefetch
13363 - * into it (errata #56). Usually the page is reserved anyways,
13364 - * unless you have no PS/2 mouse plugged in.
13365 - */
13366 -static void __init reserve_ebda_region(void)
13367 -{
13368 - unsigned int lowmem, ebda_addr;
13369 -
13370 - /* To determine the position of the EBDA and the */
13371 - /* end of conventional memory, we need to look at */
13372 - /* the BIOS data area. In a paravirtual environment */
13373 - /* that area is absent. We'll just have to assume */
13374 - /* that the paravirt case can handle memory setup */
13375 - /* correctly, without our help. */
13376 - if (paravirt_enabled())
13377 - return;
13378 -
13379 - /* end of low (conventional) memory */
13380 - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
13381 - lowmem <<= 10;
13382 -
13383 - /* start of EBDA area */
13384 - ebda_addr = get_bios_ebda();
13385 -
13386 - /* Fixup: bios puts an EBDA in the top 64K segment */
13387 - /* of conventional memory, but does not adjust lowmem. */
13388 - if ((lowmem - ebda_addr) <= 0x10000)
13389 - lowmem = ebda_addr;
13390 -
13391 - /* Fixup: bios does not report an EBDA at all. */
13392 - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
13393 - if ((ebda_addr == 0) && (lowmem >= 0x9f000))
13394 - lowmem = 0x9f000;
13395 -
13396 - /* Paranoia: should never happen, but... */
13397 - if ((lowmem == 0) || (lowmem >= 0x100000))
13398 - lowmem = 0x9f000;
13399 -
13400 - /* reserve all memory between lowmem and the 1MB mark */
13401 - reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
13402 -}
13403 -#endif
13404 -
13405 -#ifndef CONFIG_NEED_MULTIPLE_NODES
13406 -static void __init setup_bootmem_allocator(void);
13407 -static unsigned long __init setup_memory(void)
13408 -{
13409 - /*
13410 - * partially used pages are not usable - thus
13411 - * we are rounding upwards:
13412 - */
13413 - min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
13414 - xen_start_info->nr_pt_frames;
13415 -
13416 - max_low_pfn = find_max_low_pfn();
13417 -
13418 -#ifdef CONFIG_HIGHMEM
13419 - highstart_pfn = highend_pfn = max_pfn;
13420 - if (max_pfn > max_low_pfn) {
13421 - highstart_pfn = max_low_pfn;
13422 - }
13423 - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
13424 - pages_to_mb(highend_pfn - highstart_pfn));
13425 - num_physpages = highend_pfn;
13426 - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
13427 -#else
13428 - num_physpages = max_low_pfn;
13429 - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
13430 -#endif
13431 -#ifdef CONFIG_FLATMEM
13432 - max_mapnr = num_physpages;
13433 -#endif
13434 - printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
13435 - pages_to_mb(max_low_pfn));
13436 -
13437 - setup_bootmem_allocator();
13438 -
13439 - return max_low_pfn;
13440 -}
13441 -
13442 -static void __init zone_sizes_init(void)
13443 -{
13444 - unsigned long max_zone_pfns[MAX_NR_ZONES];
13445 - memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
13446 - max_zone_pfns[ZONE_DMA] =
13447 - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
13448 - max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
13449 -#ifdef CONFIG_HIGHMEM
13450 - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
13451 - add_active_range(0, 0, highend_pfn);
13452 -#else
13453 - add_active_range(0, 0, max_low_pfn);
13454 -#endif
13455 -
13456 - free_area_init_nodes(max_zone_pfns);
13457 -}
13458 -#else
13459 -extern unsigned long __init setup_memory(void);
13460 -extern void zone_sizes_init(void);
13461 -#endif /* !CONFIG_NEED_MULTIPLE_NODES */
13462 -
13463 -static inline unsigned long long get_total_mem(void)
13464 -{
13465 - unsigned long long total;
13466 -
13467 - total = max_low_pfn - min_low_pfn;
13468 -#ifdef CONFIG_HIGHMEM
13469 - total += highend_pfn - highstart_pfn;
13470 -#endif
13471 -
13472 - return total << PAGE_SHIFT;
13473 -}
13474 -
13475 -#ifdef CONFIG_KEXEC
13476 -#ifndef CONFIG_XEN
13477 -static void __init reserve_crashkernel(void)
13478 -{
13479 - unsigned long long total_mem;
13480 - unsigned long long crash_size, crash_base;
13481 - int ret;
13482 -
13483 - total_mem = get_total_mem();
13484 -
13485 - ret = parse_crashkernel(boot_command_line, total_mem,
13486 - &crash_size, &crash_base);
13487 - if (ret == 0 && crash_size > 0) {
13488 - if (crash_base > 0) {
13489 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
13490 - "for crashkernel (System RAM: %ldMB)\n",
13491 - (unsigned long)(crash_size >> 20),
13492 - (unsigned long)(crash_base >> 20),
13493 - (unsigned long)(total_mem >> 20));
13494 -
13495 - if (reserve_bootmem(crash_base, crash_size,
13496 - BOOTMEM_EXCLUSIVE) < 0) {
13497 - printk(KERN_INFO "crashkernel reservation "
13498 - "failed - memory is in use\n");
13499 - return;
13500 - }
13501 -
13502 - crashk_res.start = crash_base;
13503 - crashk_res.end = crash_base + crash_size - 1;
13504 - } else
13505 - printk(KERN_INFO "crashkernel reservation failed - "
13506 - "you have to specify a base address\n");
13507 - }
13508 -}
13509 -#else
13510 -#define reserve_crashkernel xen_machine_kexec_setup_resources
13511 -#endif
13512 -#else
13513 -static inline void __init reserve_crashkernel(void)
13514 -{}
13515 -#endif
13516 -
13517 -#ifdef CONFIG_BLK_DEV_INITRD
13518 -
13519 -static bool do_relocate_initrd = false;
13520 -
13521 -static void __init reserve_initrd(void)
13522 -{
13523 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
13524 - unsigned long ramdisk_size = xen_start_info->mod_len;
13525 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
13526 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13527 - unsigned long ramdisk_here;
13528 -
13529 - initrd_start = 0;
13530 -
13531 - if (!xen_start_info->mod_start || !ramdisk_size)
13532 - return; /* No initrd provided by bootloader */
13533 -
13534 - if (ramdisk_end < ramdisk_image) {
13535 - printk(KERN_ERR "initrd wraps around end of memory, "
13536 - "disabling initrd\n");
13537 - return;
13538 - }
13539 - if (ramdisk_size >= end_of_lowmem/2) {
13540 - printk(KERN_ERR "initrd too large to handle, "
13541 - "disabling initrd\n");
13542 - return;
13543 - }
13544 - if (ramdisk_end <= end_of_lowmem) {
13545 - /* All in lowmem, easy case */
13546 - reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
13547 - initrd_start = ramdisk_image + PAGE_OFFSET;
13548 - initrd_end = initrd_start+ramdisk_size;
13549 - return;
13550 - }
13551 -
13552 - /* We need to move the initrd down into lowmem */
13553 - ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
13554 -
13555 - /* Note: this includes all the lowmem currently occupied by
13556 - the initrd, we rely on that fact to keep the data intact. */
13557 - reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
13558 - initrd_start = ramdisk_here + PAGE_OFFSET;
13559 - initrd_end = initrd_start + ramdisk_size;
13560 -
13561 - do_relocate_initrd = true;
13562 -}
13563 -
13564 -#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
13565 -
13566 -static void __init relocate_initrd(void)
13567 -{
13568 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
13569 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
13570 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13571 - unsigned long ramdisk_here;
13572 - unsigned long slop, clen, mapaddr;
13573 - char *p, *q;
13574 -
13575 - if (!do_relocate_initrd)
13576 - return;
13577 -
13578 - ramdisk_here = initrd_start - PAGE_OFFSET;
13579 -
13580 - q = (char *)initrd_start;
13581 -
13582 - /* Copy any lowmem portion of the initrd */
13583 - if (ramdisk_image < end_of_lowmem) {
13584 - clen = end_of_lowmem - ramdisk_image;
13585 - p = (char *)__va(ramdisk_image);
13586 - memcpy(q, p, clen);
13587 - q += clen;
13588 - ramdisk_image += clen;
13589 - ramdisk_size -= clen;
13590 - }
13591 -
13592 - /* Copy the highmem portion of the initrd */
13593 - while (ramdisk_size) {
13594 - slop = ramdisk_image & ~PAGE_MASK;
13595 - clen = ramdisk_size;
13596 - if (clen > MAX_MAP_CHUNK-slop)
13597 - clen = MAX_MAP_CHUNK-slop;
13598 - mapaddr = ramdisk_image & PAGE_MASK;
13599 - p = early_ioremap(mapaddr, clen+slop);
13600 - memcpy(q, p+slop, clen);
13601 - early_iounmap(p, clen+slop);
13602 - q += clen;
13603 - ramdisk_image += clen;
13604 - ramdisk_size -= clen;
13605 - }
13606 -}
13607 -
13608 -#endif /* CONFIG_BLK_DEV_INITRD */
13609 -
13610 -void __init setup_bootmem_allocator(void)
13611 -{
13612 - unsigned long bootmap_size;
13613 - /*
13614 - * Initialize the boot-time allocator (with low memory only):
13615 - */
13616 - bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
13617 -
13618 - register_bootmem_low_pages(max_low_pfn);
13619 -
13620 - /*
13621 - * Reserve the bootmem bitmap itself as well. We do this in two
13622 - * steps (first step was init_bootmem()) because this catches
13623 - * the (very unlikely) case of us accidentally initializing the
13624 - * bootmem allocator with an invalid RAM area.
13625 - */
13626 - reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
13627 - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
13628 - BOOTMEM_DEFAULT);
13629 -
13630 -#ifndef CONFIG_XEN
13631 - /*
13632 - * reserve physical page 0 - it's a special BIOS page on many boxes,
13633 - * enabling clean reboots, SMP operation, laptop functions.
13634 - */
13635 - reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
13636 -
13637 - /* reserve EBDA region */
13638 - reserve_ebda_region();
13639 -
13640 -#ifdef CONFIG_SMP
13641 - /*
13642 - * But first pinch a few for the stack/trampoline stuff
13643 - * FIXME: Don't need the extra page at 4K, but need to fix
13644 - * trampoline before removing it. (see the GDT stuff)
13645 - */
13646 - reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
13647 -#endif
13648 -#ifdef CONFIG_ACPI_SLEEP
13649 - /*
13650 - * Reserve low memory region for sleep support.
13651 - */
13652 - acpi_reserve_bootmem();
13653 -#endif
13654 -#endif /* !CONFIG_XEN */
13655 -
13656 -#ifdef CONFIG_BLK_DEV_INITRD
13657 - reserve_initrd();
13658 -#endif
13659 - numa_kva_reserve();
13660 - reserve_crashkernel();
13661 -
13662 - reserve_ibft_region();
13663 -}
13664 -
13665 -/*
13666 - * The node 0 pgdat is initialized before all of these because
13667 - * it's needed for bootmem. node>0 pgdats have their virtual
13668 - * space allocated before the pagetables are in place to access
13669 - * them, so they can't be cleared then.
13670 - *
13671 - * This should all compile down to nothing when NUMA is off.
13672 - */
13673 -static void __init remapped_pgdat_init(void)
13674 -{
13675 - int nid;
13676 -
13677 - for_each_online_node(nid) {
13678 - if (nid != 0)
13679 - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
13680 - }
13681 -}
13682 -
13683 -#ifdef CONFIG_MCA
13684 -static void set_mca_bus(int x)
13685 -{
13686 - MCA_bus = x;
13687 -}
13688 -#else
13689 -static void set_mca_bus(int x) { }
13690 -#endif
13691 -
13692 -/* Overridden in paravirt.c if CONFIG_PARAVIRT */
13693 -char * __init __attribute__((weak)) memory_setup(void)
13694 -{
13695 - return machine_specific_memory_setup();
13696 -}
13697 -
13698 -#ifdef CONFIG_NUMA
13699 -/*
13700 - * In the golden day, when everything among i386 and x86_64 will be
13701 - * integrated, this will not live here
13702 - */
13703 -void *x86_cpu_to_node_map_early_ptr;
13704 -int x86_cpu_to_node_map_init[NR_CPUS] = {
13705 - [0 ... NR_CPUS-1] = NUMA_NO_NODE
13706 -};
13707 -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
13708 -#endif
13709 -
13710 -/*
13711 - * Determine if we were loaded by an EFI loader. If so, then we have also been
13712 - * passed the efi memmap, systab, etc., so we should use these data structures
13713 - * for initialization. Note, the efi init code path is determined by the
13714 - * global efi_enabled. This allows the same kernel image to be used on existing
13715 - * systems (with a traditional BIOS) as well as on EFI systems.
13716 - */
13717 -void __init setup_arch(char **cmdline_p)
13718 -{
13719 - int i, j, k, fpp;
13720 - struct physdev_set_iopl set_iopl;
13721 - unsigned long max_low_pfn;
13722 - unsigned long p2m_pages;
13723 -
13724 - /* Force a quick death if the kernel panics (not domain 0). */
13725 - extern int panic_timeout;
13726 - if (!panic_timeout && !is_initial_xendomain())
13727 - panic_timeout = 1;
13728 -
13729 - /* Register a call for panic conditions. */
13730 - atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
13731 -
13732 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13733 - VMASST_TYPE_4gb_segments));
13734 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13735 - VMASST_TYPE_writable_pagetables));
13736 -
13737 - memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
13738 - pre_setup_arch_hook();
13739 - early_cpu_init();
13740 - early_ioremap_init();
13741 -#ifdef CONFIG_SMP
13742 - prefill_possible_map();
13743 -#endif
13744 -
13745 -#ifdef CONFIG_EFI
13746 - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
13747 - "EL32", 4))
13748 - efi_enabled = 1;
13749 -#endif
13750 -
13751 - /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
13752 - properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
13753 - */
13754 - ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
13755 - screen_info = boot_params.screen_info;
13756 - copy_edid();
13757 - apm_info.bios = boot_params.apm_bios_info;
13758 - ist_info = boot_params.ist_info;
13759 - saved_video_mode = boot_params.hdr.vid_mode;
13760 - if( boot_params.sys_desc_table.length != 0 ) {
13761 - set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
13762 - machine_id = boot_params.sys_desc_table.table[0];
13763 - machine_submodel_id = boot_params.sys_desc_table.table[1];
13764 - BIOS_revision = boot_params.sys_desc_table.table[2];
13765 - }
13766 - bootloader_type = boot_params.hdr.type_of_loader;
13767 -
13768 - if (is_initial_xendomain()) {
13769 - const struct dom0_vga_console_info *info =
13770 - (void *)((char *)xen_start_info +
13771 - xen_start_info->console.dom0.info_off);
13772 -
13773 - dom0_init_screen_info(info,
13774 - xen_start_info->console.dom0.info_size);
13775 - xen_start_info->console.domU.mfn = 0;
13776 - xen_start_info->console.domU.evtchn = 0;
13777 - } else
13778 - screen_info.orig_video_isVGA = 0;
13779 -
13780 -#ifdef CONFIG_BLK_DEV_RAM
13781 - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
13782 - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
13783 - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
13784 -#endif
13785 -
13786 - ARCH_SETUP
13787 -
13788 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
13789 - print_memory_map(memory_setup());
13790 -
13791 - copy_edd();
13792 -
13793 - if (!boot_params.hdr.root_flags)
13794 - root_mountflags &= ~MS_RDONLY;
13795 - init_mm.start_code = (unsigned long) _text;
13796 - init_mm.end_code = (unsigned long) _etext;
13797 - init_mm.end_data = (unsigned long) _edata;
13798 - init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
13799 - xen_start_info->nr_pt_frames) << PAGE_SHIFT;
13800 -
13801 - code_resource.start = virt_to_phys(_text);
13802 - code_resource.end = virt_to_phys(_etext)-1;
13803 - data_resource.start = virt_to_phys(_etext);
13804 - data_resource.end = virt_to_phys(_edata)-1;
13805 - bss_resource.start = virt_to_phys(&__bss_start);
13806 - bss_resource.end = virt_to_phys(&__bss_stop)-1;
13807 -
13808 - if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
13809 - i = COMMAND_LINE_SIZE;
13810 - memcpy(boot_command_line, xen_start_info->cmd_line, i);
13811 - boot_command_line[i - 1] = '\0';
13812 - parse_early_param();
13813 -
13814 - if (user_defined_memmap) {
13815 - printk(KERN_INFO "user-defined physical RAM map:\n");
13816 - print_memory_map("user");
13817 - }
13818 -
13819 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
13820 - *cmdline_p = command_line;
13821 -
13822 - if (efi_enabled)
13823 - efi_init();
13824 -
13825 - /* update e820 for memory not covered by WB MTRRs */
13826 - propagate_e820_map();
13827 - mtrr_bp_init();
13828 -#ifndef CONFIG_XEN
13829 - if (mtrr_trim_uncached_memory(max_pfn))
13830 - propagate_e820_map();
13831 -#endif
13832 -
13833 - max_low_pfn = setup_memory();
13834 -
13835 -#ifdef CONFIG_KVM_CLOCK
13836 - kvmclock_init();
13837 -#endif
13838 -
13839 -#ifdef CONFIG_VMI
13840 - /*
13841 - * Must be after max_low_pfn is determined, and before kernel
13842 - * pagetables are setup.
13843 - */
13844 - vmi_init();
13845 -#endif
13846 - kvm_guest_init();
13847 -
13848 - /*
13849 - * NOTE: before this point _nobody_ is allowed to allocate
13850 - * any memory using the bootmem allocator. Although the
13851 - * allocator is now initialised only the first 8Mb of the kernel
13852 - * virtual address space has been mapped. All allocations before
13853 - * paging_init() has completed must use the alloc_bootmem_low_pages()
13854 - * variant (which allocates DMA'able memory) and care must be taken
13855 - * not to exceed the 8Mb limit.
13856 - */
13857 -
13858 -#ifdef CONFIG_SMP
13859 - smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
13860 -#endif
13861 - paging_init();
13862 -
13863 - /*
13864 - * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
13865 - */
13866 -
13867 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
13868 - if (init_ohci1394_dma_early)
13869 - init_ohci1394_dma_on_all_controllers();
13870 -#endif
13871 -
13872 - remapped_pgdat_init();
13873 - sparse_init();
13874 - zone_sizes_init();
13875 -
13876 -#ifdef CONFIG_X86_FIND_SMP_CONFIG
13877 - /*
13878 - * Find and reserve possible boot-time SMP configuration:
13879 - */
13880 - find_smp_config();
13881 -#endif
13882 -
13883 - p2m_pages = max_pfn;
13884 - if (xen_start_info->nr_pages > max_pfn) {
13885 - /*
13886 - * the max_pfn was shrunk (probably by mem= or highmem=
13887 - * kernel parameter); shrink reservation with the HV
13888 - */
13889 - struct xen_memory_reservation reservation = {
13890 - .address_bits = 0,
13891 - .extent_order = 0,
13892 - .domid = DOMID_SELF
13893 - };
13894 - unsigned int difference;
13895 - int ret;
13896 -
13897 - difference = xen_start_info->nr_pages - max_pfn;
13898 -
13899 - set_xen_guest_handle(reservation.extent_start,
13900 - ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
13901 - reservation.nr_extents = difference;
13902 - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
13903 - &reservation);
13904 - BUG_ON (ret != difference);
13905 - }
13906 - else if (max_pfn > xen_start_info->nr_pages)
13907 - p2m_pages = xen_start_info->nr_pages;
13908 -
13909 - /* Make sure we have a correctly sized P->M table. */
13910 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
13911 - phys_to_machine_mapping = alloc_bootmem_low_pages(
13912 - max_pfn * sizeof(unsigned long));
13913 - memset(phys_to_machine_mapping, ~0,
13914 - max_pfn * sizeof(unsigned long));
13915 - memcpy(phys_to_machine_mapping,
13916 - (unsigned long *)xen_start_info->mfn_list,
13917 - p2m_pages * sizeof(unsigned long));
13918 - free_bootmem(
13919 - __pa(xen_start_info->mfn_list),
13920 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
13921 - sizeof(unsigned long))));
13922 -
13923 - /*
13924 - * Initialise the list of the frames that specify the list of
13925 - * frames that make up the p2m table. Used by save/restore
13926 - */
13927 - pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
13928 -
13929 - fpp = PAGE_SIZE/sizeof(unsigned long);
13930 - for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
13931 - if ((j % fpp) == 0) {
13932 - k++;
13933 - BUG_ON(k>=16);
13934 - pfn_to_mfn_frame_list[k] =
13935 - alloc_bootmem_low_pages(PAGE_SIZE);
13936 - pfn_to_mfn_frame_list_list[k] =
13937 - virt_to_mfn(pfn_to_mfn_frame_list[k]);
13938 - j=0;
13939 - }
13940 - pfn_to_mfn_frame_list[k][j] =
13941 - virt_to_mfn(&phys_to_machine_mapping[i]);
13942 - }
13943 - HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
13944 - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
13945 - virt_to_mfn(pfn_to_mfn_frame_list_list);
13946 - }
13947 -
13948 - /* Mark all ISA DMA channels in-use - using them wouldn't work. */
13949 - for (i = 0; i < MAX_DMA_CHANNELS; ++i)
13950 - if (i != 4 && request_dma(i, "xen") != 0)
13951 - BUG();
13952 -
13953 - /*
13954 - * NOTE: at this point the bootmem allocator is fully available.
13955 - */
13956 -
13957 -#ifdef CONFIG_BLK_DEV_INITRD
13958 - relocate_initrd();
13959 -#endif
13960 -
13961 - paravirt_post_allocator_init();
13962 -
13963 - if (is_initial_xendomain())
13964 - dmi_scan_machine();
13965 -
13966 - io_delay_init();
13967 -
13968 -#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
13969 - /*
13970 - * setup to use the early static init tables during kernel startup
13971 - * X86_SMP will exclude sub-arches that don't deal well with it.
13972 - */
13973 - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
13974 - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
13975 -#ifdef CONFIG_NUMA
13976 - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
13977 -#endif
13978 -#endif
13979 -
13980 -#ifdef CONFIG_X86_GENERICARCH
13981 - generic_apic_probe();
13982 -#endif
13983 -
13984 - set_iopl.iopl = 1;
13985 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
13986 -
13987 -#ifdef CONFIG_ACPI
13988 - if (!is_initial_xendomain()) {
13989 - printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
13990 - acpi_disabled = 1;
13991 - acpi_ht = 0;
13992 - }
13993 -
13994 - /*
13995 - * Parse the ACPI tables for possible boot-time SMP configuration.
13996 - */
13997 - acpi_boot_table_init();
13998 -#endif
13999 -
14000 -#ifndef CONFIG_XEN
14001 - early_quirks();
14002 -#endif
14003 -
14004 -#ifdef CONFIG_ACPI
14005 - acpi_boot_init();
14006 -
14007 -#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
14008 - if (def_to_bigsmp)
14009 - printk(KERN_WARNING "More than 8 CPUs detected and "
14010 - "CONFIG_X86_PC cannot handle it.\nUse "
14011 - "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
14012 -#endif
14013 -#endif
14014 -#ifdef CONFIG_X86_LOCAL_APIC
14015 - if (smp_found_config)
14016 - get_smp_config();
14017 -#endif
14018 -
14019 - e820_register_memory();
14020 - e820_mark_nosave_regions();
14021 -
14022 - if (is_initial_xendomain()) {
14023 -#ifdef CONFIG_VT
14024 -#if defined(CONFIG_VGA_CONSOLE)
14025 - if (!efi_enabled ||
14026 - (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14027 - conswitchp = &vga_con;
14028 -#elif defined(CONFIG_DUMMY_CONSOLE)
14029 - conswitchp = &dummy_con;
14030 -#endif
14031 -#endif
14032 - } else {
14033 -#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14034 - conswitchp = &dummy_con;
14035 -#endif
14036 - }
14037 -}
14038 -
14039 -static int
14040 -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14041 -{
14042 - HYPERVISOR_shutdown(SHUTDOWN_crash);
14043 - /* we're never actually going to get here... */
14044 - return NOTIFY_DONE;
14045 -}
14046 -
14047 -/*
14048 - * Request address space for all standard resources
14049 - *
14050 - * This is called just before pcibios_init(), which is also a
14051 - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
14052 - */
14053 -static int __init request_standard_resources(void)
14054 -{
14055 - int i;
14056 -
14057 - /* Nothing to do if not running in dom0. */
14058 - if (!is_initial_xendomain())
14059 - return 0;
14060 -
14061 - printk(KERN_INFO "Setting up standard PCI resources\n");
14062 - init_iomem_resources(&code_resource, &data_resource, &bss_resource);
14063 -
14064 - request_resource(&iomem_resource, &video_ram_resource);
14065 -
14066 - /* request I/O space for devices used on all i[345]86 PCs */
14067 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14068 - request_resource(&ioport_resource, &standard_io_resources[i]);
14069 - return 0;
14070 -}
14071 -
14072 -subsys_initcall(request_standard_resources);
14073 Index: head-2008-12-01/arch/x86/kernel/setup_64-xen.c
14074 ===================================================================
14075 --- head-2008-12-01.orig/arch/x86/kernel/setup_64-xen.c 2008-12-01 11:44:55.000000000 +0100
14076 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
14077 @@ -1,1433 +0,0 @@
14078 -/*
14079 - * Copyright (C) 1995 Linus Torvalds
14080 - */
14081 -
14082 -/*
14083 - * This file handles the architecture-dependent parts of initialization
14084 - */
14085 -
14086 -#include <linux/errno.h>
14087 -#include <linux/sched.h>
14088 -#include <linux/kernel.h>
14089 -#include <linux/mm.h>
14090 -#include <linux/stddef.h>
14091 -#include <linux/unistd.h>
14092 -#include <linux/ptrace.h>
14093 -#include <linux/slab.h>
14094 -#include <linux/user.h>
14095 -#include <linux/screen_info.h>
14096 -#include <linux/ioport.h>
14097 -#include <linux/delay.h>
14098 -#include <linux/init.h>
14099 -#include <linux/initrd.h>
14100 -#include <linux/highmem.h>
14101 -#include <linux/bootmem.h>
14102 -#include <linux/module.h>
14103 -#include <asm/processor.h>
14104 -#include <linux/console.h>
14105 -#include <linux/seq_file.h>
14106 -#include <linux/crash_dump.h>
14107 -#include <linux/root_dev.h>
14108 -#include <linux/pci.h>
14109 -#include <asm/pci-direct.h>
14110 -#include <linux/efi.h>
14111 -#include <linux/acpi.h>
14112 -#include <linux/kallsyms.h>
14113 -#include <linux/edd.h>
14114 -#include <linux/iscsi_ibft.h>
14115 -#include <linux/mmzone.h>
14116 -#include <linux/kexec.h>
14117 -#include <linux/cpufreq.h>
14118 -#include <linux/dmi.h>
14119 -#include <linux/dma-mapping.h>
14120 -#include <linux/ctype.h>
14121 -#include <linux/sort.h>
14122 -#include <linux/uaccess.h>
14123 -#include <linux/init_ohci1394_dma.h>
14124 -#include <linux/kvm_para.h>
14125 -
14126 -#include <asm/mtrr.h>
14127 -#include <asm/uaccess.h>
14128 -#include <asm/system.h>
14129 -#include <asm/vsyscall.h>
14130 -#include <asm/io.h>
14131 -#include <asm/smp.h>
14132 -#include <asm/msr.h>
14133 -#include <asm/desc.h>
14134 -#include <video/edid.h>
14135 -#include <asm/e820.h>
14136 -#include <asm/dma.h>
14137 -#include <asm/gart.h>
14138 -#include <asm/mpspec.h>
14139 -#include <asm/mmu_context.h>
14140 -#include <asm/proto.h>
14141 -#include <asm/setup.h>
14142 -#include <asm/numa.h>
14143 -#include <asm/sections.h>
14144 -#include <asm/dmi.h>
14145 -#include <asm/cacheflush.h>
14146 -#include <asm/mce.h>
14147 -#include <asm/ds.h>
14148 -#include <asm/topology.h>
14149 -#include <asm/pat.h>
14150 -
14151 -#include <mach_apic.h>
14152 -#ifdef CONFIG_XEN
14153 -#include <linux/percpu.h>
14154 -#include <xen/interface/physdev.h>
14155 -#include "setup_arch_pre.h"
14156 -#include <asm/hypervisor.h>
14157 -#include <xen/interface/nmi.h>
14158 -#include <xen/features.h>
14159 -#include <xen/firmware.h>
14160 -#include <xen/xencons.h>
14161 -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
14162 -#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
14163 -#include <asm/mach-xen/setup_arch_post.h>
14164 -#include <xen/interface/memory.h>
14165 -
14166 -#ifdef CONFIG_XEN
14167 -#include <xen/interface/kexec.h>
14168 -#endif
14169 -
14170 -extern unsigned long start_pfn;
14171 -extern struct edid_info edid_info;
14172 -
14173 -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
14174 -EXPORT_SYMBOL(HYPERVISOR_shared_info);
14175 -
14176 -static int xen_panic_event(struct notifier_block *, unsigned long, void *);
14177 -static struct notifier_block xen_panic_block = {
14178 - xen_panic_event, NULL, 0 /* try to go last */
14179 -};
14180 -
14181 -unsigned long *phys_to_machine_mapping;
14182 -unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
14183 -
14184 -EXPORT_SYMBOL(phys_to_machine_mapping);
14185 -
14186 -DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
14187 -DEFINE_PER_CPU(int, nr_multicall_ents);
14188 -
14189 -/* Raw start-of-day parameters from the hypervisor. */
14190 -start_info_t *xen_start_info;
14191 -EXPORT_SYMBOL(xen_start_info);
14192 -#endif
14193 -
14194 -/*
14195 - * Machine setup..
14196 - */
14197 -
14198 -struct cpuinfo_x86 boot_cpu_data __read_mostly;
14199 -EXPORT_SYMBOL(boot_cpu_data);
14200 -
14201 -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
14202 -
14203 -unsigned long mmu_cr4_features;
14204 -
14205 -/* Boot loader ID as an integer, for the benefit of proc_dointvec */
14206 -int bootloader_type;
14207 -
14208 -unsigned long saved_video_mode;
14209 -
14210 -int force_mwait __cpuinitdata;
14211 -
14212 -/*
14213 - * Early DMI memory
14214 - */
14215 -int dmi_alloc_index;
14216 -char dmi_alloc_data[DMI_MAX_DATA];
14217 -
14218 -/*
14219 - * Setup options
14220 - */
14221 -struct screen_info screen_info;
14222 -EXPORT_SYMBOL(screen_info);
14223 -struct sys_desc_table_struct {
14224 - unsigned short length;
14225 - unsigned char table[0];
14226 -};
14227 -
14228 -struct edid_info edid_info;
14229 -EXPORT_SYMBOL_GPL(edid_info);
14230 -
14231 -extern int root_mountflags;
14232 -
14233 -char __initdata command_line[COMMAND_LINE_SIZE];
14234 -
14235 -static struct resource standard_io_resources[] = {
14236 - { .name = "dma1", .start = 0x00, .end = 0x1f,
14237 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14238 - { .name = "pic1", .start = 0x20, .end = 0x21,
14239 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14240 - { .name = "timer0", .start = 0x40, .end = 0x43,
14241 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14242 - { .name = "timer1", .start = 0x50, .end = 0x53,
14243 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14244 - { .name = "keyboard", .start = 0x60, .end = 0x60,
14245 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14246 - { .name = "keyboard", .start = 0x64, .end = 0x64,
14247 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14248 - { .name = "dma page reg", .start = 0x80, .end = 0x8f,
14249 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14250 - { .name = "pic2", .start = 0xa0, .end = 0xa1,
14251 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14252 - { .name = "dma2", .start = 0xc0, .end = 0xdf,
14253 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14254 - { .name = "fpu", .start = 0xf0, .end = 0xff,
14255 - .flags = IORESOURCE_BUSY | IORESOURCE_IO }
14256 -};
14257 -
14258 -#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
14259 -
14260 -static struct resource data_resource = {
14261 - .name = "Kernel data",
14262 - .start = 0,
14263 - .end = 0,
14264 - .flags = IORESOURCE_RAM,
14265 -};
14266 -static struct resource code_resource = {
14267 - .name = "Kernel code",
14268 - .start = 0,
14269 - .end = 0,
14270 - .flags = IORESOURCE_RAM,
14271 -};
14272 -static struct resource bss_resource = {
14273 - .name = "Kernel bss",
14274 - .start = 0,
14275 - .end = 0,
14276 - .flags = IORESOURCE_RAM,
14277 -};
14278 -
14279 -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
14280 -
14281 -#ifdef CONFIG_PROC_VMCORE
14282 -/* elfcorehdr= specifies the location of elf core header
14283 - * stored by the crashed kernel. This option will be passed
14284 - * by kexec loader to the capture kernel.
14285 - */
14286 -static int __init setup_elfcorehdr(char *arg)
14287 -{
14288 - char *end;
14289 - if (!arg)
14290 - return -EINVAL;
14291 - elfcorehdr_addr = memparse(arg, &end);
14292 - return end > arg ? 0 : -EINVAL;
14293 -}
14294 -early_param("elfcorehdr", setup_elfcorehdr);
14295 -#endif
14296 -
14297 -#ifndef CONFIG_NUMA
14298 -static void __init
14299 -contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
14300 -{
14301 - unsigned long bootmap_size, bootmap;
14302 -
14303 - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
14304 - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
14305 - PAGE_SIZE);
14306 - if (bootmap == -1L)
14307 - panic("Cannot find bootmem map of size %ld\n", bootmap_size);
14308 - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
14309 - e820_register_active_regions(0, start_pfn, end_pfn);
14310 -#ifdef CONFIG_XEN
14311 - free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
14312 - early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
14313 -#else
14314 - free_bootmem_with_active_regions(0, end_pfn);
14315 - early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
14316 -#endif
14317 - reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
14318 -}
14319 -#endif
14320 -
14321 -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
14322 -struct edd edd;
14323 -#ifdef CONFIG_EDD_MODULE
14324 -EXPORT_SYMBOL(edd);
14325 -#endif
14326 -#ifndef CONFIG_XEN
14327 -/**
14328 - * copy_edd() - Copy the BIOS EDD information
14329 - * from boot_params into a safe place.
14330 - *
14331 - */
14332 -static inline void copy_edd(void)
14333 -{
14334 - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
14335 - sizeof(edd.mbr_signature));
14336 - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
14337 - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
14338 - edd.edd_info_nr = boot_params.eddbuf_entries;
14339 -}
14340 -#endif
14341 -#else
14342 -static inline void copy_edd(void)
14343 -{
14344 -}
14345 -#endif
14346 -
14347 -#ifdef CONFIG_KEXEC
14348 -#ifndef CONFIG_XEN
14349 -static void __init reserve_crashkernel(void)
14350 -{
14351 - unsigned long long total_mem;
14352 - unsigned long long crash_size, crash_base;
14353 - int ret;
14354 -
14355 - total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
14356 -
14357 - ret = parse_crashkernel(boot_command_line, total_mem,
14358 - &crash_size, &crash_base);
14359 - if (ret == 0 && crash_size) {
14360 - if (crash_base <= 0) {
14361 - printk(KERN_INFO "crashkernel reservation failed - "
14362 - "you have to specify a base address\n");
14363 - return;
14364 - }
14365 -
14366 - if (reserve_bootmem(crash_base, crash_size,
14367 - BOOTMEM_EXCLUSIVE) < 0) {
14368 - printk(KERN_INFO "crashkernel reservation failed - "
14369 - "memory is in use\n");
14370 - return;
14371 - }
14372 -
14373 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
14374 - "for crashkernel (System RAM: %ldMB)\n",
14375 - (unsigned long)(crash_size >> 20),
14376 - (unsigned long)(crash_base >> 20),
14377 - (unsigned long)(total_mem >> 20));
14378 - crashk_res.start = crash_base;
14379 - crashk_res.end = crash_base + crash_size - 1;
14380 - insert_resource(&iomem_resource, &crashk_res);
14381 - }
14382 -}
14383 -#else
14384 -#define reserve_crashkernel xen_machine_kexec_setup_resources
14385 -#endif
14386 -#else
14387 -static inline void __init reserve_crashkernel(void)
14388 -{}
14389 -#endif
14390 -
14391 -/* Overridden in paravirt.c if CONFIG_PARAVIRT */
14392 -void __attribute__((weak)) __init memory_setup(void)
14393 -{
14394 - machine_specific_memory_setup();
14395 -}
14396 -
14397 -static void __init parse_setup_data(void)
14398 -{
14399 - struct setup_data *data;
14400 - unsigned long pa_data;
14401 -
14402 - if (boot_params.hdr.version < 0x0209)
14403 - return;
14404 - pa_data = boot_params.hdr.setup_data;
14405 - while (pa_data) {
14406 - data = early_ioremap(pa_data, PAGE_SIZE);
14407 - switch (data->type) {
14408 - default:
14409 - break;
14410 - }
14411 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
14412 - free_early(pa_data, pa_data+sizeof(*data)+data->len);
14413 -#endif
14414 - pa_data = data->next;
14415 - early_iounmap(data, PAGE_SIZE);
14416 - }
14417 -}
14418 -
14419 -#ifdef CONFIG_PCI_MMCONFIG
14420 -extern void __cpuinit fam10h_check_enable_mmcfg(void);
14421 -extern void __init check_enable_amd_mmconf_dmi(void);
14422 -#else
14423 -void __cpuinit fam10h_check_enable_mmcfg(void)
14424 -{
14425 -}
14426 -void __init check_enable_amd_mmconf_dmi(void)
14427 -{
14428 -}
14429 -#endif
14430 -
14431 -/*
14432 - * setup_arch - architecture-specific boot-time initializations
14433 - *
14434 - * Note: On x86_64, fixmaps are ready for use even before this is called.
14435 - */
14436 -void __init setup_arch(char **cmdline_p)
14437 -{
14438 - unsigned i;
14439 -
14440 -#ifdef CONFIG_XEN
14441 - extern struct e820map machine_e820;
14442 -
14443 - printk(KERN_INFO "Command line: %s\n", boot_command_line);
14444 -
14445 - /* Register a call for panic conditions. */
14446 - atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
14447 -
14448 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
14449 - VMASST_TYPE_writable_pagetables));
14450 -
14451 - early_ioremap_init();
14452 -
14453 - ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
14454 - screen_info = boot_params.screen_info;
14455 -
14456 - if (is_initial_xendomain()) {
14457 - const struct dom0_vga_console_info *info =
14458 - (void *)((char *)xen_start_info +
14459 - xen_start_info->console.dom0.info_off);
14460 -
14461 - dom0_init_screen_info(info,
14462 - xen_start_info->console.dom0.info_size);
14463 - xen_start_info->console.domU.mfn = 0;
14464 - xen_start_info->console.domU.evtchn = 0;
14465 - } else
14466 - screen_info.orig_video_isVGA = 0;
14467 -
14468 - copy_edid();
14469 -#else
14470 - printk(KERN_INFO "Command line: %s\n", boot_command_line);
14471 -
14472 - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
14473 - screen_info = boot_params.screen_info;
14474 - edid_info = boot_params.edid_info;
14475 -#endif /* !CONFIG_XEN */
14476 - saved_video_mode = boot_params.hdr.vid_mode;
14477 - bootloader_type = boot_params.hdr.type_of_loader;
14478 -
14479 -#ifdef CONFIG_BLK_DEV_RAM
14480 - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
14481 - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
14482 - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
14483 -#endif
14484 -#ifdef CONFIG_EFI
14485 - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
14486 - "EL64", 4))
14487 - efi_enabled = 1;
14488 -#endif
14489 -
14490 - ARCH_SETUP
14491 -
14492 - memory_setup();
14493 - copy_edd();
14494 -
14495 - if (!boot_params.hdr.root_flags)
14496 - root_mountflags &= ~MS_RDONLY;
14497 - init_mm.start_code = (unsigned long) &_text;
14498 - init_mm.end_code = (unsigned long) &_etext;
14499 - init_mm.end_data = (unsigned long) &_edata;
14500 - init_mm.brk = (unsigned long) &_end;
14501 -
14502 - code_resource.start = virt_to_phys(&_text);
14503 - code_resource.end = virt_to_phys(&_etext)-1;
14504 - data_resource.start = virt_to_phys(&_etext);
14505 - data_resource.end = virt_to_phys(&_edata)-1;
14506 - bss_resource.start = virt_to_phys(&__bss_start);
14507 - bss_resource.end = virt_to_phys(&__bss_stop)-1;
14508 -
14509 - early_identify_cpu(&boot_cpu_data);
14510 -
14511 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
14512 - *cmdline_p = command_line;
14513 -
14514 - parse_setup_data();
14515 -
14516 - parse_early_param();
14517 -
14518 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
14519 - if (init_ohci1394_dma_early)
14520 - init_ohci1394_dma_on_all_controllers();
14521 -#endif
14522 -
14523 - finish_e820_parsing();
14524 -
14525 -#ifndef CONFIG_XEN
14526 - /* after parse_early_param, so could debug it */
14527 - insert_resource(&iomem_resource, &code_resource);
14528 - insert_resource(&iomem_resource, &data_resource);
14529 - insert_resource(&iomem_resource, &bss_resource);
14530 -#endif
14531 -
14532 - early_gart_iommu_check();
14533 -
14534 - e820_register_active_regions(0, 0, -1UL);
14535 - /*
14536 - * partially used pages are not usable - thus
14537 - * we are rounding upwards:
14538 - */
14539 - end_pfn = e820_end_of_ram();
14540 - /* update e820 for memory not covered by WB MTRRs */
14541 - mtrr_bp_init();
14542 -#ifndef CONFIG_XEN
14543 - if (mtrr_trim_uncached_memory(end_pfn)) {
14544 - e820_register_active_regions(0, 0, -1UL);
14545 - end_pfn = e820_end_of_ram();
14546 - }
14547 -#endif
14548 -
14549 - num_physpages = end_pfn;
14550 - max_mapnr = end_pfn;
14551 -
14552 - check_efer();
14553 -
14554 - max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
14555 - if (efi_enabled)
14556 - efi_init();
14557 -
14558 -#ifndef CONFIG_XEN
14559 - vsmp_init();
14560 -#endif
14561 -
14562 - if (is_initial_xendomain())
14563 - dmi_scan_machine();
14564 -
14565 - io_delay_init();
14566 -
14567 -#ifdef CONFIG_KVM_CLOCK
14568 - kvmclock_init();
14569 -#endif
14570 -
14571 -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
14572 - /* setup to use the early static init tables during kernel startup */
14573 - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
14574 - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
14575 -#ifdef CONFIG_NUMA
14576 - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
14577 -#endif
14578 -#endif
14579 -
14580 - /* How many end-of-memory variables you have, grandma! */
14581 - max_low_pfn = end_pfn;
14582 - max_pfn = end_pfn;
14583 - high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
14584 -
14585 - /* Remove active ranges so rediscovery with NUMA-awareness happens */
14586 - remove_all_active_ranges();
14587 -
14588 -#ifdef CONFIG_ACPI_NUMA
14589 - /*
14590 - * Parse SRAT to discover nodes.
14591 - */
14592 - acpi_numa_init();
14593 -#endif
14594 -
14595 -#ifdef CONFIG_NUMA
14596 - numa_initmem_init(0, end_pfn);
14597 -#else
14598 - contig_initmem_init(0, end_pfn);
14599 -#endif
14600 -
14601 -#ifndef CONFIG_XEN
14602 - dma32_reserve_bootmem();
14603 -
14604 -#ifdef CONFIG_ACPI_SLEEP
14605 - /*
14606 - * Reserve low memory region for sleep support.
14607 - */
14608 - acpi_reserve_bootmem();
14609 -#endif
14610 -
14611 - if (efi_enabled)
14612 - efi_reserve_bootmem();
14613 -#endif
14614 -
14615 -#ifdef CONFIG_BLK_DEV_INITRD
14616 -#ifdef CONFIG_XEN
14617 - if (xen_start_info->mod_start) {
14618 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
14619 - unsigned long ramdisk_size = xen_start_info->mod_len;
14620 -#else
14621 - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
14622 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
14623 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
14624 -#endif
14625 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
14626 - unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
14627 -
14628 - if (ramdisk_end <= end_of_mem) {
14629 - /*
14630 - * don't need to reserve again, already reserved early
14631 - * in x86_64_start_kernel, and early_res_to_bootmem
14632 - * convert that to reserved in bootmem
14633 - */
14634 - initrd_start = ramdisk_image + PAGE_OFFSET;
14635 - initrd_end = initrd_start+ramdisk_size;
14636 -#ifdef CONFIG_XEN
14637 - initrd_below_start_ok = 1;
14638 -#endif
14639 - } else {
14640 - free_bootmem(ramdisk_image, ramdisk_size);
14641 - printk(KERN_ERR "initrd extends beyond end of memory "
14642 - "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
14643 - ramdisk_end, end_of_mem);
14644 - initrd_start = 0;
14645 - }
14646 - }
14647 -#endif
14648 - reserve_crashkernel();
14649 -
14650 - reserve_ibft_region();
14651 -
14652 - paging_init();
14653 - map_vsyscall();
14654 -#ifdef CONFIG_X86_LOCAL_APIC
14655 - /*
14656 - * Find and reserve possible boot-time SMP configuration:
14657 - */
14658 - find_smp_config();
14659 -#endif
14660 -#ifdef CONFIG_XEN
14661 - {
14662 - int i, j, k, fpp;
14663 - unsigned long p2m_pages;
14664 -
14665 - p2m_pages = end_pfn;
14666 - if (xen_start_info->nr_pages > end_pfn) {
14667 - /*
14668 - * the end_pfn was shrunk (probably by mem= or highmem=
14669 - * kernel parameter); shrink reservation with the HV
14670 - */
14671 - struct xen_memory_reservation reservation = {
14672 - .address_bits = 0,
14673 - .extent_order = 0,
14674 - .domid = DOMID_SELF
14675 - };
14676 - unsigned int difference;
14677 - int ret;
14678 -
14679 - difference = xen_start_info->nr_pages - end_pfn;
14680 -
14681 - set_xen_guest_handle(reservation.extent_start,
14682 - ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
14683 - reservation.nr_extents = difference;
14684 - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
14685 - &reservation);
14686 - BUG_ON (ret != difference);
14687 - }
14688 - else if (end_pfn > xen_start_info->nr_pages)
14689 - p2m_pages = xen_start_info->nr_pages;
14690 -
14691 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
14692 - /* Make sure we have a large enough P->M table. */
14693 - phys_to_machine_mapping = alloc_bootmem_pages(
14694 - end_pfn * sizeof(unsigned long));
14695 - memset(phys_to_machine_mapping, ~0,
14696 - end_pfn * sizeof(unsigned long));
14697 - memcpy(phys_to_machine_mapping,
14698 - (unsigned long *)xen_start_info->mfn_list,
14699 - p2m_pages * sizeof(unsigned long));
14700 - free_bootmem(
14701 - __pa(xen_start_info->mfn_list),
14702 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
14703 - sizeof(unsigned long))));
14704 -
14705 - /*
14706 - * Initialise the list of the frames that specify the
14707 - * list of frames that make up the p2m table. Used by
14708 - * save/restore.
14709 - */
14710 - pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
14711 -
14712 - fpp = PAGE_SIZE/sizeof(unsigned long);
14713 - for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
14714 - if ((j % fpp) == 0) {
14715 - k++;
14716 - BUG_ON(k>=fpp);
14717 - pfn_to_mfn_frame_list[k] =
14718 - alloc_bootmem_pages(PAGE_SIZE);
14719 - pfn_to_mfn_frame_list_list[k] =
14720 - virt_to_mfn(pfn_to_mfn_frame_list[k]);
14721 - j=0;
14722 - }
14723 - pfn_to_mfn_frame_list[k][j] =
14724 - virt_to_mfn(&phys_to_machine_mapping[i]);
14725 - }
14726 - HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
14727 - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
14728 - virt_to_mfn(pfn_to_mfn_frame_list_list);
14729 - }
14730 -
14731 - /* Mark all ISA DMA channels in-use - using them wouldn't work. */
14732 - for (i = 0; i < MAX_DMA_CHANNELS; ++i)
14733 - if (i != 4 && request_dma(i, "xen") != 0)
14734 - BUG();
14735 - }
14736 -
14737 -#ifdef CONFIG_ACPI
14738 - if (!is_initial_xendomain()) {
14739 - acpi_disabled = 1;
14740 - acpi_ht = 0;
14741 - }
14742 -#endif
14743 -#endif
14744 -
14745 -#ifndef CONFIG_XEN
14746 - early_quirks();
14747 -#endif
14748 -
14749 -#ifdef CONFIG_ACPI
14750 - /*
14751 - * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
14752 - * Call this early for SRAT node setup.
14753 - */
14754 - acpi_boot_table_init();
14755 -
14756 - /*
14757 - * Read APIC and some other early information from ACPI tables.
14758 - */
14759 - acpi_boot_init();
14760 -#endif
14761 -
14762 - init_cpu_to_node();
14763 -
14764 -#ifdef CONFIG_X86_LOCAL_APIC
14765 - /*
14766 - * get boot-time SMP configuration:
14767 - */
14768 - if (smp_found_config)
14769 - get_smp_config();
14770 -#ifndef CONFIG_XEN
14771 - init_apic_mappings();
14772 - ioapic_init_mappings();
14773 -#endif
14774 -#endif
14775 -#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
14776 - prefill_possible_map();
14777 -#endif
14778 -
14779 - kvm_guest_init();
14780 -
14781 - /*
14782 - * We trust e820 completely. No explicit ROM probing in memory.
14783 - */
14784 -#ifdef CONFIG_XEN
14785 - if (is_initial_xendomain())
14786 - e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
14787 -#else
14788 - e820_reserve_resources(e820.map, e820.nr_map);
14789 - e820_mark_nosave_regions();
14790 -#endif
14791 -
14792 - /* request I/O space for devices used on all i[345]86 PCs */
14793 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14794 - request_resource(&ioport_resource, &standard_io_resources[i]);
14795 -
14796 -#ifdef CONFIG_XEN
14797 - if (is_initial_xendomain())
14798 - e820_setup_gap(machine_e820.map, machine_e820.nr_map);
14799 -#else
14800 - e820_setup_gap(e820.map, e820.nr_map);
14801 -#endif
14802 -
14803 -#ifdef CONFIG_XEN
14804 - {
14805 - struct physdev_set_iopl set_iopl;
14806 -
14807 - set_iopl.iopl = 1;
14808 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
14809 -
14810 - if (is_initial_xendomain()) {
14811 -#ifdef CONFIG_VT
14812 -#if defined(CONFIG_VGA_CONSOLE)
14813 - conswitchp = &vga_con;
14814 -#elif defined(CONFIG_DUMMY_CONSOLE)
14815 - conswitchp = &dummy_con;
14816 -#endif
14817 -#endif
14818 - } else {
14819 -#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14820 - conswitchp = &dummy_con;
14821 -#endif
14822 - }
14823 - }
14824 -#else /* CONFIG_XEN */
14825 -
14826 -#ifdef CONFIG_VT
14827 -#if defined(CONFIG_VGA_CONSOLE)
14828 - if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14829 - conswitchp = &vga_con;
14830 -#elif defined(CONFIG_DUMMY_CONSOLE)
14831 - conswitchp = &dummy_con;
14832 -#endif
14833 -#endif
14834 -
14835 -#endif /* !CONFIG_XEN */
14836 -
14837 - /* do this before identify_cpu for boot cpu */
14838 - check_enable_amd_mmconf_dmi();
14839 -}
14840 -
14841 -#ifdef CONFIG_XEN
14842 -static int
14843 -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14844 -{
14845 - HYPERVISOR_shutdown(SHUTDOWN_crash);
14846 - /* we're never actually going to get here... */
14847 - return NOTIFY_DONE;
14848 -}
14849 -#endif /* !CONFIG_XEN */
14850 -
14851 -
14852 -static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
14853 -{
14854 - unsigned int *v;
14855 -
14856 - if (c->extended_cpuid_level < 0x80000004)
14857 - return 0;
14858 -
14859 - v = (unsigned int *) c->x86_model_id;
14860 - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
14861 - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
14862 - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
14863 - c->x86_model_id[48] = 0;
14864 - return 1;
14865 -}
14866 -
14867 -
14868 -static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
14869 -{
14870 - unsigned int n, dummy, eax, ebx, ecx, edx;
14871 -
14872 - n = c->extended_cpuid_level;
14873 -
14874 - if (n >= 0x80000005) {
14875 - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
14876 - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
14877 - "D cache %dK (%d bytes/line)\n",
14878 - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
14879 - c->x86_cache_size = (ecx>>24) + (edx>>24);
14880 - /* On K8 L1 TLB is inclusive, so don't count it */
14881 - c->x86_tlbsize = 0;
14882 - }
14883 -
14884 - if (n >= 0x80000006) {
14885 - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
14886 - ecx = cpuid_ecx(0x80000006);
14887 - c->x86_cache_size = ecx >> 16;
14888 - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
14889 -
14890 - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
14891 - c->x86_cache_size, ecx & 0xFF);
14892 - }
14893 - if (n >= 0x80000008) {
14894 - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
14895 - c->x86_virt_bits = (eax >> 8) & 0xff;
14896 - c->x86_phys_bits = eax & 0xff;
14897 - }
14898 -}
14899 -
14900 -#ifdef CONFIG_NUMA
14901 -static int __cpuinit nearby_node(int apicid)
14902 -{
14903 - int i, node;
14904 -
14905 - for (i = apicid - 1; i >= 0; i--) {
14906 - node = apicid_to_node[i];
14907 - if (node != NUMA_NO_NODE && node_online(node))
14908 - return node;
14909 - }
14910 - for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
14911 - node = apicid_to_node[i];
14912 - if (node != NUMA_NO_NODE && node_online(node))
14913 - return node;
14914 - }
14915 - return first_node(node_online_map); /* Shouldn't happen */
14916 -}
14917 -#endif
14918 -
14919 -/*
14920 - * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
14921 - * Assumes number of cores is a power of two.
14922 - */
14923 -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
14924 -{
14925 -#ifdef CONFIG_SMP
14926 - unsigned bits;
14927 -#ifdef CONFIG_NUMA
14928 - int cpu = smp_processor_id();
14929 - int node = 0;
14930 - unsigned apicid = hard_smp_processor_id();
14931 -#endif
14932 - bits = c->x86_coreid_bits;
14933 -
14934 - /* Low order bits define the core id (index of core in socket) */
14935 - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
14936 - /* Convert the initial APIC ID into the socket ID */
14937 - c->phys_proc_id = c->initial_apicid >> bits;
14938 -
14939 -#ifdef CONFIG_NUMA
14940 - node = c->phys_proc_id;
14941 - if (apicid_to_node[apicid] != NUMA_NO_NODE)
14942 - node = apicid_to_node[apicid];
14943 - if (!node_online(node)) {
14944 - /* Two possibilities here:
14945 - - The CPU is missing memory and no node was created.
14946 - In that case try picking one from a nearby CPU
14947 - - The APIC IDs differ from the HyperTransport node IDs
14948 - which the K8 northbridge parsing fills in.
14949 - Assume they are all increased by a constant offset,
14950 - but in the same order as the HT nodeids.
14951 - If that doesn't result in a usable node fall back to the
14952 - path for the previous case. */
14953 -
14954 - int ht_nodeid = c->initial_apicid;
14955 -
14956 - if (ht_nodeid >= 0 &&
14957 - apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
14958 - node = apicid_to_node[ht_nodeid];
14959 - /* Pick a nearby node */
14960 - if (!node_online(node))
14961 - node = nearby_node(apicid);
14962 - }
14963 - numa_set_node(cpu, node);
14964 -
14965 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
14966 -#endif
14967 -#endif
14968 -}
14969 -
14970 -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
14971 -{
14972 -#ifdef CONFIG_SMP
14973 - unsigned bits, ecx;
14974 -
14975 - /* Multi core CPU? */
14976 - if (c->extended_cpuid_level < 0x80000008)
14977 - return;
14978 -
14979 - ecx = cpuid_ecx(0x80000008);
14980 -
14981 - c->x86_max_cores = (ecx & 0xff) + 1;
14982 -
14983 - /* CPU telling us the core id bits shift? */
14984 - bits = (ecx >> 12) & 0xF;
14985 -
14986 - /* Otherwise recompute */
14987 - if (bits == 0) {
14988 - while ((1 << bits) < c->x86_max_cores)
14989 - bits++;
14990 - }
14991 -
14992 - c->x86_coreid_bits = bits;
14993 -
14994 -#endif
14995 -}
14996 -
14997 -#define ENABLE_C1E_MASK 0x18000000
14998 -#define CPUID_PROCESSOR_SIGNATURE 1
14999 -#define CPUID_XFAM 0x0ff00000
15000 -#define CPUID_XFAM_K8 0x00000000
15001 -#define CPUID_XFAM_10H 0x00100000
15002 -#define CPUID_XFAM_11H 0x00200000
15003 -#define CPUID_XMOD 0x000f0000
15004 -#define CPUID_XMOD_REV_F 0x00040000
15005 -
15006 -#ifndef CONFIG_XEN
15007 -/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
15008 -static __cpuinit int amd_apic_timer_broken(void)
15009 -{
15010 - u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
15011 -
15012 - switch (eax & CPUID_XFAM) {
15013 - case CPUID_XFAM_K8:
15014 - if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
15015 - break;
15016 - case CPUID_XFAM_10H:
15017 - case CPUID_XFAM_11H:
15018 - rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
15019 - if (lo & ENABLE_C1E_MASK)
15020 - return 1;
15021 - break;
15022 - default:
15023 - /* err on the side of caution */
15024 - return 1;
15025 - }
15026 - return 0;
15027 -}
15028 -#endif
15029 -
15030 -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
15031 -{
15032 - early_init_amd_mc(c);
15033 -
15034 - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
15035 - if (c->x86_power & (1<<8))
15036 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15037 -}
15038 -
15039 -static void __cpuinit init_amd(struct cpuinfo_x86 *c)
15040 -{
15041 - unsigned level;
15042 -
15043 -#ifdef CONFIG_SMP
15044 - unsigned long value;
15045 -
15046 - /*
15047 - * Disable TLB flush filter by setting HWCR.FFDIS on K8
15048 - * bit 6 of msr C001_0015
15049 - *
15050 - * Errata 63 for SH-B3 steppings
15051 - * Errata 122 for all steppings (F+ have it disabled by default)
15052 - */
15053 - if (c->x86 == 15) {
15054 - rdmsrl(MSR_K8_HWCR, value);
15055 - value |= 1 << 6;
15056 - wrmsrl(MSR_K8_HWCR, value);
15057 - }
15058 -#endif
15059 -
15060 - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
15061 - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
15062 - clear_cpu_cap(c, 0*32+31);
15063 -
15064 - /* On C+ stepping K8 rep microcode works well for copy/memset */
15065 - level = cpuid_eax(1);
15066 - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
15067 - level >= 0x0f58))
15068 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15069 - if (c->x86 == 0x10 || c->x86 == 0x11)
15070 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15071 -
15072 - /* Enable workaround for FXSAVE leak */
15073 - if (c->x86 >= 6)
15074 - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
15075 -
15076 - level = get_model_name(c);
15077 - if (!level) {
15078 - switch (c->x86) {
15079 - case 15:
15080 - /* Should distinguish Models here, but this is only
15081 - a fallback anyways. */
15082 - strcpy(c->x86_model_id, "Hammer");
15083 - break;
15084 - }
15085 - }
15086 - display_cacheinfo(c);
15087 -
15088 - /* Multi core CPU? */
15089 - if (c->extended_cpuid_level >= 0x80000008)
15090 - amd_detect_cmp(c);
15091 -
15092 - if (c->extended_cpuid_level >= 0x80000006 &&
15093 - (cpuid_edx(0x80000006) & 0xf000))
15094 - num_cache_leaves = 4;
15095 - else
15096 - num_cache_leaves = 3;
15097 -
15098 - if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
15099 - set_cpu_cap(c, X86_FEATURE_K8);
15100 -
15101 - /* MFENCE stops RDTSC speculation */
15102 - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
15103 -
15104 - if (c->x86 == 0x10)
15105 - fam10h_check_enable_mmcfg();
15106 -
15107 -#ifndef CONFIG_XEN
15108 - if (amd_apic_timer_broken())
15109 - disable_apic_timer = 1;
15110 -
15111 - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
15112 - unsigned long long tseg;
15113 -
15114 - /*
15115 - * Split up direct mapping around the TSEG SMM area.
15116 - * Don't do it for gbpages because there seems very little
15117 - * benefit in doing so.
15118 - */
15119 - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
15120 - (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
15121 - set_memory_4k((unsigned long)__va(tseg), 1);
15122 - }
15123 -#endif
15124 -}
15125 -
15126 -void __cpuinit detect_ht(struct cpuinfo_x86 *c)
15127 -{
15128 -#ifdef CONFIG_SMP
15129 - u32 eax, ebx, ecx, edx;
15130 - int index_msb, core_bits;
15131 -
15132 - cpuid(1, &eax, &ebx, &ecx, &edx);
15133 -
15134 -
15135 - if (!cpu_has(c, X86_FEATURE_HT))
15136 - return;
15137 - if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
15138 - goto out;
15139 -
15140 - smp_num_siblings = (ebx & 0xff0000) >> 16;
15141 -
15142 - if (smp_num_siblings == 1) {
15143 - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
15144 - } else if (smp_num_siblings > 1) {
15145 -
15146 - if (smp_num_siblings > NR_CPUS) {
15147 - printk(KERN_WARNING "CPU: Unsupported number of "
15148 - "siblings %d", smp_num_siblings);
15149 - smp_num_siblings = 1;
15150 - return;
15151 - }
15152 -
15153 - index_msb = get_count_order(smp_num_siblings);
15154 - c->phys_proc_id = phys_pkg_id(index_msb);
15155 -
15156 - smp_num_siblings = smp_num_siblings / c->x86_max_cores;
15157 -
15158 - index_msb = get_count_order(smp_num_siblings);
15159 -
15160 - core_bits = get_count_order(c->x86_max_cores);
15161 -
15162 - c->cpu_core_id = phys_pkg_id(index_msb) &
15163 - ((1 << core_bits) - 1);
15164 - }
15165 -out:
15166 - if ((c->x86_max_cores * smp_num_siblings) > 1) {
15167 - printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
15168 - c->phys_proc_id);
15169 - printk(KERN_INFO "CPU: Processor Core ID: %d\n",
15170 - c->cpu_core_id);
15171 - }
15172 -
15173 -#endif
15174 -}
15175 -
15176 -/*
15177 - * find out the number of processor cores on the die
15178 - */
15179 -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
15180 -{
15181 - unsigned int eax, t;
15182 -
15183 - if (c->cpuid_level < 4)
15184 - return 1;
15185 -
15186 - cpuid_count(4, 0, &eax, &t, &t, &t);
15187 -
15188 - if (eax & 0x1f)
15189 - return ((eax >> 26) + 1);
15190 - else
15191 - return 1;
15192 -}
15193 -
15194 -static void __cpuinit srat_detect_node(void)
15195 -{
15196 -#ifdef CONFIG_NUMA
15197 - unsigned node;
15198 - int cpu = smp_processor_id();
15199 - int apicid = hard_smp_processor_id();
15200 -
15201 - /* Don't do the funky fallback heuristics the AMD version employs
15202 - for now. */
15203 - node = apicid_to_node[apicid];
15204 - if (node == NUMA_NO_NODE || !node_online(node))
15205 - node = first_node(node_online_map);
15206 - numa_set_node(cpu, node);
15207 -
15208 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
15209 -#endif
15210 -}
15211 -
15212 -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
15213 -{
15214 - if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
15215 - (c->x86 == 0x6 && c->x86_model >= 0x0e))
15216 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15217 -}
15218 -
15219 -static void __cpuinit init_intel(struct cpuinfo_x86 *c)
15220 -{
15221 - /* Cache sizes */
15222 - unsigned n;
15223 -
15224 - init_intel_cacheinfo(c);
15225 - if (c->cpuid_level > 9) {
15226 - unsigned eax = cpuid_eax(10);
15227 - /* Check for version and the number of counters */
15228 - if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
15229 - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
15230 - }
15231 -
15232 - if (cpu_has_ds) {
15233 - unsigned int l1, l2;
15234 - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
15235 - if (!(l1 & (1<<11)))
15236 - set_cpu_cap(c, X86_FEATURE_BTS);
15237 - if (!(l1 & (1<<12)))
15238 - set_cpu_cap(c, X86_FEATURE_PEBS);
15239 - }
15240 -
15241 -
15242 - if (cpu_has_bts)
15243 - ds_init_intel(c);
15244 -
15245 - n = c->extended_cpuid_level;
15246 - if (n >= 0x80000008) {
15247 - unsigned eax = cpuid_eax(0x80000008);
15248 - c->x86_virt_bits = (eax >> 8) & 0xff;
15249 - c->x86_phys_bits = eax & 0xff;
15250 - /* CPUID workaround for Intel 0F34 CPU */
15251 - if (c->x86_vendor == X86_VENDOR_INTEL &&
15252 - c->x86 == 0xF && c->x86_model == 0x3 &&
15253 - c->x86_mask == 0x4)
15254 - c->x86_phys_bits = 36;
15255 - }
15256 -
15257 - if (c->x86 == 15)
15258 - c->x86_cache_alignment = c->x86_clflush_size * 2;
15259 - if (c->x86 == 6)
15260 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15261 - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15262 - c->x86_max_cores = intel_num_cpu_cores(c);
15263 -
15264 - srat_detect_node();
15265 -}
15266 -
15267 -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
15268 -{
15269 - if (c->x86 == 0x6 && c->x86_model >= 0xf)
15270 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15271 -}
15272 -
15273 -static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
15274 -{
15275 - /* Cache sizes */
15276 - unsigned n;
15277 -
15278 - n = c->extended_cpuid_level;
15279 - if (n >= 0x80000008) {
15280 - unsigned eax = cpuid_eax(0x80000008);
15281 - c->x86_virt_bits = (eax >> 8) & 0xff;
15282 - c->x86_phys_bits = eax & 0xff;
15283 - }
15284 -
15285 - if (c->x86 == 0x6 && c->x86_model >= 0xf) {
15286 - c->x86_cache_alignment = c->x86_clflush_size * 2;
15287 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15288 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15289 - }
15290 - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15291 -}
15292 -
15293 -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
15294 -{
15295 - char *v = c->x86_vendor_id;
15296 -
15297 - if (!strcmp(v, "AuthenticAMD"))
15298 - c->x86_vendor = X86_VENDOR_AMD;
15299 - else if (!strcmp(v, "GenuineIntel"))
15300 - c->x86_vendor = X86_VENDOR_INTEL;
15301 - else if (!strcmp(v, "CentaurHauls"))
15302 - c->x86_vendor = X86_VENDOR_CENTAUR;
15303 - else
15304 - c->x86_vendor = X86_VENDOR_UNKNOWN;
15305 -}
15306 -
15307 -/* Do some early cpuid on the boot CPU to get some parameter that are
15308 - needed before check_bugs. Everything advanced is in identify_cpu
15309 - below. */
15310 -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
15311 -{
15312 - u32 tfms, xlvl;
15313 -
15314 - c->loops_per_jiffy = loops_per_jiffy;
15315 - c->x86_cache_size = -1;
15316 - c->x86_vendor = X86_VENDOR_UNKNOWN;
15317 - c->x86_model = c->x86_mask = 0; /* So far unknown... */
15318 - c->x86_vendor_id[0] = '\0'; /* Unset */
15319 - c->x86_model_id[0] = '\0'; /* Unset */
15320 - c->x86_clflush_size = 64;
15321 - c->x86_cache_alignment = c->x86_clflush_size;
15322 - c->x86_max_cores = 1;
15323 - c->x86_coreid_bits = 0;
15324 - c->extended_cpuid_level = 0;
15325 - memset(&c->x86_capability, 0, sizeof c->x86_capability);
15326 -
15327 - /* Get vendor name */
15328 - cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
15329 - (unsigned int *)&c->x86_vendor_id[0],
15330 - (unsigned int *)&c->x86_vendor_id[8],
15331 - (unsigned int *)&c->x86_vendor_id[4]);
15332 -
15333 - get_cpu_vendor(c);
15334 -
15335 - /* Initialize the standard set of capabilities */
15336 - /* Note that the vendor-specific code below might override */
15337 -
15338 - /* Intel-defined flags: level 0x00000001 */
15339 - if (c->cpuid_level >= 0x00000001) {
15340 - __u32 misc;
15341 - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
15342 - &c->x86_capability[0]);
15343 - c->x86 = (tfms >> 8) & 0xf;
15344 - c->x86_model = (tfms >> 4) & 0xf;
15345 - c->x86_mask = tfms & 0xf;
15346 - if (c->x86 == 0xf)
15347 - c->x86 += (tfms >> 20) & 0xff;
15348 - if (c->x86 >= 0x6)
15349 - c->x86_model += ((tfms >> 16) & 0xF) << 4;
15350 - if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
15351 - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
15352 - } else {
15353 - /* Have CPUID level 0 only - unheard of */
15354 - c->x86 = 4;
15355 - }
15356 -
15357 - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
15358 -#ifdef CONFIG_SMP
15359 - c->phys_proc_id = c->initial_apicid;
15360 -#endif
15361 - /* AMD-defined flags: level 0x80000001 */
15362 - xlvl = cpuid_eax(0x80000000);
15363 - c->extended_cpuid_level = xlvl;
15364 - if ((xlvl & 0xffff0000) == 0x80000000) {
15365 - if (xlvl >= 0x80000001) {
15366 - c->x86_capability[1] = cpuid_edx(0x80000001);
15367 - c->x86_capability[6] = cpuid_ecx(0x80000001);
15368 - }
15369 - if (xlvl >= 0x80000004)
15370 - get_model_name(c); /* Default name */
15371 - }
15372 -
15373 - /* Transmeta-defined flags: level 0x80860001 */
15374 - xlvl = cpuid_eax(0x80860000);
15375 - if ((xlvl & 0xffff0000) == 0x80860000) {
15376 - /* Don't set x86_cpuid_level here for now to not confuse. */
15377 - if (xlvl >= 0x80860001)
15378 - c->x86_capability[2] = cpuid_edx(0x80860001);
15379 - }
15380 -
15381 - c->extended_cpuid_level = cpuid_eax(0x80000000);
15382 - if (c->extended_cpuid_level >= 0x80000007)
15383 - c->x86_power = cpuid_edx(0x80000007);
15384 -
15385 - switch (c->x86_vendor) {
15386 - case X86_VENDOR_AMD:
15387 - early_init_amd(c);
15388 - break;
15389 - case X86_VENDOR_INTEL:
15390 - early_init_intel(c);
15391 - break;
15392 - case X86_VENDOR_CENTAUR:
15393 - early_init_centaur(c);
15394 - break;
15395 - }
15396 -
15397 - validate_pat_support(c);
15398 -}
15399 -
15400 -/*
15401 - * This does the hard work of actually picking apart the CPU stuff...
15402 - */
15403 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
15404 -{
15405 - int i;
15406 -
15407 - early_identify_cpu(c);
15408 -
15409 - init_scattered_cpuid_features(c);
15410 -
15411 - c->apicid = phys_pkg_id(0);
15412 -
15413 - /*
15414 - * Vendor-specific initialization. In this section we
15415 - * canonicalize the feature flags, meaning if there are
15416 - * features a certain CPU supports which CPUID doesn't
15417 - * tell us, CPUID claiming incorrect flags, or other bugs,
15418 - * we handle them here.
15419 - *
15420 - * At the end of this section, c->x86_capability better
15421 - * indicate the features this CPU genuinely supports!
15422 - */
15423 - switch (c->x86_vendor) {
15424 - case X86_VENDOR_AMD:
15425 - init_amd(c);
15426 - break;
15427 -
15428 - case X86_VENDOR_INTEL:
15429 - init_intel(c);
15430 - break;
15431 -
15432 - case X86_VENDOR_CENTAUR:
15433 - init_centaur(c);
15434 - break;
15435 -
15436 - case X86_VENDOR_UNKNOWN:
15437 - default:
15438 - display_cacheinfo(c);
15439 - break;
15440 - }
15441 -
15442 - detect_ht(c);
15443 -
15444 - /*
15445 - * On SMP, boot_cpu_data holds the common feature set between
15446 - * all CPUs; so make sure that we indicate which features are
15447 - * common between the CPUs. The first time this routine gets
15448 - * executed, c == &boot_cpu_data.
15449 - */
15450 - if (c != &boot_cpu_data) {
15451 - /* AND the already accumulated flags with these */
15452 - for (i = 0; i < NCAPINTS; i++)
15453 - boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
15454 - }
15455 -
15456 - /* Clear all flags overriden by options */
15457 - for (i = 0; i < NCAPINTS; i++)
15458 - c->x86_capability[i] &= ~cleared_cpu_caps[i];
15459 -
15460 -#ifdef CONFIG_X86_MCE
15461 - mcheck_init(c);
15462 -#endif
15463 - select_idle_routine(c);
15464 -
15465 -#ifdef CONFIG_NUMA
15466 - numa_add_cpu(smp_processor_id());
15467 -#endif
15468 -
15469 -}
15470 -
15471 -void __cpuinit identify_boot_cpu(void)
15472 -{
15473 - identify_cpu(&boot_cpu_data);
15474 -}
15475 -
15476 -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
15477 -{
15478 - BUG_ON(c == &boot_cpu_data);
15479 - identify_cpu(c);
15480 - mtrr_ap_init();
15481 -}
15482 -
15483 -static __init int setup_noclflush(char *arg)
15484 -{
15485 - setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
15486 - return 1;
15487 -}
15488 -__setup("noclflush", setup_noclflush);
15489 -
15490 -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
15491 -{
15492 - if (c->x86_model_id[0])
15493 - printk(KERN_CONT "%s", c->x86_model_id);
15494 -
15495 - if (c->x86_mask || c->cpuid_level >= 0)
15496 - printk(KERN_CONT " stepping %02x\n", c->x86_mask);
15497 - else
15498 - printk(KERN_CONT "\n");
15499 -}
15500 -
15501 -static __init int setup_disablecpuid(char *arg)
15502 -{
15503 - int bit;
15504 - if (get_option(&arg, &bit) && bit < NCAPINTS*32)
15505 - setup_clear_cpu_cap(bit);
15506 - else
15507 - return 0;
15508 - return 1;
15509 -}
15510 -__setup("clearcpuid=", setup_disablecpuid);
15511 Index: head-2008-12-01/arch/x86/kernel/setup_percpu-xen.c
15512 ===================================================================
15513 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
15514 +++ head-2008-12-01/arch/x86/kernel/setup_percpu-xen.c 2008-12-01 11:49:07.000000000 +0100
15515 @@ -0,0 +1,385 @@
15516 +#include <linux/kernel.h>
15517 +#include <linux/module.h>
15518 +#include <linux/init.h>
15519 +#include <linux/bootmem.h>
15520 +#include <linux/percpu.h>
15521 +#include <linux/kexec.h>
15522 +#include <linux/crash_dump.h>
15523 +#include <asm/smp.h>
15524 +#include <asm/percpu.h>
15525 +#include <asm/sections.h>
15526 +#include <asm/processor.h>
15527 +#include <asm/setup.h>
15528 +#include <asm/topology.h>
15529 +#include <asm/mpspec.h>
15530 +#include <asm/apicdef.h>
15531 +#include <asm/highmem.h>
15532 +
15533 +#ifdef CONFIG_X86_LOCAL_APIC
15534 +unsigned int num_processors;
15535 +unsigned disabled_cpus __cpuinitdata;
15536 +/* Processor that is doing the boot up */
15537 +unsigned int boot_cpu_physical_apicid = -1U;
15538 +unsigned int max_physical_apicid;
15539 +EXPORT_SYMBOL(boot_cpu_physical_apicid);
15540 +
15541 +/* Bitmask of physically existing CPUs */
15542 +physid_mask_t phys_cpu_present_map;
15543 +#endif
15544 +
15545 +/* map cpu index to physical APIC ID */
15546 +#ifndef CONFIG_XEN
15547 +DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
15548 +DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
15549 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15550 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
15551 +#else
15552 +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
15553 +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15554 +#endif
15555 +
15556 +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
15557 +#define X86_64_NUMA 1
15558 +
15559 +/* map cpu index to node index */
15560 +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
15561 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
15562 +
15563 +/* which logical CPUs are on which nodes */
15564 +cpumask_t *node_to_cpumask_map;
15565 +EXPORT_SYMBOL(node_to_cpumask_map);
15566 +
15567 +/* setup node_to_cpumask_map */
15568 +static void __init setup_node_to_cpumask_map(void);
15569 +
15570 +#else
15571 +static inline void setup_node_to_cpumask_map(void) { }
15572 +#endif
15573 +
15574 +#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
15575 +/*
15576 + * Copy data used in early init routines from the initial arrays to the
15577 + * per cpu data areas. These arrays then become expendable and the
15578 + * *_early_ptr's are zeroed indicating that the static arrays are gone.
15579 + */
15580 +static void __init setup_per_cpu_maps(void)
15581 +{
15582 +#ifndef CONFIG_XEN
15583 + int cpu;
15584 +
15585 + for_each_possible_cpu(cpu) {
15586 + per_cpu(x86_cpu_to_apicid, cpu) =
15587 + early_per_cpu_map(x86_cpu_to_apicid, cpu);
15588 + per_cpu(x86_bios_cpu_apicid, cpu) =
15589 + early_per_cpu_map(x86_bios_cpu_apicid, cpu);
15590 +#ifdef X86_64_NUMA
15591 + per_cpu(x86_cpu_to_node_map, cpu) =
15592 + early_per_cpu_map(x86_cpu_to_node_map, cpu);
15593 +#endif
15594 + }
15595 +
15596 + /* indicate the early static arrays will soon be gone */
15597 + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
15598 + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
15599 +#ifdef X86_64_NUMA
15600 + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
15601 +#endif
15602 +#endif
15603 +}
15604 +
15605 +#ifdef CONFIG_X86_32
15606 +/*
15607 + * Great future not-so-futuristic plan: make i386 and x86_64 do it
15608 + * the same way
15609 + */
15610 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
15611 +EXPORT_SYMBOL(__per_cpu_offset);
15612 +static inline void setup_cpu_pda_map(void) { }
15613 +
15614 +#elif !defined(CONFIG_SMP)
15615 +static inline void setup_cpu_pda_map(void) { }
15616 +
15617 +#else /* CONFIG_SMP && CONFIG_X86_64 */
15618 +
15619 +/*
15620 + * Allocate cpu_pda pointer table and array via alloc_bootmem.
15621 + */
15622 +static void __init setup_cpu_pda_map(void)
15623 +{
15624 + char *pda;
15625 + struct x8664_pda **new_cpu_pda;
15626 + unsigned long size;
15627 + int cpu;
15628 +
15629 + size = roundup(sizeof(struct x8664_pda), cache_line_size());
15630 +
15631 + /* allocate cpu_pda array and pointer table */
15632 + {
15633 + unsigned long tsize = nr_cpu_ids * sizeof(void *);
15634 + unsigned long asize = size * (nr_cpu_ids - 1);
15635 +
15636 + tsize = roundup(tsize, cache_line_size());
15637 + new_cpu_pda = alloc_bootmem(tsize + asize);
15638 + pda = (char *)new_cpu_pda + tsize;
15639 + }
15640 +
15641 + /* initialize pointer table to static pda's */
15642 + for_each_possible_cpu(cpu) {
15643 + if (cpu == 0) {
15644 + /* leave boot cpu pda in place */
15645 + new_cpu_pda[0] = cpu_pda(0);
15646 + continue;
15647 + }
15648 + new_cpu_pda[cpu] = (struct x8664_pda *)pda;
15649 + new_cpu_pda[cpu]->in_bootmem = 1;
15650 + pda += size;
15651 + }
15652 +
15653 + /* point to new pointer table */
15654 + _cpu_pda = new_cpu_pda;
15655 +}
15656 +#endif
15657 +
15658 +/*
15659 + * Great future plan:
15660 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
15661 + * Always point %gs to its beginning
15662 + */
15663 +void __init setup_per_cpu_areas(void)
15664 +{
15665 + ssize_t size = PERCPU_ENOUGH_ROOM;
15666 + char *ptr;
15667 + int cpu;
15668 +
15669 + /* Setup cpu_pda map */
15670 + setup_cpu_pda_map();
15671 +
15672 + /* Copy section for each CPU (we discard the original) */
15673 + size = PERCPU_ENOUGH_ROOM;
15674 + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
15675 + size);
15676 +
15677 + for_each_possible_cpu(cpu) {
15678 +#ifndef CONFIG_NEED_MULTIPLE_NODES
15679 + ptr = alloc_bootmem_pages(size);
15680 +#else
15681 + int node = early_cpu_to_node(cpu);
15682 + if (!node_online(node) || !NODE_DATA(node)) {
15683 + ptr = alloc_bootmem_pages(size);
15684 + printk(KERN_INFO
15685 + "cpu %d has no node %d or node-local memory\n",
15686 + cpu, node);
15687 + }
15688 + else
15689 + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
15690 +#endif
15691 + per_cpu_offset(cpu) = ptr - __per_cpu_start;
15692 + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
15693 +
15694 + }
15695 +
15696 + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
15697 + NR_CPUS, nr_cpu_ids, nr_node_ids);
15698 +
15699 + /* Setup percpu data maps */
15700 + setup_per_cpu_maps();
15701 +
15702 + /* Setup node to cpumask map */
15703 + setup_node_to_cpumask_map();
15704 +}
15705 +
15706 +#endif
15707 +
15708 +#ifdef X86_64_NUMA
15709 +
15710 +/*
15711 + * Allocate node_to_cpumask_map based on number of available nodes
15712 + * Requires node_possible_map to be valid.
15713 + *
15714 + * Note: node_to_cpumask() is not valid until after this is done.
15715 + */
15716 +static void __init setup_node_to_cpumask_map(void)
15717 +{
15718 + unsigned int node, num = 0;
15719 + cpumask_t *map;
15720 +
15721 + /* setup nr_node_ids if not done yet */
15722 + if (nr_node_ids == MAX_NUMNODES) {
15723 + for_each_node_mask(node, node_possible_map)
15724 + num = node;
15725 + nr_node_ids = num + 1;
15726 + }
15727 +
15728 + /* allocate the map */
15729 + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
15730 +
15731 + pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
15732 + map, nr_node_ids);
15733 +
15734 + /* node_to_cpumask() will now work */
15735 + node_to_cpumask_map = map;
15736 +}
15737 +
15738 +void __cpuinit numa_set_node(int cpu, int node)
15739 +{
15740 + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
15741 +
15742 + if (cpu_pda(cpu) && node != NUMA_NO_NODE)
15743 + cpu_pda(cpu)->nodenumber = node;
15744 +
15745 + if (cpu_to_node_map)
15746 + cpu_to_node_map[cpu] = node;
15747 +
15748 + else if (per_cpu_offset(cpu))
15749 + per_cpu(x86_cpu_to_node_map, cpu) = node;
15750 +
15751 + else
15752 + pr_debug("Setting node for non-present cpu %d\n", cpu);
15753 +}
15754 +
15755 +void __cpuinit numa_clear_node(int cpu)
15756 +{
15757 + numa_set_node(cpu, NUMA_NO_NODE);
15758 +}
15759 +
15760 +#ifndef CONFIG_DEBUG_PER_CPU_MAPS
15761 +
15762 +void __cpuinit numa_add_cpu(int cpu)
15763 +{
15764 + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
15765 +}
15766 +
15767 +void __cpuinit numa_remove_cpu(int cpu)
15768 +{
15769 + cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
15770 +}
15771 +
15772 +#else /* CONFIG_DEBUG_PER_CPU_MAPS */
15773 +
15774 +/*
15775 + * --------- debug versions of the numa functions ---------
15776 + */
15777 +static void __cpuinit numa_set_cpumask(int cpu, int enable)
15778 +{
15779 + int node = cpu_to_node(cpu);
15780 + cpumask_t *mask;
15781 + char buf[64];
15782 +
15783 + if (node_to_cpumask_map == NULL) {
15784 + printk(KERN_ERR "node_to_cpumask_map NULL\n");
15785 + dump_stack();
15786 + return;
15787 + }
15788 +
15789 + mask = &node_to_cpumask_map[node];
15790 + if (enable)
15791 + cpu_set(cpu, *mask);
15792 + else
15793 + cpu_clear(cpu, *mask);
15794 +
15795 + cpulist_scnprintf(buf, sizeof(buf), *mask);
15796 + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
15797 + enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
15798 + }
15799 +
15800 +void __cpuinit numa_add_cpu(int cpu)
15801 +{
15802 + numa_set_cpumask(cpu, 1);
15803 +}
15804 +
15805 +void __cpuinit numa_remove_cpu(int cpu)
15806 +{
15807 + numa_set_cpumask(cpu, 0);
15808 +}
15809 +
15810 +int cpu_to_node(int cpu)
15811 +{
15812 + if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
15813 + printk(KERN_WARNING
15814 + "cpu_to_node(%d): usage too early!\n", cpu);
15815 + dump_stack();
15816 + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15817 + }
15818 + return per_cpu(x86_cpu_to_node_map, cpu);
15819 +}
15820 +EXPORT_SYMBOL(cpu_to_node);
15821 +
15822 +/*
15823 + * Same function as cpu_to_node() but used if called before the
15824 + * per_cpu areas are setup.
15825 + */
15826 +int early_cpu_to_node(int cpu)
15827 +{
15828 + if (early_per_cpu_ptr(x86_cpu_to_node_map))
15829 + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15830 +
15831 + if (!per_cpu_offset(cpu)) {
15832 + printk(KERN_WARNING
15833 + "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
15834 + dump_stack();
15835 + return NUMA_NO_NODE;
15836 + }
15837 + return per_cpu(x86_cpu_to_node_map, cpu);
15838 +}
15839 +
15840 +
15841 +/* empty cpumask */
15842 +static const cpumask_t cpu_mask_none;
15843 +
15844 +/*
15845 + * Returns a pointer to the bitmask of CPUs on Node 'node'.
15846 + */
15847 +const cpumask_t *_node_to_cpumask_ptr(int node)
15848 +{
15849 + if (node_to_cpumask_map == NULL) {
15850 + printk(KERN_WARNING
15851 + "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
15852 + node);
15853 + dump_stack();
15854 + return (const cpumask_t *)&cpu_online_map;
15855 + }
15856 + if (node >= nr_node_ids) {
15857 + printk(KERN_WARNING
15858 + "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
15859 + node, nr_node_ids);
15860 + dump_stack();
15861 + return &cpu_mask_none;
15862 + }
15863 + return &node_to_cpumask_map[node];
15864 +}
15865 +EXPORT_SYMBOL(_node_to_cpumask_ptr);
15866 +
15867 +/*
15868 + * Returns a bitmask of CPUs on Node 'node'.
15869 + *
15870 + * Side note: this function creates the returned cpumask on the stack
15871 + * so with a high NR_CPUS count, excessive stack space is used. The
15872 + * node_to_cpumask_ptr function should be used whenever possible.
15873 + */
15874 +cpumask_t node_to_cpumask(int node)
15875 +{
15876 + if (node_to_cpumask_map == NULL) {
15877 + printk(KERN_WARNING
15878 + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
15879 + dump_stack();
15880 + return cpu_online_map;
15881 + }
15882 + if (node >= nr_node_ids) {
15883 + printk(KERN_WARNING
15884 + "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
15885 + node, nr_node_ids);
15886 + dump_stack();
15887 + return cpu_mask_none;
15888 + }
15889 + return node_to_cpumask_map[node];
15890 +}
15891 +EXPORT_SYMBOL(node_to_cpumask);
15892 +
15893 +/*
15894 + * --------- end of debug versions of the numa functions ---------
15895 + */
15896 +
15897 +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
15898 +
15899 +#endif /* X86_64_NUMA */
15900 +
15901 Index: head-2008-12-01/arch/x86/kernel/smp-xen.c
15902 ===================================================================
15903 --- head-2008-12-01.orig/arch/x86/kernel/smp-xen.c 2008-12-01 11:44:55.000000000 +0100
15904 +++ head-2008-12-01/arch/x86/kernel/smp-xen.c 2008-12-01 11:49:07.000000000 +0100
15905 @@ -121,132 +121,14 @@ void xen_smp_send_reschedule(int cpu)
15906 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
15907 }
15908
15909 -/*
15910 - * Structure and data for smp_call_function(). This is designed to minimise
15911 - * static memory requirements. It also looks cleaner.
15912 - */
15913 -static DEFINE_SPINLOCK(call_lock);
15914 -
15915 -struct call_data_struct {
15916 - void (*func) (void *info);
15917 - void *info;
15918 - atomic_t started;
15919 - atomic_t finished;
15920 - int wait;
15921 -};
15922 -
15923 -void lock_ipi_call_lock(void)
15924 +void xen_send_call_func_single_ipi(int cpu)
15925 {
15926 - spin_lock_irq(&call_lock);
15927 + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR);
15928 }
15929
15930 -void unlock_ipi_call_lock(void)
15931 +void xen_send_call_func_ipi(cpumask_t mask)
15932 {
15933 - spin_unlock_irq(&call_lock);
15934 -}
15935 -
15936 -static struct call_data_struct *call_data;
15937 -
15938 -static void __smp_call_function(void (*func) (void *info), void *info,
15939 - int nonatomic, int wait)
15940 -{
15941 - struct call_data_struct data;
15942 - int cpus = num_online_cpus() - 1;
15943 -
15944 - if (!cpus)
15945 - return;
15946 -
15947 - data.func = func;
15948 - data.info = info;
15949 - atomic_set(&data.started, 0);
15950 - data.wait = wait;
15951 - if (wait)
15952 - atomic_set(&data.finished, 0);
15953 -
15954 - call_data = &data;
15955 - mb();
15956 -
15957 - /* Send a message to all other CPUs and wait for them to respond */
15958 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
15959 -
15960 - /* Wait for response */
15961 - while (atomic_read(&data.started) != cpus)
15962 - cpu_relax();
15963 -
15964 - if (wait)
15965 - while (atomic_read(&data.finished) != cpus)
15966 - cpu_relax();
15967 -}
15968 -
15969 -
15970 -/**
15971 - * smp_call_function_mask(): Run a function on a set of other CPUs.
15972 - * @mask: The set of cpus to run on. Must not include the current cpu.
15973 - * @func: The function to run. This must be fast and non-blocking.
15974 - * @info: An arbitrary pointer to pass to the function.
15975 - * @wait: If true, wait (atomically) until function has completed on other CPUs.
15976 - *
15977 - * Returns 0 on success, else a negative status code.
15978 - *
15979 - * If @wait is true, then returns once @func has returned; otherwise
15980 - * it returns just before the target cpu calls @func.
15981 - *
15982 - * You must not call this function with disabled interrupts or from a
15983 - * hardware interrupt handler or from a bottom half handler.
15984 - */
15985 -int
15986 -xen_smp_call_function_mask(cpumask_t mask,
15987 - void (*func)(void *), void *info,
15988 - int wait)
15989 -{
15990 - struct call_data_struct data;
15991 - cpumask_t allbutself;
15992 - int cpus;
15993 -
15994 - /* Can deadlock when called with interrupts disabled */
15995 - WARN_ON(irqs_disabled());
15996 -
15997 - /* Holding any lock stops cpus from going down. */
15998 - spin_lock(&call_lock);
15999 -
16000 - allbutself = cpu_online_map;
16001 - cpu_clear(smp_processor_id(), allbutself);
16002 -
16003 - cpus_and(mask, mask, allbutself);
16004 - cpus = cpus_weight(mask);
16005 -
16006 - if (!cpus) {
16007 - spin_unlock(&call_lock);
16008 - return 0;
16009 - }
16010 -
16011 - data.func = func;
16012 - data.info = info;
16013 - atomic_set(&data.started, 0);
16014 - data.wait = wait;
16015 - if (wait)
16016 - atomic_set(&data.finished, 0);
16017 -
16018 - call_data = &data;
16019 - wmb();
16020 -
16021 - /* Send a message to other CPUs */
16022 - if (cpus_equal(mask, allbutself) &&
16023 - cpus_equal(cpu_online_map, cpu_callout_map))
16024 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
16025 - else
16026 - send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16027 -
16028 - /* Wait for response */
16029 - while (atomic_read(&data.started) != cpus)
16030 - cpu_relax();
16031 -
16032 - if (wait)
16033 - while (atomic_read(&data.finished) != cpus)
16034 - cpu_relax();
16035 - spin_unlock(&call_lock);
16036 -
16037 - return 0;
16038 + send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16039 }
16040
16041 static void stop_this_cpu(void *dummy)
16042 @@ -268,15 +150,10 @@ static void stop_this_cpu(void *dummy)
16043
16044 void xen_smp_send_stop(void)
16045 {
16046 - int nolock;
16047 unsigned long flags;
16048
16049 - /* Don't deadlock on the call lock in panic */
16050 - nolock = !spin_trylock(&call_lock);
16051 + smp_call_function(stop_this_cpu, NULL, 0);
16052 local_irq_save(flags);
16053 - __smp_call_function(stop_this_cpu, NULL, 0, 0);
16054 - if (!nolock)
16055 - spin_unlock(&call_lock);
16056 disable_all_local_evtchn();
16057 local_irq_restore(flags);
16058 }
16059 @@ -298,21 +175,8 @@ irqreturn_t smp_reschedule_interrupt(int
16060
16061 irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
16062 {
16063 - void (*func) (void *info) = call_data->func;
16064 - void *info = call_data->info;
16065 - int wait = call_data->wait;
16066 -
16067 - /*
16068 - * Notify initiating CPU that I've grabbed the data and am
16069 - * about to execute the function
16070 - */
16071 - mb();
16072 - atomic_inc(&call_data->started);
16073 - /*
16074 - * At this point the info structure may be out of scope unless wait==1
16075 - */
16076 irq_enter();
16077 - (*func)(info);
16078 + generic_smp_call_function_interrupt();
16079 #ifdef CONFIG_X86_32
16080 __get_cpu_var(irq_stat).irq_call_count++;
16081 #else
16082 @@ -320,10 +184,19 @@ irqreturn_t smp_call_function_interrupt(
16083 #endif
16084 irq_exit();
16085
16086 - if (wait) {
16087 - mb();
16088 - atomic_inc(&call_data->finished);
16089 - }
16090 + return IRQ_HANDLED;
16091 +}
16092 +
16093 +irqreturn_t smp_call_function_single_interrupt(int irq, void *dev_id)
16094 +{
16095 + irq_enter();
16096 + generic_smp_call_function_single_interrupt();
16097 +#ifdef CONFIG_X86_32
16098 + __get_cpu_var(irq_stat).irq_call_count++;
16099 +#else
16100 + add_pda(irq_call_count, 1);
16101 +#endif
16102 + irq_exit();
16103
16104 return IRQ_HANDLED;
16105 }
16106 Index: head-2008-12-01/arch/x86/kernel/time_32-xen.c
16107 ===================================================================
16108 --- head-2008-12-01.orig/arch/x86/kernel/time_32-xen.c 2008-12-01 11:44:55.000000000 +0100
16109 +++ head-2008-12-01/arch/x86/kernel/time_32-xen.c 2008-12-01 11:58:30.000000000 +0100
16110 @@ -470,7 +470,7 @@ irqreturn_t timer_interrupt(int irq, voi
16111
16112 /* Keep nmi watchdog up to date */
16113 #ifdef __i386__
16114 - per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
16115 + x86_add_percpu(irq_stat.irq0_irqs, 1);
16116 #else
16117 add_pda(irq0_irqs, 1);
16118 #endif
16119 @@ -748,9 +748,7 @@ void __init time_init(void)
16120
16121 update_wallclock();
16122
16123 -#ifndef CONFIG_X86_64
16124 use_tsc_delay();
16125 -#endif
16126
16127 /* Cannot request_irq() until kmem is initialised. */
16128 late_time_init = setup_cpu0_timer_irq;
16129 @@ -807,7 +805,8 @@ static void stop_hz_timer(void)
16130
16131 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
16132 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
16133 - (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
16134 + (j = get_next_timer_interrupt(jiffies),
16135 + time_before_eq(j, jiffies))) {
16136 cpu_clear(cpu, nohz_cpu_mask);
16137 j = jiffies + 1;
16138 }
16139 Index: head-2008-12-01/arch/x86/kernel/traps_32-xen.c
16140 ===================================================================
16141 --- head-2008-12-01.orig/arch/x86/kernel/traps_32-xen.c 2008-12-01 11:44:55.000000000 +0100
16142 +++ head-2008-12-01/arch/x86/kernel/traps_32-xen.c 2008-12-01 11:49:07.000000000 +0100
16143 @@ -1,5 +1,6 @@
16144 /*
16145 * Copyright (C) 1991, 1992 Linus Torvalds
16146 + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
16147 *
16148 * Pentium III FXSR, SSE support
16149 * Gareth Hughes <gareth@valinux.com>, May 2000
16150 @@ -57,11 +58,10 @@
16151 #include <asm/nmi.h>
16152 #include <asm/smp.h>
16153 #include <asm/io.h>
16154 +#include <asm/traps.h>
16155
16156 #include "mach_traps.h"
16157
16158 -int panic_on_unrecovered_nmi;
16159 -
16160 #ifndef CONFIG_XEN
16161 DECLARE_BITMAP(used_vectors, NR_VECTORS);
16162 EXPORT_SYMBOL_GPL(used_vectors);
16163 @@ -82,43 +82,22 @@ gate_desc idt_table[256]
16164 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
16165 #endif
16166
16167 -asmlinkage void divide_error(void);
16168 -asmlinkage void debug(void);
16169 -asmlinkage void nmi(void);
16170 -asmlinkage void int3(void);
16171 -asmlinkage void overflow(void);
16172 -asmlinkage void bounds(void);
16173 -asmlinkage void invalid_op(void);
16174 -asmlinkage void device_not_available(void);
16175 -asmlinkage void coprocessor_segment_overrun(void);
16176 -asmlinkage void invalid_TSS(void);
16177 -asmlinkage void segment_not_present(void);
16178 -asmlinkage void stack_segment(void);
16179 -asmlinkage void general_protection(void);
16180 -asmlinkage void page_fault(void);
16181 -asmlinkage void coprocessor_error(void);
16182 -asmlinkage void simd_coprocessor_error(void);
16183 -asmlinkage void alignment_check(void);
16184 -#ifndef CONFIG_XEN
16185 -asmlinkage void spurious_interrupt_bug(void);
16186 -#else
16187 -asmlinkage void fixup_4gb_segment(void);
16188 -#endif
16189 -asmlinkage void machine_check(void);
16190 -
16191 +int panic_on_unrecovered_nmi;
16192 int kstack_depth_to_print = 24;
16193 static unsigned int code_bytes = 64;
16194 +static int ignore_nmis;
16195 +static int die_counter;
16196
16197 void printk_address(unsigned long address, int reliable)
16198 {
16199 #ifdef CONFIG_KALLSYMS
16200 - char namebuf[KSYM_NAME_LEN];
16201 unsigned long offset = 0;
16202 unsigned long symsize;
16203 const char *symname;
16204 - char reliab[4] = "";
16205 - char *delim = ":";
16206 char *modname;
16207 + char *delim = ":";
16208 + char namebuf[KSYM_NAME_LEN];
16209 + char reliab[4] = "";
16210
16211 symname = kallsyms_lookup(address, &symsize, &offset,
16212 &modname, namebuf);
16213 @@ -138,22 +117,23 @@ void printk_address(unsigned long addres
16214 #endif
16215 }
16216
16217 -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
16218 +static inline int valid_stack_ptr(struct thread_info *tinfo,
16219 + void *p, unsigned int size)
16220 {
16221 - return p > (void *)tinfo &&
16222 - p <= (void *)tinfo + THREAD_SIZE - size;
16223 + void *t = tinfo;
16224 + return p > t && p <= t + THREAD_SIZE - size;
16225 }
16226
16227 /* The form of the top of the frame on the stack */
16228 struct stack_frame {
16229 - struct stack_frame *next_frame;
16230 - unsigned long return_address;
16231 + struct stack_frame *next_frame;
16232 + unsigned long return_address;
16233 };
16234
16235 static inline unsigned long
16236 print_context_stack(struct thread_info *tinfo,
16237 - unsigned long *stack, unsigned long bp,
16238 - const struct stacktrace_ops *ops, void *data)
16239 + unsigned long *stack, unsigned long bp,
16240 + const struct stacktrace_ops *ops, void *data)
16241 {
16242 struct stack_frame *frame = (struct stack_frame *)bp;
16243
16244 @@ -175,8 +155,6 @@ print_context_stack(struct thread_info *
16245 return bp;
16246 }
16247
16248 -#define MSG(msg) ops->warning(data, msg)
16249 -
16250 void dump_trace(struct task_struct *task, struct pt_regs *regs,
16251 unsigned long *stack, unsigned long bp,
16252 const struct stacktrace_ops *ops, void *data)
16253 @@ -186,7 +164,6 @@ void dump_trace(struct task_struct *task
16254
16255 if (!stack) {
16256 unsigned long dummy;
16257 -
16258 stack = &dummy;
16259 if (task != current)
16260 stack = (unsigned long *)task->thread.sp;
16261 @@ -204,7 +181,7 @@ void dump_trace(struct task_struct *task
16262 }
16263 #endif
16264
16265 - while (1) {
16266 + for (;;) {
16267 struct thread_info *context;
16268
16269 context = (struct thread_info *)
16270 @@ -256,15 +233,15 @@ static void print_trace_address(void *da
16271 }
16272
16273 static const struct stacktrace_ops print_trace_ops = {
16274 - .warning = print_trace_warning,
16275 - .warning_symbol = print_trace_warning_symbol,
16276 - .stack = print_trace_stack,
16277 - .address = print_trace_address,
16278 + .warning = print_trace_warning,
16279 + .warning_symbol = print_trace_warning_symbol,
16280 + .stack = print_trace_stack,
16281 + .address = print_trace_address,
16282 };
16283
16284 static void
16285 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16286 - unsigned long *stack, unsigned long bp, char *log_lvl)
16287 + unsigned long *stack, unsigned long bp, char *log_lvl)
16288 {
16289 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16290 printk("%s =======================\n", log_lvl);
16291 @@ -359,15 +336,14 @@ void show_registers(struct pt_regs *regs
16292 printk(KERN_EMERG "Code: ");
16293
16294 ip = (u8 *)regs->ip - code_prologue;
16295 - if (ip < (u8 *)PAGE_OFFSET ||
16296 - probe_kernel_address(ip, c)) {
16297 + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
16298 /* try starting at EIP */
16299 ip = (u8 *)regs->ip;
16300 code_len = code_len - code_prologue + 1;
16301 }
16302 for (i = 0; i < code_len; i++, ip++) {
16303 if (ip < (u8 *)PAGE_OFFSET ||
16304 - probe_kernel_address(ip, c)) {
16305 + probe_kernel_address(ip, c)) {
16306 printk(" Bad EIP value.");
16307 break;
16308 }
16309 @@ -392,7 +368,53 @@ int is_valid_bugaddr(unsigned long ip)
16310 return ud2 == 0x0b0f;
16311 }
16312
16313 -static int die_counter;
16314 +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
16315 +static int die_owner = -1;
16316 +static unsigned int die_nest_count;
16317 +
16318 +unsigned __kprobes long oops_begin(void)
16319 +{
16320 + unsigned long flags;
16321 +
16322 + oops_enter();
16323 +
16324 + if (die_owner != raw_smp_processor_id()) {
16325 + console_verbose();
16326 + raw_local_irq_save(flags);
16327 + __raw_spin_lock(&die_lock);
16328 + die_owner = smp_processor_id();
16329 + die_nest_count = 0;
16330 + bust_spinlocks(1);
16331 + } else {
16332 + raw_local_irq_save(flags);
16333 + }
16334 + die_nest_count++;
16335 + return flags;
16336 +}
16337 +
16338 +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
16339 +{
16340 + bust_spinlocks(0);
16341 + die_owner = -1;
16342 + add_taint(TAINT_DIE);
16343 + __raw_spin_unlock(&die_lock);
16344 + raw_local_irq_restore(flags);
16345 +
16346 + if (!regs)
16347 + return;
16348 +
16349 + if (kexec_should_crash(current))
16350 + crash_kexec(regs);
16351 +
16352 + if (in_interrupt())
16353 + panic("Fatal exception in interrupt");
16354 +
16355 + if (panic_on_oops)
16356 + panic("Fatal exception");
16357 +
16358 + oops_exit();
16359 + do_exit(signr);
16360 +}
16361
16362 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
16363 {
16364 @@ -410,26 +432,22 @@ int __kprobes __die(const char *str, str
16365 printk("DEBUG_PAGEALLOC");
16366 #endif
16367 printk("\n");
16368 -
16369 if (notify_die(DIE_OOPS, str, regs, err,
16370 - current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
16371 + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
16372 + return 1;
16373
16374 - show_registers(regs);
16375 - /* Executive summary in case the oops scrolled away */
16376 - sp = (unsigned long) (&regs->sp);
16377 - savesegment(ss, ss);
16378 - if (user_mode(regs)) {
16379 - sp = regs->sp;
16380 - ss = regs->ss & 0xffff;
16381 - }
16382 - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16383 - print_symbol("%s", regs->ip);
16384 - printk(" SS:ESP %04x:%08lx\n", ss, sp);
16385 -
16386 - return 0;
16387 - }
16388 -
16389 - return 1;
16390 + show_registers(regs);
16391 + /* Executive summary in case the oops scrolled away */
16392 + sp = (unsigned long) (&regs->sp);
16393 + savesegment(ss, ss);
16394 + if (user_mode(regs)) {
16395 + sp = regs->sp;
16396 + ss = regs->ss & 0xffff;
16397 + }
16398 + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16399 + print_symbol("%s", regs->ip);
16400 + printk(" SS:ESP %04x:%08lx\n", ss, sp);
16401 + return 0;
16402 }
16403
16404 /*
16405 @@ -438,31 +456,9 @@ int __kprobes __die(const char *str, str
16406 */
16407 void die(const char *str, struct pt_regs *regs, long err)
16408 {
16409 - static struct {
16410 - raw_spinlock_t lock;
16411 - u32 lock_owner;
16412 - int lock_owner_depth;
16413 - } die = {
16414 - .lock = __RAW_SPIN_LOCK_UNLOCKED,
16415 - .lock_owner = -1,
16416 - .lock_owner_depth = 0
16417 - };
16418 - unsigned long flags;
16419 -
16420 - oops_enter();
16421 + unsigned long flags = oops_begin();
16422
16423 - if (die.lock_owner != raw_smp_processor_id()) {
16424 - console_verbose();
16425 - raw_local_irq_save(flags);
16426 - __raw_spin_lock(&die.lock);
16427 - die.lock_owner = smp_processor_id();
16428 - die.lock_owner_depth = 0;
16429 - bust_spinlocks(1);
16430 - } else {
16431 - raw_local_irq_save(flags);
16432 - }
16433 -
16434 - if (++die.lock_owner_depth < 3) {
16435 + if (die_nest_count < 3) {
16436 report_bug(regs->ip, regs);
16437
16438 if (__die(str, regs, err))
16439 @@ -471,26 +467,7 @@ void die(const char *str, struct pt_regs
16440 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
16441 }
16442
16443 - bust_spinlocks(0);
16444 - die.lock_owner = -1;
16445 - add_taint(TAINT_DIE);
16446 - __raw_spin_unlock(&die.lock);
16447 - raw_local_irq_restore(flags);
16448 -
16449 - if (!regs)
16450 - return;
16451 -
16452 - if (kexec_should_crash(current))
16453 - crash_kexec(regs);
16454 -
16455 - if (in_interrupt())
16456 - panic("Fatal exception in interrupt");
16457 -
16458 - if (panic_on_oops)
16459 - panic("Fatal exception");
16460 -
16461 - oops_exit();
16462 - do_exit(SIGSEGV);
16463 + oops_end(flags, regs, SIGSEGV);
16464 }
16465
16466 static inline void
16467 @@ -554,7 +531,7 @@ void do_##name(struct pt_regs *regs, lon
16468 { \
16469 trace_hardirqs_fixup(); \
16470 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16471 - == NOTIFY_STOP) \
16472 + == NOTIFY_STOP) \
16473 return; \
16474 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
16475 }
16476 @@ -570,7 +547,7 @@ void do_##name(struct pt_regs *regs, lon
16477 info.si_code = sicode; \
16478 info.si_addr = (void __user *)siaddr; \
16479 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16480 - == NOTIFY_STOP) \
16481 + == NOTIFY_STOP) \
16482 return; \
16483 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
16484 }
16485 @@ -579,7 +556,7 @@ void do_##name(struct pt_regs *regs, lon
16486 void do_##name(struct pt_regs *regs, long error_code) \
16487 { \
16488 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16489 - == NOTIFY_STOP) \
16490 + == NOTIFY_STOP) \
16491 return; \
16492 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
16493 }
16494 @@ -594,28 +571,29 @@ void do_##name(struct pt_regs *regs, lon
16495 info.si_addr = (void __user *)siaddr; \
16496 trace_hardirqs_fixup(); \
16497 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16498 - == NOTIFY_STOP) \
16499 + == NOTIFY_STOP) \
16500 return; \
16501 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
16502 }
16503
16504 -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16505 +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16506 #ifndef CONFIG_KPROBES
16507 DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
16508 #endif
16509 DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
16510 DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
16511 -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16512 -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16513 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16514 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16515 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
16516 -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16517 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16518 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16519 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16520 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
16521 DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
16522
16523 -void __kprobes do_general_protection(struct pt_regs * regs,
16524 - long error_code)
16525 +void __kprobes
16526 +do_general_protection(struct pt_regs *regs, long error_code)
16527 {
16528 + struct task_struct *tsk;
16529 struct thread_struct *thread;
16530
16531 thread = &current->thread;
16532 @@ -623,23 +601,24 @@ void __kprobes do_general_protection(str
16533 if (regs->flags & X86_VM_MASK)
16534 goto gp_in_vm86;
16535
16536 + tsk = current;
16537 if (!user_mode(regs))
16538 goto gp_in_kernel;
16539
16540 - current->thread.error_code = error_code;
16541 - current->thread.trap_no = 13;
16542 + tsk->thread.error_code = error_code;
16543 + tsk->thread.trap_no = 13;
16544
16545 - if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
16546 - printk_ratelimit()) {
16547 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
16548 + printk_ratelimit()) {
16549 printk(KERN_INFO
16550 - "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16551 - current->comm, task_pid_nr(current),
16552 - regs->ip, regs->sp, error_code);
16553 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16554 + tsk->comm, task_pid_nr(tsk),
16555 + regs->ip, regs->sp, error_code);
16556 print_vma_addr(" in ", regs->ip);
16557 printk("\n");
16558 }
16559
16560 - force_sig(SIGSEGV, current);
16561 + force_sig(SIGSEGV, tsk);
16562 return;
16563
16564 gp_in_vm86:
16565 @@ -648,14 +627,15 @@ gp_in_vm86:
16566 return;
16567
16568 gp_in_kernel:
16569 - if (!fixup_exception(regs)) {
16570 - current->thread.error_code = error_code;
16571 - current->thread.trap_no = 13;
16572 - if (notify_die(DIE_GPF, "general protection fault", regs,
16573 + if (fixup_exception(regs))
16574 + return;
16575 +
16576 + tsk->thread.error_code = error_code;
16577 + tsk->thread.trap_no = 13;
16578 + if (notify_die(DIE_GPF, "general protection fault", regs,
16579 error_code, 13, SIGSEGV) == NOTIFY_STOP)
16580 - return;
16581 - die("general protection fault", regs, error_code);
16582 - }
16583 + return;
16584 + die("general protection fault", regs, error_code);
16585 }
16586
16587 static notrace __kprobes void
16588 @@ -722,9 +702,9 @@ unknown_nmi_error(unsigned char reason,
16589
16590 static DEFINE_SPINLOCK(nmi_print_lock);
16591
16592 -void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
16593 +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
16594 {
16595 - if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16596 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16597 return;
16598
16599 spin_lock(&nmi_print_lock);
16600 @@ -733,10 +713,12 @@ void notrace __kprobes die_nmi(struct pt
16601 * to get a message out:
16602 */
16603 bust_spinlocks(1);
16604 - printk(KERN_EMERG "%s", msg);
16605 + printk(KERN_EMERG "%s", str);
16606 printk(" on CPU%d, ip %08lx, registers:\n",
16607 smp_processor_id(), regs->ip);
16608 show_registers(regs);
16609 + if (do_panic)
16610 + panic("Non maskable interrupt");
16611 console_silent();
16612 spin_unlock(&nmi_print_lock);
16613 bust_spinlocks(0);
16614 @@ -756,14 +738,17 @@ void notrace __kprobes die_nmi(struct pt
16615 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
16616 {
16617 unsigned char reason = 0;
16618 + int cpu;
16619
16620 - /* Only the BSP gets external NMIs from the system: */
16621 - if (!smp_processor_id())
16622 + cpu = smp_processor_id();
16623 +
16624 + /* Only the BSP gets external NMIs from the system. */
16625 + if (!cpu)
16626 reason = get_nmi_reason();
16627
16628 if (!(reason & 0xc0)) {
16629 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
16630 - == NOTIFY_STOP)
16631 + == NOTIFY_STOP)
16632 return;
16633 #ifdef CONFIG_X86_LOCAL_APIC
16634 /*
16635 @@ -772,7 +757,7 @@ static notrace __kprobes void default_do
16636 */
16637 if (nmi_watchdog_tick(regs, reason))
16638 return;
16639 - if (!do_nmi_callback(regs, smp_processor_id()))
16640 + if (!do_nmi_callback(regs, cpu))
16641 unknown_nmi_error(reason, regs);
16642 #else
16643 unknown_nmi_error(reason, regs);
16644 @@ -782,6 +767,8 @@ static notrace __kprobes void default_do
16645 }
16646 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
16647 return;
16648 +
16649 + /* AK: following checks seem to be broken on modern chipsets. FIXME */
16650 if (reason & 0x80)
16651 mem_parity_error(reason, regs);
16652 if (reason & 0x40)
16653 @@ -793,8 +780,6 @@ static notrace __kprobes void default_do
16654 reassert_nmi();
16655 }
16656
16657 -static int ignore_nmis;
16658 -
16659 notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
16660 {
16661 int cpu;
16662 @@ -879,7 +864,7 @@ void __kprobes do_debug(struct pt_regs *
16663 tsk->thread.debugctlmsr = 0;
16664
16665 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
16666 - SIGTRAP) == NOTIFY_STOP)
16667 + SIGTRAP) == NOTIFY_STOP)
16668 return;
16669 /* It's safe to allow irq's after DR6 has been saved */
16670 if (regs->flags & X86_EFLAGS_IF)
16671 @@ -940,9 +925,8 @@ clear_TF_reenable:
16672 void math_error(void __user *ip)
16673 {
16674 struct task_struct *task;
16675 - unsigned short cwd;
16676 - unsigned short swd;
16677 siginfo_t info;
16678 + unsigned short cwd, swd;
16679
16680 /*
16681 * Save the info for the exception handler and clear the error.
16682 @@ -961,7 +945,7 @@ void math_error(void __user *ip)
16683 * C1 reg you need in case of a stack fault, 0x040 is the stack
16684 * fault bit. We should only be taking one exception at a time,
16685 * so if this combination doesn't produce any single exception,
16686 - * then we have a bad program that isn't syncronizing its FPU usage
16687 + * then we have a bad program that isn't synchronizing its FPU usage
16688 * and it will suffer the consequences since we won't be able to
16689 * fully reproduce the context of the exception
16690 */
16691 @@ -970,7 +954,7 @@ void math_error(void __user *ip)
16692 switch (swd & ~cwd & 0x3f) {
16693 case 0x000: /* No unmasked exception */
16694 return;
16695 - default: /* Multiple exceptions */
16696 + default: /* Multiple exceptions */
16697 break;
16698 case 0x001: /* Invalid Op */
16699 /*
16700 @@ -1006,8 +990,8 @@ void do_coprocessor_error(struct pt_regs
16701 static void simd_math_error(void __user *ip)
16702 {
16703 struct task_struct *task;
16704 - unsigned short mxcsr;
16705 siginfo_t info;
16706 + unsigned short mxcsr;
16707
16708 /*
16709 * Save the info for the exception handler and clear the error.
16710 @@ -1084,7 +1068,7 @@ void do_spurious_interrupt_bug(struct pt
16711
16712 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
16713 {
16714 - struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
16715 + struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
16716 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
16717 unsigned long new_kesp = kesp - base;
16718 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
16719 Index: head-2008-12-01/arch/x86/kernel/traps_64-xen.c
16720 ===================================================================
16721 --- head-2008-12-01.orig/arch/x86/kernel/traps_64-xen.c 2008-12-01 11:44:55.000000000 +0100
16722 +++ head-2008-12-01/arch/x86/kernel/traps_64-xen.c 2008-12-01 11:49:07.000000000 +0100
16723 @@ -10,73 +10,56 @@
16724 * 'Traps.c' handles hardware traps and faults after we have saved some
16725 * state in 'entry.S'.
16726 */
16727 -#include <linux/sched.h>
16728 +#include <linux/moduleparam.h>
16729 +#include <linux/interrupt.h>
16730 +#include <linux/kallsyms.h>
16731 +#include <linux/spinlock.h>
16732 +#include <linux/kprobes.h>
16733 +#include <linux/uaccess.h>
16734 +#include <linux/utsname.h>
16735 +#include <linux/kdebug.h>
16736 #include <linux/kernel.h>
16737 +#include <linux/module.h>
16738 +#include <linux/ptrace.h>
16739 #include <linux/string.h>
16740 +#include <linux/unwind.h>
16741 +#include <linux/delay.h>
16742 #include <linux/errno.h>
16743 -#include <linux/ptrace.h>
16744 +#include <linux/kexec.h>
16745 +#include <linux/sched.h>
16746 #include <linux/timer.h>
16747 -#include <linux/mm.h>
16748 #include <linux/init.h>
16749 -#include <linux/delay.h>
16750 -#include <linux/spinlock.h>
16751 -#include <linux/interrupt.h>
16752 -#include <linux/kallsyms.h>
16753 -#include <linux/module.h>
16754 -#include <linux/moduleparam.h>
16755 -#include <linux/nmi.h>
16756 -#include <linux/kprobes.h>
16757 -#include <linux/kexec.h>
16758 -#include <linux/unwind.h>
16759 -#include <linux/uaccess.h>
16760 #include <linux/bug.h>
16761 -#include <linux/kdebug.h>
16762 -#include <linux/utsname.h>
16763 -
16764 -#include <mach_traps.h>
16765 +#include <linux/nmi.h>
16766 +#include <linux/mm.h>
16767
16768 #if defined(CONFIG_EDAC)
16769 #include <linux/edac.h>
16770 #endif
16771
16772 -#include <asm/system.h>
16773 -#include <asm/io.h>
16774 -#include <asm/atomic.h>
16775 +#include <asm/stacktrace.h>
16776 +#include <asm/processor.h>
16777 #include <asm/debugreg.h>
16778 +#include <asm/atomic.h>
16779 +#include <asm/system.h>
16780 +#include <asm/unwind.h>
16781 #include <asm/desc.h>
16782 #include <asm/i387.h>
16783 -#include <asm/processor.h>
16784 -#include <asm/unwind.h>
16785 +#include <asm/nmi.h>
16786 #include <asm/smp.h>
16787 +#include <asm/io.h>
16788 #include <asm/pgalloc.h>
16789 -#include <asm/pda.h>
16790 #include <asm/proto.h>
16791 -#include <asm/nmi.h>
16792 -#include <asm/stacktrace.h>
16793 +#include <asm/pda.h>
16794 +#include <asm/traps.h>
16795
16796 -asmlinkage void divide_error(void);
16797 -asmlinkage void debug(void);
16798 -asmlinkage void nmi(void);
16799 -asmlinkage void int3(void);
16800 -asmlinkage void overflow(void);
16801 -asmlinkage void bounds(void);
16802 -asmlinkage void invalid_op(void);
16803 -asmlinkage void device_not_available(void);
16804 -asmlinkage void double_fault(void);
16805 -asmlinkage void coprocessor_segment_overrun(void);
16806 -asmlinkage void invalid_TSS(void);
16807 -asmlinkage void segment_not_present(void);
16808 -asmlinkage void stack_segment(void);
16809 -asmlinkage void general_protection(void);
16810 -asmlinkage void page_fault(void);
16811 -asmlinkage void coprocessor_error(void);
16812 -asmlinkage void simd_coprocessor_error(void);
16813 -asmlinkage void reserved(void);
16814 -asmlinkage void alignment_check(void);
16815 -asmlinkage void machine_check(void);
16816 -asmlinkage void spurious_interrupt_bug(void);
16817 +#include <mach_traps.h>
16818
16819 +int panic_on_unrecovered_nmi;
16820 +int kstack_depth_to_print = 12;
16821 static unsigned int code_bytes = 64;
16822 +static int ignore_nmis;
16823 +static int die_counter;
16824
16825 static inline void conditional_sti(struct pt_regs *regs)
16826 {
16827 @@ -100,34 +83,9 @@ static inline void preempt_conditional_c
16828 dec_preempt_count();
16829 }
16830
16831 -int kstack_depth_to_print = 12;
16832 -
16833 void printk_address(unsigned long address, int reliable)
16834 {
16835 -#ifdef CONFIG_KALLSYMS
16836 - unsigned long offset = 0, symsize;
16837 - const char *symname;
16838 - char *modname;
16839 - char *delim = ":";
16840 - char namebuf[KSYM_NAME_LEN];
16841 - char reliab[4] = "";
16842 -
16843 - symname = kallsyms_lookup(address, &symsize, &offset,
16844 - &modname, namebuf);
16845 - if (!symname) {
16846 - printk(" [<%016lx>]\n", address);
16847 - return;
16848 - }
16849 - if (!reliable)
16850 - strcpy(reliab, "? ");
16851 -
16852 - if (!modname)
16853 - modname = delim = "";
16854 - printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
16855 - address, reliab, delim, modname, delim, symname, offset, symsize);
16856 -#else
16857 - printk(" [<%016lx>]\n", address);
16858 -#endif
16859 + printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
16860 }
16861
16862 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
16863 @@ -206,8 +164,6 @@ static unsigned long *in_exception_stack
16864 return NULL;
16865 }
16866
16867 -#define MSG(txt) ops->warning(data, txt)
16868 -
16869 /*
16870 * x86-64 can have up to three kernel stacks:
16871 * process stack
16872 @@ -234,11 +190,11 @@ struct stack_frame {
16873 unsigned long return_address;
16874 };
16875
16876 -
16877 -static inline unsigned long print_context_stack(struct thread_info *tinfo,
16878 - unsigned long *stack, unsigned long bp,
16879 - const struct stacktrace_ops *ops, void *data,
16880 - unsigned long *end)
16881 +static inline unsigned long
16882 +print_context_stack(struct thread_info *tinfo,
16883 + unsigned long *stack, unsigned long bp,
16884 + const struct stacktrace_ops *ops, void *data,
16885 + unsigned long *end)
16886 {
16887 struct stack_frame *frame = (struct stack_frame *)bp;
16888
16889 @@ -260,7 +216,7 @@ static inline unsigned long print_contex
16890 return bp;
16891 }
16892
16893 -void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
16894 +void dump_trace(struct task_struct *task, struct pt_regs *regs,
16895 unsigned long *stack, unsigned long bp,
16896 const struct stacktrace_ops *ops, void *data)
16897 {
16898 @@ -269,36 +225,34 @@ void dump_trace(struct task_struct *tsk,
16899 unsigned used = 0;
16900 struct thread_info *tinfo;
16901
16902 - if (!tsk)
16903 - tsk = current;
16904 - tinfo = task_thread_info(tsk);
16905 + if (!task)
16906 + task = current;
16907
16908 if (!stack) {
16909 unsigned long dummy;
16910 stack = &dummy;
16911 - if (tsk && tsk != current)
16912 - stack = (unsigned long *)tsk->thread.sp;
16913 + if (task && task != current)
16914 + stack = (unsigned long *)task->thread.sp;
16915 }
16916
16917 #ifdef CONFIG_FRAME_POINTER
16918 if (!bp) {
16919 - if (tsk == current) {
16920 + if (task == current) {
16921 /* Grab bp right from our regs */
16922 - asm("movq %%rbp, %0" : "=r" (bp):);
16923 + asm("movq %%rbp, %0" : "=r" (bp) :);
16924 } else {
16925 /* bp is the last reg pushed by switch_to */
16926 - bp = *(unsigned long *) tsk->thread.sp;
16927 + bp = *(unsigned long *) task->thread.sp;
16928 }
16929 }
16930 #endif
16931
16932 -
16933 -
16934 /*
16935 * Print function call entries in all stacks, starting at the
16936 * current stack address. If the stacks consist of nested
16937 * exceptions
16938 */
16939 + tinfo = task_thread_info(task);
16940 for (;;) {
16941 char *id;
16942 unsigned long *estack_end;
16943 @@ -383,18 +337,24 @@ static const struct stacktrace_ops print
16944 .address = print_trace_address,
16945 };
16946
16947 -void
16948 -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
16949 - unsigned long bp)
16950 +static void
16951 +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16952 + unsigned long *stack, unsigned long bp, char *log_lvl)
16953 {
16954 printk("\nCall Trace:\n");
16955 - dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
16956 + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16957 printk("\n");
16958 }
16959
16960 +void show_trace(struct task_struct *task, struct pt_regs *regs,
16961 + unsigned long *stack, unsigned long bp)
16962 +{
16963 + show_trace_log_lvl(task, regs, stack, bp, "");
16964 +}
16965 +
16966 static void
16967 -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
16968 - unsigned long bp)
16969 +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
16970 + unsigned long *sp, unsigned long bp, char *log_lvl)
16971 {
16972 unsigned long *stack;
16973 int i;
16974 @@ -406,14 +366,14 @@ _show_stack(struct task_struct *tsk, str
16975 // back trace for this cpu.
16976
16977 if (sp == NULL) {
16978 - if (tsk)
16979 - sp = (unsigned long *)tsk->thread.sp;
16980 + if (task)
16981 + sp = (unsigned long *)task->thread.sp;
16982 else
16983 sp = (unsigned long *)&sp;
16984 }
16985
16986 stack = sp;
16987 - for(i=0; i < kstack_depth_to_print; i++) {
16988 + for (i = 0; i < kstack_depth_to_print; i++) {
16989 if (stack >= irqstack && stack <= irqstack_end) {
16990 if (stack == irqstack_end) {
16991 stack = (unsigned long *) (irqstack_end[-1]);
16992 @@ -428,12 +388,12 @@ _show_stack(struct task_struct *tsk, str
16993 printk(" %016lx", *stack++);
16994 touch_nmi_watchdog();
16995 }
16996 - show_trace(tsk, regs, sp, bp);
16997 + show_trace_log_lvl(task, regs, sp, bp, log_lvl);
16998 }
16999
17000 -void show_stack(struct task_struct *tsk, unsigned long * sp)
17001 +void show_stack(struct task_struct *task, unsigned long *sp)
17002 {
17003 - _show_stack(tsk, NULL, sp, 0);
17004 + show_stack_log_lvl(task, NULL, sp, 0, "");
17005 }
17006
17007 /*
17008 @@ -441,8 +401,8 @@ void show_stack(struct task_struct *tsk,
17009 */
17010 void dump_stack(void)
17011 {
17012 - unsigned long dummy;
17013 unsigned long bp = 0;
17014 + unsigned long stack;
17015
17016 #ifdef CONFIG_FRAME_POINTER
17017 if (!bp)
17018 @@ -454,7 +414,7 @@ void dump_stack(void)
17019 init_utsname()->release,
17020 (int)strcspn(init_utsname()->version, " "),
17021 init_utsname()->version);
17022 - show_trace(NULL, NULL, &dummy, bp);
17023 + show_trace(NULL, NULL, &stack, bp);
17024 }
17025
17026 EXPORT_SYMBOL(dump_stack);
17027 @@ -465,12 +425,8 @@ void show_registers(struct pt_regs *regs
17028 unsigned long sp;
17029 const int cpu = smp_processor_id();
17030 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
17031 - u8 *ip;
17032 - unsigned int code_prologue = code_bytes * 43 / 64;
17033 - unsigned int code_len = code_bytes;
17034
17035 sp = regs->sp;
17036 - ip = (u8 *) regs->ip - code_prologue;
17037 printk("CPU %d ", cpu);
17038 __show_regs(regs);
17039 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
17040 @@ -481,15 +437,22 @@ void show_registers(struct pt_regs *regs
17041 * time of the fault..
17042 */
17043 if (!user_mode(regs)) {
17044 + unsigned int code_prologue = code_bytes * 43 / 64;
17045 + unsigned int code_len = code_bytes;
17046 unsigned char c;
17047 + u8 *ip;
17048 +
17049 printk("Stack: ");
17050 - _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
17051 + show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
17052 + regs->bp, "");
17053 printk("\n");
17054
17055 printk(KERN_EMERG "Code: ");
17056 +
17057 + ip = (u8 *)regs->ip - code_prologue;
17058 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
17059 /* try starting at RIP */
17060 - ip = (u8 *) regs->ip;
17061 + ip = (u8 *)regs->ip;
17062 code_len = code_len - code_prologue + 1;
17063 }
17064 for (i = 0; i < code_len; i++, ip++) {
17065 @@ -505,7 +468,7 @@ void show_registers(struct pt_regs *regs
17066 }
17067 }
17068 printk("\n");
17069 -}
17070 +}
17071
17072 int is_valid_bugaddr(unsigned long ip)
17073 {
17074 @@ -545,7 +508,7 @@ unsigned __kprobes long oops_begin(void)
17075 }
17076
17077 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
17078 -{
17079 +{
17080 die_owner = -1;
17081 bust_spinlocks(0);
17082 die_nest_count--;
17083 @@ -563,10 +526,9 @@ void __kprobes oops_end(unsigned long fl
17084 do_exit(signr);
17085 }
17086
17087 -int __kprobes __die(const char * str, struct pt_regs * regs, long err)
17088 +int __kprobes __die(const char *str, struct pt_regs *regs, long err)
17089 {
17090 - static int die_counter;
17091 - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
17092 + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
17093 #ifdef CONFIG_PREEMPT
17094 printk("PREEMPT ");
17095 #endif
17096 @@ -577,8 +539,10 @@ int __kprobes __die(const char * str, st
17097 printk("DEBUG_PAGEALLOC");
17098 #endif
17099 printk("\n");
17100 - if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17101 + if (notify_die(DIE_OOPS, str, regs, err,
17102 + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17103 return 1;
17104 +
17105 show_registers(regs);
17106 add_taint(TAINT_DIE);
17107 /* Executive summary in case the oops scrolled away */
17108 @@ -590,7 +554,7 @@ int __kprobes __die(const char * str, st
17109 return 0;
17110 }
17111
17112 -void die(const char * str, struct pt_regs * regs, long err)
17113 +void die(const char *str, struct pt_regs *regs, long err)
17114 {
17115 unsigned long flags = oops_begin();
17116
17117 @@ -608,8 +572,7 @@ die_nmi(char *str, struct pt_regs *regs,
17118 {
17119 unsigned long flags;
17120
17121 - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
17122 - NOTIFY_STOP)
17123 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
17124 return;
17125
17126 flags = oops_begin();
17127 @@ -617,7 +580,9 @@ die_nmi(char *str, struct pt_regs *regs,
17128 * We are in trouble anyway, lets at least try
17129 * to get a message out.
17130 */
17131 - printk(str, smp_processor_id());
17132 + printk(KERN_EMERG "%s", str);
17133 + printk(" on CPU%d, ip %08lx, registers:\n",
17134 + smp_processor_id(), regs->ip);
17135 show_registers(regs);
17136 if (kexec_should_crash(current))
17137 crash_kexec(regs);
17138 @@ -630,44 +595,44 @@ die_nmi(char *str, struct pt_regs *regs,
17139 }
17140 #endif
17141
17142 -static void __kprobes do_trap(int trapnr, int signr, char *str,
17143 - struct pt_regs * regs, long error_code,
17144 - siginfo_t *info)
17145 +static void __kprobes
17146 +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
17147 + long error_code, siginfo_t *info)
17148 {
17149 struct task_struct *tsk = current;
17150
17151 - if (user_mode(regs)) {
17152 - /*
17153 - * We want error_code and trap_no set for userspace
17154 - * faults and kernelspace faults which result in
17155 - * die(), but not kernelspace faults which are fixed
17156 - * up. die() gives the process no chance to handle
17157 - * the signal and notice the kernel fault information,
17158 - * so that won't result in polluting the information
17159 - * about previously queued, but not yet delivered,
17160 - * faults. See also do_general_protection below.
17161 - */
17162 - tsk->thread.error_code = error_code;
17163 - tsk->thread.trap_no = trapnr;
17164 + if (!user_mode(regs))
17165 + goto kernel_trap;
17166
17167 - if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17168 - printk_ratelimit()) {
17169 - printk(KERN_INFO
17170 - "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17171 - tsk->comm, tsk->pid, str,
17172 - regs->ip, regs->sp, error_code);
17173 - print_vma_addr(" in ", regs->ip);
17174 - printk("\n");
17175 - }
17176 + /*
17177 + * We want error_code and trap_no set for userspace faults and
17178 + * kernelspace faults which result in die(), but not
17179 + * kernelspace faults which are fixed up. die() gives the
17180 + * process no chance to handle the signal and notice the
17181 + * kernel fault information, so that won't result in polluting
17182 + * the information about previously queued, but not yet
17183 + * delivered, faults. See also do_general_protection below.
17184 + */
17185 + tsk->thread.error_code = error_code;
17186 + tsk->thread.trap_no = trapnr;
17187
17188 - if (info)
17189 - force_sig_info(signr, info, tsk);
17190 - else
17191 - force_sig(signr, tsk);
17192 - return;
17193 + if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17194 + printk_ratelimit()) {
17195 + printk(KERN_INFO
17196 + "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17197 + tsk->comm, tsk->pid, str,
17198 + regs->ip, regs->sp, error_code);
17199 + print_vma_addr(" in ", regs->ip);
17200 + printk("\n");
17201 }
17202
17203 + if (info)
17204 + force_sig_info(signr, info, tsk);
17205 + else
17206 + force_sig(signr, tsk);
17207 + return;
17208
17209 +kernel_trap:
17210 if (!fixup_exception(regs)) {
17211 tsk->thread.error_code = error_code;
17212 tsk->thread.trap_no = trapnr;
17213 @@ -677,41 +642,39 @@ static void __kprobes do_trap(int trapnr
17214 }
17215
17216 #define DO_ERROR(trapnr, signr, str, name) \
17217 -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17218 -{ \
17219 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17220 - == NOTIFY_STOP) \
17221 - return; \
17222 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17223 +{ \
17224 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17225 + == NOTIFY_STOP) \
17226 + return; \
17227 conditional_sti(regs); \
17228 - do_trap(trapnr, signr, str, regs, error_code, NULL); \
17229 + do_trap(trapnr, signr, str, regs, error_code, NULL); \
17230 }
17231
17232 -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17233 -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17234 -{ \
17235 - siginfo_t info; \
17236 - info.si_signo = signr; \
17237 - info.si_errno = 0; \
17238 - info.si_code = sicode; \
17239 - info.si_addr = (void __user *)siaddr; \
17240 - trace_hardirqs_fixup(); \
17241 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17242 - == NOTIFY_STOP) \
17243 - return; \
17244 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17245 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17246 +{ \
17247 + siginfo_t info; \
17248 + info.si_signo = signr; \
17249 + info.si_errno = 0; \
17250 + info.si_code = sicode; \
17251 + info.si_addr = (void __user *)siaddr; \
17252 + trace_hardirqs_fixup(); \
17253 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17254 + == NOTIFY_STOP) \
17255 + return; \
17256 conditional_sti(regs); \
17257 - do_trap(trapnr, signr, str, regs, error_code, &info); \
17258 + do_trap(trapnr, signr, str, regs, error_code, &info); \
17259 }
17260
17261 -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17262 -DO_ERROR( 4, SIGSEGV, "overflow", overflow)
17263 -DO_ERROR( 5, SIGSEGV, "bounds", bounds)
17264 -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17265 -DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
17266 -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17267 +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17268 +DO_ERROR(4, SIGSEGV, "overflow", overflow)
17269 +DO_ERROR(5, SIGSEGV, "bounds", bounds)
17270 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17271 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17272 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
17273 -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17274 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17275 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
17276 -DO_ERROR(18, SIGSEGV, "reserved", reserved)
17277
17278 /* Runs on IST stack */
17279 asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
17280 @@ -741,31 +704,34 @@ asmlinkage void do_double_fault(struct p
17281 die(str, regs, error_code);
17282 }
17283
17284 -asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
17285 - long error_code)
17286 +asmlinkage void __kprobes
17287 +do_general_protection(struct pt_regs *regs, long error_code)
17288 {
17289 - struct task_struct *tsk = current;
17290 + struct task_struct *tsk;
17291
17292 conditional_sti(regs);
17293
17294 - if (user_mode(regs)) {
17295 - tsk->thread.error_code = error_code;
17296 - tsk->thread.trap_no = 13;
17297 + tsk = current;
17298 + if (!user_mode(regs))
17299 + goto gp_in_kernel;
17300
17301 - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17302 - printk_ratelimit()) {
17303 - printk(KERN_INFO
17304 - "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17305 - tsk->comm, tsk->pid,
17306 - regs->ip, regs->sp, error_code);
17307 - print_vma_addr(" in ", regs->ip);
17308 - printk("\n");
17309 - }
17310 + tsk->thread.error_code = error_code;
17311 + tsk->thread.trap_no = 13;
17312
17313 - force_sig(SIGSEGV, tsk);
17314 - return;
17315 - }
17316 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17317 + printk_ratelimit()) {
17318 + printk(KERN_INFO
17319 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17320 + tsk->comm, tsk->pid,
17321 + regs->ip, regs->sp, error_code);
17322 + print_vma_addr(" in ", regs->ip);
17323 + printk("\n");
17324 + }
17325
17326 + force_sig(SIGSEGV, tsk);
17327 + return;
17328 +
17329 +gp_in_kernel:
17330 if (fixup_exception(regs))
17331 return;
17332
17333 @@ -778,14 +744,14 @@ asmlinkage void __kprobes do_general_pro
17334 }
17335
17336 static notrace __kprobes void
17337 -mem_parity_error(unsigned char reason, struct pt_regs * regs)
17338 +mem_parity_error(unsigned char reason, struct pt_regs *regs)
17339 {
17340 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
17341 reason);
17342 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
17343
17344 #if defined(CONFIG_EDAC)
17345 - if(edac_handler_set()) {
17346 + if (edac_handler_set()) {
17347 edac_atomic_assert_error();
17348 return;
17349 }
17350 @@ -801,7 +767,7 @@ mem_parity_error(unsigned char reason, s
17351 }
17352
17353 static notrace __kprobes void
17354 -io_check_error(unsigned char reason, struct pt_regs * regs)
17355 +io_check_error(unsigned char reason, struct pt_regs *regs)
17356 {
17357 printk("NMI: IOCK error (debug interrupt?)\n");
17358 show_registers(regs);
17359 @@ -827,14 +793,14 @@ unknown_nmi_error(unsigned char reason,
17360
17361 /* Runs on IST stack. This code must keep interrupts off all the time.
17362 Nested NMIs are prevented by the CPU. */
17363 -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17364 +asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17365 {
17366 unsigned char reason = 0;
17367 int cpu;
17368
17369 cpu = smp_processor_id();
17370
17371 - /* Only the BSP gets external NMIs from the system. */
17372 + /* Only the BSP gets external NMIs from the system. */
17373 if (!cpu)
17374 reason = get_nmi_reason();
17375
17376 @@ -847,33 +813,58 @@ asmlinkage notrace __kprobes void defau
17377 * Ok, so this is none of the documented NMI sources,
17378 * so it must be the NMI watchdog.
17379 */
17380 - if (nmi_watchdog_tick(regs,reason))
17381 + if (nmi_watchdog_tick(regs, reason))
17382 return;
17383 #endif
17384 - if (!do_nmi_callback(regs,cpu))
17385 + if (!do_nmi_callback(regs, cpu))
17386 unknown_nmi_error(reason, regs);
17387
17388 return;
17389 }
17390 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
17391 - return;
17392 + return;
17393
17394 /* AK: following checks seem to be broken on modern chipsets. FIXME */
17395 -
17396 if (reason & 0x80)
17397 mem_parity_error(reason, regs);
17398 if (reason & 0x40)
17399 io_check_error(reason, regs);
17400 }
17401
17402 +asmlinkage notrace __kprobes void
17403 +do_nmi(struct pt_regs *regs, long error_code)
17404 +{
17405 + nmi_enter();
17406 +
17407 + add_pda(__nmi_count, 1);
17408 +
17409 + if (!ignore_nmis)
17410 + default_do_nmi(regs);
17411 +
17412 + nmi_exit();
17413 +}
17414 +
17415 +void stop_nmi(void)
17416 +{
17417 + acpi_nmi_disable();
17418 + ignore_nmis++;
17419 +}
17420 +
17421 +void restart_nmi(void)
17422 +{
17423 + ignore_nmis--;
17424 + acpi_nmi_enable();
17425 +}
17426 +
17427 /* runs on IST stack. */
17428 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
17429 {
17430 trace_hardirqs_fixup();
17431
17432 - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
17433 + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
17434 + == NOTIFY_STOP)
17435 return;
17436 - }
17437 +
17438 preempt_conditional_sti(regs);
17439 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
17440 preempt_conditional_cli(regs);
17441 @@ -904,8 +895,8 @@ asmlinkage __kprobes struct pt_regs *syn
17442 asmlinkage void __kprobes do_debug(struct pt_regs * regs,
17443 unsigned long error_code)
17444 {
17445 - unsigned long condition;
17446 struct task_struct *tsk = current;
17447 + unsigned long condition;
17448 siginfo_t info;
17449
17450 trace_hardirqs_fixup();
17451 @@ -926,21 +917,19 @@ asmlinkage void __kprobes do_debug(struc
17452
17453 /* Mask out spurious debug traps due to lazy DR7 setting */
17454 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
17455 - if (!tsk->thread.debugreg7) {
17456 + if (!tsk->thread.debugreg7)
17457 goto clear_dr7;
17458 - }
17459 }
17460
17461 tsk->thread.debugreg6 = condition;
17462
17463 -
17464 /*
17465 * Single-stepping through TF: make sure we ignore any events in
17466 * kernel space (but re-enable TF when returning to user mode).
17467 */
17468 if (condition & DR_STEP) {
17469 - if (!user_mode(regs))
17470 - goto clear_TF_reenable;
17471 + if (!user_mode(regs))
17472 + goto clear_TF_reenable;
17473 }
17474
17475 /* Ok, finally something we can handle */
17476 @@ -953,7 +942,7 @@ asmlinkage void __kprobes do_debug(struc
17477 force_sig_info(SIGTRAP, &info, tsk);
17478
17479 clear_dr7:
17480 - set_debugreg(0UL, 7);
17481 + set_debugreg(0, 7);
17482 preempt_conditional_cli(regs);
17483 return;
17484
17485 @@ -961,6 +950,7 @@ clear_TF_reenable:
17486 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
17487 regs->flags &= ~X86_EFLAGS_TF;
17488 preempt_conditional_cli(regs);
17489 + return;
17490 }
17491
17492 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
17493 @@ -983,7 +973,7 @@ static int kernel_math_error(struct pt_r
17494 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
17495 {
17496 void __user *ip = (void __user *)(regs->ip);
17497 - struct task_struct * task;
17498 + struct task_struct *task;
17499 siginfo_t info;
17500 unsigned short cwd, swd;
17501
17502 @@ -1016,30 +1006,30 @@ asmlinkage void do_coprocessor_error(str
17503 cwd = get_fpu_cwd(task);
17504 swd = get_fpu_swd(task);
17505 switch (swd & ~cwd & 0x3f) {
17506 - case 0x000:
17507 - default:
17508 - break;
17509 - case 0x001: /* Invalid Op */
17510 - /*
17511 - * swd & 0x240 == 0x040: Stack Underflow
17512 - * swd & 0x240 == 0x240: Stack Overflow
17513 - * User must clear the SF bit (0x40) if set
17514 - */
17515 - info.si_code = FPE_FLTINV;
17516 - break;
17517 - case 0x002: /* Denormalize */
17518 - case 0x010: /* Underflow */
17519 - info.si_code = FPE_FLTUND;
17520 - break;
17521 - case 0x004: /* Zero Divide */
17522 - info.si_code = FPE_FLTDIV;
17523 - break;
17524 - case 0x008: /* Overflow */
17525 - info.si_code = FPE_FLTOVF;
17526 - break;
17527 - case 0x020: /* Precision */
17528 - info.si_code = FPE_FLTRES;
17529 - break;
17530 + case 0x000: /* No unmasked exception */
17531 + default: /* Multiple exceptions */
17532 + break;
17533 + case 0x001: /* Invalid Op */
17534 + /*
17535 + * swd & 0x240 == 0x040: Stack Underflow
17536 + * swd & 0x240 == 0x240: Stack Overflow
17537 + * User must clear the SF bit (0x40) if set
17538 + */
17539 + info.si_code = FPE_FLTINV;
17540 + break;
17541 + case 0x002: /* Denormalize */
17542 + case 0x010: /* Underflow */
17543 + info.si_code = FPE_FLTUND;
17544 + break;
17545 + case 0x004: /* Zero Divide */
17546 + info.si_code = FPE_FLTDIV;
17547 + break;
17548 + case 0x008: /* Overflow */
17549 + info.si_code = FPE_FLTOVF;
17550 + break;
17551 + case 0x020: /* Precision */
17552 + info.si_code = FPE_FLTRES;
17553 + break;
17554 }
17555 force_sig_info(SIGFPE, &info, task);
17556 }
17557 @@ -1052,7 +1042,7 @@ asmlinkage void bad_intr(void)
17558 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
17559 {
17560 void __user *ip = (void __user *)(regs->ip);
17561 - struct task_struct * task;
17562 + struct task_struct *task;
17563 siginfo_t info;
17564 unsigned short mxcsr;
17565
17566 @@ -1080,25 +1070,25 @@ asmlinkage void do_simd_coprocessor_erro
17567 */
17568 mxcsr = get_fpu_mxcsr(task);
17569 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
17570 - case 0x000:
17571 - default:
17572 - break;
17573 - case 0x001: /* Invalid Op */
17574 - info.si_code = FPE_FLTINV;
17575 - break;
17576 - case 0x002: /* Denormalize */
17577 - case 0x010: /* Underflow */
17578 - info.si_code = FPE_FLTUND;
17579 - break;
17580 - case 0x004: /* Zero Divide */
17581 - info.si_code = FPE_FLTDIV;
17582 - break;
17583 - case 0x008: /* Overflow */
17584 - info.si_code = FPE_FLTOVF;
17585 - break;
17586 - case 0x020: /* Precision */
17587 - info.si_code = FPE_FLTRES;
17588 - break;
17589 + case 0x000:
17590 + default:
17591 + break;
17592 + case 0x001: /* Invalid Op */
17593 + info.si_code = FPE_FLTINV;
17594 + break;
17595 + case 0x002: /* Denormalize */
17596 + case 0x010: /* Underflow */
17597 + info.si_code = FPE_FLTUND;
17598 + break;
17599 + case 0x004: /* Zero Divide */
17600 + info.si_code = FPE_FLTDIV;
17601 + break;
17602 + case 0x008: /* Overflow */
17603 + info.si_code = FPE_FLTOVF;
17604 + break;
17605 + case 0x020: /* Precision */
17606 + info.si_code = FPE_FLTRES;
17607 + break;
17608 }
17609 force_sig_info(SIGFPE, &info, task);
17610 }
17611 @@ -1118,7 +1108,7 @@ asmlinkage void __attribute__((weak)) mc
17612 }
17613
17614 /*
17615 - * 'math_state_restore()' saves the current math information in the
17616 + * 'math_state_restore()' saves the current math information in the
17617 * old math state array, and gets the new ones from the current task
17618 *
17619 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
17620 @@ -1145,7 +1135,14 @@ asmlinkage void math_state_restore(void)
17621
17622 /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
17623
17624 - restore_fpu_checking(&me->thread.xstate->fxsave);
17625 + /*
17626 + * Paranoid restore. send a SIGSEGV if we fail to restore the state.
17627 + */
17628 + if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
17629 + stts();
17630 + force_sig(SIGSEGV, me);
17631 + return;
17632 + }
17633 task_thread_info(me)->status |= TS_USEDFPU;
17634 me->fpu_counter++;
17635 }
17636 @@ -1190,13 +1187,12 @@ void __init trap_init(void)
17637 ret = HYPERVISOR_set_trap_table(trap_table);
17638 if (ret)
17639 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
17640 -
17641 /*
17642 * initialize the per thread extended state:
17643 */
17644 - init_thread_xstate();
17645 + init_thread_xstate();
17646 /*
17647 - * Should be a barrier for any external CPU state.
17648 + * Should be a barrier for any external CPU state:
17649 */
17650 cpu_init();
17651 }
17652 @@ -1212,27 +1208,25 @@ void __cpuinit smp_trap_init(trap_info_t
17653 }
17654 }
17655
17656 -
17657 static int __init oops_setup(char *s)
17658 -{
17659 +{
17660 if (!s)
17661 return -EINVAL;
17662 if (!strcmp(s, "panic"))
17663 panic_on_oops = 1;
17664 return 0;
17665 -}
17666 +}
17667 early_param("oops", oops_setup);
17668
17669 static int __init kstack_setup(char *s)
17670 {
17671 if (!s)
17672 return -EINVAL;
17673 - kstack_depth_to_print = simple_strtoul(s,NULL,0);
17674 + kstack_depth_to_print = simple_strtoul(s, NULL, 0);
17675 return 0;
17676 }
17677 early_param("kstack", kstack_setup);
17678
17679 -
17680 static int __init code_bytes_setup(char *s)
17681 {
17682 code_bytes = simple_strtoul(s, NULL, 0);
17683 Index: head-2008-12-01/arch/x86/kernel/vsyscall_64-xen.c
17684 ===================================================================
17685 --- head-2008-12-01.orig/arch/x86/kernel/vsyscall_64-xen.c 2008-12-01 11:44:55.000000000 +0100
17686 +++ head-2008-12-01/arch/x86/kernel/vsyscall_64-xen.c 2008-12-01 11:49:07.000000000 +0100
17687 @@ -42,7 +42,8 @@
17688 #include <asm/topology.h>
17689 #include <asm/vgtod.h>
17690
17691 -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
17692 +#define __vsyscall(nr) \
17693 + __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
17694 #define __syscall_clobber "r11","cx","memory"
17695
17696 /*
17697 @@ -264,10 +265,7 @@ static void __cpuinit vsyscall_set_cpu(i
17698 d |= cpu;
17699 d |= (node & 0xf) << 12;
17700 d |= (node >> 4) << 48;
17701 - if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
17702 - + GDT_ENTRY_PER_CPU),
17703 - d))
17704 - BUG();
17705 + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
17706 }
17707
17708 static void __cpuinit cpu_vsyscall_init(void *arg)
17709 @@ -281,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_bl
17710 {
17711 long cpu = (long)arg;
17712 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
17713 - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
17714 + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
17715 return NOTIFY_DONE;
17716 }
17717
17718 @@ -311,7 +309,7 @@ static int __init vsyscall_init(void)
17719 #ifdef CONFIG_SYSCTL
17720 register_sysctl_table(kernel_root_table2);
17721 #endif
17722 - on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
17723 + on_each_cpu(cpu_vsyscall_init, NULL, 1);
17724 hotcpu_notifier(cpu_vsyscall_notifier, 0);
17725 return 0;
17726 }
17727 Index: head-2008-12-01/arch/x86/mach-xen/setup.c
17728 ===================================================================
17729 --- head-2008-12-01.orig/arch/x86/mach-xen/setup.c 2008-12-01 11:37:10.000000000 +0100
17730 +++ head-2008-12-01/arch/x86/mach-xen/setup.c 2008-12-01 11:49:07.000000000 +0100
17731 @@ -17,6 +17,8 @@
17732 #include <xen/interface/callback.h>
17733 #include <xen/interface/memory.h>
17734
17735 +#ifdef CONFIG_X86_32
17736 +
17737 #ifdef CONFIG_HOTPLUG_CPU
17738 #define DEFAULT_SEND_IPI (1)
17739 #else
17740 @@ -44,51 +46,6 @@ static int __init print_ipi_mode(void)
17741
17742 late_initcall(print_ipi_mode);
17743
17744 -/**
17745 - * machine_specific_memory_setup - Hook for machine specific memory setup.
17746 - *
17747 - * Description:
17748 - * This is included late in kernel/setup.c so that it can make
17749 - * use of all of the static functions.
17750 - **/
17751 -
17752 -char * __init machine_specific_memory_setup(void)
17753 -{
17754 - int rc;
17755 - struct xen_memory_map memmap;
17756 - /*
17757 - * This is rather large for a stack variable but this early in
17758 - * the boot process we know we have plenty slack space.
17759 - */
17760 - struct e820entry map[E820MAX];
17761 -
17762 - memmap.nr_entries = E820MAX;
17763 - set_xen_guest_handle(memmap.buffer, map);
17764 -
17765 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
17766 - if ( rc == -ENOSYS ) {
17767 - memmap.nr_entries = 1;
17768 - map[0].addr = 0ULL;
17769 - map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
17770 - /* 8MB slack (to balance backend allocations). */
17771 - map[0].size += 8ULL << 20;
17772 - map[0].type = E820_RAM;
17773 - rc = 0;
17774 - }
17775 - BUG_ON(rc);
17776 -
17777 - sanitize_e820_map(map, (char *)&memmap.nr_entries);
17778 -
17779 - BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
17780 -
17781 - return "Xen";
17782 -}
17783 -
17784 -
17785 -extern void hypervisor_callback(void);
17786 -extern void failsafe_callback(void);
17787 -extern void nmi(void);
17788 -
17789 unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
17790 EXPORT_SYMBOL(machine_to_phys_mapping);
17791 unsigned int machine_to_phys_order;
17792 @@ -121,33 +78,66 @@ void __init pre_setup_arch_hook(void)
17793 (unsigned long *)xen_start_info->mfn_list;
17794 }
17795
17796 +#endif /* CONFIG_X86_32 */
17797 +
17798 +extern void hypervisor_callback(void);
17799 +extern void failsafe_callback(void);
17800 +extern void nmi(void);
17801 +
17802 +#ifdef CONFIG_X86_64
17803 +#include <asm/proto.h>
17804 +#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
17805 +#else
17806 +#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
17807 +#endif
17808 +
17809 void __init machine_specific_arch_setup(void)
17810 {
17811 int ret;
17812 static struct callback_register __initdata event = {
17813 .type = CALLBACKTYPE_event,
17814 - .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
17815 + .address = CALLBACK_ADDR(hypervisor_callback)
17816 };
17817 static struct callback_register __initdata failsafe = {
17818 .type = CALLBACKTYPE_failsafe,
17819 - .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
17820 + .address = CALLBACK_ADDR(failsafe_callback)
17821 + };
17822 +#ifdef CONFIG_X86_64
17823 + static struct callback_register __initdata syscall = {
17824 + .type = CALLBACKTYPE_syscall,
17825 + .address = CALLBACK_ADDR(system_call)
17826 };
17827 +#endif
17828 +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17829 static struct callback_register __initdata nmi_cb = {
17830 .type = CALLBACKTYPE_nmi,
17831 - .address = { __KERNEL_CS, (unsigned long)nmi },
17832 + .address = CALLBACK_ADDR(nmi)
17833 };
17834 +#endif
17835
17836 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
17837 if (ret == 0)
17838 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
17839 +#ifdef CONFIG_X86_64
17840 + if (ret == 0)
17841 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
17842 +#endif
17843 #if CONFIG_XEN_COMPAT <= 0x030002
17844 +#ifdef CONFIG_X86_32
17845 if (ret == -ENOSYS)
17846 ret = HYPERVISOR_set_callbacks(
17847 event.address.cs, event.address.eip,
17848 failsafe.address.cs, failsafe.address.eip);
17849 +#else
17850 + ret = HYPERVISOR_set_callbacks(
17851 + event.address,
17852 + failsafe.address,
17853 + syscall.address);
17854 +#endif
17855 #endif
17856 BUG_ON(ret);
17857
17858 +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17859 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
17860 #if CONFIG_XEN_COMPAT <= 0x030002
17861 if (ret == -ENOSYS) {
17862 @@ -158,15 +148,43 @@ void __init machine_specific_arch_setup(
17863 HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
17864 }
17865 #endif
17866 +#endif
17867
17868 +#ifdef CONFIG_X86_32
17869 /* Do an early initialization of the fixmap area */
17870 {
17871 extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
17872 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
17873 pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
17874 pmd_t *pmd = pmd_offset(pud, addr);
17875 + unsigned int i;
17876
17877 make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
17878 set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
17879 +
17880 +#define __FIXADDR_TOP (-PAGE_SIZE)
17881 +#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
17882 + != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
17883 + FIX_BUG_ON(SHARED_INFO);
17884 + FIX_BUG_ON(ISAMAP_BEGIN);
17885 + FIX_BUG_ON(ISAMAP_END);
17886 +#undef __FIXADDR_TOP
17887 + BUG_ON(pte_index(hypervisor_virt_start));
17888 +
17889 + /* Switch to the real shared_info page, and clear the
17890 + * dummy page. */
17891 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
17892 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
17893 + memset(empty_zero_page, 0, sizeof(empty_zero_page));
17894 +
17895 + /* Setup mapping of lower 1st MB */
17896 + for (i = 0; i < NR_FIX_ISAMAPS; i++)
17897 + if (is_initial_xendomain())
17898 + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
17899 + else
17900 + __set_fixmap(FIX_ISAMAP_BEGIN - i,
17901 + virt_to_machine(empty_zero_page),
17902 + PAGE_KERNEL_RO);
17903 }
17904 +#endif
17905 }
17906 Index: head-2008-12-01/arch/x86/mm/fault-xen.c
17907 ===================================================================
17908 --- head-2008-12-01.orig/arch/x86/mm/fault-xen.c 2008-12-01 11:44:55.000000000 +0100
17909 +++ head-2008-12-01/arch/x86/mm/fault-xen.c 2008-12-01 11:49:07.000000000 +0100
17910 @@ -10,6 +10,7 @@
17911 #include <linux/string.h>
17912 #include <linux/types.h>
17913 #include <linux/ptrace.h>
17914 +#include <linux/mmiotrace.h>
17915 #include <linux/mman.h>
17916 #include <linux/mm.h>
17917 #include <linux/smp.h>
17918 @@ -49,17 +50,23 @@
17919 #define PF_RSVD (1<<3)
17920 #define PF_INSTR (1<<4)
17921
17922 +static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
17923 +{
17924 +#ifdef CONFIG_MMIOTRACE_HOOKS
17925 + if (unlikely(is_kmmio_active()))
17926 + if (kmmio_handler(regs, addr) == 1)
17927 + return -1;
17928 +#endif
17929 + return 0;
17930 +}
17931 +
17932 static inline int notify_page_fault(struct pt_regs *regs)
17933 {
17934 #ifdef CONFIG_KPROBES
17935 int ret = 0;
17936
17937 /* kprobe_running() needs smp_processor_id() */
17938 -#ifdef CONFIG_X86_32
17939 if (!user_mode_vm(regs)) {
17940 -#else
17941 - if (!user_mode(regs)) {
17942 -#endif
17943 preempt_disable();
17944 if (kprobe_running() && kprobe_fault_handler(regs, 14))
17945 ret = 1;
17946 @@ -409,11 +416,7 @@ static void show_fault_oops(struct pt_re
17947 printk(KERN_CONT "NULL pointer dereference");
17948 else
17949 printk(KERN_CONT "paging request");
17950 -#ifdef CONFIG_X86_32
17951 - printk(KERN_CONT " at %08lx\n", address);
17952 -#else
17953 - printk(KERN_CONT " at %016lx\n", address);
17954 -#endif
17955 + printk(KERN_CONT " at %p\n", (void *) address);
17956 printk(KERN_ALERT "IP:");
17957 printk_address(regs->ip, 1);
17958 dump_pagetable(address);
17959 @@ -628,6 +631,8 @@ void __kprobes do_page_fault(struct pt_r
17960
17961 if (notify_page_fault(regs))
17962 return;
17963 + if (unlikely(kmmio_fault(regs, address)))
17964 + return;
17965
17966 /*
17967 * We fault-in kernel-space virtual memory on-demand. The
17968 @@ -832,14 +837,10 @@ bad_area_nosemaphore:
17969 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17970 printk_ratelimit()) {
17971 printk(
17972 -#ifdef CONFIG_X86_32
17973 - "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
17974 -#else
17975 - "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
17976 -#endif
17977 + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
17978 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
17979 - tsk->comm, task_pid_nr(tsk), address, regs->ip,
17980 - regs->sp, error_code);
17981 + tsk->comm, task_pid_nr(tsk), address,
17982 + (void *) regs->ip, (void *) regs->sp, error_code);
17983 print_vma_addr(" in ", regs->ip);
17984 printk("\n");
17985 }
17986 @@ -947,81 +948,45 @@ LIST_HEAD(pgd_list);
17987 void vmalloc_sync_all(void)
17988 {
17989 #ifdef CONFIG_X86_32
17990 - /*
17991 - * Note that races in the updates of insync and start aren't
17992 - * problematic: insync can only get set bits added, and updates to
17993 - * start are only improving performance (without affecting correctness
17994 - * if undone).
17995 - * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
17996 - * This change works just fine with 2-level paging too.
17997 - */
17998 -#define sync_index(a) ((a) >> PMD_SHIFT)
17999 - static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
18000 - static unsigned long start = TASK_SIZE;
18001 - unsigned long address;
18002 + unsigned long address = VMALLOC_START & PGDIR_MASK;
18003
18004 if (SHARED_KERNEL_PMD)
18005 return;
18006
18007 BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
18008 - for (address = start;
18009 - address < hypervisor_virt_start;
18010 - address += PMD_SIZE) {
18011 - if (!test_bit(sync_index(address), insync)) {
18012 - unsigned long flags;
18013 - struct page *page;
18014 -
18015 - spin_lock_irqsave(&pgd_lock, flags);
18016 - /* XEN: failure path assumes non-empty pgd_list. */
18017 - if (unlikely(list_empty(&pgd_list))) {
18018 - spin_unlock_irqrestore(&pgd_lock, flags);
18019 - return;
18020 - }
18021 - list_for_each_entry(page, &pgd_list, lru) {
18022 - if (!vmalloc_sync_one(page_address(page),
18023 - address))
18024 - break;
18025 - }
18026 - spin_unlock_irqrestore(&pgd_lock, flags);
18027 - if (!page)
18028 - set_bit(sync_index(address), insync);
18029 + for (; address < hypervisor_virt_start; address += PMD_SIZE) {
18030 + unsigned long flags;
18031 + struct page *page;
18032 +
18033 + spin_lock_irqsave(&pgd_lock, flags);
18034 + list_for_each_entry(page, &pgd_list, lru) {
18035 + if (!vmalloc_sync_one(page_address(page),
18036 + address))
18037 + break;
18038 }
18039 - if (address == start && test_bit(sync_index(address), insync))
18040 - start = address + PMD_SIZE;
18041 + spin_unlock_irqrestore(&pgd_lock, flags);
18042 }
18043 #else /* CONFIG_X86_64 */
18044 - /*
18045 - * Note that races in the updates of insync and start aren't
18046 - * problematic: insync can only get set bits added, and updates to
18047 - * start are only improving performance (without affecting correctness
18048 - * if undone).
18049 - */
18050 - static DECLARE_BITMAP(insync, PTRS_PER_PGD);
18051 - static unsigned long start = VMALLOC_START & PGDIR_MASK;
18052 + unsigned long start = VMALLOC_START & PGDIR_MASK;
18053 unsigned long address;
18054
18055 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
18056 - if (!test_bit(pgd_index(address), insync)) {
18057 - const pgd_t *pgd_ref = pgd_offset_k(address);
18058 - unsigned long flags;
18059 - struct page *page;
18060 -
18061 - if (pgd_none(*pgd_ref))
18062 - continue;
18063 - spin_lock_irqsave(&pgd_lock, flags);
18064 - list_for_each_entry(page, &pgd_list, lru) {
18065 - pgd_t *pgd;
18066 - pgd = (pgd_t *)page_address(page) + pgd_index(address);
18067 - if (pgd_none(*pgd))
18068 - set_pgd(pgd, *pgd_ref);
18069 - else
18070 - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18071 - }
18072 - spin_unlock_irqrestore(&pgd_lock, flags);
18073 - set_bit(pgd_index(address), insync);
18074 + const pgd_t *pgd_ref = pgd_offset_k(address);
18075 + unsigned long flags;
18076 + struct page *page;
18077 +
18078 + if (pgd_none(*pgd_ref))
18079 + continue;
18080 + spin_lock_irqsave(&pgd_lock, flags);
18081 + list_for_each_entry(page, &pgd_list, lru) {
18082 + pgd_t *pgd;
18083 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
18084 + if (pgd_none(*pgd))
18085 + set_pgd(pgd, *pgd_ref);
18086 + else
18087 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18088 }
18089 - if (address == start)
18090 - start = address + PGDIR_SIZE;
18091 + spin_unlock_irqrestore(&pgd_lock, flags);
18092 }
18093 #endif
18094 }
18095 Index: head-2008-12-01/arch/x86/mm/hypervisor.c
18096 ===================================================================
18097 --- head-2008-12-01.orig/arch/x86/mm/hypervisor.c 2008-12-01 11:37:10.000000000 +0100
18098 +++ head-2008-12-01/arch/x86/mm/hypervisor.c 2008-12-01 11:49:07.000000000 +0100
18099 @@ -837,42 +837,9 @@ int write_ldt_entry(struct desc_struct *
18100 return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
18101 }
18102
18103 -#define MAX_BATCHED_FULL_PTES 32
18104 -
18105 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
18106 - unsigned long addr, unsigned long end, pgprot_t newprot,
18107 - int dirty_accountable)
18108 +int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
18109 + int type)
18110 {
18111 - int rc = 0, i = 0;
18112 - mmu_update_t u[MAX_BATCHED_FULL_PTES];
18113 - pte_t *pte;
18114 - spinlock_t *ptl;
18115 -
18116 - if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
18117 - return 0;
18118 -
18119 - pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
18120 - do {
18121 - if (pte_present(*pte)) {
18122 - pte_t ptent = pte_modify(*pte, newprot);
18123 -
18124 - if (dirty_accountable && pte_dirty(ptent))
18125 - ptent = pte_mkwrite(ptent);
18126 - u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
18127 - | ((unsigned long)pte & ~PAGE_MASK)
18128 - | MMU_PT_UPDATE_PRESERVE_AD;
18129 - u[i].val = __pte_val(ptent);
18130 - if (++i == MAX_BATCHED_FULL_PTES) {
18131 - if ((rc = HYPERVISOR_mmu_update(
18132 - &u[0], i, NULL, DOMID_SELF)) != 0)
18133 - break;
18134 - i = 0;
18135 - }
18136 - }
18137 - } while (pte++, addr += PAGE_SIZE, addr != end);
18138 - if (i)
18139 - rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
18140 - pte_unmap_unlock(pte - 1, ptl);
18141 - BUG_ON(rc && rc != -ENOSYS);
18142 - return !rc;
18143 + maddr_t mach_gp = virt_to_machine(gdt + entry);
18144 + return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
18145 }
18146 Index: head-2008-12-01/arch/x86/mm/init_32-xen.c
18147 ===================================================================
18148 --- head-2008-12-01.orig/arch/x86/mm/init_32-xen.c 2008-12-01 11:44:55.000000000 +0100
18149 +++ head-2008-12-01/arch/x86/mm/init_32-xen.c 2008-12-01 11:49:07.000000000 +0100
18150 @@ -54,6 +54,7 @@
18151
18152 unsigned int __VMALLOC_RESERVE = 128 << 20;
18153
18154 +unsigned long max_low_pfn_mapped;
18155 unsigned long max_pfn_mapped;
18156
18157 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
18158 @@ -61,6 +62,27 @@ unsigned long highstart_pfn, highend_pfn
18159
18160 static noinline int do_test_wp_bit(void);
18161
18162 +
18163 +static unsigned long __initdata table_start;
18164 +static unsigned long __initdata table_end;
18165 +static unsigned long __initdata table_top;
18166 +
18167 +static int __initdata after_init_bootmem;
18168 +
18169 +static __init void *alloc_low_page(unsigned long *phys)
18170 +{
18171 + unsigned long pfn = table_end++;
18172 + void *adr;
18173 +
18174 + if (pfn >= table_top)
18175 + panic("alloc_low_page: ran out of memory");
18176 +
18177 + adr = __va(pfn * PAGE_SIZE);
18178 + memset(adr, 0, PAGE_SIZE);
18179 + *phys = pfn * PAGE_SIZE;
18180 + return adr;
18181 +}
18182 +
18183 /*
18184 * Creates a middle page table and puts a pointer to it in the
18185 * given global directory entry. This only returns the gd entry
18186 @@ -72,9 +94,12 @@ static pmd_t * __init one_md_table_init(
18187 pmd_t *pmd_table;
18188
18189 #ifdef CONFIG_X86_PAE
18190 + unsigned long phys;
18191 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
18192 - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18193 -
18194 + if (after_init_bootmem)
18195 + pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18196 + else
18197 + pmd_table = (pmd_t *)alloc_low_page(&phys);
18198 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
18199 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
18200 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
18201 @@ -101,12 +126,16 @@ static pte_t * __init one_page_table_ini
18202 #endif
18203 pte_t *page_table = NULL;
18204
18205 + if (after_init_bootmem) {
18206 #ifdef CONFIG_DEBUG_PAGEALLOC
18207 - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18208 + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18209 #endif
18210 - if (!page_table) {
18211 - page_table =
18212 + if (!page_table)
18213 + page_table =
18214 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18215 + } else {
18216 + unsigned long phys;
18217 + page_table = (pte_t *)alloc_low_page(&phys);
18218 }
18219
18220 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
18221 @@ -167,24 +196,24 @@ static inline int is_kernel_text(unsigne
18222 * of max_low_pfn pages, by creating page tables starting from address
18223 * PAGE_OFFSET:
18224 */
18225 -static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
18226 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
18227 + unsigned long start_pfn,
18228 + unsigned long end_pfn,
18229 + int use_pse)
18230 {
18231 int pgd_idx, pmd_idx, pte_ofs;
18232 unsigned long pfn;
18233 pgd_t *pgd;
18234 pmd_t *pmd;
18235 pte_t *pte;
18236 + unsigned pages_2m = 0, pages_4k = 0;
18237
18238 - unsigned long max_ram_pfn = xen_start_info->nr_pages;
18239 - if (max_ram_pfn > max_low_pfn)
18240 - max_ram_pfn = max_low_pfn;
18241 + if (!cpu_has_pse)
18242 + use_pse = 0;
18243
18244 - pgd_idx = pgd_index(PAGE_OFFSET);
18245 + pfn = start_pfn;
18246 + pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18247 pgd = pgd_base + pgd_idx;
18248 - pfn = 0;
18249 - pmd_idx = pmd_index(PAGE_OFFSET);
18250 - pte_ofs = pte_index(PAGE_OFFSET);
18251 -
18252 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
18253 #ifdef CONFIG_XEN
18254 /*
18255 @@ -198,10 +227,16 @@ static void __init kernel_physical_mappi
18256 #else
18257 pmd = one_md_table_init(pgd);
18258 #endif
18259 - if (pfn >= max_low_pfn)
18260 +
18261 + if (pfn >= end_pfn)
18262 continue;
18263 +#ifdef CONFIG_X86_PAE
18264 + pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18265 pmd += pmd_idx;
18266 - for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
18267 +#else
18268 + pmd_idx = 0;
18269 +#endif
18270 + for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
18271 pmd++, pmd_idx++) {
18272 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
18273
18274 @@ -211,13 +246,8 @@ static void __init kernel_physical_mappi
18275 /*
18276 * Map with big pages if possible, otherwise
18277 * create normal page tables:
18278 - *
18279 - * Don't use a large page for the first 2/4MB of memory
18280 - * because there are often fixed size MTRRs in there
18281 - * and overlapping MTRRs into large pages can cause
18282 - * slowdowns.
18283 */
18284 - if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
18285 + if (use_pse) {
18286 unsigned int addr2;
18287 pgprot_t prot = PAGE_KERNEL_LARGE;
18288
18289 @@ -228,49 +258,35 @@ static void __init kernel_physical_mappi
18290 is_kernel_text(addr2))
18291 prot = PAGE_KERNEL_LARGE_EXEC;
18292
18293 + pages_2m++;
18294 set_pmd(pmd, pfn_pmd(pfn, prot));
18295
18296 pfn += PTRS_PER_PTE;
18297 - max_pfn_mapped = pfn;
18298 continue;
18299 }
18300 pte = one_page_table_init(pmd);
18301
18302 - for (pte += pte_ofs;
18303 - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
18304 + pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18305 + pte += pte_ofs;
18306 + for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
18307 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
18308 pgprot_t prot = PAGE_KERNEL;
18309
18310 /* XEN: Only map initial RAM allocation. */
18311 - if ((pfn >= max_ram_pfn) || pte_present(*pte))
18312 + if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
18313 continue;
18314 if (is_kernel_text(addr))
18315 prot = PAGE_KERNEL_EXEC;
18316
18317 + pages_4k++;
18318 set_pte(pte, pfn_pte(pfn, prot));
18319 }
18320 - max_pfn_mapped = pfn;
18321 - pte_ofs = 0;
18322 }
18323 - pmd_idx = 0;
18324 }
18325 + update_page_count(PG_LEVEL_2M, pages_2m);
18326 + update_page_count(PG_LEVEL_4K, pages_4k);
18327 }
18328
18329 -#ifndef CONFIG_XEN
18330 -
18331 -static inline int page_kills_ppro(unsigned long pagenr)
18332 -{
18333 - if (pagenr >= 0x70000 && pagenr <= 0x7003F)
18334 - return 1;
18335 - return 0;
18336 -}
18337 -
18338 -#else
18339 -
18340 -#define page_kills_ppro(p) 0
18341 -
18342 -#endif
18343 -
18344 /*
18345 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
18346 * is valid. The argument is a physical page number.
18347 @@ -331,30 +347,63 @@ static void __init permanent_kmaps_init(
18348 pkmap_page_table = pte;
18349 }
18350
18351 -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
18352 +static void __init add_one_highpage_init(struct page *page, int pfn)
18353 {
18354 - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
18355 - ClearPageReserved(page);
18356 - init_page_count(page);
18357 - if (pfn < xen_start_info->nr_pages)
18358 - __free_page(page);
18359 - totalhigh_pages++;
18360 - } else
18361 - SetPageReserved(page);
18362 + ClearPageReserved(page);
18363 + init_page_count(page);
18364 + if (pfn < xen_start_info->nr_pages)
18365 + __free_page(page);
18366 + totalhigh_pages++;
18367 +}
18368 +
18369 +struct add_highpages_data {
18370 + unsigned long start_pfn;
18371 + unsigned long end_pfn;
18372 +};
18373 +
18374 +static int __init add_highpages_work_fn(unsigned long start_pfn,
18375 + unsigned long end_pfn, void *datax)
18376 +{
18377 + int node_pfn;
18378 + struct page *page;
18379 + unsigned long final_start_pfn, final_end_pfn;
18380 + struct add_highpages_data *data;
18381 +
18382 + data = (struct add_highpages_data *)datax;
18383 +
18384 + final_start_pfn = max(start_pfn, data->start_pfn);
18385 + final_end_pfn = min(end_pfn, data->end_pfn);
18386 + if (final_start_pfn >= final_end_pfn)
18387 + return 0;
18388 +
18389 + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
18390 + node_pfn++) {
18391 + if (!pfn_valid(node_pfn))
18392 + continue;
18393 + page = pfn_to_page(node_pfn);
18394 + add_one_highpage_init(page, node_pfn);
18395 + }
18396 +
18397 + return 0;
18398 +
18399 +}
18400 +
18401 +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
18402 + unsigned long end_pfn)
18403 +{
18404 + struct add_highpages_data data;
18405 +
18406 + data.start_pfn = start_pfn;
18407 + data.end_pfn = end_pfn;
18408 +
18409 + work_with_active_regions(nid, add_highpages_work_fn, &data);
18410 }
18411
18412 #ifndef CONFIG_NUMA
18413 -static void __init set_highmem_pages_init(int bad_ppro)
18414 +static void __init set_highmem_pages_init(void)
18415 {
18416 - int pfn;
18417 + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
18418
18419 - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
18420 - /*
18421 - * Holes under sparsemem might not have no mem_map[]:
18422 - */
18423 - if (pfn_valid(pfn))
18424 - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
18425 - }
18426 totalram_pages += totalhigh_pages;
18427 }
18428 #endif /* !CONFIG_NUMA */
18429 @@ -362,24 +411,11 @@ static void __init set_highmem_pages_ini
18430 #else
18431 # define kmap_init() do { } while (0)
18432 # define permanent_kmaps_init(pgd_base) do { } while (0)
18433 -# define set_highmem_pages_init(bad_ppro) do { } while (0)
18434 +# define set_highmem_pages_init() do { } while (0)
18435 #endif /* CONFIG_HIGHMEM */
18436
18437 -pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
18438 -EXPORT_SYMBOL(__PAGE_KERNEL);
18439 -
18440 -pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
18441 -
18442 pgd_t *swapper_pg_dir;
18443
18444 -static void __init xen_pagetable_setup_start(pgd_t *base)
18445 -{
18446 -}
18447 -
18448 -static void __init xen_pagetable_setup_done(pgd_t *base)
18449 -{
18450 -}
18451 -
18452 /*
18453 * Build a proper pagetable for the kernel mappings. Up until this
18454 * point, we've been running on some set of pagetables constructed by
18455 @@ -399,27 +435,10 @@ static void __init xen_pagetable_setup_d
18456 * be partially populated, and so it avoids stomping on any existing
18457 * mappings.
18458 */
18459 -static void __init pagetable_init(void)
18460 +static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
18461 {
18462 - pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18463 unsigned long vaddr, end;
18464
18465 - xen_pagetable_setup_start(pgd_base);
18466 -
18467 - /* Enable PSE if available */
18468 - if (cpu_has_pse)
18469 - set_in_cr4(X86_CR4_PSE);
18470 -
18471 - /* Enable PGE if available */
18472 - if (cpu_has_pge) {
18473 - set_in_cr4(X86_CR4_PGE);
18474 - __PAGE_KERNEL |= _PAGE_GLOBAL;
18475 - __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
18476 - }
18477 -
18478 - kernel_physical_mapping_init(pgd_base);
18479 - remap_numa_kva();
18480 -
18481 /*
18482 * Fixed mappings, only the page table structure has to be
18483 * created - mappings will be set by set_fixmap():
18484 @@ -429,10 +448,13 @@ static void __init pagetable_init(void)
18485 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
18486 page_table_range_init(vaddr, end, pgd_base);
18487 early_ioremap_reset();
18488 +}
18489
18490 - permanent_kmaps_init(pgd_base);
18491 +static void __init pagetable_init(void)
18492 +{
18493 + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18494
18495 - xen_pagetable_setup_done(pgd_base);
18496 + permanent_kmaps_init(pgd_base);
18497 }
18498
18499 #if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
18500 @@ -475,7 +497,7 @@ void zap_low_mappings(void)
18501
18502 int nx_enabled;
18503
18504 -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
18505 +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
18506 EXPORT_SYMBOL_GPL(__supported_pte_mask);
18507
18508 #ifdef CONFIG_X86_PAE
18509 @@ -528,42 +550,369 @@ static void __init set_nx(void)
18510 }
18511 #endif
18512
18513 +/* user-defined highmem size */
18514 +static unsigned int highmem_pages = -1;
18515 +
18516 /*
18517 - * paging_init() sets up the page tables - note that the first 8MB are
18518 - * already mapped by head.S.
18519 - *
18520 - * This routines also unmaps the page at virtual kernel address 0, so
18521 - * that we can trap those pesky NULL-reference errors in the kernel.
18522 + * highmem=size forces highmem to be exactly 'size' bytes.
18523 + * This works even on boxes that have no highmem otherwise.
18524 + * This also works to reduce highmem size on bigger boxes.
18525 */
18526 -void __init paging_init(void)
18527 +static int __init parse_highmem(char *arg)
18528 +{
18529 + if (!arg)
18530 + return -EINVAL;
18531 +
18532 + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
18533 + return 0;
18534 +}
18535 +early_param("highmem", parse_highmem);
18536 +
18537 +/*
18538 + * Determine low and high memory ranges:
18539 + */
18540 +void __init find_low_pfn_range(void)
18541 +{
18542 + /* it could update max_pfn */
18543 +
18544 + /* max_low_pfn is 0, we already have early_res support */
18545 +
18546 + max_low_pfn = max_pfn;
18547 + if (max_low_pfn > MAXMEM_PFN) {
18548 + if (highmem_pages == -1)
18549 + highmem_pages = max_pfn - MAXMEM_PFN;
18550 + if (highmem_pages + MAXMEM_PFN < max_pfn)
18551 + max_pfn = MAXMEM_PFN + highmem_pages;
18552 + if (highmem_pages + MAXMEM_PFN > max_pfn) {
18553 + printk(KERN_WARNING "only %luMB highmem pages "
18554 + "available, ignoring highmem size of %uMB.\n",
18555 + pages_to_mb(max_pfn - MAXMEM_PFN),
18556 + pages_to_mb(highmem_pages));
18557 + highmem_pages = 0;
18558 + }
18559 + max_low_pfn = MAXMEM_PFN;
18560 +#ifndef CONFIG_HIGHMEM
18561 + /* Maximum memory usable is what is directly addressable */
18562 + printk(KERN_WARNING "Warning only %ldMB will be used.\n",
18563 + MAXMEM>>20);
18564 + if (max_pfn > MAX_NONPAE_PFN)
18565 + printk(KERN_WARNING
18566 + "Use a HIGHMEM64G enabled kernel.\n");
18567 + else
18568 + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
18569 + max_pfn = MAXMEM_PFN;
18570 +#else /* !CONFIG_HIGHMEM */
18571 +#ifndef CONFIG_HIGHMEM64G
18572 + if (max_pfn > MAX_NONPAE_PFN) {
18573 + max_pfn = MAX_NONPAE_PFN;
18574 + printk(KERN_WARNING "Warning only 4GB will be used."
18575 + "Use a HIGHMEM64G enabled kernel.\n");
18576 + }
18577 +#endif /* !CONFIG_HIGHMEM64G */
18578 +#endif /* !CONFIG_HIGHMEM */
18579 + } else {
18580 + if (highmem_pages == -1)
18581 + highmem_pages = 0;
18582 +#ifdef CONFIG_HIGHMEM
18583 + if (highmem_pages >= max_pfn) {
18584 + printk(KERN_ERR "highmem size specified (%uMB) is "
18585 + "bigger than pages available (%luMB)!.\n",
18586 + pages_to_mb(highmem_pages),
18587 + pages_to_mb(max_pfn));
18588 + highmem_pages = 0;
18589 + }
18590 + if (highmem_pages) {
18591 + if (max_low_pfn - highmem_pages <
18592 + 64*1024*1024/PAGE_SIZE){
18593 + printk(KERN_ERR "highmem size %uMB results in "
18594 + "smaller than 64MB lowmem, ignoring it.\n"
18595 + , pages_to_mb(highmem_pages));
18596 + highmem_pages = 0;
18597 + }
18598 + max_low_pfn -= highmem_pages;
18599 + }
18600 +#else
18601 + if (highmem_pages)
18602 + printk(KERN_ERR "ignoring highmem size on non-highmem"
18603 + " kernel!\n");
18604 +#endif
18605 + }
18606 +}
18607 +
18608 +#ifndef CONFIG_NEED_MULTIPLE_NODES
18609 +void __init initmem_init(unsigned long start_pfn,
18610 + unsigned long end_pfn)
18611 +{
18612 +#ifdef CONFIG_HIGHMEM
18613 + highstart_pfn = highend_pfn = max_pfn;
18614 + if (max_pfn > max_low_pfn)
18615 + highstart_pfn = max_low_pfn;
18616 + memory_present(0, 0, highend_pfn);
18617 + e820_register_active_regions(0, 0, highend_pfn);
18618 + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
18619 + pages_to_mb(highend_pfn - highstart_pfn));
18620 + num_physpages = highend_pfn;
18621 + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
18622 +#else
18623 + memory_present(0, 0, max_low_pfn);
18624 + e820_register_active_regions(0, 0, max_low_pfn);
18625 + num_physpages = max_low_pfn;
18626 + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
18627 +#endif
18628 +#ifdef CONFIG_FLATMEM
18629 + max_mapnr = num_physpages;
18630 +#endif
18631 + printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
18632 + pages_to_mb(max_low_pfn));
18633 +
18634 + setup_bootmem_allocator();
18635 +}
18636 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
18637 +
18638 +static void __init zone_sizes_init(void)
18639 +{
18640 + unsigned long max_zone_pfns[MAX_NR_ZONES];
18641 + memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
18642 + max_zone_pfns[ZONE_DMA] =
18643 + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
18644 + max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
18645 +#ifdef CONFIG_HIGHMEM
18646 + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
18647 +#endif
18648 +
18649 + free_area_init_nodes(max_zone_pfns);
18650 +}
18651 +
18652 +void __init setup_bootmem_allocator(void)
18653 {
18654 int i;
18655 + unsigned long bootmap_size, bootmap;
18656 + unsigned long end_pfn = min(max_low_pfn, xen_start_info->nr_pages);
18657 +
18658 + /*
18659 + * Initialize the boot-time allocator (with low memory only):
18660 + */
18661 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
18662 + bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
18663 + max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
18664 + PAGE_SIZE);
18665 + if (bootmap == -1L)
18666 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
18667 + reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
18668 +
18669 + /* don't touch min_low_pfn */
18670 + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
18671 + min_low_pfn, end_pfn);
18672 + printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
18673 + max_pfn_mapped<<PAGE_SHIFT);
18674 + printk(KERN_INFO " low ram: %08lx - %08lx\n",
18675 + min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
18676 + printk(KERN_INFO " bootmap %08lx - %08lx\n",
18677 + bootmap, bootmap + bootmap_size);
18678 + for_each_online_node(i)
18679 + free_bootmem_with_active_regions(i, end_pfn);
18680 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
18681 +
18682 + after_init_bootmem = 1;
18683 +}
18684 +
18685 +static unsigned long __init extend_init_mapping(unsigned long tables_space)
18686 +{
18687 + unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
18688 + + xen_start_info->nr_pt_frames;
18689 + unsigned long start = start_pfn, va;
18690 + pgd_t *pgd;
18691 + pud_t *pud;
18692 + pmd_t *pmd;
18693 + pte_t *pte;
18694 +
18695 + /* Kill mapping of low 1MB. */
18696 + for (va = PAGE_OFFSET; va < (unsigned long)&_text; va += PAGE_SIZE)
18697 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
18698 + BUG();
18699 +
18700 + /* Ensure init mappings cover kernel text/data and initial tables. */
18701 + while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
18702 + pgd = pgd_offset_k(va);
18703 + pud = pud_offset(pgd, va);
18704 + pmd = pmd_offset(pud, va);
18705 + if (pmd_none(*pmd)) {
18706 + unsigned long pa = start_pfn++ << PAGE_SHIFT;
18707 +
18708 + memset(__va(pa), 0, PAGE_SIZE);
18709 + make_lowmem_page_readonly(__va(pa),
18710 + XENFEAT_writable_page_tables);
18711 + xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
18712 + }
18713 + pte = pte_offset_kernel(pmd, va);
18714 + if (pte_none(*pte)) {
18715 + pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
18716 +
18717 + if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
18718 + BUG();
18719 + }
18720 + va += PAGE_SIZE;
18721 + }
18722 +
18723 + /* Finally, blow away any spurious initial mappings. */
18724 + while (1) {
18725 + pgd = pgd_offset_k(va);
18726 + pud = pud_offset(pgd, va);
18727 + pmd = pmd_offset(pud, va);
18728 + if (pmd_none(*pmd))
18729 + break;
18730 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
18731 + BUG();
18732 + va += PAGE_SIZE;
18733 + }
18734 +
18735 + if (start_pfn > start)
18736 + reserve_early(start << PAGE_SHIFT,
18737 + start_pfn << PAGE_SHIFT, "INITMAP");
18738 +
18739 + return start_pfn;
18740 +}
18741 +
18742 +static void __init find_early_table_space(unsigned long end)
18743 +{
18744 + unsigned long puds, pmds, ptes, tables;
18745 +
18746 + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
18747 + tables = PAGE_ALIGN(puds * sizeof(pud_t));
18748 +
18749 + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
18750 + tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
18751 +
18752 + if (cpu_has_pse) {
18753 + unsigned long extra;
18754 +
18755 + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
18756 + extra += PMD_SIZE;
18757 + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
18758 + } else
18759 + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
18760 +
18761 + tables += PAGE_ALIGN(ptes * sizeof(pte_t));
18762 +
18763 + /* for fixmap */
18764 + tables += PAGE_SIZE
18765 + * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK)
18766 + - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK))
18767 + >> PMD_SHIFT);
18768 +
18769 + table_start = extend_init_mapping(tables);
18770 +
18771 + table_end = table_start;
18772 + table_top = table_start + (tables>>PAGE_SHIFT);
18773 +
18774 + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
18775 + end, table_start << PAGE_SHIFT,
18776 + (table_start << PAGE_SHIFT) + tables);
18777 +}
18778 +
18779 +unsigned long __init_refok init_memory_mapping(unsigned long start,
18780 + unsigned long end)
18781 +{
18782 + pgd_t *pgd_base = swapper_pg_dir;
18783 + unsigned long start_pfn, end_pfn;
18784 + unsigned long big_page_start;
18785 +
18786 + /*
18787 + * Find space for the kernel direct mapping tables.
18788 + */
18789 + if (!after_init_bootmem)
18790 + find_early_table_space(end);
18791
18792 #ifdef CONFIG_X86_PAE
18793 set_nx();
18794 if (nx_enabled)
18795 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
18796 #endif
18797 +
18798 + /* Enable PSE if available */
18799 + if (cpu_has_pse)
18800 + set_in_cr4(X86_CR4_PSE);
18801 +
18802 + /* Enable PGE if available */
18803 + if (cpu_has_pge) {
18804 + set_in_cr4(X86_CR4_PGE);
18805 + __supported_pte_mask |= _PAGE_GLOBAL;
18806 + }
18807 +
18808 + /*
18809 + * Don't use a large page for the first 2/4MB of memory
18810 + * because there are often fixed size MTRRs in there
18811 + * and overlapping MTRRs into large pages can cause
18812 + * slowdowns.
18813 + */
18814 + big_page_start = PMD_SIZE;
18815 +
18816 + if (start < big_page_start) {
18817 + start_pfn = start >> PAGE_SHIFT;
18818 + end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
18819 + } else {
18820 + /* head is not big page alignment ? */
18821 + start_pfn = start >> PAGE_SHIFT;
18822 + end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18823 + << (PMD_SHIFT - PAGE_SHIFT);
18824 + }
18825 + if (start_pfn < end_pfn)
18826 + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
18827 +
18828 + /* big page range */
18829 + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18830 + << (PMD_SHIFT - PAGE_SHIFT);
18831 + if (start_pfn < (big_page_start >> PAGE_SHIFT))
18832 + start_pfn = big_page_start >> PAGE_SHIFT;
18833 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
18834 + if (start_pfn < end_pfn)
18835 + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
18836 + cpu_has_pse);
18837 +
18838 + /* tail is not big page alignment ? */
18839 + start_pfn = end_pfn;
18840 + if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
18841 + end_pfn = end >> PAGE_SHIFT;
18842 + if (start_pfn < end_pfn)
18843 + kernel_physical_mapping_init(pgd_base, start_pfn,
18844 + end_pfn, 0);
18845 + }
18846 +
18847 + early_ioremap_page_table_range_init(pgd_base);
18848 +
18849 + __flush_tlb_all();
18850 +
18851 + if (!after_init_bootmem)
18852 + reserve_early(table_start << PAGE_SHIFT,
18853 + table_end << PAGE_SHIFT, "PGTABLE");
18854 +
18855 + if (!after_init_bootmem)
18856 + early_memtest(start, end);
18857 +
18858 + return end >> PAGE_SHIFT;
18859 +}
18860 +
18861 +
18862 +/*
18863 + * paging_init() sets up the page tables - note that the first 8MB are
18864 + * already mapped by head.S.
18865 + *
18866 + * This routines also unmaps the page at virtual kernel address 0, so
18867 + * that we can trap those pesky NULL-reference errors in the kernel.
18868 + */
18869 +void __init paging_init(void)
18870 +{
18871 pagetable_init();
18872
18873 __flush_tlb_all();
18874
18875 kmap_init();
18876
18877 - /* Switch to the real shared_info page, and clear the
18878 - * dummy page. */
18879 - set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
18880 - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
18881 - memset(empty_zero_page, 0, sizeof(empty_zero_page));
18882 -
18883 - /* Setup mapping of lower 1st MB */
18884 - for (i = 0; i < NR_FIX_ISAMAPS; i++)
18885 - if (is_initial_xendomain())
18886 - set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
18887 - else
18888 - __set_fixmap(FIX_ISAMAP_BEGIN - i,
18889 - virt_to_machine(empty_zero_page),
18890 - PAGE_KERNEL_RO);
18891 + /*
18892 + * NOTE: at this point the bootmem allocator is fully available.
18893 + */
18894 + sparse_init();
18895 + zone_sizes_init();
18896 }
18897
18898 /*
18899 @@ -598,7 +947,7 @@ static struct kcore_list kcore_mem, kcor
18900 void __init mem_init(void)
18901 {
18902 int codesize, reservedpages, datasize, initsize;
18903 - int tmp, bad_ppro;
18904 + int tmp;
18905 unsigned long pfn;
18906
18907 pci_iommu_alloc();
18908 @@ -606,19 +955,6 @@ void __init mem_init(void)
18909 #ifdef CONFIG_FLATMEM
18910 BUG_ON(!mem_map);
18911 #endif
18912 - bad_ppro = ppro_with_ram_bug();
18913 -
18914 -#ifdef CONFIG_HIGHMEM
18915 - /* check that fixmap and pkmap do not overlap */
18916 - if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
18917 - printk(KERN_ERR
18918 - "fixmap and kmap areas overlap - this will crash\n");
18919 - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
18920 - PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
18921 - FIXADDR_START);
18922 - BUG();
18923 - }
18924 -#endif
18925 /* this will put all low memory onto the freelists */
18926 totalram_pages += free_all_bootmem();
18927 /* XEN: init and count low-mem pages outside initial allocation. */
18928 @@ -636,7 +972,7 @@ void __init mem_init(void)
18929 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
18930 reservedpages++;
18931
18932 - set_highmem_pages_init(bad_ppro);
18933 + set_highmem_pages_init();
18934
18935 codesize = (unsigned long) &_etext - (unsigned long) &_text;
18936 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
18937 @@ -657,7 +993,6 @@ void __init mem_init(void)
18938 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
18939 );
18940
18941 -#if 1 /* double-sanity-check paranoia */
18942 printk(KERN_INFO "virtual kernel memory layout:\n"
18943 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
18944 #ifdef CONFIG_HIGHMEM
18945 @@ -698,7 +1033,6 @@ void __init mem_init(void)
18946 #endif
18947 BUG_ON(VMALLOC_START > VMALLOC_END);
18948 BUG_ON((unsigned long)high_memory > VMALLOC_START);
18949 -#endif /* double-sanity-check paranoia */
18950
18951 if (boot_cpu_data.wp_works_ok < 0)
18952 test_wp_bit();
18953 @@ -755,6 +1089,8 @@ void mark_rodata_ro(void)
18954 unsigned long start = PFN_ALIGN(_text);
18955 unsigned long size = PFN_ALIGN(_etext) - start;
18956
18957 +#ifndef CONFIG_DYNAMIC_FTRACE
18958 + /* Dynamic tracing modifies the kernel text section */
18959 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
18960 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
18961 size >> 10);
18962 @@ -767,6 +1103,8 @@ void mark_rodata_ro(void)
18963 printk(KERN_INFO "Testing CPA: write protecting again\n");
18964 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
18965 #endif
18966 +#endif /* CONFIG_DYNAMIC_FTRACE */
18967 +
18968 start += size;
18969 size = (unsigned long)__end_rodata - start;
18970 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
18971 @@ -829,3 +1167,9 @@ void free_initrd_mem(unsigned long start
18972 free_init_pages("initrd memory", start, end);
18973 }
18974 #endif
18975 +
18976 +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
18977 + int flags)
18978 +{
18979 + return reserve_bootmem(phys, len, flags);
18980 +}
18981 Index: head-2008-12-01/arch/x86/mm/init_64-xen.c
18982 ===================================================================
18983 --- head-2008-12-01.orig/arch/x86/mm/init_64-xen.c 2008-12-01 11:44:55.000000000 +0100
18984 +++ head-2008-12-01/arch/x86/mm/init_64-xen.c 2008-12-01 11:49:07.000000000 +0100
18985 @@ -21,6 +21,7 @@
18986 #include <linux/swap.h>
18987 #include <linux/smp.h>
18988 #include <linux/init.h>
18989 +#include <linux/initrd.h>
18990 #include <linux/pagemap.h>
18991 #include <linux/bootmem.h>
18992 #include <linux/proc_fs.h>
18993 @@ -52,6 +53,14 @@
18994
18995 #include <xen/features.h>
18996
18997 +/*
18998 + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
18999 + * The direct mapping extends to max_pfn_mapped, so that we can directly access
19000 + * apertures, ACPI and other tables without having to play with fixmaps.
19001 + */
19002 +unsigned long max_low_pfn_mapped;
19003 +unsigned long max_pfn_mapped;
19004 +
19005 #if CONFIG_XEN_COMPAT <= 0x030002
19006 unsigned int __kernel_page_user;
19007 EXPORT_SYMBOL(__kernel_page_user);
19008 @@ -60,12 +69,11 @@ EXPORT_SYMBOL(__kernel_page_user);
19009 int after_bootmem;
19010
19011 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
19012 -extern unsigned long start_pfn;
19013
19014 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
19015 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
19016
19017 -int direct_gbpages __meminitdata
19018 +int direct_gbpages
19019 #ifdef CONFIG_DIRECT_GBPAGES
19020 = 1
19021 #endif
19022 @@ -145,55 +153,23 @@ void __meminit early_make_page_readonly(
19023 * around without checking the pgd every time.
19024 */
19025
19026 -void show_mem(void)
19027 -{
19028 - long i, total = 0, reserved = 0;
19029 - long shared = 0, cached = 0;
19030 - struct page *page;
19031 - pg_data_t *pgdat;
19032 -
19033 - printk(KERN_INFO "Mem-info:\n");
19034 - show_free_areas();
19035 - for_each_online_pgdat(pgdat) {
19036 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
19037 - /*
19038 - * This loop can take a while with 256 GB and
19039 - * 4k pages so defer the NMI watchdog:
19040 - */
19041 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
19042 - touch_nmi_watchdog();
19043 -
19044 - if (!pfn_valid(pgdat->node_start_pfn + i))
19045 - continue;
19046 -
19047 - page = pfn_to_page(pgdat->node_start_pfn + i);
19048 - total++;
19049 - if (PageReserved(page))
19050 - reserved++;
19051 - else if (PageSwapCache(page))
19052 - cached++;
19053 - else if (page_count(page))
19054 - shared += page_count(page) - 1;
19055 - }
19056 - }
19057 - printk(KERN_INFO "%lu pages of RAM\n", total);
19058 - printk(KERN_INFO "%lu reserved pages\n", reserved);
19059 - printk(KERN_INFO "%lu pages shared\n", shared);
19060 - printk(KERN_INFO "%lu pages swap cached\n", cached);
19061 -}
19062 -
19063 static unsigned long __meminitdata table_start;
19064 +static unsigned long __meminitdata table_cur;
19065 static unsigned long __meminitdata table_end;
19066
19067 -static __init void *spp_getpage(void)
19068 +/*
19069 + * NOTE: This function is marked __ref because it calls __init function
19070 + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
19071 + */
19072 +static __ref void *spp_getpage(void)
19073 {
19074 void *ptr;
19075
19076 if (after_bootmem)
19077 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
19078 - else if (start_pfn < table_end) {
19079 - ptr = __va(start_pfn << PAGE_SHIFT);
19080 - start_pfn++;
19081 + else if (table_cur < table_end) {
19082 + ptr = __va(table_cur << PAGE_SHIFT);
19083 + table_cur++;
19084 memset(ptr, 0, PAGE_SIZE);
19085 } else
19086 ptr = alloc_bootmem_pages(PAGE_SIZE);
19087 @@ -208,30 +184,18 @@ static __init void *spp_getpage(void)
19088 return ptr;
19089 }
19090
19091 -#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
19092 -#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
19093 -
19094 -static __init void
19095 -set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
19096 +void
19097 +set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
19098 {
19099 - pgd_t *pgd;
19100 pud_t *pud;
19101 pmd_t *pmd;
19102 - pte_t *pte, new_pte;
19103 -
19104 - pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
19105 + pte_t *pte;
19106
19107 - pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
19108 - if (pgd_none(*pgd)) {
19109 - printk(KERN_ERR
19110 - "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19111 - return;
19112 - }
19113 - pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
19114 + pud = pud_page + pud_index(vaddr);
19115 if (pud_none(*pud)) {
19116 pmd = (pmd_t *) spp_getpage();
19117 make_page_readonly(pmd, XENFEAT_writable_page_tables);
19118 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19119 + pud_populate(&init_mm, pud, pmd);
19120 if (pmd != pmd_offset(pud, 0)) {
19121 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19122 pmd, pmd_offset(pud, 0));
19123 @@ -242,19 +206,20 @@ set_pte_phys(unsigned long vaddr, unsign
19124 if (pmd_none(*pmd)) {
19125 pte = (pte_t *) spp_getpage();
19126 make_page_readonly(pte, XENFEAT_writable_page_tables);
19127 - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19128 + pmd_populate_kernel(&init_mm, pmd, pte);
19129 if (pte != pte_offset_kernel(pmd, 0)) {
19130 printk(KERN_ERR "PAGETABLE BUG #02!\n");
19131 return;
19132 }
19133 }
19134 - if (pgprot_val(prot))
19135 - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
19136 - else
19137 - new_pte = __pte(0);
19138
19139 pte = pte_offset_kernel(pmd, vaddr);
19140 if (!pte_none(*pte) && __pte_val(new_pte) &&
19141 +#ifdef CONFIG_ACPI
19142 + /* __acpi_map_table() fails to properly call clear_fixmap() */
19143 + (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19144 + vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19145 +#endif
19146 __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19147 pte_ERROR(*pte);
19148 set_pte(pte, new_pte);
19149 @@ -266,15 +231,13 @@ set_pte_phys(unsigned long vaddr, unsign
19150 __flush_tlb_one(vaddr);
19151 }
19152
19153 -static __init void
19154 -set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
19155 +void
19156 +set_pte_vaddr(unsigned long vaddr, pte_t pteval)
19157 {
19158 pgd_t *pgd;
19159 - pud_t *pud;
19160 - pmd_t *pmd;
19161 - pte_t *pte, new_pte;
19162 + pud_t *pud_page;
19163
19164 - pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
19165 + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
19166
19167 pgd = pgd_offset_k(vaddr);
19168 if (pgd_none(*pgd)) {
19169 @@ -282,47 +245,51 @@ set_pte_phys_ma(unsigned long vaddr, uns
19170 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19171 return;
19172 }
19173 - pud = pud_offset(pgd, vaddr);
19174 - if (pud_none(*pud)) {
19175 - pmd = (pmd_t *) spp_getpage();
19176 - make_page_readonly(pmd, XENFEAT_writable_page_tables);
19177 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19178 - if (pmd != pmd_offset(pud, 0)) {
19179 - printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19180 - pmd, pmd_offset(pud, 0));
19181 + pud_page = (pud_t*)pgd_page_vaddr(*pgd);
19182 + set_pte_vaddr_pud(pud_page, vaddr, pteval);
19183 +}
19184 +
19185 +#ifndef CONFIG_XEN
19186 +/*
19187 + * Create large page table mappings for a range of physical addresses.
19188 + */
19189 +static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
19190 + pgprot_t prot)
19191 +{
19192 + pgd_t *pgd;
19193 + pud_t *pud;
19194 + pmd_t *pmd;
19195 +
19196 + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
19197 + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
19198 + pgd = pgd_offset_k((unsigned long)__va(phys));
19199 + if (pgd_none(*pgd)) {
19200 + pud = (pud_t *) spp_getpage();
19201 + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
19202 + _PAGE_USER));
19203 }
19204 - }
19205 - pmd = pmd_offset(pud, vaddr);
19206 - if (pmd_none(*pmd)) {
19207 - pte = (pte_t *) spp_getpage();
19208 - make_page_readonly(pte, XENFEAT_writable_page_tables);
19209 - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19210 - if (pte != pte_offset_kernel(pmd, 0)) {
19211 - printk(KERN_ERR "PAGETABLE BUG #02!\n");
19212 - return;
19213 + pud = pud_offset(pgd, (unsigned long)__va(phys));
19214 + if (pud_none(*pud)) {
19215 + pmd = (pmd_t *) spp_getpage();
19216 + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
19217 + _PAGE_USER));
19218 }
19219 + pmd = pmd_offset(pud, phys);
19220 + BUG_ON(!pmd_none(*pmd));
19221 + set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
19222 }
19223 - new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
19224 +}
19225
19226 - pte = pte_offset_kernel(pmd, vaddr);
19227 - if (!pte_none(*pte) && __pte_val(new_pte) &&
19228 -#ifdef CONFIG_ACPI
19229 - /* __acpi_map_table() fails to properly call clear_fixmap() */
19230 - (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19231 - vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19232 -#endif
19233 - __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19234 - pte_ERROR(*pte);
19235 - set_pte(pte, new_pte);
19236 +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
19237 +{
19238 + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
19239 +}
19240
19241 - /*
19242 - * It's enough to flush this one mapping.
19243 - * (PGE mappings get flushed as well)
19244 - */
19245 - __flush_tlb_one(vaddr);
19246 +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
19247 +{
19248 + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
19249 }
19250
19251 -#ifndef CONFIG_XEN
19252 /*
19253 * The head.S code sets up the kernel high mapping:
19254 *
19255 @@ -352,33 +319,9 @@ void __init cleanup_highmap(void)
19256 }
19257 #endif
19258
19259 -/* NOTE: this is meant to be run only at boot */
19260 -void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
19261 -{
19262 - unsigned long address = __fix_to_virt(idx);
19263 -
19264 - if (idx >= __end_of_fixed_addresses) {
19265 - printk(KERN_ERR "Invalid __set_fixmap\n");
19266 - return;
19267 - }
19268 - switch (idx) {
19269 - case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
19270 - set_pte_phys(address, phys, prot, 0);
19271 - set_pte_phys(address, phys, prot, 1);
19272 - break;
19273 - case FIX_EARLYCON_MEM_BASE:
19274 - xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
19275 - pfn_pte_ma(phys >> PAGE_SHIFT, prot));
19276 - break;
19277 - default:
19278 - set_pte_phys_ma(address, phys, prot);
19279 - break;
19280 - }
19281 -}
19282 -
19283 static __meminit void *alloc_static_page(unsigned long *phys)
19284 {
19285 - unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
19286 + unsigned long va = (table_cur << PAGE_SHIFT) + __START_KERNEL_map;
19287
19288 if (after_bootmem) {
19289 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
19290 @@ -387,13 +330,12 @@ static __meminit void *alloc_static_page
19291 return adr;
19292 }
19293
19294 - *phys = start_pfn << PAGE_SHIFT;
19295 - start_pfn++;
19296 - memset((void *)va, 0, PAGE_SIZE);
19297 - return (void *)va;
19298 + BUG_ON(!table_cur);
19299 + *phys = table_cur++ << PAGE_SHIFT;
19300 + return memset((void *)va, 0, PAGE_SIZE);
19301 }
19302
19303 -#define PTE_SIZE PAGE_SIZE
19304 +#define unmap_low_page(p) ((void)(p))
19305
19306 static inline int __meminit make_readonly(unsigned long paddr)
19307 {
19308 @@ -408,7 +350,7 @@ static inline int __meminit make_readonl
19309 /* Make old page tables read-only. */
19310 if (!xen_feature(XENFEAT_writable_page_tables)
19311 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
19312 - && (paddr < (start_pfn << PAGE_SHIFT)))
19313 + && (paddr < (table_cur << PAGE_SHIFT)))
19314 readonly = 1;
19315
19316 /*
19317 @@ -425,118 +367,129 @@ static inline int __meminit make_readonl
19318 return readonly;
19319 }
19320
19321 -#ifndef CONFIG_XEN
19322 -/* Must run before zap_low_mappings */
19323 -__meminit void *early_ioremap(unsigned long addr, unsigned long size)
19324 +static unsigned long __meminit
19325 +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
19326 {
19327 - pmd_t *pmd, *last_pmd;
19328 - unsigned long vaddr;
19329 - int i, pmds;
19330 -
19331 - pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19332 - vaddr = __START_KERNEL_map;
19333 - pmd = level2_kernel_pgt;
19334 - last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
19335 -
19336 - for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
19337 - for (i = 0; i < pmds; i++) {
19338 - if (pmd_present(pmd[i]))
19339 - goto continue_outer_loop;
19340 - }
19341 - vaddr += addr & ~PMD_MASK;
19342 - addr &= PMD_MASK;
19343 + unsigned pages = 0;
19344 + unsigned long last_map_addr = end;
19345 + int i;
19346
19347 - for (i = 0; i < pmds; i++, addr += PMD_SIZE)
19348 - set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
19349 - __flush_tlb_all();
19350 -
19351 - return (void *)vaddr;
19352 -continue_outer_loop:
19353 - ;
19354 + pte_t *pte = pte_page + pte_index(addr);
19355 +
19356 + for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
19357 + unsigned long pteval = addr | __PAGE_KERNEL;
19358 +
19359 + if (addr >= (after_bootmem
19360 + ? end
19361 + : xen_start_info->nr_pages << PAGE_SHIFT))
19362 + break;
19363 +
19364 + if (__pte_val(*pte))
19365 + continue;
19366 +
19367 + if (make_readonly(addr))
19368 + pteval &= ~_PAGE_RW;
19369 + if (0)
19370 + printk(" pte=%p addr=%lx pte=%016lx\n",
19371 + pte, addr, pteval);
19372 + if (!after_bootmem)
19373 + *pte = __pte(pteval & __supported_pte_mask);
19374 + else
19375 + set_pte(pte, __pte(pteval & __supported_pte_mask));
19376 + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
19377 + pages++;
19378 }
19379 - printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
19380 - return NULL;
19381 + update_page_count(PG_LEVEL_4K, pages);
19382 +
19383 + return last_map_addr;
19384 }
19385
19386 -/*
19387 - * To avoid virtual aliases later:
19388 - */
19389 -__meminit void early_iounmap(void *addr, unsigned long size)
19390 +static unsigned long __meminit
19391 +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
19392 {
19393 - unsigned long vaddr;
19394 - pmd_t *pmd;
19395 - int i, pmds;
19396 -
19397 - vaddr = (unsigned long)addr;
19398 - pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19399 - pmd = level2_kernel_pgt + pmd_index(vaddr);
19400 + pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
19401
19402 - for (i = 0; i < pmds; i++)
19403 - pmd_clear(pmd + i);
19404 -
19405 - __flush_tlb_all();
19406 + BUG_ON(!after_bootmem);
19407 + return phys_pte_init(pte, address, end);
19408 }
19409 -#endif
19410
19411 static unsigned long __meminit
19412 -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
19413 +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
19414 + unsigned long page_size_mask)
19415 {
19416 + unsigned long pages = 0;
19417 + unsigned long last_map_addr = end;
19418 + unsigned long start = address;
19419 +
19420 int i = pmd_index(address);
19421
19422 - for (; i < PTRS_PER_PMD; i++) {
19423 + for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
19424 unsigned long pte_phys;
19425 - pmd_t *pmd = pmd_page + i;
19426 - pte_t *pte, *pte_save;
19427 - int k;
19428 + pmd_t *pmd = pmd_page + pmd_index(address);
19429 + pte_t *pte;
19430
19431 if (address >= end)
19432 break;
19433
19434 if (__pmd_val(*pmd)) {
19435 - address += PMD_SIZE;
19436 + if (!pmd_large(*pmd)) {
19437 + spin_lock(&init_mm.page_table_lock);
19438 + last_map_addr = phys_pte_update(pmd, address,
19439 + end);
19440 + spin_unlock(&init_mm.page_table_lock);
19441 + }
19442 + /* Count entries we're using from level2_ident_pgt */
19443 + if (start == 0)
19444 + pages++;
19445 continue;
19446 }
19447
19448 - pte = alloc_static_page(&pte_phys);
19449 - pte_save = pte;
19450 - for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
19451 - unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
19452 -
19453 - if (address >= (after_bootmem
19454 - ? end
19455 - : xen_start_info->nr_pages << PAGE_SHIFT))
19456 - pteval = 0;
19457 - else if (make_readonly(address))
19458 - pteval &= ~_PAGE_RW;
19459 - set_pte(pte, __pte(pteval & __supported_pte_mask));
19460 + if (page_size_mask & (1<<PG_LEVEL_2M)) {
19461 + pages++;
19462 + spin_lock(&init_mm.page_table_lock);
19463 + set_pte((pte_t *)pmd,
19464 + pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19465 + spin_unlock(&init_mm.page_table_lock);
19466 + last_map_addr = (address & PMD_MASK) + PMD_SIZE;
19467 + continue;
19468 }
19469 +
19470 + pte = alloc_static_page(&pte_phys);
19471 + last_map_addr = phys_pte_init(pte, address, end);
19472 + unmap_low_page(pte);
19473 +
19474 if (!after_bootmem) {
19475 - early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19476 - *pmd = __pmd(pte_phys | _KERNPG_TABLE);
19477 + early_make_page_readonly(pte, XENFEAT_writable_page_tables);
19478 + *pmd = __pmd(pte_phys | _PAGE_TABLE);
19479 } else {
19480 - make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19481 - set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
19482 + make_page_readonly(pte, XENFEAT_writable_page_tables);
19483 + spin_lock(&init_mm.page_table_lock);
19484 + pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
19485 + spin_unlock(&init_mm.page_table_lock);
19486 }
19487 }
19488 - return address;
19489 + update_page_count(PG_LEVEL_2M, pages);
19490 + return last_map_addr;
19491 }
19492
19493 static unsigned long __meminit
19494 -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
19495 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
19496 + unsigned long page_size_mask)
19497 {
19498 pmd_t *pmd = pmd_offset(pud, 0);
19499 unsigned long last_map_addr;
19500
19501 - spin_lock(&init_mm.page_table_lock);
19502 - last_map_addr = phys_pmd_init(pmd, address, end);
19503 - spin_unlock(&init_mm.page_table_lock);
19504 + BUG_ON(!after_bootmem);
19505 + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
19506 __flush_tlb_all();
19507 return last_map_addr;
19508 }
19509
19510 static unsigned long __meminit
19511 -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
19512 +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
19513 + unsigned long page_size_mask)
19514 {
19515 + unsigned long pages = 0;
19516 unsigned long last_map_addr = end;
19517 int i = pud_index(addr);
19518
19519 @@ -550,29 +503,59 @@ phys_pud_init(pud_t *pud_page, unsigned
19520
19521 if (__pud_val(*pud)) {
19522 if (!pud_large(*pud))
19523 - last_map_addr = phys_pmd_update(pud, addr, end);
19524 + last_map_addr = phys_pmd_update(pud, addr, end,
19525 + page_size_mask);
19526 continue;
19527 }
19528
19529 - if (direct_gbpages) {
19530 + if (page_size_mask & (1<<PG_LEVEL_1G)) {
19531 + pages++;
19532 + spin_lock(&init_mm.page_table_lock);
19533 set_pte((pte_t *)pud,
19534 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19535 + spin_unlock(&init_mm.page_table_lock);
19536 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
19537 continue;
19538 }
19539
19540 pmd = alloc_static_page(&pmd_phys);
19541 + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
19542 + unmap_low_page(pmd);
19543
19544 - spin_lock(&init_mm.page_table_lock);
19545 - *pud = __pud(pmd_phys | _KERNPG_TABLE);
19546 - last_map_addr = phys_pmd_init(pmd, addr, end);
19547 - spin_unlock(&init_mm.page_table_lock);
19548 -
19549 - early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
19550 + if (!after_bootmem) {
19551 + early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
19552 + if (page_size_mask & (1 << PG_LEVEL_NUM))
19553 + xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE));
19554 + else
19555 + *pud = __pud(pmd_phys | _PAGE_TABLE);
19556 + } else {
19557 + make_page_readonly(pmd, XENFEAT_writable_page_tables);
19558 + spin_lock(&init_mm.page_table_lock);
19559 + pud_populate(&init_mm, pud, __va(pmd_phys));
19560 + spin_unlock(&init_mm.page_table_lock);
19561 + }
19562 }
19563 __flush_tlb_all();
19564 + update_page_count(PG_LEVEL_1G, pages);
19565
19566 - return last_map_addr >> PAGE_SHIFT;
19567 + return last_map_addr;
19568 +}
19569 +
19570 +static unsigned long __meminit
19571 +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
19572 + unsigned long page_size_mask)
19573 +{
19574 + pud_t *pud;
19575 +
19576 + if (!after_bootmem) {
19577 + unsigned long addr = __pgd_val(*pgd), *page;
19578 +
19579 + addr_to_page(addr, page);
19580 + pud = (pud_t *)page;
19581 + } else
19582 + pud = (pud_t *)pgd_page_vaddr(*pgd);
19583 +
19584 + return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM));
19585 }
19586
19587 void __init xen_init_pt(void)
19588 @@ -654,7 +637,7 @@ void __init xen_init_pt(void)
19589 static void __init extend_init_mapping(unsigned long tables_space)
19590 {
19591 unsigned long va = __START_KERNEL_map;
19592 - unsigned long start = start_pfn;
19593 + unsigned long start = table_cur;
19594 unsigned long phys, addr, *pte_page;
19595 pmd_t *pmd;
19596 pte_t *pte, new_pte;
19597 @@ -674,7 +657,7 @@ static void __init extend_init_mapping(u
19598
19599 /* Ensure init mappings cover kernel text/data and initial tables. */
19600 while (va < (__START_KERNEL_map
19601 - + (start_pfn << PAGE_SHIFT)
19602 + + (table_cur << PAGE_SHIFT)
19603 + tables_space)) {
19604 pmd = (pmd_t *)&page[pmd_index(va)];
19605 if (pmd_none(*pmd)) {
19606 @@ -706,9 +689,9 @@ static void __init extend_init_mapping(u
19607 va += PAGE_SIZE;
19608 }
19609
19610 - if (start_pfn > start)
19611 + if (table_cur > start)
19612 reserve_early(start << PAGE_SHIFT,
19613 - start_pfn << PAGE_SHIFT, "INITMAP");
19614 + table_cur << PAGE_SHIFT, "INITMAP");
19615 }
19616
19617 static void __init find_early_table_space(unsigned long end)
19618 @@ -717,23 +700,25 @@ static void __init find_early_table_spac
19619
19620 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
19621 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
19622 - ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
19623 + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
19624
19625 tables = round_up(puds * 8, PAGE_SIZE) +
19626 round_up(pmds * 8, PAGE_SIZE) +
19627 round_up(ptes * 8, PAGE_SIZE);
19628
19629 + table_cur = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
19630 + xen_start_info->nr_pt_frames;
19631 +
19632 extend_init_mapping(tables);
19633
19634 - table_start = start_pfn;
19635 + table_start = table_cur;
19636 table_end = table_start + (tables>>PAGE_SHIFT);
19637
19638 - early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
19639 - end, table_start << PAGE_SHIFT,
19640 - (table_start << PAGE_SHIFT) + tables);
19641 + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
19642 + end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
19643 }
19644
19645 -static void __init xen_finish_init_mapping(void)
19646 +static void __init xen_finish_init_mapping(bool reserve)
19647 {
19648 unsigned long i, start, end;
19649
19650 @@ -762,7 +747,8 @@ static void __init xen_finish_init_mappi
19651 if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
19652 BUG();
19653
19654 - /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
19655 + /* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
19656 + start = table_cur;
19657 table_end = ~0UL;
19658
19659 /*
19660 @@ -789,8 +775,11 @@ static void __init xen_finish_init_mappi
19661 << PAGE_SHIFT,
19662 PAGE_KERNEL_RO);
19663
19664 - /* Disable the 'start_pfn' allocator. */
19665 - table_end = start_pfn;
19666 + /* Disable the 'table_cur' allocator. */
19667 + table_end = table_cur;
19668 + if (reserve && table_cur > start)
19669 + reserve_early(start << PAGE_SHIFT,
19670 + table_cur << PAGE_SHIFT, "FIXMAP");
19671 }
19672
19673 static void __init init_gbpages(void)
19674 @@ -801,126 +790,89 @@ static void __init init_gbpages(void)
19675 direct_gbpages = 0;
19676 }
19677
19678 -#ifdef CONFIG_MEMTEST_BOOTPARAM
19679 -
19680 -static void __init memtest(unsigned long start_phys, unsigned long size,
19681 - unsigned pattern)
19682 +static unsigned long __init kernel_physical_mapping_init(unsigned long start,
19683 + unsigned long end,
19684 + unsigned long page_size_mask)
19685 {
19686 - unsigned long i;
19687 - unsigned long *start;
19688 - unsigned long start_bad;
19689 - unsigned long last_bad;
19690 - unsigned long val;
19691 - unsigned long start_phys_aligned;
19692 - unsigned long count;
19693 - unsigned long incr;
19694 -
19695 - switch (pattern) {
19696 - case 0:
19697 - val = 0UL;
19698 - break;
19699 - case 1:
19700 - val = -1UL;
19701 - break;
19702 - case 2:
19703 - val = 0x5555555555555555UL;
19704 - break;
19705 - case 3:
19706 - val = 0xaaaaaaaaaaaaaaaaUL;
19707 - break;
19708 - default:
19709 - return;
19710 - }
19711
19712 - incr = sizeof(unsigned long);
19713 - start_phys_aligned = ALIGN(start_phys, incr);
19714 - count = (size - (start_phys_aligned - start_phys))/incr;
19715 - start = __va(start_phys_aligned);
19716 - start_bad = 0;
19717 - last_bad = 0;
19718 -
19719 - for (i = 0; i < count; i++)
19720 - start[i] = val;
19721 - for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
19722 - if (*start != val) {
19723 - if (start_phys_aligned == last_bad + incr) {
19724 - last_bad += incr;
19725 - } else {
19726 - if (start_bad) {
19727 - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19728 - val, start_bad, last_bad + incr);
19729 - reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19730 - }
19731 - start_bad = last_bad = start_phys_aligned;
19732 - }
19733 - }
19734 - }
19735 - if (start_bad) {
19736 - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19737 - val, start_bad, last_bad + incr);
19738 - reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19739 - }
19740 -
19741 -}
19742 -
19743 -static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
19744 -
19745 -static int __init parse_memtest(char *arg)
19746 -{
19747 - if (arg)
19748 - memtest_pattern = simple_strtoul(arg, NULL, 0);
19749 - return 0;
19750 -}
19751 + unsigned long next, last_map_addr = end;
19752
19753 -early_param("memtest", parse_memtest);
19754 + start = (unsigned long)__va(start);
19755 + end = (unsigned long)__va(end);
19756
19757 -static void __init early_memtest(unsigned long start, unsigned long end)
19758 -{
19759 - u64 t_start, t_size;
19760 - unsigned pattern;
19761 + for (; start < end; start = next) {
19762 + pgd_t *pgd = pgd_offset_k(start);
19763 + unsigned long pud_phys;
19764 + pud_t *pud;
19765
19766 - if (!memtest_pattern)
19767 - return;
19768 + next = (start + PGDIR_SIZE) & PGDIR_MASK;
19769 + if (next > end)
19770 + next = end;
19771
19772 - printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
19773 - for (pattern = 0; pattern < memtest_pattern; pattern++) {
19774 - t_start = start;
19775 - t_size = 0;
19776 - while (t_start < end) {
19777 - t_start = find_e820_area_size(t_start, &t_size, 1);
19778 -
19779 - /* done ? */
19780 - if (t_start >= end)
19781 - break;
19782 - if (t_start + t_size > end)
19783 - t_size = end - t_start;
19784 -
19785 - printk(KERN_CONT "\n %016llx - %016llx pattern %d",
19786 - (unsigned long long)t_start,
19787 - (unsigned long long)t_start + t_size, pattern);
19788 + if (__pgd_val(*pgd)) {
19789 + last_map_addr = phys_pud_update(pgd, __pa(start),
19790 + __pa(end), page_size_mask);
19791 + continue;
19792 + }
19793
19794 - memtest(t_start, t_size, pattern);
19795 + pud = alloc_static_page(&pud_phys);
19796 + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
19797 + page_size_mask);
19798 + unmap_low_page(pud);
19799
19800 - t_start += t_size;
19801 + if(!after_bootmem) {
19802 + early_make_page_readonly(pud, XENFEAT_writable_page_tables);
19803 + xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
19804 + } else {
19805 + make_page_readonly(pud, XENFEAT_writable_page_tables);
19806 + spin_lock(&init_mm.page_table_lock);
19807 + pgd_populate(&init_mm, pgd, __va(pud_phys));
19808 + spin_unlock(&init_mm.page_table_lock);
19809 }
19810 }
19811 - printk(KERN_CONT "\n");
19812 +
19813 + return last_map_addr;
19814 }
19815 -#else
19816 -static void __init early_memtest(unsigned long start, unsigned long end)
19817 +
19818 +struct map_range {
19819 + unsigned long start;
19820 + unsigned long end;
19821 + unsigned page_size_mask;
19822 +};
19823 +
19824 +#define NR_RANGE_MR 5
19825 +
19826 +static int save_mr(struct map_range *mr, int nr_range,
19827 + unsigned long start_pfn, unsigned long end_pfn,
19828 + unsigned long page_size_mask)
19829 {
19830 +
19831 + if (start_pfn < end_pfn) {
19832 + if (nr_range >= NR_RANGE_MR)
19833 + panic("run out of range for init_memory_mapping\n");
19834 + mr[nr_range].start = start_pfn<<PAGE_SHIFT;
19835 + mr[nr_range].end = end_pfn<<PAGE_SHIFT;
19836 + mr[nr_range].page_size_mask = page_size_mask;
19837 + nr_range++;
19838 + }
19839 +
19840 + return nr_range;
19841 }
19842 -#endif
19843
19844 /*
19845 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
19846 * This runs before bootmem is initialized and gets pages directly from
19847 * the physical memory. To access them they are temporarily mapped.
19848 */
19849 -unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
19850 +unsigned long __init_refok init_memory_mapping(unsigned long start,
19851 + unsigned long end)
19852 {
19853 - unsigned long next, last_map_addr = end;
19854 - unsigned long start_phys = start, end_phys = end;
19855 + unsigned long last_map_addr = 0;
19856 + unsigned long page_size_mask = 0;
19857 + unsigned long start_pfn, end_pfn;
19858 + bool first = !table_start;
19859 + struct map_range mr[NR_RANGE_MR];
19860 + int nr_range, i;
19861
19862 printk(KERN_INFO "init_memory_mapping\n");
19863
19864 @@ -931,51 +883,123 @@ unsigned long __init_refok init_memory_m
19865 * memory mapped. Unfortunately this is done currently before the
19866 * nodes are discovered.
19867 */
19868 - if (!after_bootmem) {
19869 + if (!after_bootmem)
19870 init_gbpages();
19871 - find_early_table_space(end);
19872 - }
19873
19874 - start = (unsigned long)__va(start);
19875 - end = (unsigned long)__va(end);
19876 + if (direct_gbpages)
19877 + page_size_mask |= 1 << PG_LEVEL_1G;
19878 + if (cpu_has_pse)
19879 + page_size_mask |= 1 << PG_LEVEL_2M;
19880 +
19881 + memset(mr, 0, sizeof(mr));
19882 + nr_range = 0;
19883 +
19884 + /* head if not big page alignment ?*/
19885 + start_pfn = start >> PAGE_SHIFT;
19886 + end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
19887 + << (PMD_SHIFT - PAGE_SHIFT);
19888 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
19889 +
19890 + /* big page (2M) range*/
19891 + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
19892 + << (PMD_SHIFT - PAGE_SHIFT);
19893 + end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
19894 + << (PUD_SHIFT - PAGE_SHIFT);
19895 + if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
19896 + end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
19897 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
19898 + page_size_mask & (1<<PG_LEVEL_2M));
19899 +
19900 + /* big page (1G) range */
19901 + start_pfn = end_pfn;
19902 + end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
19903 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
19904 + page_size_mask &
19905 + ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
19906 +
19907 + /* tail is not big page (1G) alignment */
19908 + start_pfn = end_pfn;
19909 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
19910 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
19911 + page_size_mask & (1<<PG_LEVEL_2M));
19912 +
19913 + /* tail is not big page (2M) alignment */
19914 + start_pfn = end_pfn;
19915 + end_pfn = end>>PAGE_SHIFT;
19916 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
19917 +
19918 + /* try to merge same page size and continuous */
19919 + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
19920 + unsigned long old_start;
19921 + if (mr[i].end != mr[i+1].start ||
19922 + mr[i].page_size_mask != mr[i+1].page_size_mask)
19923 + continue;
19924 + /* move it */
19925 + old_start = mr[i].start;
19926 + memmove(&mr[i], &mr[i+1],
19927 + (nr_range - 1 - i) * sizeof (struct map_range));
19928 + mr[i--].start = old_start;
19929 + nr_range--;
19930 + }
19931
19932 - for (; start < end; start = next) {
19933 - pgd_t *pgd = pgd_offset_k(start);
19934 - unsigned long pud_phys;
19935 - pud_t *pud;
19936 + for (i = 0; i < nr_range; i++)
19937 + printk(KERN_DEBUG " %010lx - %010lx page %s\n",
19938 + mr[i].start, mr[i].end,
19939 + (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
19940 + (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
19941
19942 - if (after_bootmem)
19943 - pud = pud_offset(pgd, start & PGDIR_MASK);
19944 - else
19945 - pud = alloc_static_page(&pud_phys);
19946 - next = start + PGDIR_SIZE;
19947 - if (next > end)
19948 - next = end;
19949 - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
19950 - if (!after_bootmem) {
19951 - early_make_page_readonly(pud, XENFEAT_writable_page_tables);
19952 - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
19953 - }
19954 - }
19955 + if (first)
19956 + find_early_table_space(end);
19957
19958 - if (!after_bootmem) {
19959 - BUG_ON(start_pfn != table_end);
19960 - xen_finish_init_mapping();
19961 + for (i = 0; i < nr_range; i++)
19962 + last_map_addr = kernel_physical_mapping_init(
19963 + mr[i].start, mr[i].end,
19964 + mr[i].page_size_mask);
19965 +
19966 + BUG_ON(table_cur > table_end);
19967 + if (start < (table_start << PAGE_SHIFT)) {
19968 + WARN_ON(table_cur != table_end);
19969 + xen_finish_init_mapping(!first);
19970 }
19971
19972 __flush_tlb_all();
19973
19974 - if (!after_bootmem)
19975 + if (first && table_end > table_start)
19976 reserve_early(table_start << PAGE_SHIFT,
19977 table_end << PAGE_SHIFT, "PGTABLE");
19978
19979 + printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
19980 + last_map_addr, end);
19981 +
19982 if (!after_bootmem)
19983 - early_memtest(start_phys, end_phys);
19984 + early_memtest(start, end);
19985
19986 - return last_map_addr;
19987 + return last_map_addr >> PAGE_SHIFT;
19988 }
19989
19990 #ifndef CONFIG_NUMA
19991 +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
19992 +{
19993 + unsigned long bootmap_size, bootmap;
19994 +
19995 + e820_register_active_regions(0, start_pfn, end_pfn);
19996 +#ifdef CONFIG_XEN
19997 + if (end_pfn > xen_start_info->nr_pages)
19998 + end_pfn = xen_start_info->nr_pages;
19999 +#endif
20000 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
20001 + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
20002 + PAGE_SIZE);
20003 + if (bootmap == -1L)
20004 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
20005 + /* don't touch min_low_pfn */
20006 + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
20007 + 0, end_pfn);
20008 + free_bootmem_with_active_regions(0, end_pfn);
20009 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
20010 + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
20011 +}
20012 +
20013 void __init paging_init(void)
20014 {
20015 unsigned long max_zone_pfns[MAX_NR_ZONES];
20016 @@ -983,9 +1007,9 @@ void __init paging_init(void)
20017 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
20018 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
20019 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
20020 - max_zone_pfns[ZONE_NORMAL] = end_pfn;
20021 + max_zone_pfns[ZONE_NORMAL] = max_pfn;
20022
20023 - memory_present(0, 0, end_pfn);
20024 + memory_present(0, 0, max_pfn);
20025 sparse_init();
20026 free_area_init_nodes(max_zone_pfns);
20027
20028 @@ -1076,8 +1100,8 @@ void __init mem_init(void)
20029 init_page_count(pfn_to_page(pfn));
20030 totalram_pages++;
20031 }
20032 - reservedpages = end_pfn - totalram_pages -
20033 - absent_pages_in_range(0, end_pfn);
20034 + reservedpages = max_pfn - totalram_pages -
20035 + absent_pages_in_range(0, max_pfn);
20036 after_bootmem = 1;
20037
20038 codesize = (unsigned long) &_etext - (unsigned long) &_text;
20039 @@ -1096,7 +1120,7 @@ void __init mem_init(void)
20040 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
20041 "%ldk reserved, %ldk data, %ldk init)\n",
20042 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
20043 - end_pfn << (PAGE_SHIFT-10),
20044 + max_pfn << (PAGE_SHIFT-10),
20045 codesize >> 10,
20046 reservedpages << (PAGE_SHIFT-10),
20047 datasize >> 10,
20048 @@ -1159,6 +1183,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
20049 void mark_rodata_ro(void)
20050 {
20051 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
20052 + unsigned long rodata_start =
20053 + ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20054 +
20055 +#ifdef CONFIG_DYNAMIC_FTRACE
20056 + /* Dynamic tracing modifies the kernel text section */
20057 + start = rodata_start;
20058 +#endif
20059
20060 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
20061 (end - start) >> 10);
20062 @@ -1168,8 +1199,7 @@ void mark_rodata_ro(void)
20063 * The rodata section (but not the kernel text!) should also be
20064 * not-executable.
20065 */
20066 - start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20067 - set_memory_nx(start, (end - start) >> PAGE_SHIFT);
20068 + set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
20069
20070 rodata_test();
20071
20072 @@ -1191,24 +1221,26 @@ void free_initrd_mem(unsigned long start
20073 }
20074 #endif
20075
20076 -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
20077 +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
20078 + int flags)
20079 {
20080 #ifdef CONFIG_NUMA
20081 int nid, next_nid;
20082 + int ret;
20083 #endif
20084 unsigned long pfn = phys >> PAGE_SHIFT;
20085
20086 - if (pfn >= end_pfn) {
20087 + if (pfn >= max_pfn) {
20088 /*
20089 * This can happen with kdump kernels when accessing
20090 * firmware tables:
20091 */
20092 if (pfn < max_pfn_mapped)
20093 - return;
20094 + return -EFAULT;
20095
20096 - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
20097 + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
20098 phys, len);
20099 - return;
20100 + return -EFAULT;
20101 }
20102
20103 /* Should check here against the e820 map to avoid double free */
20104 @@ -1216,9 +1248,13 @@ void __init reserve_bootmem_generic(unsi
20105 nid = phys_to_nid(phys);
20106 next_nid = phys_to_nid(phys + len - 1);
20107 if (nid == next_nid)
20108 - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
20109 + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
20110 else
20111 - reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20112 + ret = reserve_bootmem(phys, len, flags);
20113 +
20114 + if (ret != 0)
20115 + return ret;
20116 +
20117 #else
20118 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20119 #endif
20120 @@ -1231,6 +1267,8 @@ void __init reserve_bootmem_generic(unsi
20121 set_dma_reserve(dma_reserve);
20122 }
20123 #endif
20124 +
20125 + return 0;
20126 }
20127
20128 int kern_addr_valid(unsigned long addr)
20129 @@ -1335,7 +1373,7 @@ vmemmap_populate(struct page *start_page
20130 pmd_t *pmd;
20131
20132 for (; addr < end; addr = next) {
20133 - next = pmd_addr_end(addr, end);
20134 + void *p = NULL;
20135
20136 pgd = vmemmap_pgd_populate(addr, node);
20137 if (!pgd)
20138 @@ -1345,33 +1383,51 @@ vmemmap_populate(struct page *start_page
20139 if (!pud)
20140 return -ENOMEM;
20141
20142 - pmd = pmd_offset(pud, addr);
20143 - if (pmd_none(*pmd)) {
20144 - pte_t entry;
20145 - void *p;
20146 + if (!cpu_has_pse) {
20147 + next = (addr + PAGE_SIZE) & PAGE_MASK;
20148 + pmd = vmemmap_pmd_populate(pud, addr, node);
20149 +
20150 + if (!pmd)
20151 + return -ENOMEM;
20152 +
20153 + p = vmemmap_pte_populate(pmd, addr, node);
20154
20155 - p = vmemmap_alloc_block(PMD_SIZE, node);
20156 if (!p)
20157 return -ENOMEM;
20158
20159 - entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20160 - PAGE_KERNEL_LARGE);
20161 - set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20162 -
20163 - /* check to see if we have contiguous blocks */
20164 - if (p_end != p || node_start != node) {
20165 - if (p_start)
20166 - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20167 - addr_start, addr_end-1, p_start, p_end-1, node_start);
20168 - addr_start = addr;
20169 - node_start = node;
20170 - p_start = p;
20171 - }
20172 - addr_end = addr + PMD_SIZE;
20173 - p_end = p + PMD_SIZE;
20174 + addr_end = addr + PAGE_SIZE;
20175 + p_end = p + PAGE_SIZE;
20176 } else {
20177 - vmemmap_verify((pte_t *)pmd, node, addr, next);
20178 + next = pmd_addr_end(addr, end);
20179 +
20180 + pmd = pmd_offset(pud, addr);
20181 + if (pmd_none(*pmd)) {
20182 + pte_t entry;
20183 +
20184 + p = vmemmap_alloc_block(PMD_SIZE, node);
20185 + if (!p)
20186 + return -ENOMEM;
20187 +
20188 + entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20189 + PAGE_KERNEL_LARGE);
20190 + set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20191 +
20192 + /* check to see if we have contiguous blocks */
20193 + if (p_end != p || node_start != node) {
20194 + if (p_start)
20195 + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20196 + addr_start, addr_end-1, p_start, p_end-1, node_start);
20197 + addr_start = addr;
20198 + node_start = node;
20199 + p_start = p;
20200 + }
20201 +
20202 + addr_end = addr + PMD_SIZE;
20203 + p_end = p + PMD_SIZE;
20204 + } else
20205 + vmemmap_verify((pte_t *)pmd, node, addr, next);
20206 }
20207 +
20208 }
20209 return 0;
20210 }
20211 Index: head-2008-12-01/arch/x86/mm/ioremap-xen.c
20212 ===================================================================
20213 --- head-2008-12-01.orig/arch/x86/mm/ioremap-xen.c 2008-12-01 11:44:55.000000000 +0100
20214 +++ head-2008-12-01/arch/x86/mm/ioremap-xen.c 2008-12-01 11:49:07.000000000 +0100
20215 @@ -13,6 +13,7 @@
20216 #include <linux/pfn.h>
20217 #include <linux/slab.h>
20218 #include <linux/vmalloc.h>
20219 +#include <linux/mmiotrace.h>
20220
20221 #include <asm/cacheflush.h>
20222 #include <asm/e820.h>
20223 @@ -274,7 +275,8 @@ int ioremap_check_change_attr(unsigned l
20224 for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
20225 unsigned long pfn = mfn_to_local_pfn(mfn);
20226
20227 - if (pfn >= max_pfn_mapped)
20228 + if (pfn >= max_low_pfn_mapped &&
20229 + (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
20230 continue;
20231 rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
20232 PAGE_SIZE, prot_val);
20233 @@ -297,11 +299,14 @@ static void __iomem *__ioremap_caller(re
20234 {
20235 unsigned long mfn, offset, vaddr;
20236 resource_size_t last_addr;
20237 + const resource_size_t unaligned_phys_addr = phys_addr;
20238 + const unsigned long unaligned_size = size;
20239 struct vm_struct *area;
20240 unsigned long new_prot_val;
20241 pgprot_t prot;
20242 int retval;
20243 domid_t domid = DOMID_IO;
20244 + void __iomem *ret_addr;
20245
20246 /* Don't allow wraparound or zero size */
20247 last_addr = phys_addr + size - 1;
20248 @@ -318,7 +323,7 @@ static void __iomem *__ioremap_caller(re
20249 /*
20250 * Don't remap the low PCI/ISA area, it's always mapped..
20251 */
20252 - if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
20253 + if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
20254 return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
20255
20256 /*
20257 @@ -342,7 +347,7 @@ static void __iomem *__ioremap_caller(re
20258 phys_addr &= PAGE_MASK;
20259 size = PAGE_ALIGN(last_addr+1) - phys_addr;
20260
20261 - retval = reserve_memtype(phys_addr, phys_addr + size,
20262 + retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
20263 prot_val, &new_prot_val);
20264 if (retval) {
20265 pr_debug("Warning: reserve_memtype returned %d\n", retval);
20266 @@ -410,7 +415,10 @@ static void __iomem *__ioremap_caller(re
20267 return NULL;
20268 }
20269
20270 - return (void __iomem *) (vaddr + offset);
20271 + ret_addr = (void __iomem *) (vaddr + offset);
20272 + mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
20273 +
20274 + return ret_addr;
20275 }
20276
20277 /**
20278 @@ -438,7 +446,7 @@ void __iomem *ioremap_nocache(resource_s
20279 {
20280 /*
20281 * Ideally, this should be:
20282 - * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20283 + * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20284 *
20285 * Till we fix all X drivers to use ioremap_wc(), we will use
20286 * UC MINUS.
20287 @@ -462,7 +470,7 @@ EXPORT_SYMBOL(ioremap_nocache);
20288 */
20289 void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
20290 {
20291 - if (pat_wc_enabled)
20292 + if (pat_enabled)
20293 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
20294 __builtin_return_address(0));
20295 else
20296 @@ -502,6 +510,14 @@ static void __iomem *ioremap_default(res
20297 }
20298 #endif
20299
20300 +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
20301 + unsigned long prot_val)
20302 +{
20303 + return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
20304 + __builtin_return_address(0));
20305 +}
20306 +EXPORT_SYMBOL(ioremap_prot);
20307 +
20308 /**
20309 * iounmap - Free a IO remapping
20310 * @addr: virtual address from ioremap_*
20311 @@ -526,6 +542,8 @@ void iounmap(volatile void __iomem *addr
20312 addr = (volatile void __iomem *)
20313 (PAGE_MASK & (unsigned long __force)addr);
20314
20315 + mmiotrace_iounmap(addr);
20316 +
20317 /* Use the vm area unlocked, assuming the caller
20318 ensures there isn't another iounmap for the same address
20319 in parallel. Reuse of the virtual address is prevented by
20320 @@ -533,7 +551,7 @@ void iounmap(volatile void __iomem *addr
20321 cpa takes care of the direct mappings. */
20322 read_lock(&vmlist_lock);
20323 for (p = vmlist; p; p = p->next) {
20324 - if (p->addr == addr)
20325 + if (p->addr == (void __force *)addr)
20326 break;
20327 }
20328 read_unlock(&vmlist_lock);
20329 @@ -547,7 +565,7 @@ void iounmap(volatile void __iomem *addr
20330 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
20331
20332 /* Finally remove it */
20333 - o = remove_vm_area((void *)addr);
20334 + o = remove_vm_area((void __force *)addr);
20335 BUG_ON(p != o || o == NULL);
20336 kfree(p);
20337 }
20338 @@ -567,7 +585,7 @@ void *xlate_dev_mem_ptr(unsigned long ph
20339 if (page_is_ram(start >> PAGE_SHIFT))
20340 return __va(phys);
20341
20342 - addr = (void *)ioremap_default(start, PAGE_SIZE);
20343 + addr = (void __force *)ioremap_default(start, PAGE_SIZE);
20344 if (addr)
20345 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
20346
20347 @@ -595,8 +613,7 @@ static int __init early_ioremap_debug_se
20348 early_param("early_ioremap_debug", early_ioremap_debug_setup);
20349
20350 static __initdata int after_paging_init;
20351 -static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
20352 - __section(.bss.page_aligned);
20353 +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
20354
20355 #ifdef CONFIG_X86_32
20356 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
20357 @@ -693,10 +710,11 @@ static void __init __early_set_fixmap(en
20358 return;
20359 }
20360 pte = early_ioremap_pte(addr);
20361 +
20362 if (pgprot_val(flags))
20363 set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
20364 else
20365 - pte_clear(NULL, addr, pte);
20366 + pte_clear(&init_mm, addr, pte);
20367 __flush_tlb_one(addr);
20368 }
20369
20370 @@ -724,13 +742,11 @@ static int __init check_early_ioremap_le
20371 {
20372 if (!early_ioremap_nested)
20373 return 0;
20374 -
20375 - printk(KERN_WARNING
20376 + WARN(1, KERN_WARNING
20377 "Debug warning: early ioremap leak of %d areas detected.\n",
20378 - early_ioremap_nested);
20379 + early_ioremap_nested);
20380 printk(KERN_WARNING
20381 - "please boot with early_ioremap_debug and report the dmesg.\n");
20382 - WARN_ON(1);
20383 + "please boot with early_ioremap_debug and report the dmesg.\n");
20384
20385 return 1;
20386 }
20387 Index: head-2008-12-01/arch/x86/mm/pageattr-xen.c
20388 ===================================================================
20389 --- head-2008-12-01.orig/arch/x86/mm/pageattr-xen.c 2008-12-01 12:19:27.000000000 +0100
20390 +++ head-2008-12-01/arch/x86/mm/pageattr-xen.c 2008-12-01 11:49:07.000000000 +0100
20391 @@ -34,6 +34,47 @@ struct cpa_data {
20392 unsigned force_split : 1;
20393 };
20394
20395 +#ifdef CONFIG_PROC_FS
20396 +static unsigned long direct_pages_count[PG_LEVEL_NUM];
20397 +
20398 +void update_page_count(int level, unsigned long pages)
20399 +{
20400 + unsigned long flags;
20401 +
20402 + /* Protect against CPA */
20403 + spin_lock_irqsave(&pgd_lock, flags);
20404 + direct_pages_count[level] += pages;
20405 + spin_unlock_irqrestore(&pgd_lock, flags);
20406 +}
20407 +
20408 +static void split_page_count(int level)
20409 +{
20410 + direct_pages_count[level]--;
20411 + direct_pages_count[level - 1] += PTRS_PER_PTE;
20412 +}
20413 +
20414 +int arch_report_meminfo(char *page)
20415 +{
20416 + int n = sprintf(page, "DirectMap4k: %8lu kB\n",
20417 + direct_pages_count[PG_LEVEL_4K] << 2);
20418 +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
20419 + n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
20420 + direct_pages_count[PG_LEVEL_2M] << 11);
20421 +#else
20422 + n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
20423 + direct_pages_count[PG_LEVEL_2M] << 12);
20424 +#endif
20425 +#ifdef CONFIG_X86_64
20426 + if (direct_gbpages)
20427 + n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
20428 + direct_pages_count[PG_LEVEL_1G] << 20);
20429 +#endif
20430 + return n;
20431 +}
20432 +#else
20433 +static inline void split_page_count(int level) { }
20434 +#endif
20435 +
20436 #ifdef CONFIG_X86_64
20437
20438 static inline unsigned long highmap_start_pfn(void)
20439 @@ -106,7 +147,7 @@ static void cpa_flush_all(unsigned long
20440 {
20441 BUG_ON(irqs_disabled());
20442
20443 - on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
20444 + on_each_cpu(__cpa_flush_all, (void *) cache, 1);
20445 }
20446
20447 static void __cpa_flush_range(void *arg)
20448 @@ -127,7 +168,7 @@ static void cpa_flush_range(unsigned lon
20449 BUG_ON(irqs_disabled());
20450 WARN_ON(PAGE_ALIGN(start) != start);
20451
20452 - on_each_cpu(__cpa_flush_range, NULL, 1, 1);
20453 + on_each_cpu(__cpa_flush_range, NULL, 1);
20454
20455 if (!cache)
20456 return;
20457 @@ -229,6 +270,7 @@ pte_t *lookup_address(unsigned long addr
20458
20459 return pte_offset_kernel(pmd, address);
20460 }
20461 +EXPORT_SYMBOL_GPL(lookup_address);
20462
20463 /*
20464 * Set the new pmd in all the pgds we know about:
20465 @@ -509,6 +551,16 @@ static int split_large_page(pte_t *kpte,
20466 }
20467 #endif
20468
20469 + if (address >= (unsigned long)__va(0) &&
20470 + address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
20471 + split_page_count(level);
20472 +
20473 +#ifdef CONFIG_X86_64
20474 + if (address >= (unsigned long)__va(1UL<<32) &&
20475 + address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
20476 + split_page_count(level);
20477 +#endif
20478 +
20479 /*
20480 * Get the target mfn from the original entry:
20481 */
20482 @@ -565,10 +617,9 @@ repeat:
20483 if (!__pte_val(old_pte)) {
20484 if (!primary)
20485 return 0;
20486 - printk(KERN_WARNING "CPA: called for zero pte. "
20487 + WARN(1, KERN_WARNING "CPA: called for zero pte. "
20488 "vaddr = %lx cpa->vaddr = %lx\n", address,
20489 cpa->vaddr);
20490 - WARN_ON(1);
20491 return -EINVAL;
20492 }
20493
20494 @@ -633,15 +684,24 @@ static int cpa_process_alias(struct cpa_
20495 struct cpa_data alias_cpa;
20496 int ret = 0;
20497
20498 - if (cpa->pfn > max_pfn_mapped)
20499 + if (cpa->pfn >= max_pfn_mapped)
20500 return 0;
20501
20502 +#ifdef CONFIG_X86_64
20503 + if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
20504 + return 0;
20505 +#endif
20506 /*
20507 * No need to redo, when the primary call touched the direct
20508 * mapping already:
20509 */
20510 - if (!within(cpa->vaddr, PAGE_OFFSET,
20511 - PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
20512 + if (!(within(cpa->vaddr, PAGE_OFFSET,
20513 + PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
20514 +#ifdef CONFIG_X86_64
20515 + || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
20516 + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
20517 +#endif
20518 + )) {
20519
20520 alias_cpa = *cpa;
20521 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
20522 @@ -809,7 +869,7 @@ int set_memory_uc(unsigned long addr, in
20523 /*
20524 * for now UC MINUS. see comments in ioremap_nocache()
20525 */
20526 - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20527 + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20528 _PAGE_CACHE_UC_MINUS, NULL))
20529 return -EINVAL;
20530
20531 @@ -825,10 +885,10 @@ int _set_memory_wc(unsigned long addr, i
20532
20533 int set_memory_wc(unsigned long addr, int numpages)
20534 {
20535 - if (!pat_wc_enabled)
20536 + if (!pat_enabled)
20537 return set_memory_uc(addr, numpages);
20538
20539 - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20540 + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20541 _PAGE_CACHE_WC, NULL))
20542 return -EINVAL;
20543
20544 @@ -844,7 +904,7 @@ int _set_memory_wb(unsigned long addr, i
20545
20546 int set_memory_wb(unsigned long addr, int numpages)
20547 {
20548 - free_memtype(addr, addr + numpages * PAGE_SIZE);
20549 + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
20550
20551 return _set_memory_wb(addr, numpages);
20552 }
20553 Index: head-2008-12-01/arch/x86/mm/pat-xen.c
20554 ===================================================================
20555 --- head-2008-12-01.orig/arch/x86/mm/pat-xen.c 2008-12-01 11:44:55.000000000 +0100
20556 +++ head-2008-12-01/arch/x86/mm/pat-xen.c 2008-12-01 11:49:07.000000000 +0100
20557 @@ -12,6 +12,8 @@
20558 #include <linux/gfp.h>
20559 #include <linux/fs.h>
20560 #include <linux/bootmem.h>
20561 +#include <linux/debugfs.h>
20562 +#include <linux/seq_file.h>
20563
20564 #include <asm/msr.h>
20565 #include <asm/tlbflush.h>
20566 @@ -26,11 +28,11 @@
20567 #include <asm/io.h>
20568
20569 #ifdef CONFIG_X86_PAT
20570 -int __read_mostly pat_wc_enabled = 1;
20571 +int __read_mostly pat_enabled = 1;
20572
20573 void __cpuinit pat_disable(char *reason)
20574 {
20575 - pat_wc_enabled = 0;
20576 + pat_enabled = 0;
20577 printk(KERN_INFO "%s\n", reason);
20578 }
20579
20580 @@ -42,6 +44,19 @@ static int __init nopat(char *str)
20581 early_param("nopat", nopat);
20582 #endif
20583
20584 +
20585 +static int debug_enable;
20586 +static int __init pat_debug_setup(char *str)
20587 +{
20588 + debug_enable = 1;
20589 + return 0;
20590 +}
20591 +__setup("debugpat", pat_debug_setup);
20592 +
20593 +#define dprintk(fmt, arg...) \
20594 + do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
20595 +
20596 +
20597 static u64 __read_mostly boot_pat_state;
20598
20599 enum {
20600 @@ -53,24 +68,25 @@ enum {
20601 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
20602 };
20603
20604 -#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
20605 +#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
20606
20607 void pat_init(void)
20608 {
20609 u64 pat;
20610
20611 - if (!pat_wc_enabled)
20612 + if (!pat_enabled)
20613 return;
20614
20615 /* Paranoia check. */
20616 - if (!cpu_has_pat) {
20617 - printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
20618 + if (!cpu_has_pat && boot_pat_state) {
20619 /*
20620 - * Panic if this happens on the secondary CPU, and we
20621 + * If this happens we are on a secondary CPU, but
20622 * switched to PAT on the boot CPU. We have no way to
20623 * undo PAT.
20624 - */
20625 - BUG_ON(boot_pat_state);
20626 + */
20627 + printk(KERN_ERR "PAT enabled, "
20628 + "but not supported by secondary CPU\n");
20629 + BUG();
20630 }
20631
20632 #ifndef CONFIG_XEN
20633 @@ -87,8 +103,8 @@ void pat_init(void)
20634 * 011 UC _PAGE_CACHE_UC
20635 * PAT bit unused
20636 */
20637 - pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
20638 - PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
20639 + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
20640 + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
20641
20642 /* Boot CPU check */
20643 if (!boot_pat_state)
20644 @@ -113,13 +129,13 @@ void pat_init(void)
20645 static char *cattr_name(unsigned long flags)
20646 {
20647 switch (flags & _PAGE_CACHE_MASK) {
20648 - case _PAGE_CACHE_UC: return "uncached";
20649 - case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20650 - case _PAGE_CACHE_WB: return "write-back";
20651 - case _PAGE_CACHE_WC: return "write-combining";
20652 - case _PAGE_CACHE_WP: return "write-protected";
20653 - case _PAGE_CACHE_WT: return "write-through";
20654 - default: return "broken";
20655 + case _PAGE_CACHE_UC: return "uncached";
20656 + case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20657 + case _PAGE_CACHE_WB: return "write-back";
20658 + case _PAGE_CACHE_WC: return "write-combining";
20659 + case _PAGE_CACHE_WP: return "write-protected";
20660 + case _PAGE_CACHE_WT: return "write-through";
20661 + default: return "broken";
20662 }
20663 }
20664
20665 @@ -157,49 +173,55 @@ static DEFINE_SPINLOCK(memtype_lock); /
20666 * The intersection is based on "Effective Memory Type" tables in IA-32
20667 * SDM vol 3a
20668 */
20669 -static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
20670 - unsigned long *ret_prot)
20671 +static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
20672 {
20673 - unsigned long pat_type;
20674 - u8 mtrr_type;
20675 -
20676 - pat_type = prot & _PAGE_CACHE_MASK;
20677 - prot &= (~_PAGE_CACHE_MASK);
20678 -
20679 - /*
20680 - * We return the PAT request directly for types where PAT takes
20681 - * precedence with respect to MTRR and for UC_MINUS.
20682 - * Consistency checks with other PAT requests is done later
20683 - * while going through memtype list.
20684 - */
20685 - if (pat_type == _PAGE_CACHE_WC) {
20686 - *ret_prot = prot | _PAGE_CACHE_WC;
20687 - return 0;
20688 - } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
20689 - *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
20690 - return 0;
20691 - } else if (pat_type == _PAGE_CACHE_UC) {
20692 - *ret_prot = prot | _PAGE_CACHE_UC;
20693 - return 0;
20694 - }
20695 -
20696 /*
20697 * Look for MTRR hint to get the effective type in case where PAT
20698 * request is for WB.
20699 */
20700 - mtrr_type = mtrr_type_lookup(start, end);
20701 + if (req_type == _PAGE_CACHE_WB) {
20702 + u8 mtrr_type;
20703
20704 - if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
20705 - *ret_prot = prot | _PAGE_CACHE_UC;
20706 - } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
20707 - *ret_prot = prot | _PAGE_CACHE_WC;
20708 - } else {
20709 - *ret_prot = prot | _PAGE_CACHE_WB;
20710 + mtrr_type = mtrr_type_lookup(start, end);
20711 + if (mtrr_type == MTRR_TYPE_UNCACHABLE)
20712 + return _PAGE_CACHE_UC;
20713 + if (mtrr_type == MTRR_TYPE_WRCOMB)
20714 + return _PAGE_CACHE_WC;
20715 + }
20716 +
20717 + return req_type;
20718 +}
20719 +
20720 +static int chk_conflict(struct memtype *new, struct memtype *entry,
20721 + unsigned long *type)
20722 +{
20723 + if (new->type != entry->type) {
20724 + if (type) {
20725 + new->type = entry->type;
20726 + *type = entry->type;
20727 + } else
20728 + goto conflict;
20729 }
20730
20731 + /* check overlaps with more than one entry in the list */
20732 + list_for_each_entry_continue(entry, &memtype_list, nd) {
20733 + if (new->end <= entry->start)
20734 + break;
20735 + else if (new->type != entry->type)
20736 + goto conflict;
20737 + }
20738 return 0;
20739 +
20740 + conflict:
20741 + printk(KERN_INFO "%s:%d conflicting memory types "
20742 + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
20743 + new->end, cattr_name(new->type), cattr_name(entry->type));
20744 + return -EBUSY;
20745 }
20746
20747 +static struct memtype *cached_entry;
20748 +static u64 cached_start;
20749 +
20750 /*
20751 * req_type typically has one of the:
20752 * - _PAGE_CACHE_WB
20753 @@ -210,37 +232,36 @@ static int pat_x_mtrr_type(u64 start, u6
20754 * req_type will have a special case value '-1', when requester want to inherit
20755 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
20756 *
20757 - * If ret_type is NULL, function will return an error if it cannot reserve the
20758 - * region with req_type. If ret_type is non-null, function will return
20759 - * available type in ret_type in case of no error. In case of any error
20760 + * If new_type is NULL, function will return an error if it cannot reserve the
20761 + * region with req_type. If new_type is non-NULL, function will return
20762 + * available type in new_type in case of no error. In case of any error
20763 * it will return a negative return value.
20764 */
20765 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
20766 - unsigned long *ret_type)
20767 + unsigned long *new_type)
20768 {
20769 - struct memtype *new_entry = NULL;
20770 - struct memtype *parse;
20771 + struct memtype *new, *entry;
20772 unsigned long actual_type;
20773 + struct list_head *where;
20774 int err = 0;
20775
20776 - /* Only track when pat_wc_enabled */
20777 - if (!pat_wc_enabled) {
20778 + BUG_ON(start >= end); /* end is exclusive */
20779 +
20780 + if (!pat_enabled) {
20781 /* This is identical to page table setting without PAT */
20782 - if (ret_type) {
20783 - if (req_type == -1) {
20784 - *ret_type = _PAGE_CACHE_WB;
20785 - } else {
20786 - *ret_type = req_type;
20787 - }
20788 + if (new_type) {
20789 + if (req_type == -1)
20790 + *new_type = _PAGE_CACHE_WB;
20791 + else
20792 + *new_type = req_type & _PAGE_CACHE_MASK;
20793 }
20794 return 0;
20795 }
20796
20797 /* Low ISA region is always mapped WB in page table. No need to track */
20798 - if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
20799 - if (ret_type)
20800 - *ret_type = _PAGE_CACHE_WB;
20801 -
20802 + if (is_ISA_range(start, end - 1)) {
20803 + if (new_type)
20804 + *new_type = _PAGE_CACHE_WB;
20805 return 0;
20806 }
20807
20808 @@ -253,206 +274,118 @@ int reserve_memtype(u64 start, u64 end,
20809 */
20810 u8 mtrr_type = mtrr_type_lookup(start, end);
20811
20812 - if (mtrr_type == MTRR_TYPE_WRBACK) {
20813 - req_type = _PAGE_CACHE_WB;
20814 + if (mtrr_type == MTRR_TYPE_WRBACK)
20815 actual_type = _PAGE_CACHE_WB;
20816 - } else {
20817 - req_type = _PAGE_CACHE_UC_MINUS;
20818 + else
20819 actual_type = _PAGE_CACHE_UC_MINUS;
20820 - }
20821 - } else {
20822 - req_type &= _PAGE_CACHE_MASK;
20823 - err = pat_x_mtrr_type(start, end, req_type, &actual_type);
20824 - }
20825 -
20826 - if (err) {
20827 - if (ret_type)
20828 - *ret_type = actual_type;
20829 + } else
20830 + actual_type = pat_x_mtrr_type(start, end,
20831 + req_type & _PAGE_CACHE_MASK);
20832
20833 - return -EINVAL;
20834 - }
20835 -
20836 - new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
20837 - if (!new_entry)
20838 + new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
20839 + if (!new)
20840 return -ENOMEM;
20841
20842 - new_entry->start = start;
20843 - new_entry->end = end;
20844 - new_entry->type = actual_type;
20845 + new->start = start;
20846 + new->end = end;
20847 + new->type = actual_type;
20848
20849 - if (ret_type)
20850 - *ret_type = actual_type;
20851 + if (new_type)
20852 + *new_type = actual_type;
20853
20854 spin_lock(&memtype_lock);
20855
20856 - /* Search for existing mapping that overlaps the current range */
20857 - list_for_each_entry(parse, &memtype_list, nd) {
20858 - struct memtype *saved_ptr;
20859 + if (cached_entry && start >= cached_start)
20860 + entry = cached_entry;
20861 + else
20862 + entry = list_entry(&memtype_list, struct memtype, nd);
20863
20864 - if (parse->start >= end) {
20865 - pr_debug("New Entry\n");
20866 - list_add(&new_entry->nd, parse->nd.prev);
20867 - new_entry = NULL;
20868 + /* Search for existing mapping that overlaps the current range */
20869 + where = NULL;
20870 + list_for_each_entry_continue(entry, &memtype_list, nd) {
20871 + if (end <= entry->start) {
20872 + where = entry->nd.prev;
20873 + cached_entry = list_entry(where, struct memtype, nd);
20874 break;
20875 - }
20876 -
20877 - if (start <= parse->start && end >= parse->start) {
20878 - if (actual_type != parse->type && ret_type) {
20879 - actual_type = parse->type;
20880 - *ret_type = actual_type;
20881 - new_entry->type = actual_type;
20882 - }
20883 -
20884 - if (actual_type != parse->type) {
20885 - printk(
20886 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
20887 - current->comm, current->pid,
20888 - start, end,
20889 - cattr_name(actual_type),
20890 - cattr_name(parse->type));
20891 - err = -EBUSY;
20892 - break;
20893 - }
20894 -
20895 - saved_ptr = parse;
20896 - /*
20897 - * Check to see whether the request overlaps more
20898 - * than one entry in the list
20899 - */
20900 - list_for_each_entry_continue(parse, &memtype_list, nd) {
20901 - if (end <= parse->start) {
20902 - break;
20903 - }
20904 -
20905 - if (actual_type != parse->type) {
20906 - printk(
20907 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
20908 - current->comm, current->pid,
20909 - start, end,
20910 - cattr_name(actual_type),
20911 - cattr_name(parse->type));
20912 - err = -EBUSY;
20913 - break;
20914 - }
20915 - }
20916 -
20917 - if (err) {
20918 - break;
20919 + } else if (start <= entry->start) { /* end > entry->start */
20920 + err = chk_conflict(new, entry, new_type);
20921 + if (!err) {
20922 + dprintk("Overlap at 0x%Lx-0x%Lx\n",
20923 + entry->start, entry->end);
20924 + where = entry->nd.prev;
20925 + cached_entry = list_entry(where,
20926 + struct memtype, nd);
20927 }
20928 -
20929 - pr_debug("Overlap at 0x%Lx-0x%Lx\n",
20930 - saved_ptr->start, saved_ptr->end);
20931 - /* No conflict. Go ahead and add this new entry */
20932 - list_add(&new_entry->nd, saved_ptr->nd.prev);
20933 - new_entry = NULL;
20934 break;
20935 - }
20936 -
20937 - if (start < parse->end) {
20938 - if (actual_type != parse->type && ret_type) {
20939 - actual_type = parse->type;
20940 - *ret_type = actual_type;
20941 - new_entry->type = actual_type;
20942 - }
20943 -
20944 - if (actual_type != parse->type) {
20945 - printk(
20946 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
20947 - current->comm, current->pid,
20948 - start, end,
20949 - cattr_name(actual_type),
20950 - cattr_name(parse->type));
20951 - err = -EBUSY;
20952 - break;
20953 - }
20954 -
20955 - saved_ptr = parse;
20956 - /*
20957 - * Check to see whether the request overlaps more
20958 - * than one entry in the list
20959 - */
20960 - list_for_each_entry_continue(parse, &memtype_list, nd) {
20961 - if (end <= parse->start) {
20962 - break;
20963 - }
20964 -
20965 - if (actual_type != parse->type) {
20966 - printk(
20967 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
20968 - current->comm, current->pid,
20969 - start, end,
20970 - cattr_name(actual_type),
20971 - cattr_name(parse->type));
20972 - err = -EBUSY;
20973 - break;
20974 + } else if (start < entry->end) { /* start > entry->start */
20975 + err = chk_conflict(new, entry, new_type);
20976 + if (!err) {
20977 + dprintk("Overlap at 0x%Lx-0x%Lx\n",
20978 + entry->start, entry->end);
20979 + cached_entry = list_entry(entry->nd.prev,
20980 + struct memtype, nd);
20981 +
20982 + /*
20983 + * Move to right position in the linked
20984 + * list to add this new entry
20985 + */
20986 + list_for_each_entry_continue(entry,
20987 + &memtype_list, nd) {
20988 + if (start <= entry->start) {
20989 + where = entry->nd.prev;
20990 + break;
20991 + }
20992 }
20993 }
20994 -
20995 - if (err) {
20996 - break;
20997 - }
20998 -
20999 - pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
21000 - saved_ptr->start, saved_ptr->end);
21001 - /* No conflict. Go ahead and add this new entry */
21002 - list_add(&new_entry->nd, &saved_ptr->nd);
21003 - new_entry = NULL;
21004 break;
21005 }
21006 }
21007
21008 if (err) {
21009 - printk(KERN_INFO
21010 - "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
21011 - start, end, cattr_name(new_entry->type),
21012 - cattr_name(req_type));
21013 - kfree(new_entry);
21014 + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
21015 + "track %s, req %s\n",
21016 + start, end, cattr_name(new->type), cattr_name(req_type));
21017 + kfree(new);
21018 spin_unlock(&memtype_lock);
21019 return err;
21020 }
21021
21022 - if (new_entry) {
21023 - /* No conflict. Not yet added to the list. Add to the tail */
21024 - list_add_tail(&new_entry->nd, &memtype_list);
21025 - pr_debug("New Entry\n");
21026 - }
21027 + cached_start = start;
21028
21029 - if (ret_type) {
21030 - pr_debug(
21031 - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21032 - start, end, cattr_name(actual_type),
21033 - cattr_name(req_type), cattr_name(*ret_type));
21034 - } else {
21035 - pr_debug(
21036 - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
21037 - start, end, cattr_name(actual_type),
21038 - cattr_name(req_type));
21039 - }
21040 + if (where)
21041 + list_add(&new->nd, where);
21042 + else
21043 + list_add_tail(&new->nd, &memtype_list);
21044
21045 spin_unlock(&memtype_lock);
21046 +
21047 + dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21048 + start, end, cattr_name(new->type), cattr_name(req_type),
21049 + new_type ? cattr_name(*new_type) : "-");
21050 +
21051 return err;
21052 }
21053
21054 int free_memtype(u64 start, u64 end)
21055 {
21056 - struct memtype *ml;
21057 + struct memtype *entry;
21058 int err = -EINVAL;
21059
21060 - /* Only track when pat_wc_enabled */
21061 - if (!pat_wc_enabled) {
21062 + if (!pat_enabled)
21063 return 0;
21064 - }
21065
21066 /* Low ISA region is always mapped WB. No need to track */
21067 - if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
21068 + if (is_ISA_range(start, end - 1))
21069 return 0;
21070 - }
21071
21072 spin_lock(&memtype_lock);
21073 - list_for_each_entry(ml, &memtype_list, nd) {
21074 - if (ml->start == start && ml->end == end) {
21075 - list_del(&ml->nd);
21076 - kfree(ml);
21077 + list_for_each_entry(entry, &memtype_list, nd) {
21078 + if (entry->start == start && entry->end == end) {
21079 + if (cached_entry == entry || cached_start == start)
21080 + cached_entry = NULL;
21081 +
21082 + list_del(&entry->nd);
21083 + kfree(entry);
21084 err = 0;
21085 break;
21086 }
21087 @@ -464,27 +397,19 @@ int free_memtype(u64 start, u64 end)
21088 current->comm, current->pid, start, end);
21089 }
21090
21091 - pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21092 + dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21093 return err;
21094 }
21095
21096
21097 -/*
21098 - * /dev/mem mmap interface. The memtype used for mapping varies:
21099 - * - Use UC for mappings with O_SYNC flag
21100 - * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
21101 - * inherit the memtype from existing mapping.
21102 - * - Else use UC_MINUS memtype (for backward compatibility with existing
21103 - * X drivers.
21104 - */
21105 pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
21106 unsigned long size, pgprot_t vma_prot)
21107 {
21108 return vma_prot;
21109 }
21110
21111 -#ifdef CONFIG_NONPROMISC_DEVMEM
21112 -/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
21113 +#ifdef CONFIG_STRICT_DEVMEM
21114 +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
21115 static inline int range_is_allowed(unsigned long mfn, unsigned long size)
21116 {
21117 return 1;
21118 @@ -508,20 +433,20 @@ static inline int range_is_allowed(unsig
21119 }
21120 return 1;
21121 }
21122 -#endif /* CONFIG_NONPROMISC_DEVMEM */
21123 +#endif /* CONFIG_STRICT_DEVMEM */
21124
21125 int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
21126 unsigned long size, pgprot_t *vma_prot)
21127 {
21128 u64 addr = (u64)mfn << PAGE_SHIFT;
21129 - unsigned long flags = _PAGE_CACHE_UC_MINUS;
21130 + unsigned long flags = -1;
21131 int retval;
21132
21133 if (!range_is_allowed(mfn, size))
21134 return 0;
21135
21136 if (file->f_flags & O_SYNC) {
21137 - flags = _PAGE_CACHE_UC;
21138 + flags = _PAGE_CACHE_UC_MINUS;
21139 }
21140
21141 #ifndef CONFIG_X86_32
21142 @@ -534,25 +459,26 @@ int phys_mem_access_prot_allowed(struct
21143 * caching for the high addresses through the KEN pin, but
21144 * we maintain the tradition of paranoia in this code.
21145 */
21146 - if (!pat_wc_enabled &&
21147 - ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
21148 - test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
21149 - test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
21150 - test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
21151 - (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21152 + if (!pat_enabled &&
21153 + !(boot_cpu_has(X86_FEATURE_MTRR) ||
21154 + boot_cpu_has(X86_FEATURE_K6_MTRR) ||
21155 + boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
21156 + boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
21157 + (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21158 flags = _PAGE_CACHE_UC;
21159 }
21160 #endif
21161 #endif
21162
21163 /*
21164 - * With O_SYNC, we can only take UC mapping. Fail if we cannot.
21165 + * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
21166 + *
21167 * Without O_SYNC, we want to get
21168 * - WB for WB-able memory and no other conflicting mappings
21169 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
21170 * - Inherit from confliting mappings otherwise
21171 */
21172 - if (flags != _PAGE_CACHE_UC_MINUS) {
21173 + if (flags != -1) {
21174 retval = reserve_memtype(addr, addr + size, flags, NULL);
21175 } else {
21176 retval = reserve_memtype(addr, addr + size, -1, &flags);
21177 @@ -600,3 +526,88 @@ void unmap_devmem(unsigned long mfn, uns
21178 free_memtype(addr, addr + size);
21179 }
21180
21181 +#if defined(CONFIG_DEBUG_FS)
21182 +
21183 +/* get Nth element of the linked list */
21184 +static struct memtype *memtype_get_idx(loff_t pos)
21185 +{
21186 + struct memtype *list_node, *print_entry;
21187 + int i = 1;
21188 +
21189 + print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21190 + if (!print_entry)
21191 + return NULL;
21192 +
21193 + spin_lock(&memtype_lock);
21194 + list_for_each_entry(list_node, &memtype_list, nd) {
21195 + if (pos == i) {
21196 + *print_entry = *list_node;
21197 + spin_unlock(&memtype_lock);
21198 + return print_entry;
21199 + }
21200 + ++i;
21201 + }
21202 + spin_unlock(&memtype_lock);
21203 + kfree(print_entry);
21204 + return NULL;
21205 +}
21206 +
21207 +static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
21208 +{
21209 + if (*pos == 0) {
21210 + ++*pos;
21211 + seq_printf(seq, "PAT memtype list:\n");
21212 + }
21213 +
21214 + return memtype_get_idx(*pos);
21215 +}
21216 +
21217 +static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
21218 +{
21219 + ++*pos;
21220 + return memtype_get_idx(*pos);
21221 +}
21222 +
21223 +static void memtype_seq_stop(struct seq_file *seq, void *v)
21224 +{
21225 +}
21226 +
21227 +static int memtype_seq_show(struct seq_file *seq, void *v)
21228 +{
21229 + struct memtype *print_entry = (struct memtype *)v;
21230 +
21231 + seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
21232 + print_entry->start, print_entry->end);
21233 + kfree(print_entry);
21234 + return 0;
21235 +}
21236 +
21237 +static struct seq_operations memtype_seq_ops = {
21238 + .start = memtype_seq_start,
21239 + .next = memtype_seq_next,
21240 + .stop = memtype_seq_stop,
21241 + .show = memtype_seq_show,
21242 +};
21243 +
21244 +static int memtype_seq_open(struct inode *inode, struct file *file)
21245 +{
21246 + return seq_open(file, &memtype_seq_ops);
21247 +}
21248 +
21249 +static const struct file_operations memtype_fops = {
21250 + .open = memtype_seq_open,
21251 + .read = seq_read,
21252 + .llseek = seq_lseek,
21253 + .release = seq_release,
21254 +};
21255 +
21256 +static int __init pat_memtype_list_init(void)
21257 +{
21258 + debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
21259 + NULL, &memtype_fops);
21260 + return 0;
21261 +}
21262 +
21263 +late_initcall(pat_memtype_list_init);
21264 +
21265 +#endif /* CONFIG_DEBUG_FS */
21266 Index: head-2008-12-01/arch/x86/mm/pgtable-xen.c
21267 ===================================================================
21268 --- head-2008-12-01.orig/arch/x86/mm/pgtable-xen.c 2008-12-01 11:46:22.000000000 +0100
21269 +++ head-2008-12-01/arch/x86/mm/pgtable-xen.c 2008-12-01 11:49:07.000000000 +0100
21270 @@ -4,6 +4,7 @@
21271 #include <asm/pgalloc.h>
21272 #include <asm/pgtable.h>
21273 #include <asm/tlb.h>
21274 +#include <asm/fixmap.h>
21275 #include <asm/hypervisor.h>
21276 #include <asm/mmu_context.h>
21277
21278 @@ -410,15 +411,9 @@ static inline void pgd_list_del(pgd_t *p
21279 static void pgd_ctor(void *p)
21280 {
21281 pgd_t *pgd = p;
21282 - unsigned long flags;
21283
21284 pgd_test_and_unpin(pgd);
21285
21286 - /* Clear usermode parts of PGD */
21287 - memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
21288 -
21289 - spin_lock_irqsave(&pgd_lock, flags);
21290 -
21291 /* If the pgd points to a shared pagetable level (either the
21292 ptes in non-PAE, or shared PMD in PAE), then just copy the
21293 references from swapper_pg_dir. */
21294 @@ -440,13 +435,9 @@ static void pgd_ctor(void *p)
21295 __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
21296 #endif
21297
21298 -#ifndef CONFIG_X86_PAE
21299 /* list required to sync kernel mapping updates */
21300 if (!SHARED_KERNEL_PMD)
21301 pgd_list_add(pgd);
21302 -#endif
21303 -
21304 - spin_unlock_irqrestore(&pgd_lock, flags);
21305 }
21306
21307 static void pgd_dtor(void *pgd)
21308 @@ -475,33 +466,6 @@ static void pgd_dtor(void *pgd)
21309
21310 #ifdef CONFIG_X86_PAE
21311 /*
21312 - * Mop up any pmd pages which may still be attached to the pgd.
21313 - * Normally they will be freed by munmap/exit_mmap, but any pmd we
21314 - * preallocate which never got a corresponding vma will need to be
21315 - * freed manually.
21316 - */
21317 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21318 -{
21319 - int i;
21320 -
21321 - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
21322 - pgd_t pgd = pgdp[i];
21323 -
21324 - if (__pgd_val(pgd) != 0) {
21325 - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21326 -
21327 - pgdp[i] = xen_make_pgd(0);
21328 -
21329 - paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21330 - pmd_free(mm, pmd);
21331 - }
21332 - }
21333 -
21334 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21335 - xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21336 -}
21337 -
21338 -/*
21339 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
21340 * updating the top-level pagetable entries to guarantee the
21341 * processor notices the update. Since this is expensive, and
21342 @@ -512,61 +476,7 @@ static void pgd_mop_up_pmds(struct mm_st
21343 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
21344 * and initialize the kernel pmds here.
21345 */
21346 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21347 -{
21348 - pud_t *pud;
21349 - pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
21350 - unsigned long addr, flags;
21351 - int i;
21352 -
21353 - /*
21354 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
21355 - * allocation). We therefore store virtual addresses of pmds as they
21356 - * do not change across save/restore, and poke the machine addresses
21357 - * into the pgdir under the pgd_lock.
21358 - */
21359 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
21360 - pmds[i] = pmd_alloc_one(mm, addr);
21361 - if (!pmds[i])
21362 - goto out_oom;
21363 - }
21364 -
21365 - spin_lock_irqsave(&pgd_lock, flags);
21366 -
21367 - /* Protect against save/restore: move below 4GB under pgd_lock. */
21368 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21369 - && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21370 - spin_unlock_irqrestore(&pgd_lock, flags);
21371 -out_oom:
21372 - while (i--)
21373 - pmd_free(mm, pmds[i]);
21374 - return 0;
21375 - }
21376 -
21377 - /* Copy kernel pmd contents and write-protect the new pmds. */
21378 - pud = pud_offset(pgd, 0);
21379 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
21380 - i++, pud++, addr += PUD_SIZE) {
21381 - if (i >= KERNEL_PGD_BOUNDARY) {
21382 - memcpy(pmds[i],
21383 - (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21384 - sizeof(pmd_t) * PTRS_PER_PMD);
21385 - make_lowmem_page_readonly(
21386 - pmds[i], XENFEAT_writable_page_tables);
21387 - }
21388 -
21389 - /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21390 - pud_populate(mm, pud, pmds[i]);
21391 - }
21392 -
21393 - /* List required to sync kernel mapping updates and
21394 - * to pin/unpin on save/restore. */
21395 - pgd_list_add(pgd);
21396 -
21397 - spin_unlock_irqrestore(&pgd_lock, flags);
21398 -
21399 - return 1;
21400 -}
21401 +#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
21402
21403 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
21404 {
21405 @@ -596,16 +506,97 @@ void pud_populate(struct mm_struct *mm,
21406 xen_tlb_flush();
21407 }
21408 #else /* !CONFIG_X86_PAE */
21409 +
21410 /* No need to prepopulate any pagetable entries in non-PAE modes. */
21411 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21412 +#define PREALLOCATED_PMDS 0
21413 +
21414 +#endif /* CONFIG_X86_PAE */
21415 +
21416 +static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
21417 {
21418 - return 1;
21419 + int i;
21420 +
21421 + if (contig)
21422 + xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
21423 +
21424 + for(i = 0; i < PREALLOCATED_PMDS; i++)
21425 + if (pmds[i])
21426 + pmd_free(mm, pmds[i]);
21427 }
21428
21429 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
21430 +static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
21431 {
21432 + int i;
21433 + bool failed = false;
21434 +
21435 + for(i = 0; i < PREALLOCATED_PMDS; i++) {
21436 + pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
21437 + if (pmd == NULL)
21438 + failed = true;
21439 + pmds[i] = pmd;
21440 + }
21441 +
21442 + if (failed) {
21443 + free_pmds(pmds, mm, false);
21444 + return -ENOMEM;
21445 + }
21446 +
21447 + return 0;
21448 +}
21449 +
21450 +/*
21451 + * Mop up any pmd pages which may still be attached to the pgd.
21452 + * Normally they will be freed by munmap/exit_mmap, but any pmd we
21453 + * preallocate which never got a corresponding vma will need to be
21454 + * freed manually.
21455 + */
21456 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21457 +{
21458 + int i;
21459 +
21460 + for(i = 0; i < PREALLOCATED_PMDS; i++) {
21461 + pgd_t pgd = pgdp[i];
21462 +
21463 + if (__pgd_val(pgd) != 0) {
21464 + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21465 +
21466 + pgdp[i] = xen_make_pgd(0);
21467 +
21468 + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21469 + pmd_free(mm, pmd);
21470 + }
21471 + }
21472 +
21473 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21474 + xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21475 +}
21476 +
21477 +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
21478 +{
21479 + pud_t *pud;
21480 + unsigned long addr;
21481 + int i;
21482 +
21483 + if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
21484 + return;
21485 +
21486 + pud = pud_offset(pgd, 0);
21487 + for (addr = i = 0; i < PREALLOCATED_PMDS;
21488 + i++, pud++, addr += PUD_SIZE) {
21489 + pmd_t *pmd = pmds[i];
21490 +
21491 + if (i >= KERNEL_PGD_BOUNDARY) {
21492 + memcpy(pmd,
21493 + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21494 + sizeof(pmd_t) * PTRS_PER_PMD);
21495 + make_lowmem_page_readonly(
21496 + pmd, XENFEAT_writable_page_tables);
21497 + }
21498 +
21499 + /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21500 + pud_populate(mm, pud, pmd);
21501 + }
21502 }
21503 -#endif /* CONFIG_X86_PAE */
21504
21505 #ifdef CONFIG_X86_64
21506 /* We allocate two contiguous pages for kernel and user. */
21507 @@ -616,19 +607,52 @@ static void pgd_mop_up_pmds(struct mm_st
21508
21509 pgd_t *pgd_alloc(struct mm_struct *mm)
21510 {
21511 - pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21512 + pgd_t *pgd;
21513 + pmd_t *pmds[PREALLOCATED_PMDS];
21514 + unsigned long flags;
21515 +
21516 + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21517 +
21518 + if (pgd == NULL)
21519 + goto out;
21520
21521 - /* so that alloc_pd can use it */
21522 mm->pgd = pgd;
21523 - if (pgd)
21524 - pgd_ctor(pgd);
21525
21526 - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
21527 - free_pages((unsigned long)pgd, PGD_ORDER);
21528 - pgd = NULL;
21529 + if (preallocate_pmds(pmds, mm) != 0)
21530 + goto out_free_pgd;
21531 +
21532 + if (paravirt_pgd_alloc(mm) != 0)
21533 + goto out_free_pmds;
21534 +
21535 + /*
21536 + * Make sure that pre-populating the pmds is atomic with
21537 + * respect to anything walking the pgd_list, so that they
21538 + * never see a partially populated pgd.
21539 + */
21540 + spin_lock_irqsave(&pgd_lock, flags);
21541 +
21542 +#ifdef CONFIG_X86_PAE
21543 + /* Protect against save/restore: move below 4GB under pgd_lock. */
21544 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21545 + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21546 + spin_unlock_irqrestore(&pgd_lock, flags);
21547 + goto out_free_pmds;
21548 }
21549 +#endif
21550 +
21551 + pgd_ctor(pgd);
21552 + pgd_prepopulate_pmd(mm, pgd, pmds);
21553 +
21554 + spin_unlock_irqrestore(&pgd_lock, flags);
21555
21556 return pgd;
21557 +
21558 +out_free_pmds:
21559 + free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
21560 +out_free_pgd:
21561 + free_pages((unsigned long)pgd, PGD_ORDER);
21562 +out:
21563 + return NULL;
21564 }
21565
21566 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
21567 @@ -644,6 +668,7 @@ void pgd_free(struct mm_struct *mm, pgd_
21568 pgd_dtor(pgd);
21569
21570 pgd_mop_up_pmds(mm, pgd);
21571 + paravirt_pgd_free(mm, pgd);
21572 free_pages((unsigned long)pgd, PGD_ORDER);
21573 }
21574
21575 @@ -685,7 +710,7 @@ int ptep_test_and_clear_young(struct vm_
21576
21577 if (pte_young(*ptep))
21578 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
21579 - &ptep->pte);
21580 + (unsigned long *) &ptep->pte);
21581
21582 if (ret)
21583 pte_update(vma->vm_mm, addr, ptep);
21584 @@ -707,3 +732,42 @@ int ptep_clear_flush_young(struct vm_are
21585
21586 return young;
21587 }
21588 +
21589 +int fixmaps_set;
21590 +
21591 +void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21592 +{
21593 + unsigned long address = __fix_to_virt(idx);
21594 + pte_t pte;
21595 +
21596 + if (idx >= __end_of_fixed_addresses) {
21597 + BUG();
21598 + return;
21599 + }
21600 +
21601 + switch (idx) {
21602 +#ifdef CONFIG_X86_64
21603 + extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
21604 +
21605 + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
21606 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21607 + set_pte_vaddr_pud(level3_user_pgt, address, pte);
21608 + break;
21609 + case FIX_EARLYCON_MEM_BASE:
21610 + xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
21611 + pfn_pte_ma(phys >> PAGE_SHIFT, flags));
21612 + fixmaps_set++;
21613 + return;
21614 +#else
21615 + case FIX_WP_TEST:
21616 + case FIX_VDSO:
21617 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21618 + break;
21619 +#endif
21620 + default:
21621 + pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21622 + break;
21623 + }
21624 + set_pte_vaddr(address, pte);
21625 + fixmaps_set++;
21626 +}
21627 Index: head-2008-12-01/arch/x86/mm/pgtable_32-xen.c
21628 ===================================================================
21629 --- head-2008-12-01.orig/arch/x86/mm/pgtable_32-xen.c 2008-12-01 11:44:55.000000000 +0100
21630 +++ head-2008-12-01/arch/x86/mm/pgtable_32-xen.c 2008-12-01 11:49:07.000000000 +0100
21631 @@ -25,51 +25,49 @@
21632 #include <xen/features.h>
21633 #include <asm/hypervisor.h>
21634
21635 -void show_mem(void)
21636 +/*
21637 + * Associate a virtual page frame with a given physical page frame
21638 + * and protection flags for that frame.
21639 + */
21640 +void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
21641 {
21642 - int total = 0, reserved = 0;
21643 - int shared = 0, cached = 0;
21644 - int highmem = 0;
21645 - struct page *page;
21646 - pg_data_t *pgdat;
21647 - unsigned long i;
21648 - unsigned long flags;
21649 -
21650 - printk(KERN_INFO "Mem-info:\n");
21651 - show_free_areas();
21652 - for_each_online_pgdat(pgdat) {
21653 - pgdat_resize_lock(pgdat, &flags);
21654 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
21655 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
21656 - touch_nmi_watchdog();
21657 - page = pgdat_page_nr(pgdat, i);
21658 - total++;
21659 - if (PageHighMem(page))
21660 - highmem++;
21661 - if (PageReserved(page))
21662 - reserved++;
21663 - else if (PageSwapCache(page))
21664 - cached++;
21665 - else if (page_count(page))
21666 - shared += page_count(page) - 1;
21667 - }
21668 - pgdat_resize_unlock(pgdat, &flags);
21669 - }
21670 - printk(KERN_INFO "%d pages of RAM\n", total);
21671 - printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
21672 - printk(KERN_INFO "%d reserved pages\n", reserved);
21673 - printk(KERN_INFO "%d pages shared\n", shared);
21674 - printk(KERN_INFO "%d pages swap cached\n", cached);
21675 -
21676 - printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
21677 - printk(KERN_INFO "%lu pages writeback\n",
21678 - global_page_state(NR_WRITEBACK));
21679 - printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
21680 - printk(KERN_INFO "%lu pages slab\n",
21681 - global_page_state(NR_SLAB_RECLAIMABLE) +
21682 - global_page_state(NR_SLAB_UNRECLAIMABLE));
21683 - printk(KERN_INFO "%lu pages pagetables\n",
21684 - global_page_state(NR_PAGETABLE));
21685 +#ifndef CONFIG_XEN
21686 + pgd_t *pgd;
21687 + pud_t *pud;
21688 + pmd_t *pmd;
21689 + pte_t *pte;
21690 +
21691 + pgd = swapper_pg_dir + pgd_index(vaddr);
21692 + if (pgd_none(*pgd)) {
21693 + BUG();
21694 + return;
21695 + }
21696 + pud = pud_offset(pgd, vaddr);
21697 + if (pud_none(*pud)) {
21698 + BUG();
21699 + return;
21700 + }
21701 + pmd = pmd_offset(pud, vaddr);
21702 + if (pmd_none(*pmd)) {
21703 + BUG();
21704 + return;
21705 + }
21706 + pte = pte_offset_kernel(pmd, vaddr);
21707 + if (pte_val(pteval))
21708 + set_pte_present(&init_mm, vaddr, pte, pteval);
21709 + else
21710 + pte_clear(&init_mm, vaddr, pte);
21711 +
21712 + /*
21713 + * It's enough to flush this one mapping.
21714 + * (PGE mappings get flushed as well)
21715 + */
21716 + __flush_tlb_one(vaddr);
21717 +#else
21718 + if (HYPERVISOR_update_va_mapping(vaddr, pteval,
21719 + UVMF_INVLPG|UVMF_ALL))
21720 + BUG();
21721 +#endif
21722 }
21723
21724 /*
21725 @@ -107,35 +105,10 @@ void set_pmd_pfn(unsigned long vaddr, un
21726 __flush_tlb_one(vaddr);
21727 }
21728
21729 -static int fixmaps;
21730 unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
21731 unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
21732 EXPORT_SYMBOL(__FIXADDR_TOP);
21733
21734 -void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21735 -{
21736 - unsigned long address = __fix_to_virt(idx);
21737 - pte_t pte;
21738 -
21739 - if (idx >= __end_of_fixed_addresses) {
21740 - BUG();
21741 - return;
21742 - }
21743 - switch (idx) {
21744 - case FIX_WP_TEST:
21745 - case FIX_VDSO:
21746 - pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21747 - break;
21748 - default:
21749 - pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21750 - break;
21751 - }
21752 - if (HYPERVISOR_update_va_mapping(address, pte,
21753 - UVMF_INVLPG|UVMF_ALL))
21754 - BUG();
21755 - fixmaps++;
21756 -}
21757 -
21758 /**
21759 * reserve_top_address - reserves a hole in the top of kernel address space
21760 * @reserve - size of hole to reserve
21761 @@ -145,13 +118,48 @@ void __set_fixmap (enum fixed_addresses
21762 */
21763 void __init reserve_top_address(unsigned long reserve)
21764 {
21765 - BUG_ON(fixmaps > 0);
21766 + BUG_ON(fixmaps_set > 0);
21767 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
21768 (int)-reserve);
21769 __FIXADDR_TOP = -reserve - PAGE_SIZE;
21770 __VMALLOC_RESERVE += reserve;
21771 }
21772
21773 +/*
21774 + * vmalloc=size forces the vmalloc area to be exactly 'size'
21775 + * bytes. This can be used to increase (or decrease) the
21776 + * vmalloc area - the default is 128m.
21777 + */
21778 +static int __init parse_vmalloc(char *arg)
21779 +{
21780 + if (!arg)
21781 + return -EINVAL;
21782 +
21783 + __VMALLOC_RESERVE = memparse(arg, &arg);
21784 + return 0;
21785 +}
21786 +early_param("vmalloc", parse_vmalloc);
21787 +
21788 +#ifndef CONFIG_XEN
21789 +/*
21790 + * reservetop=size reserves a hole at the top of the kernel address space which
21791 + * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
21792 + * so relocating the fixmap can be done before paging initialization.
21793 + */
21794 +static int __init parse_reservetop(char *arg)
21795 +{
21796 + unsigned long address;
21797 +
21798 + if (!arg)
21799 + return -EINVAL;
21800 +
21801 + address = memparse(arg, &arg);
21802 + reserve_top_address(address);
21803 + return 0;
21804 +}
21805 +early_param("reservetop", parse_reservetop);
21806 +#endif
21807 +
21808 void make_lowmem_page_readonly(void *va, unsigned int feature)
21809 {
21810 pte_t *pte;
21811 Index: head-2008-12-01/arch/x86/pci/amd_bus.c
21812 ===================================================================
21813 --- head-2008-12-01.orig/arch/x86/pci/amd_bus.c 2008-12-03 15:48:43.000000000 +0100
21814 +++ head-2008-12-01/arch/x86/pci/amd_bus.c 2008-12-01 11:49:07.000000000 +0100
21815 @@ -607,6 +607,14 @@ static int __init pci_io_ecs_init(void)
21816 for_each_online_cpu(cpu)
21817 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
21818 (void *)(long)cpu);
21819 +#ifdef CONFIG_XEN
21820 + {
21821 + u64 reg;
21822 + rdmsrl(MSR_AMD64_NB_CFG, reg);
21823 + if (!(reg & ENABLE_CF8_EXT_CFG))
21824 + return 0;
21825 + }
21826 +#endif
21827 pci_probe |= PCI_HAS_IO_ECS;
21828
21829 return 0;
21830 @@ -614,6 +622,10 @@ static int __init pci_io_ecs_init(void)
21831
21832 static int __init amd_postcore_init(void)
21833 {
21834 +#ifdef CONFIG_XEN
21835 + if (!is_initial_xendomain())
21836 + return 0;
21837 +#endif
21838 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
21839 return 0;
21840
21841 Index: head-2008-12-01/arch/x86/pci/irq-xen.c
21842 ===================================================================
21843 --- head-2008-12-01.orig/arch/x86/pci/irq-xen.c 2008-12-01 11:44:55.000000000 +0100
21844 +++ head-2008-12-01/arch/x86/pci/irq-xen.c 2008-12-01 11:49:07.000000000 +0100
21845 @@ -11,8 +11,8 @@
21846 #include <linux/slab.h>
21847 #include <linux/interrupt.h>
21848 #include <linux/dmi.h>
21849 -#include <asm/io.h>
21850 -#include <asm/smp.h>
21851 +#include <linux/io.h>
21852 +#include <linux/smp.h>
21853 #include <asm/io_apic.h>
21854 #include <linux/irq.h>
21855 #include <linux/acpi.h>
21856 @@ -45,7 +45,8 @@ struct irq_router {
21857 char *name;
21858 u16 vendor, device;
21859 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
21860 - int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
21861 + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
21862 + int new);
21863 };
21864
21865 struct irq_router_handler {
21866 @@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_d
21867 * and perform checksum verification.
21868 */
21869
21870 -static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
21871 +static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
21872 {
21873 struct irq_routing_table *rt;
21874 int i;
21875 @@ -74,10 +75,11 @@ static inline struct irq_routing_table *
21876 rt->size < sizeof(struct irq_routing_table))
21877 return NULL;
21878 sum = 0;
21879 - for (i=0; i < rt->size; i++)
21880 + for (i = 0; i < rt->size; i++)
21881 sum += addr[i];
21882 if (!sum) {
21883 - DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
21884 + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
21885 + rt);
21886 return rt;
21887 }
21888 return NULL;
21889 @@ -104,7 +106,9 @@ static struct irq_routing_table * __init
21890 return rt;
21891 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
21892 }
21893 - for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
21894 + for (addr = (u8 *) isa_bus_to_virt(0xf0000);
21895 + addr < (u8 *) isa_bus_to_virt(0x100000);
21896 + addr += 16) {
21897 rt = pirq_check_routing_table(addr);
21898 if (rt)
21899 return rt;
21900 @@ -126,20 +130,20 @@ static void __init pirq_peer_trick(void)
21901 struct irq_info *e;
21902
21903 memset(busmap, 0, sizeof(busmap));
21904 - for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
21905 + for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
21906 e = &rt->slots[i];
21907 #ifdef DEBUG
21908 {
21909 int j;
21910 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
21911 - for(j=0; j<4; j++)
21912 + for (j = 0; j < 4; j++)
21913 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
21914 DBG("\n");
21915 }
21916 #endif
21917 busmap[e->bus] = 1;
21918 }
21919 - for(i = 1; i < 256; i++) {
21920 + for (i = 1; i < 256; i++) {
21921 int node;
21922 if (!busmap[i] || pci_find_bus(0, i))
21923 continue;
21924 @@ -187,7 +191,8 @@ static unsigned int read_config_nybble(s
21925 return (nr & 1) ? (x >> 4) : (x & 0xf);
21926 }
21927
21928 -static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
21929 +static void write_config_nybble(struct pci_dev *router, unsigned offset,
21930 + unsigned nr, unsigned int val)
21931 {
21932 u8 x;
21933 unsigned reg = offset + (nr >> 1);
21934 @@ -289,7 +294,7 @@ static int pirq_ite_get(struct pci_dev *
21935 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
21936
21937 WARN_ON_ONCE(pirq > 4);
21938 - return read_config_nybble(router,0x43, pirqmap[pirq-1]);
21939 + return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
21940 }
21941
21942 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21943 @@ -318,7 +323,7 @@ static int pirq_opti_set(struct pci_dev
21944
21945 /*
21946 * Cyrix: nibble offset 0x5C
21947 - * 0x5C bits 7:4 is INTB bits 3:0 is INTA
21948 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA
21949 * 0x5D bits 7:4 is INTD bits 3:0 is INTC
21950 */
21951 static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
21952 @@ -354,7 +359,7 @@ static int pirq_cyrix_set(struct pci_dev
21953 * Apparently there are systems implementing PCI routing table using
21954 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
21955 * We try our best to handle both link mappings.
21956 - *
21957 + *
21958 * Currently (2003-05-21) it appears most SiS chipsets follow the
21959 * definition of routing registers from the SiS-5595 southbridge.
21960 * According to the SiS 5595 datasheets the revision id's of the
21961 @@ -374,7 +379,7 @@ static int pirq_cyrix_set(struct pci_dev
21962 *
21963 * 0x62: USBIRQ:
21964 * bit 6 OHCI function disabled (0), enabled (1)
21965 - *
21966 + *
21967 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
21968 *
21969 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
21970 @@ -437,7 +442,7 @@ static int pirq_vlsi_get(struct pci_dev
21971 {
21972 WARN_ON_ONCE(pirq >= 9);
21973 if (pirq > 8) {
21974 - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
21975 + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
21976 return 0;
21977 }
21978 return read_config_nybble(router, 0x74, pirq-1);
21979 @@ -447,7 +452,7 @@ static int pirq_vlsi_set(struct pci_dev
21980 {
21981 WARN_ON_ONCE(pirq >= 9);
21982 if (pirq > 8) {
21983 - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
21984 + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
21985 return 0;
21986 }
21987 write_config_nybble(router, 0x74, pirq-1, irq);
21988 @@ -471,7 +476,8 @@ static int pirq_serverworks_get(struct p
21989 return inb(0xc01) & 0xf;
21990 }
21991
21992 -static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21993 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
21994 + int pirq, int irq)
21995 {
21996 outb(pirq, 0xc00);
21997 outb(irq, 0xc01);
21998 @@ -491,22 +497,20 @@ static int pirq_amd756_get(struct pci_de
21999 u8 irq;
22000 irq = 0;
22001 if (pirq <= 4)
22002 - {
22003 irq = read_config_nybble(router, 0x56, pirq - 1);
22004 - }
22005 - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
22006 - dev->vendor, dev->device, pirq, irq);
22007 + dev_info(&dev->dev,
22008 + "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
22009 + dev->vendor, dev->device, pirq, irq);
22010 return irq;
22011 }
22012
22013 static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22014 {
22015 - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
22016 - dev->vendor, dev->device, pirq, irq);
22017 + dev_info(&dev->dev,
22018 + "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
22019 + dev->vendor, dev->device, pirq, irq);
22020 if (pirq <= 4)
22021 - {
22022 write_config_nybble(router, 0x56, pirq - 1, irq);
22023 - }
22024 return 1;
22025 }
22026
22027 @@ -553,50 +557,51 @@ static __init int intel_router_probe(str
22028 if (pci_dev_present(pirq_440gx))
22029 return 0;
22030
22031 - switch(device)
22032 - {
22033 - case PCI_DEVICE_ID_INTEL_82371FB_0:
22034 - case PCI_DEVICE_ID_INTEL_82371SB_0:
22035 - case PCI_DEVICE_ID_INTEL_82371AB_0:
22036 - case PCI_DEVICE_ID_INTEL_82371MX:
22037 - case PCI_DEVICE_ID_INTEL_82443MX_0:
22038 - case PCI_DEVICE_ID_INTEL_82801AA_0:
22039 - case PCI_DEVICE_ID_INTEL_82801AB_0:
22040 - case PCI_DEVICE_ID_INTEL_82801BA_0:
22041 - case PCI_DEVICE_ID_INTEL_82801BA_10:
22042 - case PCI_DEVICE_ID_INTEL_82801CA_0:
22043 - case PCI_DEVICE_ID_INTEL_82801CA_12:
22044 - case PCI_DEVICE_ID_INTEL_82801DB_0:
22045 - case PCI_DEVICE_ID_INTEL_82801E_0:
22046 - case PCI_DEVICE_ID_INTEL_82801EB_0:
22047 - case PCI_DEVICE_ID_INTEL_ESB_1:
22048 - case PCI_DEVICE_ID_INTEL_ICH6_0:
22049 - case PCI_DEVICE_ID_INTEL_ICH6_1:
22050 - case PCI_DEVICE_ID_INTEL_ICH7_0:
22051 - case PCI_DEVICE_ID_INTEL_ICH7_1:
22052 - case PCI_DEVICE_ID_INTEL_ICH7_30:
22053 - case PCI_DEVICE_ID_INTEL_ICH7_31:
22054 - case PCI_DEVICE_ID_INTEL_ESB2_0:
22055 - case PCI_DEVICE_ID_INTEL_ICH8_0:
22056 - case PCI_DEVICE_ID_INTEL_ICH8_1:
22057 - case PCI_DEVICE_ID_INTEL_ICH8_2:
22058 - case PCI_DEVICE_ID_INTEL_ICH8_3:
22059 - case PCI_DEVICE_ID_INTEL_ICH8_4:
22060 - case PCI_DEVICE_ID_INTEL_ICH9_0:
22061 - case PCI_DEVICE_ID_INTEL_ICH9_1:
22062 - case PCI_DEVICE_ID_INTEL_ICH9_2:
22063 - case PCI_DEVICE_ID_INTEL_ICH9_3:
22064 - case PCI_DEVICE_ID_INTEL_ICH9_4:
22065 - case PCI_DEVICE_ID_INTEL_ICH9_5:
22066 - case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22067 - case PCI_DEVICE_ID_INTEL_ICH10_0:
22068 - case PCI_DEVICE_ID_INTEL_ICH10_1:
22069 - case PCI_DEVICE_ID_INTEL_ICH10_2:
22070 - case PCI_DEVICE_ID_INTEL_ICH10_3:
22071 - r->name = "PIIX/ICH";
22072 - r->get = pirq_piix_get;
22073 - r->set = pirq_piix_set;
22074 - return 1;
22075 + switch (device) {
22076 + case PCI_DEVICE_ID_INTEL_82371FB_0:
22077 + case PCI_DEVICE_ID_INTEL_82371SB_0:
22078 + case PCI_DEVICE_ID_INTEL_82371AB_0:
22079 + case PCI_DEVICE_ID_INTEL_82371MX:
22080 + case PCI_DEVICE_ID_INTEL_82443MX_0:
22081 + case PCI_DEVICE_ID_INTEL_82801AA_0:
22082 + case PCI_DEVICE_ID_INTEL_82801AB_0:
22083 + case PCI_DEVICE_ID_INTEL_82801BA_0:
22084 + case PCI_DEVICE_ID_INTEL_82801BA_10:
22085 + case PCI_DEVICE_ID_INTEL_82801CA_0:
22086 + case PCI_DEVICE_ID_INTEL_82801CA_12:
22087 + case PCI_DEVICE_ID_INTEL_82801DB_0:
22088 + case PCI_DEVICE_ID_INTEL_82801E_0:
22089 + case PCI_DEVICE_ID_INTEL_82801EB_0:
22090 + case PCI_DEVICE_ID_INTEL_ESB_1:
22091 + case PCI_DEVICE_ID_INTEL_ICH6_0:
22092 + case PCI_DEVICE_ID_INTEL_ICH6_1:
22093 + case PCI_DEVICE_ID_INTEL_ICH7_0:
22094 + case PCI_DEVICE_ID_INTEL_ICH7_1:
22095 + case PCI_DEVICE_ID_INTEL_ICH7_30:
22096 + case PCI_DEVICE_ID_INTEL_ICH7_31:
22097 + case PCI_DEVICE_ID_INTEL_ESB2_0:
22098 + case PCI_DEVICE_ID_INTEL_ICH8_0:
22099 + case PCI_DEVICE_ID_INTEL_ICH8_1:
22100 + case PCI_DEVICE_ID_INTEL_ICH8_2:
22101 + case PCI_DEVICE_ID_INTEL_ICH8_3:
22102 + case PCI_DEVICE_ID_INTEL_ICH8_4:
22103 + case PCI_DEVICE_ID_INTEL_ICH9_0:
22104 + case PCI_DEVICE_ID_INTEL_ICH9_1:
22105 + case PCI_DEVICE_ID_INTEL_ICH9_2:
22106 + case PCI_DEVICE_ID_INTEL_ICH9_3:
22107 + case PCI_DEVICE_ID_INTEL_ICH9_4:
22108 + case PCI_DEVICE_ID_INTEL_ICH9_5:
22109 + case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22110 + case PCI_DEVICE_ID_INTEL_ICH10_0:
22111 + case PCI_DEVICE_ID_INTEL_ICH10_1:
22112 + case PCI_DEVICE_ID_INTEL_ICH10_2:
22113 + case PCI_DEVICE_ID_INTEL_ICH10_3:
22114 + case PCI_DEVICE_ID_INTEL_PCH_0:
22115 + case PCI_DEVICE_ID_INTEL_PCH_1:
22116 + r->name = "PIIX/ICH";
22117 + r->get = pirq_piix_get;
22118 + r->set = pirq_piix_set;
22119 + return 1;
22120 }
22121 return 0;
22122 }
22123 @@ -610,7 +615,7 @@ static __init int via_router_probe(struc
22124 * workarounds for some buggy BIOSes
22125 */
22126 if (device == PCI_DEVICE_ID_VIA_82C586_0) {
22127 - switch(router->device) {
22128 + switch (router->device) {
22129 case PCI_DEVICE_ID_VIA_82C686:
22130 /*
22131 * Asus k7m bios wrongly reports 82C686A
22132 @@ -635,7 +640,7 @@ static __init int via_router_probe(struc
22133 }
22134 }
22135
22136 - switch(device) {
22137 + switch (device) {
22138 case PCI_DEVICE_ID_VIA_82C586_0:
22139 r->name = "VIA";
22140 r->get = pirq_via586_get;
22141 @@ -658,28 +663,27 @@ static __init int via_router_probe(struc
22142
22143 static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22144 {
22145 - switch(device)
22146 - {
22147 - case PCI_DEVICE_ID_VLSI_82C534:
22148 - r->name = "VLSI 82C534";
22149 - r->get = pirq_vlsi_get;
22150 - r->set = pirq_vlsi_set;
22151 - return 1;
22152 + switch (device) {
22153 + case PCI_DEVICE_ID_VLSI_82C534:
22154 + r->name = "VLSI 82C534";
22155 + r->get = pirq_vlsi_get;
22156 + r->set = pirq_vlsi_set;
22157 + return 1;
22158 }
22159 return 0;
22160 }
22161
22162
22163 -static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22164 +static __init int serverworks_router_probe(struct irq_router *r,
22165 + struct pci_dev *router, u16 device)
22166 {
22167 - switch(device)
22168 - {
22169 - case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22170 - case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22171 - r->name = "ServerWorks";
22172 - r->get = pirq_serverworks_get;
22173 - r->set = pirq_serverworks_set;
22174 - return 1;
22175 + switch (device) {
22176 + case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22177 + case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22178 + r->name = "ServerWorks";
22179 + r->get = pirq_serverworks_get;
22180 + r->set = pirq_serverworks_set;
22181 + return 1;
22182 }
22183 return 0;
22184 }
22185 @@ -688,7 +692,7 @@ static __init int sis_router_probe(struc
22186 {
22187 if (device != PCI_DEVICE_ID_SI_503)
22188 return 0;
22189 -
22190 +
22191 r->name = "SIS";
22192 r->get = pirq_sis_get;
22193 r->set = pirq_sis_set;
22194 @@ -697,50 +701,45 @@ static __init int sis_router_probe(struc
22195
22196 static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22197 {
22198 - switch(device)
22199 - {
22200 - case PCI_DEVICE_ID_CYRIX_5520:
22201 - r->name = "NatSemi";
22202 - r->get = pirq_cyrix_get;
22203 - r->set = pirq_cyrix_set;
22204 - return 1;
22205 + switch (device) {
22206 + case PCI_DEVICE_ID_CYRIX_5520:
22207 + r->name = "NatSemi";
22208 + r->get = pirq_cyrix_get;
22209 + r->set = pirq_cyrix_set;
22210 + return 1;
22211 }
22212 return 0;
22213 }
22214
22215 static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22216 {
22217 - switch(device)
22218 - {
22219 - case PCI_DEVICE_ID_OPTI_82C700:
22220 - r->name = "OPTI";
22221 - r->get = pirq_opti_get;
22222 - r->set = pirq_opti_set;
22223 - return 1;
22224 + switch (device) {
22225 + case PCI_DEVICE_ID_OPTI_82C700:
22226 + r->name = "OPTI";
22227 + r->get = pirq_opti_get;
22228 + r->set = pirq_opti_set;
22229 + return 1;
22230 }
22231 return 0;
22232 }
22233
22234 static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22235 {
22236 - switch(device)
22237 - {
22238 - case PCI_DEVICE_ID_ITE_IT8330G_0:
22239 - r->name = "ITE";
22240 - r->get = pirq_ite_get;
22241 - r->set = pirq_ite_set;
22242 - return 1;
22243 + switch (device) {
22244 + case PCI_DEVICE_ID_ITE_IT8330G_0:
22245 + r->name = "ITE";
22246 + r->get = pirq_ite_get;
22247 + r->set = pirq_ite_set;
22248 + return 1;
22249 }
22250 return 0;
22251 }
22252
22253 static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22254 {
22255 - switch(device)
22256 - {
22257 + switch (device) {
22258 case PCI_DEVICE_ID_AL_M1533:
22259 case PCI_DEVICE_ID_AL_M1563:
22260 - printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
22261 r->name = "ALI";
22262 r->get = pirq_ali_get;
22263 r->set = pirq_ali_set;
22264 @@ -751,25 +750,24 @@ static __init int ali_router_probe(struc
22265
22266 static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22267 {
22268 - switch(device)
22269 - {
22270 - case PCI_DEVICE_ID_AMD_VIPER_740B:
22271 - r->name = "AMD756";
22272 - break;
22273 - case PCI_DEVICE_ID_AMD_VIPER_7413:
22274 - r->name = "AMD766";
22275 - break;
22276 - case PCI_DEVICE_ID_AMD_VIPER_7443:
22277 - r->name = "AMD768";
22278 - break;
22279 - default:
22280 - return 0;
22281 + switch (device) {
22282 + case PCI_DEVICE_ID_AMD_VIPER_740B:
22283 + r->name = "AMD756";
22284 + break;
22285 + case PCI_DEVICE_ID_AMD_VIPER_7413:
22286 + r->name = "AMD766";
22287 + break;
22288 + case PCI_DEVICE_ID_AMD_VIPER_7443:
22289 + r->name = "AMD768";
22290 + break;
22291 + default:
22292 + return 0;
22293 }
22294 r->get = pirq_amd756_get;
22295 r->set = pirq_amd756_set;
22296 return 1;
22297 }
22298 -
22299 +
22300 static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22301 {
22302 switch (device) {
22303 @@ -811,7 +809,7 @@ static struct pci_dev *pirq_router_dev;
22304 * FIXME: should we have an option to say "generic for
22305 * chipset" ?
22306 */
22307 -
22308 +
22309 static void __init pirq_find_router(struct irq_router *r)
22310 {
22311 struct irq_routing_table *rt = pirq_table;
22312 @@ -830,7 +828,7 @@ static void __init pirq_find_router(stru
22313 r->name = "default";
22314 r->get = NULL;
22315 r->set = NULL;
22316 -
22317 +
22318 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
22319 rt->rtr_vendor, rt->rtr_device);
22320
22321 @@ -841,19 +839,19 @@ static void __init pirq_find_router(stru
22322 return;
22323 }
22324
22325 - for( h = pirq_routers; h->vendor; h++) {
22326 + for (h = pirq_routers; h->vendor; h++) {
22327 /* First look for a router match */
22328 - if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
22329 + if (rt->rtr_vendor == h->vendor &&
22330 + h->probe(r, pirq_router_dev, rt->rtr_device))
22331 break;
22332 /* Fall back to a device match */
22333 - if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
22334 + if (pirq_router_dev->vendor == h->vendor &&
22335 + h->probe(r, pirq_router_dev, pirq_router_dev->device))
22336 break;
22337 }
22338 - printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
22339 - pirq_router.name,
22340 - pirq_router_dev->vendor,
22341 - pirq_router_dev->device,
22342 - pci_name(pirq_router_dev));
22343 + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
22344 + pirq_router.name,
22345 + pirq_router_dev->vendor, pirq_router_dev->device);
22346
22347 /* The device remains referenced for the kernel lifetime */
22348 }
22349 @@ -861,11 +859,13 @@ static void __init pirq_find_router(stru
22350 static struct irq_info *pirq_get_info(struct pci_dev *dev)
22351 {
22352 struct irq_routing_table *rt = pirq_table;
22353 - int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
22354 + int entries = (rt->size - sizeof(struct irq_routing_table)) /
22355 + sizeof(struct irq_info);
22356 struct irq_info *info;
22357
22358 for (info = rt->slots; entries--; info++)
22359 - if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22360 + if (info->bus == dev->bus->number &&
22361 + PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22362 return info;
22363 return NULL;
22364 }
22365 @@ -884,7 +884,7 @@ static int pcibios_lookup_irq(struct pci
22366 /* Find IRQ pin */
22367 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
22368 if (!pin) {
22369 - DBG(KERN_DEBUG " -> no interrupt pin\n");
22370 + dev_dbg(&dev->dev, "no interrupt pin\n");
22371 return 0;
22372 }
22373 pin = pin - 1;
22374 @@ -893,20 +893,21 @@ static int pcibios_lookup_irq(struct pci
22375
22376 if (!pirq_table)
22377 return 0;
22378 -
22379 - DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
22380 +
22381 info = pirq_get_info(dev);
22382 if (!info) {
22383 - DBG(" -> not found in routing table\n" KERN_DEBUG);
22384 + dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
22385 + 'A' + pin);
22386 return 0;
22387 }
22388 pirq = info->irq[pin].link;
22389 mask = info->irq[pin].bitmap;
22390 if (!pirq) {
22391 - DBG(" -> not routed\n" KERN_DEBUG);
22392 + dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
22393 return 0;
22394 }
22395 - DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
22396 + dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
22397 + 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
22398 mask &= pcibios_irq_mask;
22399
22400 /* Work around broken HP Pavilion Notebooks which assign USB to
22401 @@ -919,7 +920,8 @@ static int pcibios_lookup_irq(struct pci
22402 }
22403
22404 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
22405 - if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
22406 + if (acer_tm360_irqrouting && dev->irq == 11 &&
22407 + dev->vendor == PCI_VENDOR_ID_O2) {
22408 pirq = 0x68;
22409 mask = 0x400;
22410 dev->irq = r->get(pirq_router_dev, dev, pirq);
22411 @@ -932,51 +934,50 @@ static int pcibios_lookup_irq(struct pci
22412 */
22413 newirq = dev->irq;
22414 if (newirq && !((1 << newirq) & mask)) {
22415 - if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
22416 - else printk("\n" KERN_WARNING
22417 - "PCI: IRQ %i for device %s doesn't match PIRQ mask "
22418 - "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
22419 - pci_name(dev));
22420 + if (pci_probe & PCI_USE_PIRQ_MASK)
22421 + newirq = 0;
22422 + else
22423 + dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
22424 + "%#x; try pci=usepirqmask\n", newirq, mask);
22425 }
22426 if (!newirq && assign) {
22427 for (i = 0; i < 16; i++) {
22428 if (!(mask & (1 << i)))
22429 continue;
22430 - if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
22431 + if (pirq_penalty[i] < pirq_penalty[newirq] &&
22432 + can_request_irq(i, IRQF_SHARED))
22433 newirq = i;
22434 }
22435 }
22436 - DBG(" -> newirq=%d", newirq);
22437 + dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
22438
22439 /* Check if it is hardcoded */
22440 if ((pirq & 0xf0) == 0xf0) {
22441 irq = pirq & 0xf;
22442 - DBG(" -> hardcoded IRQ %d\n", irq);
22443 - msg = "Hardcoded";
22444 - } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22445 - ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
22446 - DBG(" -> got IRQ %d\n", irq);
22447 - msg = "Found";
22448 + msg = "hardcoded";
22449 + } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22450 + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
22451 + msg = "found";
22452 eisa_set_level_irq(irq);
22453 - } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22454 - DBG(" -> assigning IRQ %d", newirq);
22455 + } else if (newirq && r->set &&
22456 + (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22457 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
22458 eisa_set_level_irq(newirq);
22459 - DBG(" ... OK\n");
22460 - msg = "Assigned";
22461 + msg = "assigned";
22462 irq = newirq;
22463 }
22464 }
22465
22466 if (!irq) {
22467 - DBG(" ... failed\n");
22468 if (newirq && mask == (1 << newirq)) {
22469 - msg = "Guessed";
22470 + msg = "guessed";
22471 irq = newirq;
22472 - } else
22473 + } else {
22474 + dev_dbg(&dev->dev, "can't route interrupt\n");
22475 return 0;
22476 + }
22477 }
22478 - printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
22479 + dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
22480
22481 /* Update IRQ for all devices with the same pirq value */
22482 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
22483 @@ -988,20 +989,25 @@ static int pcibios_lookup_irq(struct pci
22484 if (!info)
22485 continue;
22486 if (info->irq[pin].link == pirq) {
22487 - /* We refuse to override the dev->irq information. Give a warning! */
22488 - if ( dev2->irq && dev2->irq != irq && \
22489 + /*
22490 + * We refuse to override the dev->irq
22491 + * information. Give a warning!
22492 + */
22493 + if (dev2->irq && dev2->irq != irq && \
22494 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
22495 - ((1 << dev2->irq) & mask)) ) {
22496 + ((1 << dev2->irq) & mask))) {
22497 #ifndef CONFIG_PCI_MSI
22498 - printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
22499 - pci_name(dev2), dev2->irq, irq);
22500 + dev_info(&dev2->dev, "IRQ routing conflict: "
22501 + "have IRQ %d, want IRQ %d\n",
22502 + dev2->irq, irq);
22503 #endif
22504 - continue;
22505 - }
22506 + continue;
22507 + }
22508 dev2->irq = irq;
22509 pirq_penalty[irq]++;
22510 if (dev != dev2)
22511 - printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
22512 + dev_info(&dev->dev, "sharing IRQ %d with %s\n",
22513 + irq, pci_name(dev2));
22514 }
22515 }
22516 return 1;
22517 @@ -1015,15 +1021,20 @@ static void __init pcibios_fixup_irqs(vo
22518 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
22519 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
22520 /*
22521 - * If the BIOS has set an out of range IRQ number, just ignore it.
22522 - * Also keep track of which IRQ's are already in use.
22523 + * If the BIOS has set an out of range IRQ number, just
22524 + * ignore it. Also keep track of which IRQ's are
22525 + * already in use.
22526 */
22527 if (dev->irq >= 16) {
22528 - DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
22529 + dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
22530 dev->irq = 0;
22531 }
22532 - /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
22533 - if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
22534 + /*
22535 + * If the IRQ is already assigned to a PCI device,
22536 + * ignore its ISA use penalty
22537 + */
22538 + if (pirq_penalty[dev->irq] >= 100 &&
22539 + pirq_penalty[dev->irq] < 100000)
22540 pirq_penalty[dev->irq] = 0;
22541 pirq_penalty[dev->irq]++;
22542 }
22543 @@ -1035,13 +1046,17 @@ static void __init pcibios_fixup_irqs(vo
22544 /*
22545 * Recalculate IRQ numbers if we use the I/O APIC.
22546 */
22547 - if (io_apic_assign_pci_irqs)
22548 - {
22549 + if (io_apic_assign_pci_irqs) {
22550 int irq;
22551
22552 if (pin) {
22553 - pin--; /* interrupt pins are numbered starting from 1 */
22554 - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
22555 + /*
22556 + * interrupt pins are numbered starting
22557 + * from 1
22558 + */
22559 + pin--;
22560 + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
22561 + PCI_SLOT(dev->devfn), pin);
22562 /*
22563 * Busses behind bridges are typically not listed in the MP-table.
22564 * In this case we have to look up the IRQ based on the parent bus,
22565 @@ -1049,18 +1064,18 @@ static void __init pcibios_fixup_irqs(vo
22566 * busses itself so we should get into this branch reliably.
22567 */
22568 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22569 - struct pci_dev * bridge = dev->bus->self;
22570 + struct pci_dev *bridge = dev->bus->self;
22571
22572 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22573 - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22574 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22575 PCI_SLOT(bridge->devfn), pin);
22576 if (irq >= 0)
22577 - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22578 - pci_name(bridge), 'A' + pin, irq);
22579 + dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
22580 + pci_name(bridge),
22581 + 'A' + pin, irq);
22582 }
22583 if (irq >= 0) {
22584 - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22585 - pci_name(dev), 'A' + pin, irq);
22586 + dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
22587 dev->irq = irq;
22588 }
22589 }
22590 @@ -1082,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq
22591 {
22592 if (!broken_hp_bios_irq9) {
22593 broken_hp_bios_irq9 = 1;
22594 - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22595 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22596 + d->ident);
22597 }
22598 return 0;
22599 }
22600 @@ -1095,7 +1111,8 @@ static int __init fix_acer_tm360_irqrout
22601 {
22602 if (!acer_tm360_irqrouting) {
22603 acer_tm360_irqrouting = 1;
22604 - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22605 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22606 + d->ident);
22607 }
22608 return 0;
22609 }
22610 @@ -1107,7 +1124,8 @@ static struct dmi_system_id __initdata p
22611 .matches = {
22612 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
22613 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
22614 - DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
22615 + DMI_MATCH(DMI_PRODUCT_VERSION,
22616 + "HP Pavilion Notebook Model GE"),
22617 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
22618 },
22619 },
22620 @@ -1122,7 +1140,7 @@ static struct dmi_system_id __initdata p
22621 { }
22622 };
22623
22624 -static int __init pcibios_irq_init(void)
22625 +int __init pcibios_irq_init(void)
22626 {
22627 DBG(KERN_DEBUG "PCI: IRQ init\n");
22628
22629 @@ -1142,11 +1160,14 @@ static int __init pcibios_irq_init(void)
22630 pirq_find_router(&pirq_router);
22631 if (pirq_table->exclusive_irqs) {
22632 int i;
22633 - for (i=0; i<16; i++)
22634 + for (i = 0; i < 16; i++)
22635 if (!(pirq_table->exclusive_irqs & (1 << i)))
22636 pirq_penalty[i] += 100;
22637 }
22638 - /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
22639 + /*
22640 + * If we're using the I/O APIC, avoid using the PCI IRQ
22641 + * routing table
22642 + */
22643 if (io_apic_assign_pci_irqs)
22644 pirq_table = NULL;
22645 }
22646 @@ -1157,9 +1178,6 @@ static int __init pcibios_irq_init(void)
22647 return 0;
22648 }
22649
22650 -subsys_initcall(pcibios_irq_init);
22651 -
22652 -
22653 static void pirq_penalize_isa_irq(int irq, int active)
22654 {
22655 /*
22656 @@ -1193,7 +1211,7 @@ static int pirq_enable_irq(struct pci_de
22657 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
22658 char *msg = "";
22659
22660 - pin--; /* interrupt pins are numbered starting from 1 */
22661 + pin--; /* interrupt pins are numbered starting from 1 */
22662
22663 if (io_apic_assign_pci_irqs) {
22664 int irq;
22665 @@ -1207,35 +1225,41 @@ static int pirq_enable_irq(struct pci_de
22666 */
22667 temp_dev = dev;
22668 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22669 - struct pci_dev * bridge = dev->bus->self;
22670 + struct pci_dev *bridge = dev->bus->self;
22671
22672 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22673 - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22674 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22675 PCI_SLOT(bridge->devfn), pin);
22676 if (irq >= 0)
22677 - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22678 - pci_name(bridge), 'A' + pin, irq);
22679 + dev_warn(&dev->dev, "using bridge %s "
22680 + "INT %c to get IRQ %d\n",
22681 + pci_name(bridge), 'A' + pin,
22682 + irq);
22683 dev = bridge;
22684 }
22685 dev = temp_dev;
22686 if (irq >= 0) {
22687 - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22688 - pci_name(dev), 'A' + pin, irq);
22689 + dev_info(&dev->dev, "PCI->APIC IRQ transform: "
22690 + "INT %c -> IRQ %d\n", 'A' + pin, irq);
22691 dev->irq = irq;
22692 return 0;
22693 } else
22694 - msg = " Probably buggy MP table.";
22695 + msg = "; probably buggy MP table";
22696 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
22697 msg = "";
22698 else
22699 - msg = " Please try using pci=biosirq.";
22700 + msg = "; please try using pci=biosirq";
22701
22702 - /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
22703 - if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
22704 + /*
22705 + * With IDE legacy devices the IRQ lookup failure is not
22706 + * a problem..
22707 + */
22708 + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
22709 + !(dev->class & 0x5))
22710 return 0;
22711
22712 - printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
22713 - 'A' + pin, pci_name(dev), msg);
22714 + dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
22715 + 'A' + pin, msg);
22716 }
22717 return 0;
22718 }
22719 Index: head-2008-12-01/arch/x86/vdso/Makefile
22720 ===================================================================
22721 --- head-2008-12-01.orig/arch/x86/vdso/Makefile 2008-12-01 11:37:10.000000000 +0100
22722 +++ head-2008-12-01/arch/x86/vdso/Makefile 2008-12-01 11:49:07.000000000 +0100
22723 @@ -65,9 +65,7 @@ obj-$(VDSO32-y) += vdso32-syms.lds
22724 vdso32.so-$(VDSO32-y) += int80
22725 vdso32.so-$(CONFIG_COMPAT) += syscall
22726 vdso32.so-$(VDSO32-y) += sysenter
22727 -xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
22728 -xen-vdso32-$(CONFIG_X86_32) += syscall
22729 -vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
22730 +vdso32.so-$(CONFIG_X86_XEN) += syscall
22731
22732 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
22733
22734 Index: head-2008-12-01/arch/x86/vdso/vdso32.S
22735 ===================================================================
22736 --- head-2008-12-01.orig/arch/x86/vdso/vdso32.S 2008-12-01 11:37:10.000000000 +0100
22737 +++ head-2008-12-01/arch/x86/vdso/vdso32.S 2008-12-01 11:49:07.000000000 +0100
22738 @@ -9,7 +9,7 @@ vdso32_int80_end:
22739
22740 .globl vdso32_syscall_start, vdso32_syscall_end
22741 vdso32_syscall_start:
22742 -#ifdef CONFIG_COMPAT
22743 +#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
22744 .incbin "arch/x86/vdso/vdso32-syscall.so"
22745 #endif
22746 vdso32_syscall_end:
22747 @@ -19,16 +19,4 @@ vdso32_sysenter_start:
22748 .incbin "arch/x86/vdso/vdso32-sysenter.so"
22749 vdso32_sysenter_end:
22750
22751 -#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
22752 - .globl vdso32_int80_start, vdso32_int80_end
22753 -vdso32_int80_start:
22754 - .incbin "arch/x86/vdso/vdso32-int80.so"
22755 -vdso32_int80_end:
22756 -#elif defined(CONFIG_X86_XEN)
22757 - .globl vdso32_syscall_start, vdso32_syscall_end
22758 -vdso32_syscall_start:
22759 - .incbin "arch/x86/vdso/vdso32-syscall.so"
22760 -vdso32_syscall_end:
22761 -#endif
22762 -
22763 __FINIT
22764 Index: head-2008-12-01/arch/x86/vdso/vdso32-setup-xen.c
22765 ===================================================================
22766 --- head-2008-12-01.orig/arch/x86/vdso/vdso32-setup-xen.c 2008-12-01 11:44:55.000000000 +0100
22767 +++ head-2008-12-01/arch/x86/vdso/vdso32-setup-xen.c 2008-12-01 11:49:07.000000000 +0100
22768 @@ -195,50 +195,28 @@ static __init void relocate_vdso(Elf32_E
22769 }
22770 }
22771
22772 -/*
22773 - * These symbols are defined by vdso32.S to mark the bounds
22774 - * of the ELF DSO images included therein.
22775 - */
22776 -extern const char vdso32_default_start, vdso32_default_end;
22777 -extern const char vdso32_sysenter_start, vdso32_sysenter_end;
22778 static struct page *vdso32_pages[1];
22779
22780 #ifdef CONFIG_X86_64
22781
22782 -#if CONFIG_XEN_COMPAT < 0x030200
22783 -static int use_int80 = 1;
22784 -#endif
22785 -static int use_sysenter __read_mostly = -1;
22786 -
22787 -#define vdso32_sysenter() (use_sysenter > 0)
22788 +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
22789 +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
22790
22791 -/* May not be __init: called during resume */
22792 -void syscall32_cpu_init(void)
22793 +void __cpuinit syscall32_cpu_init(void)
22794 {
22795 - static const struct callback_register cstar = {
22796 + static /*const*/ struct callback_register __cpuinitdata cstar = {
22797 .type = CALLBACKTYPE_syscall32,
22798 .address = (unsigned long)ia32_cstar_target
22799 };
22800 - static const struct callback_register sysenter = {
22801 + static /*const*/ struct callback_register __cpuinitdata sysenter = {
22802 .type = CALLBACKTYPE_sysenter,
22803 .address = (unsigned long)ia32_sysenter_target
22804 };
22805
22806 - if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
22807 - (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
22808 -#if CONFIG_XEN_COMPAT < 0x030200
22809 - return;
22810 - use_int80 = 0;
22811 -#else
22812 - BUG();
22813 -#endif
22814 -
22815 - if (use_sysenter < 0) {
22816 - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
22817 - use_sysenter = 1;
22818 - if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
22819 - use_sysenter = 1;
22820 - }
22821 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
22822 + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
22823 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
22824 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
22825 }
22826
22827 #define compat_uses_vma 1
22828 @@ -250,6 +228,7 @@ static inline void map_compat_vdso(int m
22829 #else /* CONFIG_X86_32 */
22830
22831 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
22832 +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
22833
22834 extern asmlinkage void ia32pv_cstar_target(void);
22835 static /*const*/ struct callback_register __cpuinitdata cstar = {
22836 @@ -265,13 +244,13 @@ void __cpuinit enable_sep_cpu(void)
22837 .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
22838 };
22839
22840 - if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
22841 + if (vdso32_syscall()) {
22842 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
22843 BUG();
22844 return;
22845 }
22846
22847 - if (!boot_cpu_has(X86_FEATURE_SEP))
22848 + if (!vdso32_sysenter())
22849 return;
22850
22851 if (xen_feature(XENFEAT_supervisor_mode_kernel))
22852 @@ -341,34 +320,26 @@ int __init sysenter_setup(void)
22853
22854 #ifdef CONFIG_X86_32
22855 gate_vma_init();
22856 -#endif
22857
22858 -#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
22859 - if (use_int80) {
22860 - extern const char vdso32_int80_start, vdso32_int80_end;
22861 -
22862 - vsyscall = &vdso32_int80_start;
22863 - vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
22864 - } else
22865 -#elif defined(CONFIG_X86_32)
22866 - if (boot_cpu_has(X86_FEATURE_SYSCALL)
22867 - && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
22868 - || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
22869 - setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
22870 - barrier(); /* until clear_bit()'s constraints are correct ... */
22871 if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
22872 - extern const char vdso32_syscall_start, vdso32_syscall_end;
22873 -
22874 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
22875 + && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
22876 + setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
22877 + else {
22878 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
22879 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
22880 + }
22881 + }
22882 +#endif
22883 + if (vdso32_syscall()) {
22884 vsyscall = &vdso32_syscall_start;
22885 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
22886 - } else
22887 -#endif
22888 - if (!vdso32_sysenter()) {
22889 - vsyscall = &vdso32_default_start;
22890 - vsyscall_len = &vdso32_default_end - &vdso32_default_start;
22891 - } else {
22892 + } else if (vdso32_sysenter()){
22893 vsyscall = &vdso32_sysenter_start;
22894 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
22895 + } else {
22896 + vsyscall = &vdso32_int80_start;
22897 + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
22898 }
22899
22900 memcpy(syscall_page, vsyscall, vsyscall_len);
22901 Index: head-2008-12-01/arch/x86/xen/Kconfig
22902 ===================================================================
22903 --- head-2008-12-01.orig/arch/x86/xen/Kconfig 2008-12-01 11:36:47.000000000 +0100
22904 +++ head-2008-12-01/arch/x86/xen/Kconfig 2008-12-01 11:49:07.000000000 +0100
22905 @@ -17,7 +17,7 @@ config XEN_MAX_DOMAIN_MEMORY
22906 int "Maximum allowed size of a domain in gigabytes"
22907 default 8 if X86_32
22908 default 32 if X86_64
22909 - depends on XEN
22910 + depends on PARAVIRT_XEN
22911 help
22912 The pseudo-physical to machine address array is sized
22913 according to the maximum possible memory size of a Xen
22914 @@ -26,5 +26,5 @@ config XEN_MAX_DOMAIN_MEMORY
22915
22916 config XEN_SAVE_RESTORE
22917 bool
22918 - depends on PM
22919 + depends on PARAVIRT_XEN && PM
22920 default y
22921 \ No newline at end of file
22922 Index: head-2008-12-01/drivers/acpi/processor_core.c
22923 ===================================================================
22924 --- head-2008-12-01.orig/drivers/acpi/processor_core.c 2008-12-01 11:44:55.000000000 +0100
22925 +++ head-2008-12-01/drivers/acpi/processor_core.c 2008-12-01 11:49:07.000000000 +0100
22926 @@ -721,9 +721,11 @@ static int __cpuinit acpi_processor_star
22927 if (result)
22928 goto end;
22929
22930 - sysdev = get_cpu_sysdev(pr->id);
22931 - if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
22932 - return -EFAULT;
22933 + if (pr->id != -1) {
22934 + sysdev = get_cpu_sysdev(pr->id);
22935 + if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
22936 + return -EFAULT;
22937 + }
22938
22939 status = acpi_install_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
22940 acpi_processor_notify, pr);
22941 @@ -895,7 +897,8 @@ static int acpi_processor_remove(struct
22942 status = acpi_remove_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
22943 acpi_processor_notify);
22944
22945 - sysfs_remove_link(&device->dev.kobj, "sysdev");
22946 + if (pr->id != -1)
22947 + sysfs_remove_link(&device->dev.kobj, "sysdev");
22948
22949 acpi_processor_remove_fs(device);
22950
22951 Index: head-2008-12-01/drivers/char/tpm/tpm_vtpm.c
22952 ===================================================================
22953 --- head-2008-12-01.orig/drivers/char/tpm/tpm_vtpm.c 2008-12-03 15:48:43.000000000 +0100
22954 +++ head-2008-12-01/drivers/char/tpm/tpm_vtpm.c 2008-12-01 11:49:07.000000000 +0100
22955 @@ -347,7 +347,7 @@ static int _vtpm_send_queued(struct tpm_
22956 {
22957 int rc;
22958 int error = 0;
22959 - long flags;
22960 + unsigned long flags;
22961 unsigned char buffer[1];
22962 struct vtpm_state *vtpms;
22963 vtpms = (struct vtpm_state *)chip_get_private(chip);
22964 Index: head-2008-12-01/drivers/misc/Kconfig
22965 ===================================================================
22966 --- head-2008-12-01.orig/drivers/misc/Kconfig 2008-12-03 15:48:43.000000000 +0100
22967 +++ head-2008-12-01/drivers/misc/Kconfig 2008-12-01 11:49:07.000000000 +0100
22968 @@ -438,7 +438,7 @@ config ENCLOSURE_SERVICES
22969 config SGI_XP
22970 tristate "Support communication between SGI SSIs"
22971 depends on NET
22972 - depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP
22973 + depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP && !XEN
22974 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
22975 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
22976 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP
22977 @@ -465,7 +465,7 @@ config HP_ILO
22978
22979 config SGI_GRU
22980 tristate "SGI GRU driver"
22981 - depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP
22982 + depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP && !XEN
22983 default n
22984 select MMU_NOTIFIER
22985 ---help---
22986 Index: head-2008-12-01/drivers/pci/msi-xen.c
22987 ===================================================================
22988 --- head-2008-12-01.orig/drivers/pci/msi-xen.c 2008-12-01 11:44:55.000000000 +0100
22989 +++ head-2008-12-01/drivers/pci/msi-xen.c 2008-12-01 11:49:07.000000000 +0100
22990 @@ -90,12 +90,10 @@ arch_teardown_msi_irqs(struct pci_dev *d
22991 }
22992 #endif
22993
22994 -static void msi_set_enable(struct pci_dev *dev, int enable)
22995 +static void __msi_set_enable(struct pci_dev *dev, int pos, int enable)
22996 {
22997 - int pos;
22998 u16 control;
22999
23000 - pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
23001 if (pos) {
23002 pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
23003 control &= ~PCI_MSI_FLAGS_ENABLE;
23004 @@ -105,6 +103,11 @@ static void msi_set_enable(struct pci_de
23005 }
23006 }
23007
23008 +static void msi_set_enable(struct pci_dev *dev, int enable)
23009 +{
23010 + __msi_set_enable(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), enable);
23011 +}
23012 +
23013 static void msix_set_enable(struct pci_dev *dev, int enable)
23014 {
23015 int pos;
23016 @@ -573,9 +576,8 @@ int pci_enable_msi(struct pci_dev* dev)
23017
23018 /* Check whether driver already requested for MSI-X irqs */
23019 if (dev->msix_enabled) {
23020 - printk(KERN_INFO "PCI: %s: Can't enable MSI. "
23021 - "Device already has MSI-X enabled\n",
23022 - pci_name(dev));
23023 + dev_info(&dev->dev, "can't enable MSI "
23024 + "(MSI-X already enabled)\n");
23025 return -EINVAL;
23026 }
23027
23028 @@ -707,9 +709,8 @@ int pci_enable_msix(struct pci_dev* dev,
23029 temp = dev->irq;
23030 /* Check whether driver already requested for MSI vector */
23031 if (dev->msi_enabled) {
23032 - printk(KERN_INFO "PCI: %s: Can't enable MSI-X. "
23033 - "Device already has an MSI irq assigned\n",
23034 - pci_name(dev));
23035 + dev_info(&dev->dev, "can't enable MSI-X "
23036 + "(MSI IRQ already assigned)\n");
23037 return -EINVAL;
23038 }
23039
23040 Index: head-2008-12-01/drivers/pci/quirks.c
23041 ===================================================================
23042 --- head-2008-12-01.orig/drivers/pci/quirks.c 2008-12-03 15:48:43.000000000 +0100
23043 +++ head-2008-12-01/drivers/pci/quirks.c 2008-12-01 11:49:07.000000000 +0100
23044 @@ -42,9 +42,7 @@ static void __devinit quirk_release_reso
23045 /* PCI Host Bridge isn't a target device */
23046 return;
23047 }
23048 - printk(KERN_INFO
23049 - "PCI: Disable device and release resources [%s].\n",
23050 - pci_name(dev));
23051 + dev_info(&dev->dev, "disable device and release resources\n");
23052 pci_disable_device(dev);
23053
23054 for (i=0; i < PCI_NUM_RESOURCES; i++) {
23055 Index: head-2008-12-01/drivers/pci/setup-res.c
23056 ===================================================================
23057 --- head-2008-12-01.orig/drivers/pci/setup-res.c 2008-12-03 15:48:43.000000000 +0100
23058 +++ head-2008-12-01/drivers/pci/setup-res.c 2008-12-01 11:50:17.000000000 +0100
23059 @@ -129,7 +129,7 @@ int pci_claim_resource(struct pci_dev *d
23060 #ifdef CONFIG_PCI_REASSIGN
23061 void pci_disable_bridge_window(struct pci_dev *dev)
23062 {
23063 - printk(KERN_DEBUG "PCI: Disable bridge window on %s\n", pci_name(dev));
23064 + dev_dbg(&dev->dev, "disable bridge window\n");
23065
23066 /* MMIO Base/Limit */
23067 pci_write_config_dword(dev, PCI_MEMORY_BASE, 0x0000fff0);
23068 @@ -190,8 +190,8 @@ int pci_assign_resource(struct pci_dev *
23069 res->flags &= ~IORESOURCE_STARTALIGN;
23070 if (resno < PCI_BRIDGE_RESOURCES) {
23071 #ifdef CONFIG_PCI_REASSIGN
23072 - printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23073 - "%016llx - %016llx\n", resno, pci_name(dev),
23074 + dev_dbg(&dev->dev, "assign resource(%d) "
23075 + "%016llx - %016llx\n", resno,
23076 (unsigned long long)res->start,
23077 (unsigned long long)res->end);
23078 #endif
23079 @@ -235,8 +235,8 @@ int pci_assign_resource_fixed(struct pci
23080 (unsigned long long)res->end);
23081 } else if (resno < PCI_BRIDGE_RESOURCES) {
23082 #ifdef CONFIG_PCI_REASSIGN
23083 - printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23084 - "%016llx - %016llx\n", resno, pci_name(dev),
23085 + dev_dbg(&dev->dev, "assign resource(%d) "
23086 + "%016llx - %016llx\n", resno,
23087 (unsigned long long)res->start,
23088 (unsigned long long)res->end);
23089 #endif
23090 Index: head-2008-12-01/drivers/xen/Makefile
23091 ===================================================================
23092 --- head-2008-12-01.orig/drivers/xen/Makefile 2008-12-01 11:44:55.000000000 +0100
23093 +++ head-2008-12-01/drivers/xen/Makefile 2008-12-01 11:49:07.000000000 +0100
23094 @@ -1,4 +1,4 @@
23095 -obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
23096 +obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o manage.o
23097 xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
23098 xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
23099
23100 Index: head-2008-12-01/drivers/xen/balloon/sysfs.c
23101 ===================================================================
23102 --- head-2008-12-01.orig/drivers/xen/balloon/sysfs.c 2008-12-01 11:37:10.000000000 +0100
23103 +++ head-2008-12-01/drivers/xen/balloon/sysfs.c 2008-12-01 11:49:07.000000000 +0100
23104 @@ -45,6 +45,7 @@
23105
23106 #define BALLOON_SHOW(name, format, args...) \
23107 static ssize_t show_##name(struct sys_device *dev, \
23108 + struct sysdev_attribute *attr, \
23109 char *buf) \
23110 { \
23111 return sprintf(buf, format, ##args); \
23112 @@ -59,14 +60,15 @@ BALLOON_SHOW(hard_limit_kb,
23113 (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0);
23114 BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
23115
23116 -static ssize_t show_target_kb(struct sys_device *dev, char *buf)
23117 +static ssize_t show_target_kb(struct sys_device *dev,
23118 + struct sysdev_attribute *attr, char *buf)
23119 {
23120 return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
23121 }
23122
23123 static ssize_t store_target_kb(struct sys_device *dev,
23124 - const char *buf,
23125 - size_t count)
23126 + struct sysdev_attribute *attr,
23127 + const char *buf, size_t count)
23128 {
23129 char memstring[64], *endchar;
23130 unsigned long long target_bytes;
23131 Index: head-2008-12-01/drivers/xen/blktap/blktap.c
23132 ===================================================================
23133 --- head-2008-12-01.orig/drivers/xen/blktap/blktap.c 2008-12-01 11:44:55.000000000 +0100
23134 +++ head-2008-12-01/drivers/xen/blktap/blktap.c 2008-12-01 11:49:07.000000000 +0100
23135 @@ -54,6 +54,7 @@
23136 #include <linux/gfp.h>
23137 #include <linux/poll.h>
23138 #include <linux/delay.h>
23139 +#include <linux/nsproxy.h>
23140 #include <asm/tlbflush.h>
23141
23142 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
23143 @@ -483,7 +484,7 @@ found:
23144
23145 if ((class = get_xen_class()) != NULL)
23146 device_create(class, NULL, MKDEV(blktap_major, minor),
23147 - "blktap%d", minor);
23148 + NULL, "blktap%d", minor);
23149 }
23150
23151 out:
23152 @@ -1686,7 +1687,8 @@ static int __init blkif_init(void)
23153 * We only create the device when a request of a new device is
23154 * made.
23155 */
23156 - device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
23157 + device_create(class, NULL, MKDEV(blktap_major, 0), NULL,
23158 + "blktap0");
23159 } else {
23160 /* this is bad, but not fatal */
23161 WPRINTK("blktap: sysfs xen_class not created\n");
23162 Index: head-2008-12-01/drivers/xen/char/mem.c
23163 ===================================================================
23164 --- head-2008-12-01.orig/drivers/xen/char/mem.c 2008-12-01 11:44:55.000000000 +0100
23165 +++ head-2008-12-01/drivers/xen/char/mem.c 2008-12-01 11:49:07.000000000 +0100
23166 @@ -35,7 +35,7 @@ static inline int uncached_access(struct
23167
23168 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
23169 {
23170 -#ifdef CONFIG_NONPROMISC_DEVMEM
23171 +#ifdef CONFIG_STRICT_DEVMEM
23172 u64 from = ((u64)pfn) << PAGE_SHIFT;
23173 u64 to = from + size;
23174 u64 cursor = from;
23175 @@ -172,7 +172,10 @@ static void mmap_mem_close(struct vm_are
23176
23177 static struct vm_operations_struct mmap_mem_ops = {
23178 .open = mmap_mem_open,
23179 - .close = mmap_mem_close
23180 + .close = mmap_mem_close,
23181 +#ifdef CONFIG_HAVE_IOREMAP_PROT
23182 + .access = generic_access_phys
23183 +#endif
23184 };
23185
23186 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
23187 Index: head-2008-12-01/drivers/xen/console/console.c
23188 ===================================================================
23189 --- head-2008-12-01.orig/drivers/xen/console/console.c 2008-12-01 11:44:55.000000000 +0100
23190 +++ head-2008-12-01/drivers/xen/console/console.c 2008-12-01 11:49:07.000000000 +0100
23191 @@ -416,9 +416,7 @@ static void __xencons_tx_flush(void)
23192
23193 if (work_done && (xencons_tty != NULL)) {
23194 wake_up_interruptible(&xencons_tty->write_wait);
23195 - if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
23196 - (xencons_tty->ldisc.write_wakeup != NULL))
23197 - (xencons_tty->ldisc.write_wakeup)(xencons_tty);
23198 + tty_wakeup(xencons_tty);
23199 }
23200 }
23201
23202 @@ -619,8 +617,8 @@ static void xencons_close(struct tty_str
23203 tty->closing = 1;
23204 tty_wait_until_sent(tty, 0);
23205 tty_driver_flush_buffer(tty);
23206 - if (tty->ldisc.flush_buffer != NULL)
23207 - tty->ldisc.flush_buffer(tty);
23208 + if (tty->ldisc.ops->flush_buffer != NULL)
23209 + tty->ldisc.ops->flush_buffer(tty);
23210 tty->closing = 0;
23211 spin_lock_irqsave(&xencons_lock, flags);
23212 xencons_tty = NULL;
23213 Index: head-2008-12-01/drivers/xen/core/evtchn.c
23214 ===================================================================
23215 --- head-2008-12-01.orig/drivers/xen/core/evtchn.c 2008-12-01 11:37:10.000000000 +0100
23216 +++ head-2008-12-01/drivers/xen/core/evtchn.c 2008-12-03 15:53:53.000000000 +0100
23217 @@ -744,9 +744,9 @@ static struct irq_chip dynirq_chip = {
23218 };
23219
23220 /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
23221 -static int pirq_eoi_does_unmask;
23222 +static bool pirq_eoi_does_unmask;
23223 static DECLARE_BITMAP(pirq_needs_eoi, ALIGN(NR_PIRQS, PAGE_SIZE * 8))
23224 - __attribute__ ((__section__(".bss.page_aligned"), __aligned__(PAGE_SIZE)));
23225 + __page_aligned_bss;
23226
23227 static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
23228 {
23229 @@ -1002,6 +1002,7 @@ void xen_poll_irq(int irq)
23230 BUG();
23231 }
23232
23233 +#ifdef CONFIG_PM_SLEEP
23234 static void restore_cpu_virqs(unsigned int cpu)
23235 {
23236 struct evtchn_bind_virq bind_virq;
23237 @@ -1094,6 +1095,7 @@ void irq_resume(void)
23238 }
23239
23240 }
23241 +#endif
23242
23243 #if defined(CONFIG_X86_IO_APIC)
23244 #define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
23245 @@ -1175,7 +1177,7 @@ void __init xen_init_IRQ(void)
23246 BUG_ON(!bitmap_empty(pirq_needs_eoi, PAGE_SIZE * 8));
23247 eoi_mfn.mfn = virt_to_bus(pirq_needs_eoi) >> PAGE_SHIFT;
23248 if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_mfn, &eoi_mfn) == 0)
23249 - pirq_eoi_does_unmask = 1;
23250 + pirq_eoi_does_unmask = true;
23251
23252 /* No event channels are 'live' right now. */
23253 for (i = 0; i < NR_EVENT_CHANNELS; i++)
23254 Index: head-2008-12-01/drivers/xen/core/gnttab.c
23255 ===================================================================
23256 --- head-2008-12-01.orig/drivers/xen/core/gnttab.c 2008-12-03 15:48:43.000000000 +0100
23257 +++ head-2008-12-01/drivers/xen/core/gnttab.c 2008-12-02 09:26:17.000000000 +0100
23258 @@ -449,6 +449,7 @@ static int map_pte_fn(pte_t *pte, struct
23259 return 0;
23260 }
23261
23262 +#ifdef CONFIG_PM_SLEEP
23263 static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
23264 unsigned long addr, void *data)
23265 {
23266 @@ -456,6 +457,7 @@ static int unmap_pte_fn(pte_t *pte, stru
23267 set_pte_at(&init_mm, addr, pte, __pte(0));
23268 return 0;
23269 }
23270 +#endif
23271
23272 void *arch_gnttab_alloc_shared(unsigned long *frames)
23273 {
23274 @@ -633,6 +635,75 @@ void __gnttab_dma_map_page(struct page *
23275 } while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
23276 }
23277
23278 +#ifdef __HAVE_ARCH_PTE_SPECIAL
23279 +
23280 +static unsigned int GNTMAP_pte_special;
23281 +
23282 +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
23283 + unsigned int count)
23284 +{
23285 + unsigned int i;
23286 +
23287 + if (unlikely(cmd != GNTTABOP_map_grant_ref))
23288 + count = 0;
23289 +
23290 + for (i = 0; i < count; ++i, ++map) {
23291 + if (!(map->flags & GNTMAP_host_map)
23292 + || !(map->flags & GNTMAP_application_map))
23293 + continue;
23294 + if (GNTMAP_pte_special)
23295 + map->flags |= GNTMAP_pte_special;
23296 + else {
23297 + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
23298 + return true;
23299 + }
23300 + }
23301 +
23302 + return false;
23303 +}
23304 +EXPORT_SYMBOL(gnttab_pre_map_adjust);
23305 +
23306 +#if CONFIG_XEN_COMPAT < 0x030400
23307 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
23308 +{
23309 + unsigned int i;
23310 + int rc = 0;
23311 +
23312 + for (i = 0; i < count && rc == 0; ++i, ++map) {
23313 + pte_t pte;
23314 +
23315 + if (!(map->flags & GNTMAP_host_map)
23316 + || !(map->flags & GNTMAP_application_map))
23317 + continue;
23318 +
23319 +#ifdef CONFIG_X86
23320 + pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
23321 + | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
23322 + | _PAGE_SPECIAL)
23323 + & __supported_pte_mask);
23324 +#else
23325 +#error Architecture not yet supported.
23326 +#endif
23327 + if (!(map->flags & GNTMAP_readonly))
23328 + pte = pte_mkwrite(pte);
23329 +
23330 + if (map->flags & GNTMAP_contains_pte) {
23331 + mmu_update_t u;
23332 +
23333 + u.ptr = map->host_addr;
23334 + u.val = __pte_val(pte);
23335 + rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
23336 + } else
23337 + rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
23338 + }
23339 +
23340 + return rc;
23341 +}
23342 +EXPORT_SYMBOL(gnttab_post_map_adjust);
23343 +#endif
23344 +
23345 +#endif /* __HAVE_ARCH_PTE_SPECIAL */
23346 +
23347 int gnttab_resume(void)
23348 {
23349 if (max_nr_grant_frames() < nr_grant_frames)
23350 @@ -640,6 +711,7 @@ int gnttab_resume(void)
23351 return gnttab_map(0, nr_grant_frames - 1);
23352 }
23353
23354 +#ifdef CONFIG_PM_SLEEP
23355 int gnttab_suspend(void)
23356 {
23357 #ifdef CONFIG_X86
23358 @@ -649,6 +721,7 @@ int gnttab_suspend(void)
23359 #endif
23360 return 0;
23361 }
23362 +#endif
23363
23364 #else /* !CONFIG_XEN */
23365
23366 @@ -759,6 +832,18 @@ int __devinit gnttab_init(void)
23367 gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
23368 gnttab_free_head = NR_RESERVED_ENTRIES;
23369
23370 +#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
23371 + if (!xen_feature(XENFEAT_auto_translated_physmap)
23372 + && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
23373 +#ifdef CONFIG_X86
23374 + GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
23375 + >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
23376 +#else
23377 +#error Architecture not yet supported.
23378 +#endif
23379 + }
23380 +#endif
23381 +
23382 return 0;
23383
23384 ini_nomem:
23385 Index: head-2008-12-01/drivers/xen/core/machine_kexec.c
23386 ===================================================================
23387 --- head-2008-12-01.orig/drivers/xen/core/machine_kexec.c 2008-12-01 11:44:55.000000000 +0100
23388 +++ head-2008-12-01/drivers/xen/core/machine_kexec.c 2008-12-01 11:49:07.000000000 +0100
23389 @@ -90,7 +90,7 @@ void __init xen_machine_kexec_setup_reso
23390 xen_hypervisor_res.start = range.start;
23391 xen_hypervisor_res.end = range.start + range.size - 1;
23392 xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
23393 -#ifdef CONFIG_X86_64
23394 +#ifdef CONFIG_X86
23395 insert_resource(&iomem_resource, &xen_hypervisor_res);
23396 #endif
23397
23398 @@ -105,7 +105,7 @@ void __init xen_machine_kexec_setup_reso
23399 if (range.size) {
23400 crashk_res.start = range.start;
23401 crashk_res.end = range.start + range.size - 1;
23402 -#ifdef CONFIG_X86_64
23403 +#ifdef CONFIG_X86
23404 insert_resource(&iomem_resource, &crashk_res);
23405 #endif
23406 }
23407 @@ -152,7 +152,7 @@ void __init xen_machine_kexec_setup_reso
23408 return;
23409 }
23410
23411 -#ifndef CONFIG_X86_64
23412 +#ifndef CONFIG_X86
23413 void __init xen_machine_kexec_register_resources(struct resource *res)
23414 {
23415 request_resource(res, &xen_hypervisor_res);
23416 Index: head-2008-12-01/drivers/xen/core/machine_reboot.c
23417 ===================================================================
23418 --- head-2008-12-01.orig/drivers/xen/core/machine_reboot.c 2008-12-01 11:44:55.000000000 +0100
23419 +++ head-2008-12-01/drivers/xen/core/machine_reboot.c 2008-12-01 11:49:07.000000000 +0100
23420 @@ -65,6 +65,7 @@ EXPORT_SYMBOL(machine_restart);
23421 EXPORT_SYMBOL(machine_halt);
23422 EXPORT_SYMBOL(machine_power_off);
23423
23424 +#ifdef CONFIG_PM_SLEEP
23425 static void pre_suspend(void)
23426 {
23427 HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
23428 @@ -119,6 +120,7 @@ static void post_suspend(int suspend_can
23429 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
23430 virt_to_mfn(pfn_to_mfn_frame_list_list);
23431 }
23432 +#endif
23433
23434 #else /* !(defined(__i386__) || defined(__x86_64__)) */
23435
23436 @@ -137,6 +139,7 @@ static void post_suspend(int suspend_can
23437
23438 #endif
23439
23440 +#ifdef CONFIG_PM_SLEEP
23441 struct suspend {
23442 int fast_suspend;
23443 void (*resume_notifier)(int);
23444 @@ -230,7 +233,8 @@ int __xen_suspend(int fast_suspend, void
23445
23446 if (fast_suspend) {
23447 xenbus_suspend();
23448 - err = stop_machine_run(take_machine_down, &suspend, 0);
23449 + err = stop_machine(take_machine_down, &suspend,
23450 + &cpumask_of_cpu(0));
23451 if (err < 0)
23452 xenbus_suspend_cancel();
23453 } else {
23454 @@ -253,3 +257,4 @@ int __xen_suspend(int fast_suspend, void
23455
23456 return 0;
23457 }
23458 +#endif
23459 Index: head-2008-12-01/drivers/xen/core/reboot.c
23460 ===================================================================
23461 --- head-2008-12-01.orig/drivers/xen/core/reboot.c 2008-12-01 11:36:47.000000000 +0100
23462 +++ head-2008-12-01/drivers/xen/core/reboot.c 2008-12-01 11:49:07.000000000 +0100
23463 @@ -29,17 +29,12 @@ MODULE_LICENSE("Dual BSD/GPL");
23464 /* Ignore multiple shutdown requests. */
23465 static int shutting_down = SHUTDOWN_INVALID;
23466
23467 -/* Was last suspend request cancelled? */
23468 -static int suspend_cancelled;
23469 -
23470 /* Can we leave APs online when we suspend? */
23471 static int fast_suspend;
23472
23473 static void __shutdown_handler(struct work_struct *unused);
23474 static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
23475
23476 -static int setup_suspend_evtchn(void);
23477 -
23478 int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));
23479
23480 static int shutdown_process(void *__unused)
23481 @@ -69,6 +64,13 @@ static int shutdown_process(void *__unus
23482 return 0;
23483 }
23484
23485 +#ifdef CONFIG_PM_SLEEP
23486 +
23487 +static int setup_suspend_evtchn(void);
23488 +
23489 +/* Was last suspend request cancelled? */
23490 +static int suspend_cancelled;
23491 +
23492 static void xen_resume_notifier(int _suspend_cancelled)
23493 {
23494 int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
23495 @@ -118,6 +120,10 @@ static int xen_suspend(void *__unused)
23496 return 0;
23497 }
23498
23499 +#else
23500 +# define xen_suspend NULL
23501 +#endif
23502 +
23503 static void switch_shutdown_state(int new_state)
23504 {
23505 int prev_state, old_state = SHUTDOWN_INVALID;
23506 @@ -194,8 +200,10 @@ static void shutdown_handler(struct xenb
23507 new_state = SHUTDOWN_POWEROFF;
23508 else if (strcmp(str, "reboot") == 0)
23509 ctrl_alt_del();
23510 +#ifdef CONFIG_PM_SLEEP
23511 else if (strcmp(str, "suspend") == 0)
23512 new_state = SHUTDOWN_SUSPEND;
23513 +#endif
23514 else if (strcmp(str, "halt") == 0)
23515 new_state = SHUTDOWN_HALT;
23516 else
23517 @@ -247,6 +255,7 @@ static struct xenbus_watch sysrq_watch =
23518 .callback = sysrq_handler
23519 };
23520
23521 +#ifdef CONFIG_PM_SLEEP
23522 static irqreturn_t suspend_int(int irq, void* dev_id)
23523 {
23524 switch_shutdown_state(SHUTDOWN_SUSPEND);
23525 @@ -274,6 +283,9 @@ static int setup_suspend_evtchn(void)
23526
23527 return 0;
23528 }
23529 +#else
23530 +#define setup_suspend_evtchn() 0
23531 +#endif
23532
23533 static int setup_shutdown_watcher(void)
23534 {
23535 Index: head-2008-12-01/drivers/xen/core/smpboot.c
23536 ===================================================================
23537 --- head-2008-12-01.orig/drivers/xen/core/smpboot.c 2008-12-01 11:44:55.000000000 +0100
23538 +++ head-2008-12-01/drivers/xen/core/smpboot.c 2008-12-01 11:49:07.000000000 +0100
23539 @@ -27,6 +27,7 @@
23540
23541 extern irqreturn_t smp_reschedule_interrupt(int, void *);
23542 extern irqreturn_t smp_call_function_interrupt(int, void *);
23543 +extern irqreturn_t smp_call_function_single_interrupt(int, void *);
23544
23545 extern int local_setup_timer(unsigned int cpu);
23546 extern void local_teardown_timer(unsigned int cpu);
23547 @@ -54,8 +55,10 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
23548
23549 static DEFINE_PER_CPU(int, resched_irq);
23550 static DEFINE_PER_CPU(int, callfunc_irq);
23551 +static DEFINE_PER_CPU(int, call1func_irq);
23552 static char resched_name[NR_CPUS][15];
23553 static char callfunc_name[NR_CPUS][15];
23554 +static char call1func_name[NR_CPUS][15];
23555
23556 #ifdef CONFIG_X86_LOCAL_APIC
23557 #define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
23558 @@ -77,8 +80,10 @@ void __init prefill_possible_map(void)
23559
23560 for (i = 0; i < NR_CPUS; i++) {
23561 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
23562 - if (rc >= 0)
23563 + if (rc >= 0) {
23564 cpu_set(i, cpu_possible_map);
23565 + nr_cpu_ids = i + 1;
23566 + }
23567 }
23568 }
23569
23570 @@ -114,7 +119,8 @@ static int __cpuinit xen_smp_intr_init(u
23571 {
23572 int rc;
23573
23574 - per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
23575 + per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) =
23576 + per_cpu(call1func_irq, cpu) = -1;
23577
23578 sprintf(resched_name[cpu], "resched%u", cpu);
23579 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
23580 @@ -138,6 +144,17 @@ static int __cpuinit xen_smp_intr_init(u
23581 goto fail;
23582 per_cpu(callfunc_irq, cpu) = rc;
23583
23584 + sprintf(call1func_name[cpu], "call1func%u", cpu);
23585 + rc = bind_ipi_to_irqhandler(CALL_FUNC_SINGLE_VECTOR,
23586 + cpu,
23587 + smp_call_function_single_interrupt,
23588 + IRQF_DISABLED|IRQF_NOBALANCING,
23589 + call1func_name[cpu],
23590 + NULL);
23591 + if (rc < 0)
23592 + goto fail;
23593 + per_cpu(call1func_irq, cpu) = rc;
23594 +
23595 rc = xen_spinlock_init(cpu);
23596 if (rc < 0)
23597 goto fail;
23598 @@ -152,6 +169,8 @@ static int __cpuinit xen_smp_intr_init(u
23599 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23600 if (per_cpu(callfunc_irq, cpu) >= 0)
23601 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23602 + if (per_cpu(call1func_irq, cpu) >= 0)
23603 + unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23604 xen_spinlock_cleanup(cpu);
23605 return rc;
23606 }
23607 @@ -164,6 +183,7 @@ static void __cpuexit xen_smp_intr_exit(
23608
23609 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23610 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23611 + unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23612 xen_spinlock_cleanup(cpu);
23613 }
23614 #endif
23615 @@ -171,11 +191,7 @@ static void __cpuexit xen_smp_intr_exit(
23616 void __cpuinit cpu_bringup(void)
23617 {
23618 cpu_init();
23619 -#ifdef __i386__
23620 identify_secondary_cpu(&current_cpu_data);
23621 -#else
23622 - identify_cpu(&current_cpu_data);
23623 -#endif
23624 touch_softlockup_watchdog();
23625 preempt_disable();
23626 local_irq_enable();
23627 @@ -255,9 +271,6 @@ void __init smp_prepare_cpus(unsigned in
23628 struct task_struct *idle;
23629 int apicid;
23630 struct vcpu_get_physid cpu_id;
23631 -#ifdef __x86_64__
23632 - struct desc_ptr *gdt_descr;
23633 -#endif
23634 void *gdt_addr;
23635
23636 apicid = 0;
23637 @@ -270,7 +283,7 @@ void __init smp_prepare_cpus(unsigned in
23638
23639 current_thread_info()->cpu = 0;
23640
23641 - for (cpu = 0; cpu < NR_CPUS; cpu++) {
23642 + for_each_possible_cpu (cpu) {
23643 cpus_clear(per_cpu(cpu_sibling_map, cpu));
23644 cpus_clear(per_cpu(cpu_core_map, cpu));
23645 }
23646 @@ -297,21 +310,10 @@ void __init smp_prepare_cpus(unsigned in
23647 if (IS_ERR(idle))
23648 panic("failed fork for CPU %d", cpu);
23649
23650 -#ifdef __x86_64__
23651 - gdt_descr = &cpu_gdt_descr[cpu];
23652 - gdt_descr->address = get_zeroed_page(GFP_KERNEL);
23653 - if (unlikely(!gdt_descr->address)) {
23654 - printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
23655 - cpu);
23656 - continue;
23657 - }
23658 - gdt_descr->size = GDT_SIZE;
23659 - memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
23660 - gdt_addr = (void *)gdt_descr->address;
23661 -#else
23662 +#ifdef __i386__
23663 init_gdt(cpu);
23664 - gdt_addr = get_cpu_gdt_table(cpu);
23665 #endif
23666 + gdt_addr = get_cpu_gdt_table(cpu);
23667 make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
23668
23669 apicid = cpu;
23670 Index: head-2008-12-01/drivers/xen/core/spinlock.c
23671 ===================================================================
23672 --- head-2008-12-01.orig/drivers/xen/core/spinlock.c 2008-12-01 11:37:10.000000000 +0100
23673 +++ head-2008-12-01/drivers/xen/core/spinlock.c 2008-12-01 11:51:53.000000000 +0100
23674 @@ -73,9 +73,9 @@ int xen_spin_wait(raw_spinlock_t *lock,
23675 /* announce we're spinning */
23676 spinning.ticket = token;
23677 spinning.lock = lock;
23678 - spinning.prev = __get_cpu_var(spinning);
23679 + spinning.prev = x86_read_percpu(spinning);
23680 smp_wmb();
23681 - __get_cpu_var(spinning) = &spinning;
23682 + x86_write_percpu(spinning, &spinning);
23683
23684 /* clear pending */
23685 xen_clear_irq_pending(irq);
23686 @@ -102,7 +102,7 @@ int xen_spin_wait(raw_spinlock_t *lock,
23687 kstat_this_cpu.irqs[irq] += !rc;
23688
23689 /* announce we're done */
23690 - __get_cpu_var(spinning) = spinning.prev;
23691 + x86_write_percpu(spinning, spinning.prev);
23692 rm_lock = &__get_cpu_var(spinning_rm_lock);
23693 raw_local_irq_save(flags);
23694 __raw_write_lock(rm_lock);
23695 Index: head-2008-12-01/drivers/xen/fbfront/xenfb.c
23696 ===================================================================
23697 --- head-2008-12-01.orig/drivers/xen/fbfront/xenfb.c 2008-12-01 11:44:55.000000000 +0100
23698 +++ head-2008-12-01/drivers/xen/fbfront/xenfb.c 2008-12-01 11:49:07.000000000 +0100
23699 @@ -18,6 +18,7 @@
23700 * frame buffer.
23701 */
23702
23703 +#include <linux/console.h>
23704 #include <linux/kernel.h>
23705 #include <linux/errno.h>
23706 #include <linux/fb.h>
23707 @@ -544,6 +545,28 @@ static unsigned long vmalloc_to_mfn(void
23708 return pfn_to_mfn(vmalloc_to_pfn(address));
23709 }
23710
23711 +static __devinit void
23712 +xenfb_make_preferred_console(void)
23713 +{
23714 + struct console *c;
23715 +
23716 + if (console_set_on_cmdline)
23717 + return;
23718 +
23719 + acquire_console_sem();
23720 + for (c = console_drivers; c; c = c->next) {
23721 + if (!strcmp(c->name, "tty") && c->index == 0)
23722 + break;
23723 + }
23724 + release_console_sem();
23725 + if (c) {
23726 + unregister_console(c);
23727 + c->flags |= CON_CONSDEV;
23728 + c->flags &= ~CON_PRINTBUFFER; /* don't print again */
23729 + register_console(c);
23730 + }
23731 +}
23732 +
23733 static int __devinit xenfb_probe(struct xenbus_device *dev,
23734 const struct xenbus_device_id *id)
23735 {
23736 @@ -673,6 +696,7 @@ static int __devinit xenfb_probe(struct
23737 goto error;
23738 }
23739
23740 + xenfb_make_preferred_console();
23741 return 0;
23742
23743 error_nomem:
23744 @@ -881,4 +905,5 @@ static void __exit xenfb_cleanup(void)
23745 module_init(xenfb_init);
23746 module_exit(xenfb_cleanup);
23747
23748 +MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
23749 MODULE_LICENSE("GPL");
23750 Index: head-2008-12-01/drivers/xen/fbfront/xenkbd.c
23751 ===================================================================
23752 --- head-2008-12-01.orig/drivers/xen/fbfront/xenkbd.c 2008-12-01 11:36:07.000000000 +0100
23753 +++ head-2008-12-01/drivers/xen/fbfront/xenkbd.c 2008-12-01 11:49:07.000000000 +0100
23754 @@ -350,4 +350,5 @@ static void __exit xenkbd_cleanup(void)
23755 module_init(xenkbd_init);
23756 module_exit(xenkbd_cleanup);
23757
23758 +MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
23759 MODULE_LICENSE("GPL");
23760 Index: head-2008-12-01/drivers/xen/gntdev/gntdev.c
23761 ===================================================================
23762 --- head-2008-12-01.orig/drivers/xen/gntdev/gntdev.c 2008-12-01 11:44:55.000000000 +0100
23763 +++ head-2008-12-01/drivers/xen/gntdev/gntdev.c 2008-12-01 11:49:07.000000000 +0100
23764 @@ -418,7 +418,7 @@ static int __init gntdev_init(void)
23765 }
23766
23767 device = device_create(class, NULL, MKDEV(gntdev_major, 0),
23768 - GNTDEV_NAME);
23769 + NULL, GNTDEV_NAME);
23770 if (IS_ERR(device)) {
23771 printk(KERN_ERR "Error creating gntdev device in xen_class\n");
23772 printk(KERN_ERR "gntdev created with major number = %d\n",
23773 Index: head-2008-12-01/drivers/xen/netfront/accel.c
23774 ===================================================================
23775 --- head-2008-12-01.orig/drivers/xen/netfront/accel.c 2008-12-01 11:36:55.000000000 +0100
23776 +++ head-2008-12-01/drivers/xen/netfront/accel.c 2008-12-01 11:49:07.000000000 +0100
23777 @@ -28,6 +28,7 @@
23778 * IN THE SOFTWARE.
23779 */
23780
23781 +#include <linux/version.h>
23782 #include <linux/netdevice.h>
23783 #include <linux/skbuff.h>
23784 #include <linux/list.h>
23785 Index: head-2008-12-01/drivers/xen/netfront/netfront.c
23786 ===================================================================
23787 --- head-2008-12-01.orig/drivers/xen/netfront/netfront.c 2008-12-01 11:44:55.000000000 +0100
23788 +++ head-2008-12-01/drivers/xen/netfront/netfront.c 2008-12-01 11:49:07.000000000 +0100
23789 @@ -640,7 +640,7 @@ static int network_open(struct net_devic
23790 }
23791 spin_unlock_bh(&np->rx_lock);
23792
23793 - network_maybe_wake_tx(dev);
23794 + netif_start_queue(dev);
23795
23796 return 0;
23797 }
23798 Index: head-2008-12-01/drivers/xen/sfc_netback/accel.h
23799 ===================================================================
23800 --- head-2008-12-01.orig/drivers/xen/sfc_netback/accel.h 2008-12-03 15:48:43.000000000 +0100
23801 +++ head-2008-12-01/drivers/xen/sfc_netback/accel.h 2008-12-01 11:49:07.000000000 +0100
23802 @@ -25,6 +25,7 @@
23803 #ifndef NETBACK_ACCEL_H
23804 #define NETBACK_ACCEL_H
23805
23806 +#include <linux/version.h>
23807 #include <linux/slab.h>
23808 #include <linux/ip.h>
23809 #include <linux/tcp.h>
23810 Index: head-2008-12-01/drivers/xen/sfc_netfront/accel.h
23811 ===================================================================
23812 --- head-2008-12-01.orig/drivers/xen/sfc_netfront/accel.h 2008-12-01 11:29:05.000000000 +0100
23813 +++ head-2008-12-01/drivers/xen/sfc_netfront/accel.h 2008-12-01 11:49:07.000000000 +0100
23814 @@ -35,6 +35,7 @@
23815 #include <xen/evtchn.h>
23816
23817 #include <linux/kernel.h>
23818 +#include <linux/version.h>
23819 #include <linux/list.h>
23820
23821 enum netfront_accel_post_status {
23822 Index: head-2008-12-01/drivers/xen/xenbus/xenbus_comms.c
23823 ===================================================================
23824 --- head-2008-12-01.orig/drivers/xen/xenbus/xenbus_comms.c 2008-12-01 11:36:47.000000000 +0100
23825 +++ head-2008-12-01/drivers/xen/xenbus/xenbus_comms.c 2008-12-01 11:49:07.000000000 +0100
23826 @@ -228,14 +228,11 @@ int xb_init_comms(void)
23827 intf->rsp_cons = intf->rsp_prod;
23828 }
23829
23830 +#if defined(CONFIG_XEN) || defined(MODULE)
23831 if (xenbus_irq)
23832 unbind_from_irqhandler(xenbus_irq, &xb_waitq);
23833
23834 -#if defined(CONFIG_XEN) || defined(MODULE)
23835 err = bind_caller_port_to_irqhandler(
23836 -#else
23837 - err = bind_evtchn_to_irqhandler(
23838 -#endif
23839 xen_store_evtchn, wake_waiting,
23840 0, "xenbus", &xb_waitq);
23841 if (err <= 0) {
23842 @@ -244,6 +241,20 @@ int xb_init_comms(void)
23843 }
23844
23845 xenbus_irq = err;
23846 +#else
23847 + if (xenbus_irq) {
23848 + /* Already have an irq; assume we're resuming */
23849 + rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
23850 + } else {
23851 + err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
23852 + 0, "xenbus", &xb_waitq);
23853 + if (err <= 0) {
23854 + printk(KERN_ERR "XENBUS request irq failed %i\n", err);
23855 + return err;
23856 + }
23857 + xenbus_irq = err;
23858 + }
23859 +#endif
23860
23861 return 0;
23862 }
23863 Index: head-2008-12-01/drivers/xen/xenbus/xenbus_probe.c
23864 ===================================================================
23865 --- head-2008-12-01.orig/drivers/xen/xenbus/xenbus_probe.c 2008-12-01 11:44:55.000000000 +0100
23866 +++ head-2008-12-01/drivers/xen/xenbus/xenbus_probe.c 2008-12-01 11:49:07.000000000 +0100
23867 @@ -36,6 +36,7 @@
23868 __FUNCTION__, __LINE__, ##args)
23869
23870 #include <linux/kernel.h>
23871 +#include <linux/version.h>
23872 #include <linux/err.h>
23873 #include <linux/string.h>
23874 #include <linux/ctype.h>
23875 Index: head-2008-12-01/fs/aio.c
23876 ===================================================================
23877 --- head-2008-12-01.orig/fs/aio.c 2008-12-01 11:44:55.000000000 +0100
23878 +++ head-2008-12-01/fs/aio.c 2008-12-01 11:49:07.000000000 +0100
23879 @@ -1319,7 +1319,7 @@ static int make_aio_fd(struct kioctx *io
23880 int fd;
23881 struct file *file;
23882
23883 - fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
23884 + fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
23885 if (fd < 0)
23886 return fd;
23887
23888 Index: head-2008-12-01/include/asm-generic/pgtable.h
23889 ===================================================================
23890 --- head-2008-12-01.orig/include/asm-generic/pgtable.h 2008-12-01 11:29:05.000000000 +0100
23891 +++ head-2008-12-01/include/asm-generic/pgtable.h 2008-12-01 11:49:07.000000000 +0100
23892 @@ -99,10 +99,6 @@ static inline void ptep_set_wrprotect(st
23893 }
23894 #endif
23895
23896 -#ifndef arch_change_pte_range
23897 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
23898 -#endif
23899 -
23900 #ifndef __HAVE_ARCH_PTE_SAME
23901 #define pte_same(A,B) (pte_val(A) == pte_val(B))
23902 #endif
23903 Index: head-2008-12-01/include/asm-x86/dma-mapping.h
23904 ===================================================================
23905 --- head-2008-12-01.orig/include/asm-x86/dma-mapping.h 2008-12-01 11:44:55.000000000 +0100
23906 +++ head-2008-12-01/include/asm-x86/dma-mapping.h 2008-12-01 11:49:07.000000000 +0100
23907 @@ -74,7 +74,7 @@ static inline struct dma_mapping_ops *ge
23908 /* Make sure we keep the same behaviour */
23909 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
23910 {
23911 -#ifdef CONFIG_X86_32
23912 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
23913 return 0;
23914 #else
23915 struct dma_mapping_ops *ops = get_dma_ops(dev);
23916 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/desc.h
23917 ===================================================================
23918 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/desc.h 2008-12-01 11:44:55.000000000 +0100
23919 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/desc.h 2008-12-01 11:49:07.000000000 +0100
23920 @@ -31,11 +31,17 @@ extern struct desc_ptr idt_descr;
23921 extern gate_desc idt_table[];
23922 #endif
23923
23924 +struct gdt_page {
23925 + struct desc_struct gdt[GDT_ENTRIES];
23926 +} __attribute__((aligned(PAGE_SIZE)));
23927 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
23928 +
23929 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
23930 +{
23931 + return per_cpu(gdt_page, cpu).gdt;
23932 +}
23933 +
23934 #ifdef CONFIG_X86_64
23935 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
23936 -extern struct desc_ptr cpu_gdt_descr[];
23937 -/* the cpu gdt accessor */
23938 -#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
23939
23940 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
23941 unsigned dpl, unsigned ist, unsigned seg)
23942 @@ -53,16 +59,6 @@ static inline void pack_gate(gate_desc *
23943 }
23944
23945 #else
23946 -struct gdt_page {
23947 - struct desc_struct gdt[GDT_ENTRIES];
23948 -} __attribute__((aligned(PAGE_SIZE)));
23949 -DECLARE_PER_CPU(struct gdt_page, gdt_page);
23950 -
23951 -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
23952 -{
23953 - return per_cpu(gdt_page, cpu).gdt;
23954 -}
23955 -
23956 static inline void pack_gate(gate_desc *gate, unsigned char type,
23957 unsigned long base, unsigned dpl, unsigned flags,
23958 unsigned short seg)
23959 @@ -333,6 +329,28 @@ static inline void set_intr_gate(unsigne
23960 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
23961 }
23962
23963 +#define SYS_VECTOR_FREE 0
23964 +#define SYS_VECTOR_ALLOCED 1
23965 +
23966 +extern int first_system_vector;
23967 +extern char system_vectors[];
23968 +
23969 +static inline void alloc_system_vector(int vector)
23970 +{
23971 + if (system_vectors[vector] == SYS_VECTOR_FREE) {
23972 + system_vectors[vector] = SYS_VECTOR_ALLOCED;
23973 + if (first_system_vector > vector)
23974 + first_system_vector = vector;
23975 + } else
23976 + BUG();
23977 +}
23978 +
23979 +static inline void alloc_intr_gate(unsigned int n, void *addr)
23980 +{
23981 + alloc_system_vector(n);
23982 + set_intr_gate(n, addr);
23983 +}
23984 +
23985 /*
23986 * This routine sets up an interrupt gate at directory privilege level 3.
23987 */
23988 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap.h
23989 ===================================================================
23990 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/fixmap.h 2008-12-01 11:44:55.000000000 +0100
23991 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap.h 2008-12-01 11:49:07.000000000 +0100
23992 @@ -7,7 +7,58 @@
23993 # include "fixmap_64.h"
23994 #endif
23995
23996 +extern int fixmaps_set;
23997 +
23998 +void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
23999 +
24000 +static inline void __set_fixmap(enum fixed_addresses idx,
24001 + maddr_t phys, pgprot_t flags)
24002 +{
24003 + xen_set_fixmap(idx, phys, flags);
24004 +}
24005 +
24006 +#define set_fixmap(idx, phys) \
24007 + __set_fixmap(idx, phys, PAGE_KERNEL)
24008 +
24009 +/*
24010 + * Some hardware wants to get fixmapped without caching.
24011 + */
24012 +#define set_fixmap_nocache(idx, phys) \
24013 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24014 +
24015 #define clear_fixmap(idx) \
24016 __set_fixmap(idx, 0, __pgprot(0))
24017
24018 +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24019 +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24020 +
24021 +extern void __this_fixmap_does_not_exist(void);
24022 +
24023 +/*
24024 + * 'index to address' translation. If anyone tries to use the idx
24025 + * directly without translation, we catch the bug with a NULL-deference
24026 + * kernel oops. Illegal ranges of incoming indices are caught too.
24027 + */
24028 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24029 +{
24030 + /*
24031 + * this branch gets completely eliminated after inlining,
24032 + * except when someone tries to use fixaddr indices in an
24033 + * illegal way. (such as mixing up address types or using
24034 + * out-of-range indices).
24035 + *
24036 + * If it doesn't get removed, the linker will complain
24037 + * loudly with a reasonably clear error message..
24038 + */
24039 + if (idx >= __end_of_fixed_addresses)
24040 + __this_fixmap_does_not_exist();
24041 +
24042 + return __fix_to_virt(idx);
24043 +}
24044 +
24045 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
24046 +{
24047 + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24048 + return __virt_to_fix(vaddr);
24049 +}
24050 #endif
24051 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_32.h
24052 ===================================================================
24053 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2008-12-01 11:44:55.000000000 +0100
24054 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_32.h 2008-12-01 11:49:07.000000000 +0100
24055 @@ -58,10 +58,17 @@ enum fixed_addresses {
24056 #ifdef CONFIG_X86_LOCAL_APIC
24057 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24058 #endif
24059 -#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
24060 +#ifndef CONFIG_XEN
24061 +#ifdef CONFIG_X86_IO_APIC
24062 FIX_IO_APIC_BASE_0,
24063 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
24064 #endif
24065 +#else
24066 + FIX_SHARED_INFO,
24067 +#define NR_FIX_ISAMAPS 256
24068 + FIX_ISAMAP_END,
24069 + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24070 +#endif
24071 #ifdef CONFIG_X86_VISWS_APIC
24072 FIX_CO_CPU, /* Cobalt timer */
24073 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
24074 @@ -78,51 +85,38 @@ enum fixed_addresses {
24075 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
24076 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
24077 #endif
24078 -#ifdef CONFIG_ACPI
24079 - FIX_ACPI_BEGIN,
24080 - FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24081 -#endif
24082 #ifdef CONFIG_PCI_MMCONFIG
24083 FIX_PCIE_MCFG,
24084 #endif
24085 #ifdef CONFIG_PARAVIRT
24086 FIX_PARAVIRT_BOOTMAP,
24087 #endif
24088 - FIX_SHARED_INFO,
24089 -#define NR_FIX_ISAMAPS 256
24090 - FIX_ISAMAP_END,
24091 - FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24092 __end_of_permanent_fixed_addresses,
24093 /*
24094 * 256 temporary boot-time mappings, used by early_ioremap(),
24095 * before ioremap() is functional.
24096 *
24097 - * We round it up to the next 512 pages boundary so that we
24098 + * We round it up to the next 256 pages boundary so that we
24099 * can have a single pgd entry and a single pte table:
24100 */
24101 #define NR_FIX_BTMAPS 64
24102 #define FIX_BTMAPS_NESTING 4
24103 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24104 - (__end_of_permanent_fixed_addresses & 511),
24105 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
24106 + (__end_of_permanent_fixed_addresses & 255),
24107 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24108 FIX_WP_TEST,
24109 +#ifdef CONFIG_ACPI
24110 + FIX_ACPI_BEGIN,
24111 + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24112 +#endif
24113 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24114 FIX_OHCI1394_BASE,
24115 #endif
24116 __end_of_fixed_addresses
24117 };
24118
24119 -extern void __set_fixmap(enum fixed_addresses idx,
24120 - maddr_t phys, pgprot_t flags);
24121 extern void reserve_top_address(unsigned long reserve);
24122
24123 -#define set_fixmap(idx, phys) \
24124 - __set_fixmap(idx, phys, PAGE_KERNEL)
24125 -/*
24126 - * Some hardware wants to get fixmapped without caching.
24127 - */
24128 -#define set_fixmap_nocache(idx, phys) \
24129 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24130
24131 #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
24132
24133 @@ -131,38 +125,5 @@ extern void reserve_top_address(unsigned
24134 #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
24135 #define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
24136
24137 -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24138 -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24139 -
24140 -extern void __this_fixmap_does_not_exist(void);
24141 -
24142 -/*
24143 - * 'index to address' translation. If anyone tries to use the idx
24144 - * directly without tranlation, we catch the bug with a NULL-deference
24145 - * kernel oops. Illegal ranges of incoming indices are caught too.
24146 - */
24147 -static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24148 -{
24149 - /*
24150 - * this branch gets completely eliminated after inlining,
24151 - * except when someone tries to use fixaddr indices in an
24152 - * illegal way. (such as mixing up address types or using
24153 - * out-of-range indices).
24154 - *
24155 - * If it doesn't get removed, the linker will complain
24156 - * loudly with a reasonably clear error message..
24157 - */
24158 - if (idx >= __end_of_fixed_addresses)
24159 - __this_fixmap_does_not_exist();
24160 -
24161 - return __fix_to_virt(idx);
24162 -}
24163 -
24164 -static inline unsigned long virt_to_fix(const unsigned long vaddr)
24165 -{
24166 - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24167 - return __virt_to_fix(vaddr);
24168 -}
24169 -
24170 #endif /* !__ASSEMBLY__ */
24171 #endif
24172 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_64.h
24173 ===================================================================
24174 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2008-12-01 11:44:55.000000000 +0100
24175 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_64.h 2008-12-01 11:49:07.000000000 +0100
24176 @@ -12,6 +12,7 @@
24177 #define _ASM_FIXMAP_64_H
24178
24179 #include <linux/kernel.h>
24180 +#include <asm/acpi.h>
24181 #include <asm/apicdef.h>
24182 #include <asm/page.h>
24183 #include <asm/vsyscall.h>
24184 @@ -40,7 +41,6 @@ enum fixed_addresses {
24185 VSYSCALL_HPET,
24186 FIX_DBGP_BASE,
24187 FIX_EARLYCON_MEM_BASE,
24188 - FIX_HPET_BASE,
24189 #ifdef CONFIG_X86_LOCAL_APIC
24190 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24191 #endif
24192 @@ -53,14 +53,21 @@ enum fixed_addresses {
24193 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
24194 + MAX_EFI_IO_PAGES - 1,
24195 #endif
24196 +#ifdef CONFIG_PARAVIRT
24197 + FIX_PARAVIRT_BOOTMAP,
24198 +#else
24199 + FIX_SHARED_INFO,
24200 +#endif
24201 #ifdef CONFIG_ACPI
24202 FIX_ACPI_BEGIN,
24203 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24204 #endif
24205 - FIX_SHARED_INFO,
24206 #define NR_FIX_ISAMAPS 256
24207 FIX_ISAMAP_END,
24208 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24209 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24210 + FIX_OHCI1394_BASE,
24211 +#endif
24212 __end_of_permanent_fixed_addresses,
24213 /*
24214 * 256 temporary boot-time mappings, used by early_ioremap(),
24215 @@ -71,27 +78,12 @@ enum fixed_addresses {
24216 */
24217 #define NR_FIX_BTMAPS 64
24218 #define FIX_BTMAPS_NESTING 4
24219 - FIX_BTMAP_END =
24220 - __end_of_permanent_fixed_addresses + 512 -
24221 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24222 (__end_of_permanent_fixed_addresses & 511),
24223 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24224 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24225 - FIX_OHCI1394_BASE,
24226 -#endif
24227 __end_of_fixed_addresses
24228 };
24229
24230 -extern void __set_fixmap(enum fixed_addresses idx,
24231 - unsigned long phys, pgprot_t flags);
24232 -
24233 -#define set_fixmap(idx, phys) \
24234 - __set_fixmap(idx, phys, PAGE_KERNEL)
24235 -/*
24236 - * Some hardware wants to get fixmapped without caching.
24237 - */
24238 -#define set_fixmap_nocache(idx, phys) \
24239 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24240 -
24241 #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
24242 #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
24243 #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
24244 @@ -100,30 +92,4 @@ extern void __set_fixmap(enum fixed_addr
24245 #define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
24246 #define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
24247
24248 -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24249 -
24250 -extern void __this_fixmap_does_not_exist(void);
24251 -
24252 -/*
24253 - * 'index to address' translation. If anyone tries to use the idx
24254 - * directly without translation, we catch the bug with a NULL-deference
24255 - * kernel oops. Illegal ranges of incoming indices are caught too.
24256 - */
24257 -static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24258 -{
24259 - /*
24260 - * this branch gets completely eliminated after inlining,
24261 - * except when someone tries to use fixaddr indices in an
24262 - * illegal way. (such as mixing up address types or using
24263 - * out-of-range indices).
24264 - *
24265 - * If it doesn't get removed, the linker will complain
24266 - * loudly with a reasonably clear error message..
24267 - */
24268 - if (idx >= __end_of_fixed_addresses)
24269 - __this_fixmap_does_not_exist();
24270 -
24271 - return __fix_to_virt(idx);
24272 -}
24273 -
24274 #endif
24275 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/highmem.h
24276 ===================================================================
24277 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/highmem.h 2008-12-01 11:44:55.000000000 +0100
24278 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/highmem.h 2008-12-01 11:49:07.000000000 +0100
24279 @@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *p
24280
24281 #define flush_cache_kmaps() do { } while (0)
24282
24283 +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
24284 + unsigned long end_pfn);
24285 +
24286 void clear_highpage(struct page *);
24287 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
24288 {
24289 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/hypercall.h
24290 ===================================================================
24291 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/hypercall.h 2008-12-01 11:36:55.000000000 +0100
24292 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/hypercall.h 2008-12-01 11:49:07.000000000 +0100
24293 @@ -323,9 +323,19 @@ static inline int __must_check
24294 HYPERVISOR_grant_table_op(
24295 unsigned int cmd, void *uop, unsigned int count)
24296 {
24297 + bool fixup = false;
24298 + int rc;
24299 +
24300 if (arch_use_lazy_mmu_mode())
24301 xen_multicall_flush(false);
24302 - return _hypercall3(int, grant_table_op, cmd, uop, count);
24303 +#ifdef GNTTABOP_map_grant_ref
24304 + if (cmd == GNTTABOP_map_grant_ref)
24305 +#endif
24306 + fixup = gnttab_pre_map_adjust(cmd, uop, count);
24307 + rc = _hypercall3(int, grant_table_op, cmd, uop, count);
24308 + if (rc == 0 && fixup)
24309 + rc = gnttab_post_map_adjust(uop, count);
24310 + return rc;
24311 }
24312
24313 static inline int __must_check
24314 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/hypervisor.h
24315 ===================================================================
24316 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2008-12-01 11:37:10.000000000 +0100
24317 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/hypervisor.h 2008-12-01 11:49:07.000000000 +0100
24318 @@ -35,7 +35,6 @@
24319
24320 #include <linux/types.h>
24321 #include <linux/kernel.h>
24322 -#include <linux/version.h>
24323 #include <linux/errno.h>
24324 #include <xen/interface/xen.h>
24325 #include <xen/interface/platform.h>
24326 @@ -171,6 +170,20 @@ static inline void arch_flush_lazy_mmu_m
24327 }
24328 #endif
24329
24330 +struct gnttab_map_grant_ref;
24331 +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
24332 + unsigned int count);
24333 +#if CONFIG_XEN_COMPAT < 0x030400
24334 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
24335 +#else
24336 +static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
24337 + unsigned int count)
24338 +{
24339 + BUG();
24340 + return -ENOSYS;
24341 +}
24342 +#endif
24343 +
24344 #else /* CONFIG_XEN */
24345
24346 static inline void xen_multicall_flush(bool ignore) {}
24347 @@ -179,6 +192,9 @@ static inline void xen_multicall_flush(b
24348 #define xen_multi_mmu_update(...) ({ BUG(); -ENOSYS; })
24349 #define xen_multi_mmuext_op(...) ({ BUG(); -ENOSYS; })
24350
24351 +#define gnttab_pre_map_adjust(...) false
24352 +#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
24353 +
24354 #endif /* CONFIG_XEN */
24355
24356 #if defined(CONFIG_X86_64)
24357 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/io.h
24358 ===================================================================
24359 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/io.h 2008-12-01 11:44:55.000000000 +0100
24360 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/io.h 2008-12-01 11:49:07.000000000 +0100
24361 @@ -3,6 +3,76 @@
24362
24363 #define ARCH_HAS_IOREMAP_WC
24364
24365 +#include <linux/compiler.h>
24366 +
24367 +/*
24368 + * early_ioremap() and early_iounmap() are for temporary early boot-time
24369 + * mappings, before the real ioremap() is functional.
24370 + * A boot-time mapping is currently limited to at most 16 pages.
24371 + */
24372 +#ifndef __ASSEMBLY__
24373 +extern void early_ioremap_init(void);
24374 +extern void early_ioremap_clear(void);
24375 +extern void early_ioremap_reset(void);
24376 +extern void *early_ioremap(unsigned long offset, unsigned long size);
24377 +extern void early_iounmap(void *addr, unsigned long size);
24378 +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24379 +#endif
24380 +
24381 +#define build_mmio_read(name, size, type, reg, barrier) \
24382 +static inline type name(const volatile void __iomem *addr) \
24383 +{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
24384 +:"m" (*(volatile type __force *)addr) barrier); return ret; }
24385 +
24386 +#define build_mmio_write(name, size, type, reg, barrier) \
24387 +static inline void name(type val, volatile void __iomem *addr) \
24388 +{ asm volatile("mov" size " %0,%1": :reg (val), \
24389 +"m" (*(volatile type __force *)addr) barrier); }
24390 +
24391 +build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
24392 +build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
24393 +build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
24394 +
24395 +build_mmio_read(__readb, "b", unsigned char, "=q", )
24396 +build_mmio_read(__readw, "w", unsigned short, "=r", )
24397 +build_mmio_read(__readl, "l", unsigned int, "=r", )
24398 +
24399 +build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
24400 +build_mmio_write(writew, "w", unsigned short, "r", :"memory")
24401 +build_mmio_write(writel, "l", unsigned int, "r", :"memory")
24402 +
24403 +build_mmio_write(__writeb, "b", unsigned char, "q", )
24404 +build_mmio_write(__writew, "w", unsigned short, "r", )
24405 +build_mmio_write(__writel, "l", unsigned int, "r", )
24406 +
24407 +#define readb_relaxed(a) __readb(a)
24408 +#define readw_relaxed(a) __readw(a)
24409 +#define readl_relaxed(a) __readl(a)
24410 +#define __raw_readb __readb
24411 +#define __raw_readw __readw
24412 +#define __raw_readl __readl
24413 +
24414 +#define __raw_writeb __writeb
24415 +#define __raw_writew __writew
24416 +#define __raw_writel __writel
24417 +
24418 +#define mmiowb() barrier()
24419 +
24420 +#ifdef CONFIG_X86_64
24421 +build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
24422 +build_mmio_read(__readq, "q", unsigned long, "=r", )
24423 +build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
24424 +build_mmio_write(__writeq, "q", unsigned long, "r", )
24425 +
24426 +#define readq_relaxed(a) __readq(a)
24427 +#define __raw_readq __readq
24428 +#define __raw_writeq writeq
24429 +
24430 +/* Let people know we have them */
24431 +#define readq readq
24432 +#define writeq writeq
24433 +#endif
24434 +
24435 #ifdef CONFIG_X86_32
24436 # include "io_32.h"
24437 #else
24438 @@ -19,4 +89,17 @@ extern int ioremap_check_change_attr(uns
24439 unsigned long prot_val);
24440 extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
24441
24442 +/*
24443 + * early_ioremap() and early_iounmap() are for temporary early boot-time
24444 + * mappings, before the real ioremap() is functional.
24445 + * A boot-time mapping is currently limited to at most 16 pages.
24446 + */
24447 +extern void early_ioremap_init(void);
24448 +extern void early_ioremap_clear(void);
24449 +extern void early_ioremap_reset(void);
24450 +extern void *early_ioremap(unsigned long offset, unsigned long size);
24451 +extern void early_iounmap(void *addr, unsigned long size);
24452 +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24453 +
24454 +
24455 #endif /* _ASM_X86_IO_H */
24456 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/io_32.h
24457 ===================================================================
24458 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/io_32.h 2008-12-01 11:44:55.000000000 +0100
24459 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/io_32.h 2008-12-01 11:49:07.000000000 +0100
24460 @@ -123,6 +123,8 @@ static inline void *phys_to_virt(unsigne
24461 */
24462 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
24463 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
24464 +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
24465 + unsigned long prot_val);
24466
24467 /*
24468 * The default ioremap() behavior is non-cached:
24469 @@ -135,18 +137,6 @@ static inline void __iomem *ioremap(reso
24470 extern void iounmap(volatile void __iomem *addr);
24471
24472 /*
24473 - * early_ioremap() and early_iounmap() are for temporary early boot-time
24474 - * mappings, before the real ioremap() is functional.
24475 - * A boot-time mapping is currently limited to at most 16 pages.
24476 - */
24477 -extern void early_ioremap_init(void);
24478 -extern void early_ioremap_clear(void);
24479 -extern void early_ioremap_reset(void);
24480 -extern void *early_ioremap(unsigned long offset, unsigned long size);
24481 -extern void early_iounmap(void *addr, unsigned long size);
24482 -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24483 -
24484 -/*
24485 * ISA I/O bus memory addresses are 1:1 with the physical address.
24486 */
24487 #define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
24488 @@ -162,55 +152,6 @@ extern void __iomem *fix_ioremap(unsigne
24489 #define virt_to_bus(_x) phys_to_machine(__pa(_x))
24490 #define bus_to_virt(_x) __va(machine_to_phys(_x))
24491
24492 -/*
24493 - * readX/writeX() are used to access memory mapped devices. On some
24494 - * architectures the memory mapped IO stuff needs to be accessed
24495 - * differently. On the x86 architecture, we just read/write the
24496 - * memory location directly.
24497 - */
24498 -
24499 -static inline unsigned char readb(const volatile void __iomem *addr)
24500 -{
24501 - return *(volatile unsigned char __force *)addr;
24502 -}
24503 -
24504 -static inline unsigned short readw(const volatile void __iomem *addr)
24505 -{
24506 - return *(volatile unsigned short __force *)addr;
24507 -}
24508 -
24509 -static inline unsigned int readl(const volatile void __iomem *addr)
24510 -{
24511 - return *(volatile unsigned int __force *) addr;
24512 -}
24513 -
24514 -#define readb_relaxed(addr) readb(addr)
24515 -#define readw_relaxed(addr) readw(addr)
24516 -#define readl_relaxed(addr) readl(addr)
24517 -#define __raw_readb readb
24518 -#define __raw_readw readw
24519 -#define __raw_readl readl
24520 -
24521 -static inline void writeb(unsigned char b, volatile void __iomem *addr)
24522 -{
24523 - *(volatile unsigned char __force *)addr = b;
24524 -}
24525 -
24526 -static inline void writew(unsigned short b, volatile void __iomem *addr)
24527 -{
24528 - *(volatile unsigned short __force *)addr = b;
24529 -}
24530 -
24531 -static inline void writel(unsigned int b, volatile void __iomem *addr)
24532 -{
24533 - *(volatile unsigned int __force *)addr = b;
24534 -}
24535 -#define __raw_writeb writeb
24536 -#define __raw_writew writew
24537 -#define __raw_writel writel
24538 -
24539 -#define mmiowb()
24540 -
24541 static inline void
24542 memset_io(volatile void __iomem *addr, unsigned char val, int count)
24543 {
24544 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/io_64.h
24545 ===================================================================
24546 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/io_64.h 2008-12-01 11:44:55.000000000 +0100
24547 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/io_64.h 2008-12-01 11:49:07.000000000 +0100
24548 @@ -190,6 +190,8 @@ extern void early_iounmap(void *addr, un
24549 */
24550 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
24551 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
24552 +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
24553 + unsigned long prot_val);
24554
24555 /*
24556 * The default ioremap() behavior is non-cached:
24557 @@ -220,77 +222,6 @@ extern void __iomem *fix_ioremap(unsigne
24558 #define virt_to_bus(_x) phys_to_machine(__pa(_x))
24559 #define bus_to_virt(_x) __va(machine_to_phys(_x))
24560
24561 -/*
24562 - * readX/writeX() are used to access memory mapped devices. On some
24563 - * architectures the memory mapped IO stuff needs to be accessed
24564 - * differently. On the x86 architecture, we just read/write the
24565 - * memory location directly.
24566 - */
24567 -
24568 -static inline __u8 __readb(const volatile void __iomem *addr)
24569 -{
24570 - return *(__force volatile __u8 *)addr;
24571 -}
24572 -
24573 -static inline __u16 __readw(const volatile void __iomem *addr)
24574 -{
24575 - return *(__force volatile __u16 *)addr;
24576 -}
24577 -
24578 -static __always_inline __u32 __readl(const volatile void __iomem *addr)
24579 -{
24580 - return *(__force volatile __u32 *)addr;
24581 -}
24582 -
24583 -static inline __u64 __readq(const volatile void __iomem *addr)
24584 -{
24585 - return *(__force volatile __u64 *)addr;
24586 -}
24587 -
24588 -#define readb(x) __readb(x)
24589 -#define readw(x) __readw(x)
24590 -#define readl(x) __readl(x)
24591 -#define readq(x) __readq(x)
24592 -#define readb_relaxed(a) readb(a)
24593 -#define readw_relaxed(a) readw(a)
24594 -#define readl_relaxed(a) readl(a)
24595 -#define readq_relaxed(a) readq(a)
24596 -#define __raw_readb readb
24597 -#define __raw_readw readw
24598 -#define __raw_readl readl
24599 -#define __raw_readq readq
24600 -
24601 -#define mmiowb()
24602 -
24603 -static inline void __writel(__u32 b, volatile void __iomem *addr)
24604 -{
24605 - *(__force volatile __u32 *)addr = b;
24606 -}
24607 -
24608 -static inline void __writeq(__u64 b, volatile void __iomem *addr)
24609 -{
24610 - *(__force volatile __u64 *)addr = b;
24611 -}
24612 -
24613 -static inline void __writeb(__u8 b, volatile void __iomem *addr)
24614 -{
24615 - *(__force volatile __u8 *)addr = b;
24616 -}
24617 -
24618 -static inline void __writew(__u16 b, volatile void __iomem *addr)
24619 -{
24620 - *(__force volatile __u16 *)addr = b;
24621 -}
24622 -
24623 -#define writeq(val, addr) __writeq((val), (addr))
24624 -#define writel(val, addr) __writel((val), (addr))
24625 -#define writew(val, addr) __writew((val), (addr))
24626 -#define writeb(val, addr) __writeb((val), (addr))
24627 -#define __raw_writeb writeb
24628 -#define __raw_writew writew
24629 -#define __raw_writel writel
24630 -#define __raw_writeq writeq
24631 -
24632 void __memcpy_fromio(void *, unsigned long, unsigned);
24633 void __memcpy_toio(unsigned long, const void *, unsigned);
24634
24635 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/irq_vectors.h
24636 ===================================================================
24637 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
24638 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/irq_vectors.h 2008-12-01 11:49:07.000000000 +0100
24639 @@ -0,0 +1,52 @@
24640 +#ifndef _ASM_IRQ_VECTORS_H
24641 +#define _ASM_IRQ_VECTORS_H
24642 +
24643 +#ifdef CONFIG_X86_32
24644 +# define SYSCALL_VECTOR 0x80
24645 +#else
24646 +# define IA32_SYSCALL_VECTOR 0x80
24647 +#endif
24648 +
24649 +#define RESCHEDULE_VECTOR 0
24650 +#define CALL_FUNCTION_VECTOR 1
24651 +#define CALL_FUNC_SINGLE_VECTOR 2
24652 +#define SPIN_UNLOCK_VECTOR 3
24653 +#define NR_IPIS 4
24654 +
24655 +/*
24656 + * The maximum number of vectors supported by i386 processors
24657 + * is limited to 256. For processors other than i386, NR_VECTORS
24658 + * should be changed accordingly.
24659 + */
24660 +#define NR_VECTORS 256
24661 +
24662 +#define FIRST_VM86_IRQ 3
24663 +#define LAST_VM86_IRQ 15
24664 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
24665 +
24666 +/*
24667 + * The flat IRQ space is divided into two regions:
24668 + * 1. A one-to-one mapping of real physical IRQs. This space is only used
24669 + * if we have physical device-access privilege. This region is at the
24670 + * start of the IRQ space so that existing device drivers do not need
24671 + * to be modified to translate physical IRQ numbers into our IRQ space.
24672 + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
24673 + * are bound using the provided bind/unbind functions.
24674 + */
24675 +
24676 +#define PIRQ_BASE 0
24677 +#if !defined(MAX_IO_APICS)
24678 +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
24679 +#elif NR_CPUS < MAX_IO_APICS
24680 +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
24681 +#else
24682 +# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
24683 +#endif
24684 +
24685 +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
24686 +#define NR_DYNIRQS 256
24687 +
24688 +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
24689 +#define NR_IRQ_VECTORS NR_IRQS
24690 +
24691 +#endif /* _ASM_IRQ_VECTORS_H */
24692 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/irqflags.h
24693 ===================================================================
24694 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/irqflags.h 2008-12-01 11:44:55.000000000 +0100
24695 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/irqflags.h 2008-12-01 11:49:07.000000000 +0100
24696 @@ -118,7 +118,7 @@ static inline void halt(void)
24697
24698 #ifndef CONFIG_X86_64
24699 #define INTERRUPT_RETURN iret
24700 -#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
24701 +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
24702 sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
24703 __TEST_PENDING ; \
24704 jnz 14f /* process more events if necessary... */ ; \
24705 @@ -175,18 +175,6 @@ static inline void trace_hardirqs_fixup_
24706 #else
24707
24708 #ifdef CONFIG_X86_64
24709 -/*
24710 - * Currently paravirt can't handle swapgs nicely when we
24711 - * don't have a stack we can rely on (such as a user space
24712 - * stack). So we either find a way around these or just fault
24713 - * and emulate if a guest tries to call swapgs directly.
24714 - *
24715 - * Either way, this is a good way to document that we don't
24716 - * have a reliable stack. x86_64 only.
24717 - */
24718 -#define SWAPGS_UNSAFE_STACK swapgs
24719 -#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
24720 -#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
24721 #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
24722 #define ARCH_LOCKDEP_SYS_EXIT_IRQ \
24723 TRACE_IRQS_ON; \
24724 @@ -198,24 +186,6 @@ static inline void trace_hardirqs_fixup_
24725 TRACE_IRQS_OFF;
24726
24727 #else
24728 -#define ARCH_TRACE_IRQS_ON \
24729 - pushl %eax; \
24730 - pushl %ecx; \
24731 - pushl %edx; \
24732 - call trace_hardirqs_on; \
24733 - popl %edx; \
24734 - popl %ecx; \
24735 - popl %eax;
24736 -
24737 -#define ARCH_TRACE_IRQS_OFF \
24738 - pushl %eax; \
24739 - pushl %ecx; \
24740 - pushl %edx; \
24741 - call trace_hardirqs_off; \
24742 - popl %edx; \
24743 - popl %ecx; \
24744 - popl %eax;
24745 -
24746 #define ARCH_LOCKDEP_SYS_EXIT \
24747 pushl %eax; \
24748 pushl %ecx; \
24749 @@ -229,8 +199,8 @@ static inline void trace_hardirqs_fixup_
24750 #endif
24751
24752 #ifdef CONFIG_TRACE_IRQFLAGS
24753 -# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
24754 -# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
24755 +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
24756 +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
24757 #else
24758 # define TRACE_IRQS_ON
24759 # define TRACE_IRQS_OFF
24760 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context.h
24761 ===================================================================
24762 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/mmu_context.h 2008-12-01 11:36:55.000000000 +0100
24763 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context.h 2008-12-01 11:49:07.000000000 +0100
24764 @@ -1,5 +1,42 @@
24765 +#ifndef __ASM_X86_MMU_CONTEXT_H
24766 +#define __ASM_X86_MMU_CONTEXT_H
24767 +
24768 +#include <asm/desc.h>
24769 +#include <asm/atomic.h>
24770 +#include <asm/pgalloc.h>
24771 +#include <asm/tlbflush.h>
24772 +
24773 +void arch_exit_mmap(struct mm_struct *mm);
24774 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24775 +
24776 +void mm_pin(struct mm_struct *mm);
24777 +void mm_unpin(struct mm_struct *mm);
24778 +void mm_pin_all(void);
24779 +
24780 +static inline void xen_activate_mm(struct mm_struct *prev,
24781 + struct mm_struct *next)
24782 +{
24783 + if (!PagePinned(virt_to_page(next->pgd)))
24784 + mm_pin(next);
24785 +}
24786 +
24787 +/*
24788 + * Used for LDT copy/destruction.
24789 + */
24790 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24791 +void destroy_context(struct mm_struct *mm);
24792 +
24793 #ifdef CONFIG_X86_32
24794 # include "mmu_context_32.h"
24795 #else
24796 # include "mmu_context_64.h"
24797 #endif
24798 +
24799 +#define activate_mm(prev, next) \
24800 +do { \
24801 + xen_activate_mm(prev, next); \
24802 + switch_mm((prev), (next), NULL); \
24803 +} while (0);
24804 +
24805 +
24806 +#endif /* __ASM_X86_MMU_CONTEXT_H */
24807 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_32.h
24808 ===================================================================
24809 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2008-12-01 11:44:55.000000000 +0100
24810 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_32.h 2008-12-01 11:49:07.000000000 +0100
24811 @@ -1,32 +1,6 @@
24812 #ifndef __I386_SCHED_H
24813 #define __I386_SCHED_H
24814
24815 -#include <asm/desc.h>
24816 -#include <asm/atomic.h>
24817 -#include <asm/pgalloc.h>
24818 -#include <asm/tlbflush.h>
24819 -
24820 -void arch_exit_mmap(struct mm_struct *mm);
24821 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24822 -
24823 -void mm_pin(struct mm_struct *mm);
24824 -void mm_unpin(struct mm_struct *mm);
24825 -void mm_pin_all(void);
24826 -
24827 -static inline void xen_activate_mm(struct mm_struct *prev,
24828 - struct mm_struct *next)
24829 -{
24830 - if (!PagePinned(virt_to_page(next->pgd)))
24831 - mm_pin(next);
24832 -}
24833 -
24834 -/*
24835 - * Used for LDT copy/destruction.
24836 - */
24837 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24838 -void destroy_context(struct mm_struct *mm);
24839 -
24840 -
24841 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24842 {
24843 #if 0 /* XEN: no lazy tlb */
24844 @@ -107,10 +81,4 @@ static inline void switch_mm(struct mm_s
24845 #define deactivate_mm(tsk, mm) \
24846 asm("movl %0,%%gs": :"r" (0));
24847
24848 -#define activate_mm(prev, next) \
24849 -do { \
24850 - xen_activate_mm(prev, next); \
24851 - switch_mm((prev), (next), NULL); \
24852 -} while (0)
24853 -
24854 #endif
24855 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_64.h
24856 ===================================================================
24857 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2008-12-01 11:44:55.000000000 +0100
24858 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_64.h 2008-12-01 11:49:07.000000000 +0100
24859 @@ -1,23 +1,6 @@
24860 #ifndef __X86_64_MMU_CONTEXT_H
24861 #define __X86_64_MMU_CONTEXT_H
24862
24863 -#include <asm/desc.h>
24864 -#include <asm/atomic.h>
24865 -#include <asm/pgalloc.h>
24866 -#include <asm/page.h>
24867 -#include <asm/pda.h>
24868 -#include <asm/pgtable.h>
24869 -#include <asm/tlbflush.h>
24870 -
24871 -void arch_exit_mmap(struct mm_struct *mm);
24872 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24873 -
24874 -/*
24875 - * possibly do the LDT unload here?
24876 - */
24877 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24878 -void destroy_context(struct mm_struct *mm);
24879 -
24880 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24881 {
24882 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
24883 @@ -58,10 +41,6 @@ static inline void __prepare_arch_switch
24884 }
24885 }
24886
24887 -extern void mm_pin(struct mm_struct *mm);
24888 -extern void mm_unpin(struct mm_struct *mm);
24889 -void mm_pin_all(void);
24890 -
24891 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
24892 struct task_struct *tsk)
24893 {
24894 @@ -124,11 +103,4 @@ do { \
24895 asm volatile("movl %0,%%fs"::"r"(0)); \
24896 } while (0)
24897
24898 -static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
24899 -{
24900 - if (!PagePinned(virt_to_page(next->pgd)))
24901 - mm_pin(next);
24902 - switch_mm(prev, next, NULL);
24903 -}
24904 -
24905 #endif
24906 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/page.h
24907 ===================================================================
24908 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/page.h 2008-12-01 11:44:55.000000000 +0100
24909 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/page.h 2008-12-01 11:49:07.000000000 +0100
24910 @@ -16,9 +16,9 @@
24911 * below. The preprocessor will warn if the two definitions aren't identical.
24912 */
24913 #define _PAGE_BIT_PRESENT 0
24914 -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
24915 -#define _PAGE_BIT_IO 9
24916 -#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
24917 +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
24918 +#define _PAGE_BIT_IO 11
24919 +#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
24920
24921 #define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
24922 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
24923 @@ -28,8 +28,11 @@
24924 (ie, 32-bit PAE). */
24925 #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
24926
24927 -/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
24928 -#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
24929 +/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
24930 +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
24931 +
24932 +/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
24933 +#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
24934
24935 #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
24936 #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
24937 @@ -39,8 +42,7 @@
24938 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
24939 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
24940
24941 -/* to align the pointer to the (next) page boundary */
24942 -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
24943 +#define HUGE_MAX_HSTATE 2
24944
24945 #ifndef __ASSEMBLY__
24946 #include <linux/types.h>
24947 @@ -61,9 +63,17 @@
24948
24949 #ifndef __ASSEMBLY__
24950
24951 +typedef struct { pgdval_t pgd; } pgd_t;
24952 +typedef struct { pgprotval_t pgprot; } pgprot_t;
24953 +
24954 extern int page_is_ram(unsigned long pagenr);
24955 extern int devmem_is_allowed(unsigned long pagenr);
24956 +extern void map_devmem(unsigned long pfn, unsigned long size,
24957 + pgprot_t vma_prot);
24958 +extern void unmap_devmem(unsigned long pfn, unsigned long size,
24959 + pgprot_t vma_prot);
24960
24961 +extern unsigned long max_low_pfn_mapped;
24962 extern unsigned long max_pfn_mapped;
24963
24964 struct page;
24965 @@ -84,15 +94,11 @@ static inline void copy_user_page(void *
24966 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
24967 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
24968
24969 -typedef struct { pgprotval_t pgprot; } pgprot_t;
24970 -
24971 #define pgprot_val(x) ((x).pgprot)
24972 #define __pgprot(x) ((pgprot_t) { (x) } )
24973
24974 #include <asm/maddr.h>
24975
24976 -typedef struct { pgdval_t pgd; } pgd_t;
24977 -
24978 #define __pgd_ma(x) ((pgd_t) { (x) } )
24979 static inline pgd_t xen_make_pgd(pgdval_t val)
24980 {
24981 @@ -196,6 +202,11 @@ static inline pteval_t xen_pte_val(pte_t
24982 return ret;
24983 }
24984
24985 +static inline pteval_t xen_pte_flags(pte_t pte)
24986 +{
24987 + return __pte_val(pte) & PTE_FLAGS_MASK;
24988 +}
24989 +
24990 #define pgd_val(x) xen_pgd_val(x)
24991 #define __pgd(x) xen_make_pgd(x)
24992
24993 @@ -210,6 +221,7 @@ static inline pteval_t xen_pte_val(pte_t
24994 #endif
24995
24996 #define pte_val(x) xen_pte_val(x)
24997 +#define pte_flags(x) xen_pte_flags(x)
24998 #define __pte(x) xen_make_pte(x)
24999
25000 #define __pa(x) __phys_addr((unsigned long)(x))
25001 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/page_64.h
25002 ===================================================================
25003 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/page_64.h 2008-12-01 11:44:55.000000000 +0100
25004 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/page_64.h 2008-12-01 11:49:07.000000000 +0100
25005 @@ -26,6 +26,12 @@
25006 #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
25007 #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
25008
25009 +/*
25010 + * Set __PAGE_OFFSET to the most negative possible address +
25011 + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
25012 + * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
25013 + * what Xen requires.
25014 + */
25015 #define __PAGE_OFFSET _AC(0xffff880000000000, UL)
25016
25017 #define __PHYSICAL_START CONFIG_PHYSICAL_START
25018 @@ -63,7 +69,8 @@
25019 void clear_page(void *page);
25020 void copy_page(void *to, void *from);
25021
25022 -extern unsigned long end_pfn;
25023 +/* duplicated to the one in bootmem.h */
25024 +extern unsigned long max_pfn;
25025
25026 static inline unsigned long __phys_addr(unsigned long x)
25027 {
25028 @@ -91,6 +98,11 @@ typedef union { pteval_t pte; unsigned i
25029 extern unsigned long init_memory_mapping(unsigned long start,
25030 unsigned long end);
25031
25032 +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
25033 +
25034 +extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
25035 +extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
25036 +
25037 #endif /* !__ASSEMBLY__ */
25038
25039 #ifdef CONFIG_FLATMEM
25040 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pci.h
25041 ===================================================================
25042 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pci.h 2008-12-01 11:44:55.000000000 +0100
25043 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pci.h 2008-12-01 11:49:07.000000000 +0100
25044 @@ -21,6 +21,8 @@ struct pci_sysdata {
25045 #endif
25046 };
25047
25048 +extern int pci_routeirq;
25049 +
25050 /* scan a bus after allocating a pci_sysdata for it */
25051 extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
25052 int node);
25053 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pci_32.h
25054 ===================================================================
25055 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pci_32.h 2008-12-01 11:36:55.000000000 +0100
25056 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pci_32.h 2008-12-01 11:49:07.000000000 +0100
25057 @@ -38,12 +38,14 @@ struct pci_dev;
25058 #define PCI_DMA_BUS_IS_PHYS (1)
25059
25060 /* pci_unmap_{page,single} is a nop so... */
25061 -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
25062 -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
25063 -#define pci_unmap_addr(PTR, ADDR_NAME) (0)
25064 -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
25065 -#define pci_unmap_len(PTR, LEN_NAME) (0)
25066 -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
25067 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
25068 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
25069 +#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
25070 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
25071 + do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
25072 +#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
25073 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
25074 + do { break; } while (pci_unmap_len(PTR, LEN_NAME))
25075
25076 #endif
25077
25078 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgalloc.h
25079 ===================================================================
25080 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgalloc.h 2008-12-01 11:44:55.000000000 +0100
25081 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgalloc.h 2008-12-01 11:49:07.000000000 +0100
25082 @@ -7,6 +7,9 @@
25083
25084 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
25085
25086 +static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
25087 +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
25088 +
25089 static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
25090 static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
25091 static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
25092 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable.h
25093 ===================================================================
25094 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable.h 2008-12-01 11:44:55.000000000 +0100
25095 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable.h 2008-12-01 11:49:07.000000000 +0100
25096 @@ -13,11 +13,12 @@
25097 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
25098 #define _PAGE_BIT_PAT 7 /* on 4KB pages */
25099 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
25100 -#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
25101 +#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
25102 +#define _PAGE_BIT_UNUSED2 10
25103 +#define _PAGE_BIT_IO 11 /* Mapped page is I/O or foreign and
25104 * has no associated page struct. */
25105 -#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
25106 -#define _PAGE_BIT_UNUSED3 11
25107 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
25108 +#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
25109 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
25110
25111 /* If _PAGE_BIT_PRESENT is clear, we use these: */
25112 @@ -28,34 +29,31 @@
25113 /* if the user mapped it with PROT_NONE; pte_present gives true */
25114 #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
25115
25116 -/*
25117 - * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
25118 - * sign-extended value on 32-bit with all 1's in the upper word,
25119 - * which preserves the upper pte values on 64-bit ptes:
25120 - */
25121 -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25122 -#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
25123 -#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
25124 -#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
25125 -#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
25126 -#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
25127 -#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
25128 -#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
25129 -#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
25130 -#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25131 -#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
25132 -#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
25133 -#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
25134 -#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
25135 +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25136 +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
25137 +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
25138 +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
25139 +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
25140 +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
25141 +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
25142 +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
25143 +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
25144 +#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
25145 +#define _PAGE_UNUSED2 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2)
25146 +#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25147 +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
25148 +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
25149 +#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
25150 +#define __HAVE_ARCH_PTE_SPECIAL
25151
25152 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
25153 -#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
25154 +#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
25155 #else
25156 -#define _PAGE_NX 0
25157 +#define _PAGE_NX (_AT(pteval_t, 0))
25158 #endif
25159
25160 -#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
25161 -#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
25162 +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
25163 +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
25164
25165 #ifndef __ASSEMBLY__
25166 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
25167 @@ -71,8 +69,8 @@ extern unsigned int __kernel_page_user;
25168 _PAGE_DIRTY | __kernel_page_user)
25169
25170 /* Set of bits not changed in pte_modify */
25171 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25172 - _PAGE_ACCESSED | _PAGE_DIRTY)
25173 +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25174 + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
25175
25176 /*
25177 * PAT settings are part of the hypervisor interface, which sets the
25178 @@ -102,19 +100,9 @@ extern unsigned int __kernel_page_user;
25179 #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
25180 _PAGE_ACCESSED)
25181
25182 -#ifdef CONFIG_X86_32
25183 -#define _PAGE_KERNEL_EXEC \
25184 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
25185 -#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
25186 -
25187 -#ifndef __ASSEMBLY__
25188 -extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
25189 -#endif /* __ASSEMBLY__ */
25190 -#else
25191 #define __PAGE_KERNEL_EXEC \
25192 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
25193 #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
25194 -#endif
25195
25196 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
25197 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
25198 @@ -125,25 +113,22 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
25199 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
25200 #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
25201 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
25202 +#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
25203 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
25204
25205 -/*
25206 - * We don't support GLOBAL page in xenolinux64
25207 - */
25208 -#define MAKE_GLOBAL(x) __pgprot((x))
25209 -
25210 -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
25211 -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
25212 -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
25213 -#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
25214 -#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
25215 -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
25216 -#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
25217 -#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
25218 -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
25219 -#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
25220 -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
25221 -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25222 +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
25223 +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
25224 +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
25225 +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
25226 +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
25227 +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
25228 +#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
25229 +#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
25230 +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
25231 +#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
25232 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
25233 +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
25234 +#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25235
25236 /* xwr */
25237 #define __P000 PAGE_NONE
25238 @@ -182,27 +167,27 @@ extern struct list_head pgd_list;
25239 */
25240 static inline int pte_dirty(pte_t pte)
25241 {
25242 - return __pte_val(pte) & _PAGE_DIRTY;
25243 + return pte_flags(pte) & _PAGE_DIRTY;
25244 }
25245
25246 static inline int pte_young(pte_t pte)
25247 {
25248 - return __pte_val(pte) & _PAGE_ACCESSED;
25249 + return pte_flags(pte) & _PAGE_ACCESSED;
25250 }
25251
25252 static inline int pte_write(pte_t pte)
25253 {
25254 - return __pte_val(pte) & _PAGE_RW;
25255 + return pte_flags(pte) & _PAGE_RW;
25256 }
25257
25258 static inline int pte_file(pte_t pte)
25259 {
25260 - return __pte_val(pte) & _PAGE_FILE;
25261 + return pte_flags(pte) & _PAGE_FILE;
25262 }
25263
25264 static inline int pte_huge(pte_t pte)
25265 {
25266 - return __pte_val(pte) & _PAGE_PSE;
25267 + return pte_flags(pte) & _PAGE_PSE;
25268 }
25269
25270 static inline int pte_global(pte_t pte)
25271 @@ -212,12 +197,12 @@ static inline int pte_global(pte_t pte)
25272
25273 static inline int pte_exec(pte_t pte)
25274 {
25275 - return !(__pte_val(pte) & _PAGE_NX);
25276 + return !(pte_flags(pte) & _PAGE_NX);
25277 }
25278
25279 static inline int pte_special(pte_t pte)
25280 {
25281 - return 0;
25282 + return pte_flags(pte) & _PAGE_SPECIAL;
25283 }
25284
25285 static inline int pmd_large(pmd_t pte)
25286 @@ -228,22 +213,22 @@ static inline int pmd_large(pmd_t pte)
25287
25288 static inline pte_t pte_mkclean(pte_t pte)
25289 {
25290 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
25291 + return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY);
25292 }
25293
25294 static inline pte_t pte_mkold(pte_t pte)
25295 {
25296 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
25297 + return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED);
25298 }
25299
25300 static inline pte_t pte_wrprotect(pte_t pte)
25301 {
25302 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
25303 + return __pte_ma(__pte_val(pte) & ~_PAGE_RW);
25304 }
25305
25306 static inline pte_t pte_mkexec(pte_t pte)
25307 {
25308 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
25309 + return __pte_ma(__pte_val(pte) & ~_PAGE_NX);
25310 }
25311
25312 static inline pte_t pte_mkdirty(pte_t pte)
25313 @@ -268,7 +253,7 @@ static inline pte_t pte_mkhuge(pte_t pte
25314
25315 static inline pte_t pte_clrhuge(pte_t pte)
25316 {
25317 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
25318 + return __pte_ma(__pte_val(pte) & ~_PAGE_PSE);
25319 }
25320
25321 static inline pte_t pte_mkglobal(pte_t pte)
25322 @@ -283,35 +268,46 @@ static inline pte_t pte_clrglobal(pte_t
25323
25324 static inline pte_t pte_mkspecial(pte_t pte)
25325 {
25326 - return pte;
25327 + return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL);
25328 }
25329
25330 extern pteval_t __supported_pte_mask;
25331
25332 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
25333 {
25334 - return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
25335 - pgprot_val(pgprot)) & __supported_pte_mask);
25336 + pgprotval_t prot = pgprot_val(pgprot);
25337 +
25338 + if (prot & _PAGE_PRESENT)
25339 + prot &= __supported_pte_mask;
25340 + return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25341 }
25342
25343 static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
25344 {
25345 - return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
25346 - pgprot_val(pgprot)) & __supported_pte_mask);
25347 + pgprotval_t prot = pgprot_val(pgprot);
25348 +
25349 + if (prot & _PAGE_PRESENT)
25350 + prot &= __supported_pte_mask;
25351 + return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25352 }
25353
25354 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
25355 {
25356 - return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
25357 - pgprot_val(pgprot)) & __supported_pte_mask);
25358 + pgprotval_t prot = pgprot_val(pgprot);
25359 +
25360 + if (prot & _PAGE_PRESENT)
25361 + prot &= __supported_pte_mask;
25362 + return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25363 }
25364
25365 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
25366 {
25367 - pteval_t val = pte_val(pte);
25368 + pgprotval_t prot = pgprot_val(newprot);
25369 + pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
25370
25371 - val &= _PAGE_CHG_MASK;
25372 - val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
25373 + if (prot & _PAGE_PRESENT)
25374 + prot &= __supported_pte_mask;
25375 + val |= prot & ~_PAGE_CHG_MASK;
25376
25377 return __pte(val);
25378 }
25379 @@ -325,9 +321,11 @@ static inline pgprot_t pgprot_modify(pgp
25380 return __pgprot(preservebits | addbits);
25381 }
25382
25383 -#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
25384 +#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
25385
25386 -#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
25387 +#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \
25388 + ? pgprot_val(p) & __supported_pte_mask \
25389 + : pgprot_val(p))
25390
25391 #ifndef __ASSEMBLY__
25392 #define __HAVE_PHYS_MEM_ACCESS_PROT
25393 @@ -338,6 +336,17 @@ int phys_mem_access_prot_allowed(struct
25394 unsigned long size, pgprot_t *vma_prot);
25395 #endif
25396
25397 +/* Install a pte for a particular vaddr in kernel space. */
25398 +void set_pte_vaddr(unsigned long vaddr, pte_t pte);
25399 +
25400 +#ifndef CONFIG_XEN
25401 +extern void native_pagetable_setup_start(pgd_t *base);
25402 +extern void native_pagetable_setup_done(pgd_t *base);
25403 +#else
25404 +static inline void xen_pagetable_setup_start(pgd_t *base) {}
25405 +static inline void xen_pagetable_setup_done(pgd_t *base) {}
25406 +#endif
25407 +
25408 #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
25409 #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
25410
25411 @@ -373,6 +382,26 @@ int phys_mem_access_prot_allowed(struct
25412 # include "pgtable_64.h"
25413 #endif
25414
25415 +/*
25416 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25417 + *
25418 + * this macro returns the index of the entry in the pgd page which would
25419 + * control the given virtual address
25420 + */
25421 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25422 +
25423 +/*
25424 + * pgd_offset() returns a (pgd_t *)
25425 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25426 + */
25427 +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25428 +/*
25429 + * a shortcut which implies the use of the kernel's pgd, instead
25430 + * of a process's
25431 + */
25432 +#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25433 +
25434 +
25435 #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
25436 #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
25437
25438 @@ -383,8 +412,15 @@ enum {
25439 PG_LEVEL_4K,
25440 PG_LEVEL_2M,
25441 PG_LEVEL_1G,
25442 + PG_LEVEL_NUM
25443 };
25444
25445 +#ifdef CONFIG_PROC_FS
25446 +extern void update_page_count(int level, unsigned long pages);
25447 +#else
25448 +static inline void update_page_count(int level, unsigned long pages) { }
25449 +#endif
25450 +
25451 /*
25452 * Helper function that returns the kernel pagetable entry controlling
25453 * the virtual address 'address'. NULL means no pagetable entry present.
25454 @@ -441,6 +477,8 @@ static inline void xen_pte_clear(struct
25455 * race with other CPU's that might be updating the dirty
25456 * bit at the same time.
25457 */
25458 +struct vm_area_struct;
25459 +
25460 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
25461 extern int ptep_set_access_flags(struct vm_area_struct *vma,
25462 unsigned long address, pte_t *ptep,
25463 @@ -523,9 +561,6 @@ static inline void clone_pgd_range(pgd_t
25464 memcpy(dst, src, count * sizeof(pgd_t));
25465 }
25466
25467 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
25468 - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
25469 -
25470 #define arbitrary_virt_to_machine(va) \
25471 ({ \
25472 unsigned int __lvl; \
25473 @@ -535,6 +570,44 @@ static inline void clone_pgd_range(pgd_t
25474 | ((unsigned long)(va) & (PAGE_SIZE - 1))); \
25475 })
25476
25477 +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
25478 +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
25479 + pte_t *ptep)
25480 +{
25481 +#if CONFIG_XEN_COMPAT < 0x030300
25482 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
25483 + return ptep_get_and_clear(mm, addr, ptep);
25484 +#endif
25485 + return *ptep;
25486 +}
25487 +
25488 +#ifdef CONFIG_HIGHPTE
25489 +extern void *high_memory;
25490 +#endif
25491 +
25492 +static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
25493 + pte_t *ptep, pte_t pte)
25494 +{
25495 + mmu_update_t u;
25496 +
25497 +#if CONFIG_XEN_COMPAT < 0x030300
25498 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
25499 + set_pte_at(mm, addr, ptep, pte);
25500 + return;
25501 + }
25502 +#endif
25503 +#ifdef CONFIG_HIGHPTE
25504 + if ((void *)ptep > high_memory)
25505 + u.ptr = arbitrary_virt_to_machine(ptep)
25506 + | MMU_PT_UPDATE_PRESERVE_AD;
25507 + else
25508 +#endif
25509 + u.ptr = virt_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
25510 + u.val = __pte_val(pte);
25511 + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
25512 + BUG();
25513 +}
25514 +
25515 #include <asm-generic/pgtable.h>
25516
25517 #include <xen/features.h>
25518 @@ -563,10 +636,6 @@ int touch_pte_range(struct mm_struct *mm
25519 unsigned long address,
25520 unsigned long size);
25521
25522 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
25523 - unsigned long addr, unsigned long end, pgprot_t newprot,
25524 - int dirty_accountable);
25525 -
25526 #endif /* __ASSEMBLY__ */
25527
25528 #endif /* _ASM_X86_PGTABLE_H */
25529 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable-3level.h
25530 ===================================================================
25531 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2008-12-01 11:44:55.000000000 +0100
25532 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable-3level.h 2008-12-01 11:49:07.000000000 +0100
25533 @@ -14,11 +14,11 @@
25534 #define pmd_ERROR(e) \
25535 printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
25536 __FILE__, __LINE__, &(e), __pmd_val(e), \
25537 - (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25538 + (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25539 #define pgd_ERROR(e) \
25540 printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
25541 __FILE__, __LINE__, &(e), __pgd_val(e), \
25542 - (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25543 + (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25544
25545 static inline int pud_none(pud_t pud)
25546 {
25547 @@ -27,7 +27,7 @@ static inline int pud_none(pud_t pud)
25548 }
25549 static inline int pud_bad(pud_t pud)
25550 {
25551 - return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25552 + return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25553 }
25554
25555 static inline int pud_present(pud_t pud)
25556 @@ -102,9 +102,9 @@ static inline void pud_clear(pud_t *pudp
25557 xen_tlb_flush();
25558 }
25559
25560 -#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
25561 +#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))
25562
25563 -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
25564 +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
25565
25566
25567 /* Find an entry in the second-level page table.. */
25568 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_32.h
25569 ===================================================================
25570 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2008-12-01 11:44:55.000000000 +0100
25571 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_32.h 2008-12-01 11:49:07.000000000 +0100
25572 @@ -89,10 +89,10 @@ extern unsigned long pg0[];
25573 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
25574 can temporarily clear it. */
25575 #define pmd_present(x) (__pmd_val(x))
25576 -#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25577 +#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25578 #else
25579 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
25580 -#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25581 +#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25582 #endif
25583
25584
25585 @@ -119,26 +119,6 @@ extern unsigned long pg0[];
25586 */
25587 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
25588
25589 -/*
25590 - * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25591 - *
25592 - * this macro returns the index of the entry in the pgd page which would
25593 - * control the given virtual address
25594 - */
25595 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25596 -#define pgd_index_k(addr) pgd_index((addr))
25597 -
25598 -/*
25599 - * pgd_offset() returns a (pgd_t *)
25600 - * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25601 - */
25602 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25603 -
25604 -/*
25605 - * a shortcut which implies the use of the kernel's pgd, instead
25606 - * of a process's
25607 - */
25608 -#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25609
25610 static inline int pud_large(pud_t pud) { return 0; }
25611
25612 @@ -165,7 +145,7 @@ static inline int pud_large(pud_t pud) {
25613 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25614
25615 #define pmd_page_vaddr(pmd) \
25616 - ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
25617 + ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
25618
25619 #if defined(CONFIG_HIGHPTE)
25620 #define pte_offset_map(dir, address) \
25621 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_64.h
25622 ===================================================================
25623 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-01 11:44:55.000000000 +0100
25624 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-01 11:49:07.000000000 +0100
25625 @@ -23,6 +23,8 @@ extern void xen_init_pt(void);
25626 extern pud_t level3_kernel_pgt[512];
25627 extern pud_t level3_ident_pgt[512];
25628 extern pmd_t level2_kernel_pgt[512];
25629 +extern pmd_t level2_fixmap_pgt[512];
25630 +extern pmd_t level2_ident_pgt[512];
25631 extern pgd_t init_level4_pgt[];
25632
25633 #define swapper_pg_dir init_level4_pgt
25634 @@ -79,6 +81,9 @@ extern void paging_init(void);
25635
25636 struct mm_struct;
25637
25638 +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
25639 +
25640 +
25641 #define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
25642
25643 static inline void xen_set_pte(pte_t *ptep, pte_t pte)
25644 @@ -150,24 +155,24 @@ static inline void xen_pgd_clear(pgd_t *
25645 #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
25646 #define VMEMMAP_START _AC(0xffffe20000000000, UL)
25647 #define MODULES_VADDR _AC(0xffffffffa0000000, UL)
25648 -#define MODULES_END _AC(0xfffffffffff00000, UL)
25649 +#define MODULES_END _AC(0xffffffffff000000, UL)
25650 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
25651
25652 #ifndef __ASSEMBLY__
25653
25654 static inline int pgd_bad(pgd_t pgd)
25655 {
25656 - return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25657 + return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25658 }
25659
25660 static inline int pud_bad(pud_t pud)
25661 {
25662 - return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25663 + return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25664 }
25665
25666 static inline int pmd_bad(pmd_t pmd)
25667 {
25668 - return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25669 + return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25670 }
25671
25672 #define pte_none(x) (!(x).pte)
25673 @@ -175,7 +180,7 @@ static inline int pmd_bad(pmd_t pmd)
25674
25675 #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
25676
25677 -#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
25678 +#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
25679 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
25680 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
25681 #define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
25682 @@ -200,11 +205,8 @@ static inline int pmd_bad(pmd_t pmd)
25683 * Level 4 access.
25684 */
25685 #define pgd_page_vaddr(pgd) \
25686 - ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
25687 + ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
25688 #define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
25689 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25690 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25691 -#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
25692 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
25693 static inline int pgd_large(pgd_t pgd) { return 0; }
25694 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
25695 @@ -226,7 +228,7 @@ static inline int pud_large(pud_t pte)
25696 }
25697
25698 /* PMD - Level 2 access */
25699 -#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
25700 +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
25701 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25702
25703 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
25704 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/processor.h
25705 ===================================================================
25706 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/processor.h 2008-12-01 11:44:55.000000000 +0100
25707 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/processor.h 2008-12-01 11:49:07.000000000 +0100
25708 @@ -134,7 +134,7 @@ extern __u32 cleared_cpu_caps[NCAPINTS
25709 #ifdef CONFIG_SMP
25710 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25711 #define cpu_data(cpu) per_cpu(cpu_info, cpu)
25712 -#define current_cpu_data cpu_data(smp_processor_id())
25713 +#define current_cpu_data __get_cpu_var(cpu_info)
25714 #else
25715 #define cpu_data(cpu) boot_cpu_data
25716 #define current_cpu_data boot_cpu_data
25717 @@ -153,7 +153,7 @@ static inline int hlt_works(int cpu)
25718
25719 extern void cpu_detect(struct cpuinfo_x86 *c);
25720
25721 -extern void identify_cpu(struct cpuinfo_x86 *);
25722 +extern void early_cpu_init(void);
25723 extern void identify_boot_cpu(void);
25724 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25725 extern void print_cpu_info(struct cpuinfo_x86 *);
25726 @@ -267,15 +267,11 @@ struct tss_struct {
25727 struct thread_struct *io_bitmap_owner;
25728
25729 /*
25730 - * Pad the TSS to be cacheline-aligned (size is 0x100):
25731 - */
25732 - unsigned long __cacheline_filler[35];
25733 - /*
25734 * .. and then another 0x100 bytes for the emergency kernel stack:
25735 */
25736 unsigned long stack[64];
25737
25738 -} __attribute__((packed));
25739 +} ____cacheline_aligned;
25740
25741 DECLARE_PER_CPU(struct tss_struct, init_tss);
25742
25743 @@ -668,11 +664,36 @@ static inline void __sti_mwait(unsigned
25744
25745 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25746
25747 -extern int force_mwait;
25748 -
25749 extern void select_idle_routine(const struct cpuinfo_x86 *c);
25750
25751 extern unsigned long boot_option_idle_override;
25752 +extern unsigned long idle_halt;
25753 +extern unsigned long idle_nomwait;
25754 +
25755 +#ifndef CONFIG_XEN
25756 +/*
25757 + * on systems with caches, caches must be flashed as the absolute
25758 + * last instruction before going into a suspended halt. Otherwise,
25759 + * dirty data can linger in the cache and become stale on resume,
25760 + * leading to strange errors.
25761 + *
25762 + * perform a variety of operations to guarantee that the compiler
25763 + * will not reorder instructions. wbinvd itself is serializing
25764 + * so the processor will not reorder.
25765 + *
25766 + * Systems without cache can just go into halt.
25767 + */
25768 +static inline void wbinvd_halt(void)
25769 +{
25770 + mb();
25771 + /* check for clflush to determine if wbinvd is legal */
25772 + if (cpu_has_clflush)
25773 + asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
25774 + else
25775 + while (1)
25776 + halt();
25777 +}
25778 +#endif
25779
25780 extern void enable_sep_cpu(void);
25781 extern int sysenter_setup(void);
25782 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/segment.h
25783 ===================================================================
25784 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/segment.h 2008-12-01 11:44:55.000000000 +0100
25785 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/segment.h 2008-12-01 11:49:07.000000000 +0100
25786 @@ -1,6 +1,15 @@
25787 #ifndef _ASM_X86_SEGMENT_H_
25788 #define _ASM_X86_SEGMENT_H_
25789
25790 +/* Constructor for a conventional segment GDT (or LDT) entry */
25791 +/* This is a macro so it can be used in initializers */
25792 +#define GDT_ENTRY(flags, base, limit) \
25793 + ((((base) & 0xff000000ULL) << (56-24)) | \
25794 + (((flags) & 0x0000f0ffULL) << 40) | \
25795 + (((limit) & 0x000f0000ULL) << (48-16)) | \
25796 + (((base) & 0x00ffffffULL) << 16) | \
25797 + (((limit) & 0x0000ffffULL)))
25798 +
25799 /* Simple and small GDT entries for booting only */
25800
25801 #define GDT_ENTRY_BOOT_CS 2
25802 @@ -61,18 +70,14 @@
25803 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
25804
25805 #define GDT_ENTRY_DEFAULT_USER_CS 14
25806 -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
25807
25808 #define GDT_ENTRY_DEFAULT_USER_DS 15
25809 -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
25810
25811 #define GDT_ENTRY_KERNEL_BASE 12
25812
25813 #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
25814 -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25815
25816 #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
25817 -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25818
25819 #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
25820 #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
25821 @@ -143,10 +148,11 @@
25822 #else
25823 #include <asm/cache.h>
25824
25825 -#define __KERNEL_CS 0x10
25826 -#define __KERNEL_DS 0x18
25827 +#define GDT_ENTRY_KERNEL32_CS 1
25828 +#define GDT_ENTRY_KERNEL_CS 2
25829 +#define GDT_ENTRY_KERNEL_DS 3
25830
25831 -#define __KERNEL32_CS 0x08
25832 +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
25833
25834 /*
25835 * we cannot use the same code segment descriptor for user and kernel
25836 @@ -154,10 +160,10 @@
25837 * The segment offset needs to contain a RPL. Grr. -AK
25838 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
25839 */
25840 -
25841 -#define __USER32_CS 0x23 /* 4*8+3 */
25842 -#define __USER_DS 0x2b /* 5*8+3 */
25843 -#define __USER_CS 0x33 /* 6*8+3 */
25844 +#define GDT_ENTRY_DEFAULT_USER32_CS 4
25845 +#define GDT_ENTRY_DEFAULT_USER_DS 5
25846 +#define GDT_ENTRY_DEFAULT_USER_CS 6
25847 +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
25848 #define __USER32_DS __USER_DS
25849
25850 #define GDT_ENTRY_TSS 8 /* needs two entries */
25851 @@ -179,6 +185,11 @@
25852
25853 #endif
25854
25855 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25856 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25857 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
25858 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
25859 +
25860 /* User mode is privilege level 3 */
25861 #define USER_RPL 0x3
25862 /* LDT segment has TI set, GDT has it cleared */
25863 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/smp.h
25864 ===================================================================
25865 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/smp.h 2008-12-01 11:44:55.000000000 +0100
25866 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/smp.h 2008-12-01 11:49:07.000000000 +0100
25867 @@ -25,23 +25,16 @@ extern cpumask_t cpu_initialized;
25868 extern void (*mtrr_hook)(void);
25869 extern void zap_low_mappings(void);
25870
25871 +extern int __cpuinit get_local_pda(int cpu);
25872 +
25873 extern int smp_num_siblings;
25874 extern unsigned int num_processors;
25875 extern cpumask_t cpu_initialized;
25876
25877 -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
25878 -extern u16 x86_cpu_to_apicid_init[];
25879 -extern u16 x86_bios_cpu_apicid_init[];
25880 -extern void *x86_cpu_to_apicid_early_ptr;
25881 -extern void *x86_bios_cpu_apicid_early_ptr;
25882 -#else
25883 -#define x86_cpu_to_apicid_early_ptr NULL
25884 -#define x86_bios_cpu_apicid_early_ptr NULL
25885 -#endif
25886 -
25887 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
25888 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
25889 DECLARE_PER_CPU(u16, cpu_llc_id);
25890 +
25891 DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
25892 DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
25893
25894 @@ -63,9 +56,9 @@ struct smp_ops {
25895
25896 void (*smp_send_stop)(void);
25897 void (*smp_send_reschedule)(int cpu);
25898 - int (*smp_call_function_mask)(cpumask_t mask,
25899 - void (*func)(void *info), void *info,
25900 - int wait);
25901 +
25902 + void (*send_call_func_ipi)(cpumask_t mask);
25903 + void (*send_call_func_single_ipi)(int cpu);
25904 };
25905
25906 /* Globals due to paravirt */
25907 @@ -106,11 +99,14 @@ static inline void smp_send_reschedule(i
25908 smp_ops.smp_send_reschedule(cpu);
25909 }
25910
25911 -static inline int smp_call_function_mask(cpumask_t mask,
25912 - void (*func) (void *info), void *info,
25913 - int wait)
25914 +static inline void arch_send_call_function_single_ipi(int cpu)
25915 {
25916 - return smp_ops.smp_call_function_mask(mask, func, info, wait);
25917 + smp_ops.send_call_func_single_ipi(cpu);
25918 +}
25919 +
25920 +static inline void arch_send_call_function_ipi(cpumask_t mask)
25921 +{
25922 + smp_ops.send_call_func_ipi(mask);
25923 }
25924
25925 void native_smp_prepare_boot_cpu(void);
25926 @@ -122,23 +118,19 @@ int native_cpu_up(unsigned int cpunum);
25927
25928 void xen_smp_send_stop(void);
25929 void xen_smp_send_reschedule(int cpu);
25930 -int xen_smp_call_function_mask(cpumask_t mask,
25931 - void (*func) (void *info), void *info,
25932 - int wait);
25933 +void xen_send_call_func_ipi(cpumask_t mask);
25934 +void xen_send_call_func_single_ipi(int cpu);
25935
25936 #define smp_send_stop xen_smp_send_stop
25937 #define smp_send_reschedule xen_smp_send_reschedule
25938 -#define smp_call_function_mask xen_smp_call_function_mask
25939 -
25940 -extern void prefill_possible_map(void);
25941 +#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi
25942 +#define arch_send_call_function_ipi xen_send_call_func_ipi
25943
25944 #endif /* CONFIG_XEN */
25945
25946 extern int __cpu_disable(void);
25947 extern void __cpu_die(unsigned int cpu);
25948
25949 -extern void prefill_possible_map(void);
25950 -
25951 void smp_store_cpu_info(int id);
25952 #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
25953
25954 @@ -149,6 +141,14 @@ static inline int num_booting_cpus(void)
25955 }
25956 #endif /* CONFIG_SMP */
25957
25958 +#if defined(CONFIG_SMP) && (defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_XEN))
25959 +extern void prefill_possible_map(void);
25960 +#else
25961 +static inline void prefill_possible_map(void)
25962 +{
25963 +}
25964 +#endif
25965 +
25966 extern unsigned disabled_cpus __cpuinitdata;
25967
25968 #ifdef CONFIG_X86_32_SMP
25969 @@ -216,12 +216,8 @@ static inline int hard_smp_processor_id(
25970 #endif /* CONFIG_X86_LOCAL_APIC */
25971
25972 #ifdef CONFIG_HOTPLUG_CPU
25973 -extern void cpu_exit_clear(void);
25974 extern void cpu_uninit(void);
25975 #endif
25976
25977 -extern void smp_alloc_memory(void);
25978 -extern void lock_ipi_call_lock(void);
25979 -extern void unlock_ipi_call_lock(void);
25980 #endif /* __ASSEMBLY__ */
25981 #endif
25982 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/spinlock.h
25983 ===================================================================
25984 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/spinlock.h 2008-12-01 11:44:55.000000000 +0100
25985 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/spinlock.h 2008-12-01 11:49:07.000000000 +0100
25986 @@ -65,14 +65,14 @@ extern void xen_spin_kick(raw_spinlock_t
25987 */
25988 #if (NR_CPUS < 256)
25989 #define TICKET_SHIFT 8
25990 -#define __raw_spin_lock_preamble \
25991 +#define __ticket_spin_lock_preamble \
25992 asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
25993 "cmpb %h0, %b0\n\t" \
25994 "sete %1" \
25995 : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
25996 : "0" (0x0100) \
25997 : "memory", "cc")
25998 -#define __raw_spin_lock_body \
25999 +#define __ticket_spin_lock_body \
26000 asm("1:\t" \
26001 "cmpb %h0, %b0\n\t" \
26002 "je 2f\n\t" \
26003 @@ -88,7 +88,7 @@ extern void xen_spin_kick(raw_spinlock_t
26004 : "memory", "cc")
26005
26006
26007 -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26008 +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26009 {
26010 int tmp, new;
26011
26012 @@ -107,7 +107,7 @@ static __always_inline int __raw_spin_tr
26013 return tmp;
26014 }
26015
26016 -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26017 +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26018 {
26019 unsigned int token;
26020 unsigned char kick;
26021 @@ -124,7 +124,7 @@ static __always_inline void __raw_spin_u
26022 }
26023 #else
26024 #define TICKET_SHIFT 16
26025 -#define __raw_spin_lock_preamble \
26026 +#define __ticket_spin_lock_preamble \
26027 do { \
26028 unsigned int tmp; \
26029 asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
26030 @@ -136,7 +136,7 @@ static __always_inline void __raw_spin_u
26031 : "0" (0x00010000) \
26032 : "memory", "cc"); \
26033 } while (0)
26034 -#define __raw_spin_lock_body \
26035 +#define __ticket_spin_lock_body \
26036 do { \
26037 unsigned int tmp; \
26038 asm("shldl $16, %0, %2\n" \
26039 @@ -155,7 +155,7 @@ static __always_inline void __raw_spin_u
26040 : "memory", "cc"); \
26041 } while (0)
26042
26043 -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26044 +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26045 {
26046 int tmp;
26047 int new;
26048 @@ -177,7 +177,7 @@ static __always_inline int __raw_spin_tr
26049 return tmp;
26050 }
26051
26052 -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26053 +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26054 {
26055 unsigned int token, tmp;
26056 bool kick;
26057 @@ -195,49 +195,145 @@ static __always_inline void __raw_spin_u
26058 }
26059 #endif
26060
26061 -static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26062 +static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
26063 {
26064 int tmp = ACCESS_ONCE(lock->slock);
26065
26066 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
26067 }
26068
26069 -static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26070 +static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
26071 {
26072 int tmp = ACCESS_ONCE(lock->slock);
26073
26074 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
26075 }
26076
26077 -static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26078 +static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
26079 {
26080 unsigned int token, count;
26081 bool free;
26082
26083 - __raw_spin_lock_preamble;
26084 + __ticket_spin_lock_preamble;
26085 if (unlikely(!free))
26086 token = xen_spin_adjust(lock, token);
26087 do {
26088 count = 1 << 10;
26089 - __raw_spin_lock_body;
26090 + __ticket_spin_lock_body;
26091 } while (unlikely(!count) && !xen_spin_wait(lock, token));
26092 }
26093
26094 -static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26095 - unsigned long flags)
26096 +static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock,
26097 + unsigned long flags)
26098 {
26099 unsigned int token, count;
26100 bool free;
26101
26102 - __raw_spin_lock_preamble;
26103 + __ticket_spin_lock_preamble;
26104 if (unlikely(!free))
26105 token = xen_spin_adjust(lock, token);
26106 do {
26107 count = 1 << 10;
26108 - __raw_spin_lock_body;
26109 + __ticket_spin_lock_body;
26110 } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
26111 }
26112
26113 +#ifdef CONFIG_PARAVIRT
26114 +/*
26115 + * Define virtualization-friendly old-style lock byte lock, for use in
26116 + * pv_lock_ops if desired.
26117 + *
26118 + * This differs from the pre-2.6.24 spinlock by always using xchgb
26119 + * rather than decb to take the lock; this allows it to use a
26120 + * zero-initialized lock structure. It also maintains a 1-byte
26121 + * contention counter, so that we can implement
26122 + * __byte_spin_is_contended.
26123 + */
26124 +struct __byte_spinlock {
26125 + s8 lock;
26126 + s8 spinners;
26127 +};
26128 +
26129 +static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
26130 +{
26131 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26132 + return bl->lock != 0;
26133 +}
26134 +
26135 +static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
26136 +{
26137 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26138 + return bl->spinners != 0;
26139 +}
26140 +
26141 +static inline void __byte_spin_lock(raw_spinlock_t *lock)
26142 +{
26143 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26144 + s8 val = 1;
26145 +
26146 + asm("1: xchgb %1, %0\n"
26147 + " test %1,%1\n"
26148 + " jz 3f\n"
26149 + " " LOCK_PREFIX "incb %2\n"
26150 + "2: rep;nop\n"
26151 + " cmpb $1, %0\n"
26152 + " je 2b\n"
26153 + " " LOCK_PREFIX "decb %2\n"
26154 + " jmp 1b\n"
26155 + "3:"
26156 + : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
26157 +}
26158 +
26159 +static inline int __byte_spin_trylock(raw_spinlock_t *lock)
26160 +{
26161 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26162 + u8 old = 1;
26163 +
26164 + asm("xchgb %1,%0"
26165 + : "+m" (bl->lock), "+q" (old) : : "memory");
26166 +
26167 + return old == 0;
26168 +}
26169 +
26170 +static inline void __byte_spin_unlock(raw_spinlock_t *lock)
26171 +{
26172 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26173 + smp_wmb();
26174 + bl->lock = 0;
26175 +}
26176 +#else /* !CONFIG_PARAVIRT */
26177 +static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26178 +{
26179 + return __ticket_spin_is_locked(lock);
26180 +}
26181 +
26182 +static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26183 +{
26184 + return __ticket_spin_is_contended(lock);
26185 +}
26186 +
26187 +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26188 +{
26189 + __ticket_spin_lock(lock);
26190 +}
26191 +
26192 +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26193 + unsigned long flags)
26194 +{
26195 + __ticket_spin_lock_flags(lock, flags);
26196 +}
26197 +
26198 +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26199 +{
26200 + return __ticket_spin_trylock(lock);
26201 +}
26202 +
26203 +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26204 +{
26205 + __ticket_spin_unlock(lock);
26206 +}
26207 +#endif /* CONFIG_PARAVIRT */
26208 +
26209 static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
26210 {
26211 while (__raw_spin_is_locked(lock))
26212 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/system.h
26213 ===================================================================
26214 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/system.h 2008-12-01 11:44:55.000000000 +0100
26215 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/system.h 2008-12-01 11:49:07.000000000 +0100
26216 @@ -137,7 +137,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
26217 #define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
26218 #define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
26219
26220 -extern void load_gs_index(unsigned);
26221 +extern void xen_load_gs_index(unsigned);
26222
26223 /*
26224 * Load a segment. Fall back on loading the zero
26225 @@ -154,14 +154,14 @@ extern void load_gs_index(unsigned);
26226 "jmp 2b\n" \
26227 ".previous\n" \
26228 _ASM_EXTABLE(1b,3b) \
26229 - : :"r" (value), "r" (0))
26230 + : :"r" (value), "r" (0) : "memory")
26231
26232
26233 /*
26234 * Save a segment register away
26235 */
26236 #define savesegment(seg, value) \
26237 - asm volatile("mov %%" #seg ",%0":"=rm" (value))
26238 + asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
26239
26240 static inline unsigned long get_limit(unsigned long segment)
26241 {
26242 @@ -269,6 +269,7 @@ static inline void xen_wbinvd(void)
26243 #ifdef CONFIG_X86_64
26244 #define read_cr8() (xen_read_cr8())
26245 #define write_cr8(x) (xen_write_cr8(x))
26246 +#define load_gs_index xen_load_gs_index
26247 #endif
26248
26249 /* Clear the 'TS' bit */
26250 @@ -287,13 +288,12 @@ static inline void clflush(volatile void
26251 void disable_hlt(void);
26252 void enable_hlt(void);
26253
26254 -extern int es7000_plat;
26255 void cpu_idle_wait(void);
26256
26257 extern unsigned long arch_align_stack(unsigned long sp);
26258 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
26259
26260 -void default_idle(void);
26261 +void xen_idle(void);
26262
26263 /*
26264 * Force strict CPU ordering.
26265 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/xor_64.h
26266 ===================================================================
26267 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/xor_64.h 2008-12-01 11:44:55.000000000 +0100
26268 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/xor_64.h 2008-12-01 11:49:07.000000000 +0100
26269 @@ -1,3 +1,6 @@
26270 +#ifndef ASM_X86__XOR_64_H
26271 +#define ASM_X86__XOR_64_H
26272 +
26273 /*
26274 * x86-64 changes / gcc fixes from Andi Kleen.
26275 * Copyright 2002 Andi Kleen, SuSE Labs.
26276 @@ -330,3 +333,5 @@ do { \
26277 We may also be able to load into the L1 only depending on how the cpu
26278 deals with a load to a line that is being prefetched. */
26279 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
26280 +
26281 +#endif /* ASM_X86__XOR_64_H */
26282 Index: head-2008-12-01/include/asm-x86/mach-xen/irq_vectors.h
26283 ===================================================================
26284 --- head-2008-12-01.orig/include/asm-x86/mach-xen/irq_vectors.h 2008-12-01 11:37:10.000000000 +0100
26285 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26286 @@ -1,126 +0,0 @@
26287 -/*
26288 - * This file should contain #defines for all of the interrupt vector
26289 - * numbers used by this architecture.
26290 - *
26291 - * In addition, there are some standard defines:
26292 - *
26293 - * FIRST_EXTERNAL_VECTOR:
26294 - * The first free place for external interrupts
26295 - *
26296 - * SYSCALL_VECTOR:
26297 - * The IRQ vector a syscall makes the user to kernel transition
26298 - * under.
26299 - *
26300 - * TIMER_IRQ:
26301 - * The IRQ number the timer interrupt comes in at.
26302 - *
26303 - * NR_IRQS:
26304 - * The total number of interrupt vectors (including all the
26305 - * architecture specific interrupts) needed.
26306 - *
26307 - */
26308 -#ifndef _ASM_IRQ_VECTORS_H
26309 -#define _ASM_IRQ_VECTORS_H
26310 -
26311 -/*
26312 - * IDT vectors usable for external interrupt sources start
26313 - * at 0x20:
26314 - */
26315 -#define FIRST_EXTERNAL_VECTOR 0x20
26316 -
26317 -#define SYSCALL_VECTOR 0x80
26318 -
26319 -/*
26320 - * Vectors 0x20-0x2f are used for ISA interrupts.
26321 - */
26322 -
26323 -#if 0
26324 -/*
26325 - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
26326 - *
26327 - * some of the following vectors are 'rare', they are merged
26328 - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
26329 - * TLB, reschedule and local APIC vectors are performance-critical.
26330 - *
26331 - * Vectors 0xf0-0xfa are free (reserved for future Linux use).
26332 - */
26333 -#define SPURIOUS_APIC_VECTOR 0xff
26334 -#define ERROR_APIC_VECTOR 0xfe
26335 -#define INVALIDATE_TLB_VECTOR 0xfd
26336 -#define RESCHEDULE_VECTOR 0xfc
26337 -#define CALL_FUNCTION_VECTOR 0xfb
26338 -
26339 -#define THERMAL_APIC_VECTOR 0xf0
26340 -/*
26341 - * Local APIC timer IRQ vector is on a different priority level,
26342 - * to work around the 'lost local interrupt if more than 2 IRQ
26343 - * sources per level' errata.
26344 - */
26345 -#define LOCAL_TIMER_VECTOR 0xef
26346 -#endif
26347 -
26348 -#define SPURIOUS_APIC_VECTOR 0xff
26349 -#define ERROR_APIC_VECTOR 0xfe
26350 -
26351 -/*
26352 - * First APIC vector available to drivers: (vectors 0x30-0xee)
26353 - * we start at 0x31 to spread out vectors evenly between priority
26354 - * levels. (0x80 is the syscall vector)
26355 - */
26356 -#define FIRST_DEVICE_VECTOR 0x31
26357 -#define FIRST_SYSTEM_VECTOR 0xef
26358 -
26359 -/*
26360 - * 16 8259A IRQ's, 208 potential APIC interrupt sources.
26361 - * Right now the APIC is mostly only used for SMP.
26362 - * 256 vectors is an architectural limit. (we can have
26363 - * more than 256 devices theoretically, but they will
26364 - * have to use shared interrupts)
26365 - * Since vectors 0x00-0x1f are used/reserved for the CPU,
26366 - * the usable vector space is 0x20-0xff (224 vectors)
26367 - */
26368 -
26369 -#define RESCHEDULE_VECTOR 0
26370 -#define CALL_FUNCTION_VECTOR 1
26371 -#define SPIN_UNLOCK_VECTOR 2
26372 -#define NR_IPIS 3
26373 -
26374 -/*
26375 - * The maximum number of vectors supported by i386 processors
26376 - * is limited to 256. For processors other than i386, NR_VECTORS
26377 - * should be changed accordingly.
26378 - */
26379 -#define NR_VECTORS 256
26380 -
26381 -#define FPU_IRQ 13
26382 -
26383 -#define FIRST_VM86_IRQ 3
26384 -#define LAST_VM86_IRQ 15
26385 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
26386 -
26387 -/*
26388 - * The flat IRQ space is divided into two regions:
26389 - * 1. A one-to-one mapping of real physical IRQs. This space is only used
26390 - * if we have physical device-access privilege. This region is at the
26391 - * start of the IRQ space so that existing device drivers do not need
26392 - * to be modified to translate physical IRQ numbers into our IRQ space.
26393 - * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
26394 - * are bound using the provided bind/unbind functions.
26395 - */
26396 -
26397 -#define PIRQ_BASE 0
26398 -#if !defined(MAX_IO_APICS)
26399 -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26400 -#elif NR_CPUS < MAX_IO_APICS
26401 -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26402 -#else
26403 -# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
26404 -#endif
26405 -
26406 -#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
26407 -#define NR_DYNIRQS 256
26408 -
26409 -#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
26410 -#define NR_IRQ_VECTORS NR_IRQS
26411 -
26412 -#endif /* _ASM_IRQ_VECTORS_H */
26413 Index: head-2008-12-01/include/asm-x86/mach-xen/setup_arch_post.h
26414 ===================================================================
26415 --- head-2008-12-01.orig/include/asm-x86/mach-xen/setup_arch_post.h 2008-12-03 15:48:43.000000000 +0100
26416 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26417 @@ -1,63 +0,0 @@
26418 -/**
26419 - * machine_specific_* - Hooks for machine specific setup.
26420 - *
26421 - * Description:
26422 - * This is included late in kernel/setup.c so that it can make
26423 - * use of all of the static functions.
26424 - **/
26425 -
26426 -#include <xen/interface/callback.h>
26427 -
26428 -extern void hypervisor_callback(void);
26429 -extern void failsafe_callback(void);
26430 -extern void nmi(void);
26431 -
26432 -static void __init machine_specific_arch_setup(void)
26433 -{
26434 - int ret;
26435 - static struct callback_register __initdata event = {
26436 - .type = CALLBACKTYPE_event,
26437 - .address = (unsigned long) hypervisor_callback,
26438 - };
26439 - static struct callback_register __initdata failsafe = {
26440 - .type = CALLBACKTYPE_failsafe,
26441 - .address = (unsigned long)failsafe_callback,
26442 - };
26443 - static struct callback_register __initdata syscall = {
26444 - .type = CALLBACKTYPE_syscall,
26445 - .address = (unsigned long)system_call,
26446 - };
26447 -#ifdef CONFIG_X86_LOCAL_APIC
26448 - static struct callback_register __initdata nmi_cb = {
26449 - .type = CALLBACKTYPE_nmi,
26450 - .address = (unsigned long)nmi,
26451 - };
26452 -#endif
26453 -
26454 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
26455 - if (ret == 0)
26456 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
26457 - if (ret == 0)
26458 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
26459 -#if CONFIG_XEN_COMPAT <= 0x030002
26460 - if (ret == -ENOSYS)
26461 - ret = HYPERVISOR_set_callbacks(
26462 - event.address,
26463 - failsafe.address,
26464 - syscall.address);
26465 -#endif
26466 - BUG_ON(ret);
26467 -
26468 -#ifdef CONFIG_X86_LOCAL_APIC
26469 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
26470 -#if CONFIG_XEN_COMPAT <= 0x030002
26471 - if (ret == -ENOSYS) {
26472 - static struct xennmi_callback __initdata cb = {
26473 - .handler_address = (unsigned long)nmi
26474 - };
26475 -
26476 - HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
26477 - }
26478 -#endif
26479 -#endif
26480 -}
26481 Index: head-2008-12-01/include/asm-x86/mach-xen/setup_arch_pre.h
26482 ===================================================================
26483 --- head-2008-12-01.orig/include/asm-x86/mach-xen/setup_arch_pre.h 2008-12-03 15:48:43.000000000 +0100
26484 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26485 @@ -1,5 +0,0 @@
26486 -/* Hook to call BIOS initialisation function */
26487 -
26488 -#define ARCH_SETUP machine_specific_arch_setup();
26489 -
26490 -static void __init machine_specific_arch_setup(void);
26491 Index: head-2008-12-01/include/asm-x86/traps.h
26492 ===================================================================
26493 --- head-2008-12-01.orig/include/asm-x86/traps.h 2008-12-03 15:48:43.000000000 +0100
26494 +++ head-2008-12-01/include/asm-x86/traps.h 2008-12-01 11:49:07.000000000 +0100
26495 @@ -23,6 +23,9 @@ asmlinkage void spurious_interrupt_bug(v
26496 #ifdef CONFIG_X86_MCE
26497 asmlinkage void machine_check(void);
26498 #endif /* CONFIG_X86_MCE */
26499 +#ifdef CONFIG_X86_XEN
26500 +asmlinkage void fixup_4gb_segment(void);
26501 +#endif
26502
26503 void do_divide_error(struct pt_regs *, long);
26504 void do_overflow(struct pt_regs *, long);
26505 @@ -48,6 +51,9 @@ void math_error(void __user *);
26506 void do_coprocessor_error(struct pt_regs *, long);
26507 void do_simd_coprocessor_error(struct pt_regs *, long);
26508 void do_spurious_interrupt_bug(struct pt_regs *, long);
26509 +#ifdef CONFIG_XEN
26510 +void do_fixup_4gb_segment(struct pt_regs *, long);
26511 +#endif
26512 unsigned long patch_espfix_desc(unsigned long, unsigned long);
26513 asmlinkage void math_emulate(long);
26514
26515 Index: head-2008-12-01/include/asm-x86/xen/hypercall.h
26516 ===================================================================
26517 --- head-2008-12-01.orig/include/asm-x86/xen/hypercall.h 2008-12-03 15:48:43.000000000 +0100
26518 +++ head-2008-12-01/include/asm-x86/xen/hypercall.h 2008-12-01 11:49:07.000000000 +0100
26519 @@ -264,7 +264,7 @@ HYPERVISOR_fpu_taskswitch(int set)
26520 static inline int
26521 HYPERVISOR_sched_op(int cmd, void *arg)
26522 {
26523 - return _hypercall2(int, sched_op_new, cmd, arg);
26524 + return _hypercall2(int, sched_op, cmd, arg);
26525 }
26526
26527 static inline long
26528 Index: head-2008-12-01/include/asm-x86/xen/interface_64.h
26529 ===================================================================
26530 --- head-2008-12-01.orig/include/asm-x86/xen/interface_64.h 2008-12-03 15:48:43.000000000 +0100
26531 +++ head-2008-12-01/include/asm-x86/xen/interface_64.h 2008-12-01 11:49:07.000000000 +0100
26532 @@ -136,7 +136,7 @@ struct cpu_user_regs {
26533 uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
26534 uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
26535 };
26536 -DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
26537 +DEFINE_XEN_GUEST_HANDLE_STRUCT(cpu_user_regs);
26538
26539 #undef __DECL_REG
26540
26541 Index: head-2008-12-01/include/linux/page-flags.h
26542 ===================================================================
26543 --- head-2008-12-01.orig/include/linux/page-flags.h 2008-12-01 11:48:52.000000000 +0100
26544 +++ head-2008-12-01/include/linux/page-flags.h 2008-12-01 11:49:07.000000000 +0100
26545 @@ -109,9 +109,11 @@ enum pageflags {
26546 /* Filesystems */
26547 PG_checked = PG_owner_priv_1,
26548
26549 +#ifdef CONFIG_PARAVIRT_XEN
26550 /* XEN */
26551 PG_pinned = PG_owner_priv_1,
26552 PG_savepinned = PG_dirty,
26553 +#endif
26554
26555 /* SLOB */
26556 PG_slob_page = PG_active,
26557 @@ -185,8 +187,12 @@ PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU,
26558 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
26559 __PAGEFLAG(Slab, slab)
26560 PAGEFLAG(Checked, checked) /* Used by some filesystems */
26561 +#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
26562 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
26563 +#endif
26564 +#ifdef CONFIG_PARAVIRT_XEN
26565 PAGEFLAG(SavePinned, savepinned); /* Xen */
26566 +#endif
26567 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
26568 PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
26569 __SETPAGEFLAG(Private, private)
26570 Index: head-2008-12-01/include/xen/interface/memory.h
26571 ===================================================================
26572 --- head-2008-12-01.orig/include/xen/interface/memory.h 2008-12-01 11:44:55.000000000 +0100
26573 +++ head-2008-12-01/include/xen/interface/memory.h 2008-12-01 11:49:07.000000000 +0100
26574 @@ -82,6 +82,7 @@ struct xen_memory_reservation {
26575 domid_t domid;
26576
26577 };
26578 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_memory_reservation);
26579 typedef struct xen_memory_reservation xen_memory_reservation_t;
26580 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
26581
26582 @@ -167,11 +168,7 @@ struct xen_machphys_mfn_list {
26583 * any large discontiguities in the machine address space, 2MB gaps in
26584 * the machphys table will be represented by an MFN base of zero.
26585 */
26586 -#ifndef CONFIG_PARAVIRT_XEN
26587 XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
26588 -#else
26589 - ulong extent_start;
26590 -#endif
26591
26592 /*
26593 * Number of extents written to the above array. This will be smaller
26594 @@ -179,6 +176,7 @@ struct xen_machphys_mfn_list {
26595 */
26596 unsigned int nr_extents;
26597 };
26598 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
26599 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
26600 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
26601
26602 @@ -218,6 +216,7 @@ struct xen_add_to_physmap {
26603 /* GPFN where the source mapping page should appear. */
26604 xen_pfn_t gpfn;
26605 };
26606 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
26607 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
26608 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
26609
26610 @@ -250,21 +249,13 @@ struct xen_translate_gpfn_list {
26611 xen_ulong_t nr_gpfns;
26612
26613 /* List of GPFNs to translate. */
26614 -#ifndef CONFIG_PARAVIRT_XEN
26615 XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
26616 -#else
26617 - ulong gpfn_list;
26618 -#endif
26619
26620 /*
26621 * Output list to contain MFN translations. May be the same as the input
26622 * list (in which case each input GPFN is overwritten with the output MFN).
26623 */
26624 -#ifndef CONFIG_PARAVIRT_XEN
26625 XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
26626 -#else
26627 - ulong mfn_list;
26628 -#endif
26629 };
26630 DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
26631 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
26632 Index: head-2008-12-01/kernel/hrtimer.c
26633 ===================================================================
26634 --- head-2008-12-01.orig/kernel/hrtimer.c 2008-12-03 15:48:43.000000000 +0100
26635 +++ head-2008-12-01/kernel/hrtimer.c 2008-12-01 11:49:07.000000000 +0100
26636 @@ -1084,7 +1084,7 @@ ktime_t hrtimer_get_remaining(const stru
26637 }
26638 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
26639
26640 -#ifdef CONFIG_NO_HZ
26641 +#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26642 /**
26643 * hrtimer_get_next_event - get the time until next expiry event
26644 *
26645 Index: head-2008-12-01/kernel/timer.c
26646 ===================================================================
26647 --- head-2008-12-01.orig/kernel/timer.c 2008-12-03 15:48:43.000000000 +0100
26648 +++ head-2008-12-01/kernel/timer.c 2008-12-01 11:49:07.000000000 +0100
26649 @@ -815,7 +815,7 @@ static inline void __run_timers(struct t
26650 spin_unlock_irq(&base->lock);
26651 }
26652
26653 -#ifdef CONFIG_NO_HZ
26654 +#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26655 /*
26656 * Find out when the next timer event is due to happen. This
26657 * is used on S/390 to stop all activity when a cpus is idle.
26658 Index: head-2008-12-01/lib/swiotlb-xen.c
26659 ===================================================================
26660 --- head-2008-12-01.orig/lib/swiotlb-xen.c 2008-12-01 11:44:55.000000000 +0100
26661 +++ head-2008-12-01/lib/swiotlb-xen.c 2008-12-01 11:49:07.000000000 +0100
26662 @@ -750,7 +750,7 @@ swiotlb_sync_sg_for_device(struct device
26663 }
26664
26665 int
26666 -swiotlb_dma_mapping_error(dma_addr_t dma_addr)
26667 +swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
26668 {
26669 return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
26670 }
26671 Index: head-2008-12-01/mm/mprotect.c
26672 ===================================================================
26673 --- head-2008-12-01.orig/mm/mprotect.c 2008-12-01 11:29:05.000000000 +0100
26674 +++ head-2008-12-01/mm/mprotect.c 2008-12-01 11:49:07.000000000 +0100
26675 @@ -92,8 +92,6 @@ static inline void change_pmd_range(stru
26676 next = pmd_addr_end(addr, end);
26677 if (pmd_none_or_clear_bad(pmd))
26678 continue;
26679 - if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
26680 - continue;
26681 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
26682 } while (pmd++, addr = next, addr != end);
26683 }