1 From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
2 Subject: [PATCH] Linux: Update to 2.6.27
5 This patch contains the differences between Linux 2.6.26 and 2.6.27.
7 Acked-by: Jeff Mahoney <jeffm@suse.com>
8 Automatically created from "patches.kernel.org/patch-2.6.27" by xen-port-patches.py
10 Index: head-2008-12-01/arch/x86/Kconfig
11 ===================================================================
12 --- head-2008-12-01.orig/arch/x86/Kconfig 2008-12-01 11:44:55.000000000 +0100
13 +++ head-2008-12-01/arch/x86/Kconfig 2008-12-01 11:49:07.000000000 +0100
14 @@ -590,7 +590,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
16 bool "AMD IOMMU support"
18 - depends on X86_64 && PCI && ACPI
19 + depends on X86_64 && PCI && ACPI && !X86_64_XEN
21 With this option you can enable support for AMD IOMMU hardware in
22 your system. An IOMMU is a hardware component which provides
23 @@ -625,8 +625,10 @@ config MAXSMP
26 int "Maximum number of CPUs (2-512)" if !MAXSMP
30 + default "32" if MAXSMP && XEN
31 default "4096" if MAXSMP
32 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
33 default "16" if X86_64_XEN
34 @@ -1223,7 +1225,7 @@ config MTRR
37 prompt "MTRR cleanup support"
39 + depends on MTRR && !XEN
41 Convert MTRR layout from continuous to discrete, so X drivers can
42 add writeback entries.
43 Index: head-2008-12-01/arch/x86/Kconfig.debug
44 ===================================================================
45 --- head-2008-12-01.orig/arch/x86/Kconfig.debug 2008-12-01 11:37:10.000000000 +0100
46 +++ head-2008-12-01/arch/x86/Kconfig.debug 2008-12-01 11:49:07.000000000 +0100
47 @@ -25,6 +25,7 @@ config STRICT_DEVMEM
48 config X86_VERBOSE_BOOTUP
49 bool "Enable verbose x86 bootup info messages"
53 Enables the informational output from the decompression stage
54 (e.g. bzImage) of the boot. If you disable this you will still
55 @@ -166,7 +167,7 @@ config MMIOTRACE_HOOKS
58 bool "Memory mapped IO tracing"
59 - depends on DEBUG_KERNEL && PCI
60 + depends on DEBUG_KERNEL && PCI && !XEN
62 select MMIOTRACE_HOOKS
64 Index: head-2008-12-01/arch/x86/Makefile
65 ===================================================================
66 --- head-2008-12-01.orig/arch/x86/Makefile 2008-12-01 11:36:55.000000000 +0100
67 +++ head-2008-12-01/arch/x86/Makefile 2008-12-01 11:49:07.000000000 +0100
68 @@ -116,8 +116,8 @@ mflags-$(CONFIG_X86_VOYAGER) := -Iinclud
69 mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
72 -mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-x86/mach-xen
73 -mcore-$(CONFIG_X86_XEN) := arch/x86/mach-xen/
74 +mflags-$(CONFIG_XEN) := -Iinclude/asm-x86/mach-xen
75 +mcore-$(CONFIG_XEN) := arch/x86/mach-xen/
77 # generic subarchitecture
78 mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
79 @@ -128,8 +128,10 @@ mcore-$(CONFIG_X86_GENERICARCH) := arch/
80 mflags-y += -Iinclude/asm-x86/mach-default
82 # 64 bit does not support subarch support - clear sub arch variables
83 +ifneq ($(CONFIG_XEN),y)
84 fcore-$(CONFIG_X86_64) :=
85 mcore-$(CONFIG_X86_64) :=
88 KBUILD_CFLAGS += $(mflags-y)
89 KBUILD_AFLAGS += $(mflags-y)
90 Index: head-2008-12-01/arch/x86/ia32/ia32entry-xen.S
91 ===================================================================
92 --- head-2008-12-01.orig/arch/x86/ia32/ia32entry-xen.S 2008-12-01 11:44:55.000000000 +0100
93 +++ head-2008-12-01/arch/x86/ia32/ia32entry-xen.S 2008-12-01 11:49:07.000000000 +0100
95 #include <asm/irqflags.h>
96 #include <linux/linkage.h>
98 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
99 +#include <linux/elf-em.h>
100 +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
101 +#define __AUDIT_ARCH_LE 0x40000000
103 +#ifndef CONFIG_AUDITSYSCALL
104 +#define sysexit_audit int_ret_from_sys_call
105 +#define sysretl_audit int_ret_from_sys_call
108 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
110 .macro IA32_ARG_FIXUP noebp=0
116 + * Reload arg registers from stack in case ptrace changed them.
117 + * We don't reload %eax because syscall_trace_enter() returned
118 + * the value it wants us to use in the table lookup.
120 .macro LOAD_ARGS32 offset
121 movl \offset(%rsp),%r11d
122 movl \offset+8(%rsp),%r10d
124 movl \offset+48(%rsp),%edx
125 movl \offset+56(%rsp),%esi
126 movl \offset+64(%rsp),%edi
127 - movl \offset+72(%rsp),%eax
130 .macro CFI_STARTPROC32 simple
135 +#ifdef CONFIG_PARAVIRT
136 +ENTRY(native_usergs_sysret32)
139 +ENDPROC(native_usergs_sysret32)
141 +ENTRY(native_irq_enable_sysexit)
145 +ENDPROC(native_irq_enable_sysexit)
149 * 32bit SYSENTER instruction entry.
151 @@ -98,7 +125,7 @@ ENTRY(ia32_sysenter_target)
153 movl %ebp,%ebp /* zero extension */
155 - movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
156 + movl 48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d
157 movl $__USER32_DS,40(%rsp)
159 movl $__USER32_CS,16(%rsp)
160 @@ -113,19 +140,79 @@ ENTRY(ia32_sysenter_target)
163 GET_THREAD_INFO(%r10)
164 - orl $TS_COMPAT,threadinfo_status(%r10)
165 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
166 + orl $TS_COMPAT,TI_status(%r10)
167 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
168 jnz sysenter_tracesys
170 cmpl $(IA32_NR_syscalls-1),%eax
175 call *ia32_sys_call_table(,%rax,8)
176 movq %rax,RAX-ARGOFFSET(%rsp)
177 + GET_THREAD_INFO(%r10)
178 + DISABLE_INTERRUPTS(CLBR_NONE)
180 + testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
182 + jmp int_ret_from_sys_call
184 +#ifdef CONFIG_AUDITSYSCALL
185 + .macro auditsys_entry_common
186 + movl %esi,%r9d /* 6th arg: 4th syscall arg */
187 + movl %edx,%r8d /* 5th arg: 3rd syscall arg */
188 + /* (already in %ecx) 4th arg: 2nd syscall arg */
189 + movl %ebx,%edx /* 3rd arg: 1st syscall arg */
190 + movl %eax,%esi /* 2nd arg: syscall number */
191 + movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
192 + call audit_syscall_entry
193 + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
194 + cmpl $(IA32_NR_syscalls-1),%eax
196 + movl %ebx,%edi /* reload 1st syscall arg */
197 + movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
198 + movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
199 + movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
200 + movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
203 + .macro auditsys_exit exit,ebpsave=RBP
204 + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
205 + jnz int_ret_from_sys_call
207 + ENABLE_INTERRUPTS(CLBR_NONE)
208 + movl %eax,%esi /* second arg, syscall return value */
209 + cmpl $0,%eax /* is it < 0? */
210 + setl %al /* 1 if so, 0 if not */
211 + movzbl %al,%edi /* zero-extend that into %edi */
212 + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
213 + call audit_syscall_exit
214 + GET_THREAD_INFO(%r10)
215 + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
216 + movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
217 + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
218 + DISABLE_INTERRUPTS(CLBR_NONE)
220 + testl %edi,TI_flags(%r10)
222 jmp int_ret_from_sys_call
226 + auditsys_entry_common
227 + movl %ebp,%r9d /* reload 6th syscall arg */
228 + jmp sysenter_dispatch
231 + auditsys_exit sysexit_from_sys_call
236 +#ifdef CONFIG_AUDITSYSCALL
237 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
238 + jz sysenter_auditsys
243 @@ -186,18 +273,38 @@ ENTRY(ia32_cstar_target)
246 GET_THREAD_INFO(%r10)
247 - orl $TS_COMPAT,threadinfo_status(%r10)
248 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
249 + orl $TS_COMPAT,TI_status(%r10)
250 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
253 cmpl $IA32_NR_syscalls-1,%eax
257 call *ia32_sys_call_table(,%rax,8)
258 movq %rax,RAX-ARGOFFSET(%rsp)
259 + GET_THREAD_INFO(%r10)
260 + DISABLE_INTERRUPTS(CLBR_NONE)
261 + testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
263 jmp int_ret_from_sys_call
266 +#ifdef CONFIG_AUDITSYSCALL
268 + movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
269 + auditsys_entry_common
270 + movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
274 + auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
278 +#ifdef CONFIG_AUDITSYSCALL
279 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
285 @@ -263,8 +370,8 @@ ENTRY(ia32_syscall)
286 this could be a problem. */
288 GET_THREAD_INFO(%r10)
289 - orl $TS_COMPAT,threadinfo_status(%r10)
290 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
291 + orl $TS_COMPAT,TI_status(%r10)
292 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
295 cmpl $(IA32_NR_syscalls-1),%eax
296 @@ -309,13 +416,11 @@ quiet_ni_syscall:
297 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
298 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
299 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
300 - PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
301 PTREGSCALL stub32_execve, sys32_execve, %rcx
302 PTREGSCALL stub32_fork, sys_fork, %rdi
303 PTREGSCALL stub32_clone, sys32_clone, %rdx
304 PTREGSCALL stub32_vfork, sys_vfork, %rdi
305 PTREGSCALL stub32_iopl, sys_iopl, %rsi
306 - PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
308 ENTRY(ia32_ptregs_common)
310 @@ -415,7 +520,7 @@ ia32_sys_call_table:
312 .quad sys_setreuid16 /* 70 */
314 - .quad stub32_sigsuspend
315 + .quad sys32_sigsuspend
316 .quad compat_sys_sigpending
317 .quad sys_sethostname
318 .quad compat_sys_setrlimit /* 75 */
319 @@ -522,7 +627,7 @@ ia32_sys_call_table:
320 .quad sys32_rt_sigpending
321 .quad compat_sys_rt_sigtimedwait
322 .quad sys32_rt_sigqueueinfo
323 - .quad stub32_rt_sigsuspend
324 + .quad sys_rt_sigsuspend
325 .quad sys32_pread /* 180 */
328 @@ -670,4 +775,10 @@ ia32_sys_call_table:
329 .quad sys32_fallocate
330 .quad compat_sys_timerfd_settime /* 325 */
331 .quad compat_sys_timerfd_gettime
332 + .quad compat_sys_signalfd4
334 + .quad sys_epoll_create1
335 + .quad sys_dup3 /* 330 */
337 + .quad sys_inotify_init1
339 Index: head-2008-12-01/arch/x86/kernel/Makefile
340 ===================================================================
341 --- head-2008-12-01.orig/arch/x86/kernel/Makefile 2008-12-01 11:44:55.000000000 +0100
342 +++ head-2008-12-01/arch/x86/kernel/Makefile 2008-12-01 11:49:07.000000000 +0100
343 @@ -120,9 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
345 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
347 - obj-$(CONFIG_XEN) += nmi_64.o
348 + obj-$(CONFIG_XEN) += nmi.o
349 time_64-$(CONFIG_XEN) += time_32.o
352 -disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
353 - pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
354 +disabled-obj-$(CONFIG_XEN) := bios_uv.o early-quirks.o hpet.o i8253.o \
355 + i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \
356 + tlb_$(BITS).o tsc.o tsc_sync.o vsmp_64.o
357 Index: head-2008-12-01/arch/x86/kernel/acpi/boot.c
358 ===================================================================
359 --- head-2008-12-01.orig/arch/x86/kernel/acpi/boot.c 2008-12-01 11:44:55.000000000 +0100
360 +++ head-2008-12-01/arch/x86/kernel/acpi/boot.c 2008-12-01 11:49:07.000000000 +0100
361 @@ -951,7 +951,9 @@ void __init mp_register_ioapic(int id, u
362 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
363 mp_ioapics[idx].mp_apicaddr = address;
366 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
368 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
370 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
371 @@ -1108,7 +1110,7 @@ int mp_register_gsi(u32 gsi, int trigger
375 -#ifdef CONFIG_X86_32
376 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
377 #define MAX_GSI_NUM 4096
378 #define IRQ_COMPRESSION_START 64
380 @@ -1156,7 +1158,7 @@ int mp_register_gsi(u32 gsi, int trigger
381 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
382 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
383 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
384 -#ifdef CONFIG_X86_32
385 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
386 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
389 @@ -1164,7 +1166,7 @@ int mp_register_gsi(u32 gsi, int trigger
392 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
393 -#ifdef CONFIG_X86_32
394 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
396 * For GSI >= 64, use IRQ compression
398 Index: head-2008-12-01/arch/x86/kernel/acpi/sleep-xen.c
399 ===================================================================
400 --- head-2008-12-01.orig/arch/x86/kernel/acpi/sleep-xen.c 2008-12-01 11:44:55.000000000 +0100
401 +++ head-2008-12-01/arch/x86/kernel/acpi/sleep-xen.c 2008-12-01 11:49:07.000000000 +0100
403 #include <linux/bootmem.h>
404 #include <linux/dmi.h>
405 #include <linux/cpumask.h>
406 +#include <asm/segment.h>
408 #include "realmode/wakeup.h"
410 @@ -20,7 +21,7 @@ unsigned long acpi_realmode_flags;
411 /* address in low memory of the wakeup routine. */
412 static unsigned long acpi_realmode;
415 +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
416 static char temp_stack[10240];
419 @@ -54,18 +55,27 @@ int acpi_save_state_mem(void)
420 header->video_mode = saved_video_mode;
422 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
425 + * Set up the wakeup GDT. We set these up as Big Real Mode,
426 + * that is, with limits set to 4 GB. At least the Lenovo
427 + * Thinkpad X61 is known to need this for the video BIOS
428 + * initialization quirk to work; this is likely to also
429 + * be the case for other laptops or integrated video devices.
432 /* GDT[0]: GDT self-pointer */
433 header->wakeup_gdt[0] =
434 (u64)(sizeof(header->wakeup_gdt) - 1) +
435 ((u64)(acpi_wakeup_address +
436 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
438 - /* GDT[1]: real-mode-like code segment */
439 - header->wakeup_gdt[1] = (0x009bULL << 40) +
440 - ((u64)acpi_wakeup_address << 16) + 0xffff;
441 - /* GDT[2]: real-mode-like data segment */
442 - header->wakeup_gdt[2] = (0x0093ULL << 40) +
443 - ((u64)acpi_wakeup_address << 16) + 0xffff;
444 + /* GDT[1]: big real mode-like code segment */
445 + header->wakeup_gdt[1] =
446 + GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
447 + /* GDT[2]: big real mode-like data segment */
448 + header->wakeup_gdt[2] =
449 + GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
452 store_gdt((struct desc_ptr *)&header->pmode_gdt);
453 @@ -79,7 +89,7 @@ int acpi_save_state_mem(void)
454 #endif /* !CONFIG_64BIT */
456 header->pmode_cr0 = read_cr0();
457 - header->pmode_cr4 = read_cr4();
458 + header->pmode_cr4 = read_cr4_safe();
459 header->realmode_flags = acpi_realmode_flags;
460 header->real_magic = 0x12345678;
462 @@ -89,7 +99,9 @@ int acpi_save_state_mem(void)
463 saved_magic = 0x12345678;
464 #else /* CONFIG_64BIT */
465 header->trampoline_segment = setup_trampoline() >> 4;
466 - init_rsp = (unsigned long)temp_stack + 4096;
468 + stack_start.sp = temp_stack + 4096;
470 initial_code = (unsigned long)wakeup_long64;
471 saved_magic = 0x123456789abcdef0;
472 #endif /* CONFIG_64BIT */
473 @@ -145,6 +157,12 @@ static int __init acpi_sleep_setup(char
474 acpi_realmode_flags |= 2;
475 if (strncmp(str, "s3_beep", 7) == 0)
476 acpi_realmode_flags |= 4;
477 +#ifdef CONFIG_HIBERNATION
478 + if (strncmp(str, "s4_nohwsig", 10) == 0)
479 + acpi_no_s4_hw_signature();
481 + if (strncmp(str, "old_ordering", 12) == 0)
482 + acpi_old_suspend_ordering();
483 str = strchr(str, ',');
485 str += strspn(str, ", \t");
486 Index: head-2008-12-01/arch/x86/kernel/apic_32-xen.c
487 ===================================================================
488 --- head-2008-12-01.orig/arch/x86/kernel/apic_32-xen.c 2008-12-01 11:37:10.000000000 +0100
489 +++ head-2008-12-01/arch/x86/kernel/apic_32-xen.c 2008-12-01 11:49:07.000000000 +0100
490 @@ -59,7 +59,10 @@ static cpumask_t timer_bcast_ipi;
492 * Debug level, exported for io_apic.c
495 +unsigned int apic_verbosity;
497 +/* Have we found an MP table */
498 +int smp_found_config;
501 static int modern_apic(void)
502 Index: head-2008-12-01/arch/x86/kernel/apic_64-xen.c
503 ===================================================================
504 --- head-2008-12-01.orig/arch/x86/kernel/apic_64-xen.c 2008-12-01 11:37:10.000000000 +0100
505 +++ head-2008-12-01/arch/x86/kernel/apic_64-xen.c 2008-12-01 11:49:07.000000000 +0100
506 @@ -39,7 +39,10 @@ int disable_apic;
508 * Debug level, exported for io_apic.c
511 +unsigned int apic_verbosity;
513 +/* Have we found an MP table */
514 +int smp_found_config;
517 * The guts of the apic timer interrupt
518 Index: head-2008-12-01/arch/x86/kernel/asm-offsets_64.c
519 ===================================================================
520 --- head-2008-12-01.orig/arch/x86/kernel/asm-offsets_64.c 2008-12-03 15:48:43.000000000 +0100
521 +++ head-2008-12-01/arch/x86/kernel/asm-offsets_64.c 2008-12-01 11:49:07.000000000 +0100
522 @@ -138,7 +138,7 @@ int main(void)
525 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
527 +#ifdef CONFIG_PARAVIRT_XEN
529 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
530 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
531 Index: head-2008-12-01/arch/x86/kernel/cpu/amd_64.c
532 ===================================================================
533 --- head-2008-12-01.orig/arch/x86/kernel/cpu/amd_64.c 2008-12-03 15:48:43.000000000 +0100
534 +++ head-2008-12-01/arch/x86/kernel/cpu/amd_64.c 2008-12-01 11:49:07.000000000 +0100
535 @@ -193,6 +193,7 @@ static void __cpuinit init_amd(struct cp
536 fam10h_check_enable_mmcfg();
540 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
541 unsigned long long tseg;
543 @@ -211,6 +212,7 @@ static void __cpuinit init_amd(struct cp
544 set_memory_4k((unsigned long)__va(tseg), 1);
550 static struct cpu_dev amd_cpu_dev __cpuinitdata = {
551 Index: head-2008-12-01/arch/x86/kernel/cpu/bugs_64.c
552 ===================================================================
553 --- head-2008-12-01.orig/arch/x86/kernel/cpu/bugs_64.c 2008-12-03 15:48:43.000000000 +0100
554 +++ head-2008-12-01/arch/x86/kernel/cpu/bugs_64.c 2008-12-01 11:49:07.000000000 +0100
555 @@ -20,6 +20,7 @@ void __init check_bugs(void)
557 alternative_instructions();
561 * Make sure the first 2MB area is not mapped by huge pages
562 * There are typically fixed size MTRRs in there and overlapping
563 @@ -30,4 +31,5 @@ void __init check_bugs(void)
566 set_memory_4k((unsigned long)__va(0), 1);
569 Index: head-2008-12-01/arch/x86/kernel/cpu/common-xen.c
570 ===================================================================
571 --- head-2008-12-01.orig/arch/x86/kernel/cpu/common-xen.c 2008-12-01 11:44:55.000000000 +0100
572 +++ head-2008-12-01/arch/x86/kernel/cpu/common-xen.c 2008-12-01 11:49:07.000000000 +0100
574 #include <asm/mtrr.h>
577 +#include <asm/asm.h>
578 #ifdef CONFIG_X86_LOCAL_APIC
579 #include <asm/mpspec.h>
580 #include <asm/apic.h>
581 @@ -341,11 +342,24 @@ static void __init early_cpu_detect(void
583 get_cpu_vendor(c, 1);
587 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
588 cpu_devs[c->x86_vendor]->c_early_init)
589 cpu_devs[c->x86_vendor]->c_early_init(c);
594 + * The NOPL instruction is supposed to exist on all CPUs with
595 + * family >= 6; unfortunately, that's not true in practice because
596 + * of early VIA chips and (more importantly) broken virtualizers that
597 + * are not easy to detect. In the latter case it doesn't even *fail*
598 + * reliably, so probing for it doesn't even work. Disable it completely
599 + * unless we can find a reliable way to detect all the broken cases.
601 +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
603 + clear_cpu_cap(c, X86_FEATURE_NOPL);
606 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
607 @@ -402,8 +416,8 @@ static void __cpuinit generic_identify(s
610 init_scattered_cpuid_features(c);
616 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
617 @@ -434,7 +448,7 @@ __setup("serialnumber", x86_serial_nr_se
619 * This does the hard work of actually picking apart the CPU stuff...
621 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
622 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
626 @@ -448,6 +462,8 @@ void __cpuinit identify_cpu(struct cpuin
627 c->x86_max_cores = 1;
628 c->x86_clflush_size = 32;
629 memset(&c->x86_capability, 0, sizeof c->x86_capability);
630 + if (boot_cpu_has(X86_FEATURE_SYSCALL32))
631 + set_cpu_cap(c, X86_FEATURE_SYSCALL32);
633 if (!have_cpuid_p()) {
635 Index: head-2008-12-01/arch/x86/kernel/cpu/common_64-xen.c
636 ===================================================================
637 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
638 +++ head-2008-12-01/arch/x86/kernel/cpu/common_64-xen.c 2008-12-01 11:49:07.000000000 +0100
640 +#include <linux/init.h>
641 +#include <linux/kernel.h>
642 +#include <linux/sched.h>
643 +#include <linux/string.h>
644 +#include <linux/bootmem.h>
645 +#include <linux/bitops.h>
646 +#include <linux/module.h>
647 +#include <linux/kgdb.h>
648 +#include <linux/topology.h>
649 +#include <linux/delay.h>
650 +#include <linux/smp.h>
651 +#include <linux/percpu.h>
652 +#include <asm/i387.h>
653 +#include <asm/msr.h>
655 +#include <asm/linkage.h>
656 +#include <asm/mmu_context.h>
657 +#include <asm/mtrr.h>
658 +#include <asm/mce.h>
659 +#include <asm/pat.h>
660 +#include <asm/asm.h>
661 +#include <asm/numa.h>
662 +#ifdef CONFIG_X86_LOCAL_APIC
663 +#include <asm/mpspec.h>
664 +#include <asm/apic.h>
665 +#include <mach_apic.h>
666 +#elif defined(CONFIG_XEN)
667 +#include <mach_apic.h>
669 +#include <asm/pda.h>
670 +#include <asm/pgtable.h>
671 +#include <asm/processor.h>
672 +#include <asm/desc.h>
673 +#include <asm/atomic.h>
674 +#include <asm/proto.h>
675 +#include <asm/sections.h>
676 +#include <asm/setup.h>
677 +#include <asm/genapic.h>
681 +/* We need valid kernel segments for data and code in long mode too
682 + * IRET will check the segment types kkeil 2000/10/28
683 + * Also sysret mandates a special GDT layout
685 +/* The TLS descriptors are currently at a different place compared to i386.
686 + Hopefully nobody expects them at a fixed place (Wine?) */
687 +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
688 + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
689 + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
690 + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
691 + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
692 + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
693 + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
695 +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
697 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
699 +/* Current gdt points %fs at the "master" per-cpu area: after this,
700 + * it's on the real one. */
701 +void switch_to_new_gdt(void)
704 + struct desc_ptr gdt_descr;
706 + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
707 + gdt_descr.size = GDT_SIZE - 1;
708 + load_gdt(&gdt_descr);
710 + void *va, *gdt_addr = get_cpu_gdt_table(smp_processor_id());
711 + unsigned long frames[16];
712 + unsigned int f = 0;
714 + for (va = gdt_addr; va < gdt_addr + GDT_SIZE; va += PAGE_SIZE) {
715 + frames[f++] = virt_to_mfn(va);
716 + make_page_readonly(va, XENFEAT_writable_descriptor_tables);
718 + if (HYPERVISOR_set_gdt(frames, GDT_SIZE / sizeof(struct desc_struct)))
723 +struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
725 +static void __cpuinit default_init(struct cpuinfo_x86 *c)
727 + display_cacheinfo(c);
730 +static struct cpu_dev __cpuinitdata default_cpu = {
731 + .c_init = default_init,
732 + .c_vendor = "Unknown",
734 +static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
736 +int __cpuinit get_model_name(struct cpuinfo_x86 *c)
740 + if (c->extended_cpuid_level < 0x80000004)
743 + v = (unsigned int *) c->x86_model_id;
744 + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
745 + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
746 + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
747 + c->x86_model_id[48] = 0;
752 +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
754 + unsigned int n, dummy, ebx, ecx, edx;
756 + n = c->extended_cpuid_level;
758 + if (n >= 0x80000005) {
759 + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
760 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
761 + "D cache %dK (%d bytes/line)\n",
762 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
763 + c->x86_cache_size = (ecx>>24) + (edx>>24);
764 + /* On K8 L1 TLB is inclusive, so don't count it */
765 + c->x86_tlbsize = 0;
768 + if (n >= 0x80000006) {
769 + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
770 + ecx = cpuid_ecx(0x80000006);
771 + c->x86_cache_size = ecx >> 16;
772 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
774 + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
775 + c->x86_cache_size, ecx & 0xFF);
779 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
782 + u32 eax, ebx, ecx, edx;
783 + int index_msb, core_bits;
785 + cpuid(1, &eax, &ebx, &ecx, &edx);
788 + if (!cpu_has(c, X86_FEATURE_HT))
790 + if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
793 + smp_num_siblings = (ebx & 0xff0000) >> 16;
795 + if (smp_num_siblings == 1) {
796 + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
797 + } else if (smp_num_siblings > 1) {
799 + if (smp_num_siblings > NR_CPUS) {
800 + printk(KERN_WARNING "CPU: Unsupported number of "
801 + "siblings %d", smp_num_siblings);
802 + smp_num_siblings = 1;
806 + index_msb = get_count_order(smp_num_siblings);
807 + c->phys_proc_id = phys_pkg_id(index_msb);
809 + smp_num_siblings = smp_num_siblings / c->x86_max_cores;
811 + index_msb = get_count_order(smp_num_siblings);
813 + core_bits = get_count_order(c->x86_max_cores);
815 + c->cpu_core_id = phys_pkg_id(index_msb) &
816 + ((1 << core_bits) - 1);
819 + if ((c->x86_max_cores * smp_num_siblings) > 1) {
820 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
822 + printk(KERN_INFO "CPU: Processor Core ID: %d\n",
829 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
831 + char *v = c->x86_vendor_id;
833 + static int printed;
835 + for (i = 0; i < X86_VENDOR_NUM; i++) {
837 + if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
838 + (cpu_devs[i]->c_ident[1] &&
839 + !strcmp(v, cpu_devs[i]->c_ident[1]))) {
841 + this_cpu = cpu_devs[i];
848 + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
849 + printk(KERN_ERR "CPU: Your system may be unstable.\n");
851 + c->x86_vendor = X86_VENDOR_UNKNOWN;
854 +static void __init early_cpu_support_print(void)
857 + struct cpu_dev *cpu_devx;
859 + printk("KERNEL supported cpus:\n");
860 + for (i = 0; i < X86_VENDOR_NUM; i++) {
861 + cpu_devx = cpu_devs[i];
864 + for (j = 0; j < 2; j++) {
865 + if (!cpu_devx->c_ident[j])
867 + printk(" %s %s\n", cpu_devx->c_vendor,
868 + cpu_devx->c_ident[j]);
874 + * The NOPL instruction is supposed to exist on all CPUs with
875 + * family >= 6, unfortunately, that's not true in practice because
876 + * of early VIA chips and (more importantly) broken virtualizers that
877 + * are not easy to detect. Hence, probe for it based on first
880 + * Note: no 64-bit chip is known to lack these, but put the code here
881 + * for consistency with 32 bits, and to make it utterly trivial to
882 + * diagnose the problem should it ever surface.
884 +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
886 + const u32 nopl_signature = 0x888c53b1; /* Random number */
887 + u32 has_nopl = nopl_signature;
889 + clear_cpu_cap(c, X86_FEATURE_NOPL);
892 + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
894 + " .section .fixup,\"ax\"\n"
898 + _ASM_EXTABLE(1b,3b)
899 + : "+a" (has_nopl));
901 + if (has_nopl == nopl_signature)
902 + set_cpu_cap(c, X86_FEATURE_NOPL);
906 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
908 +void __init early_cpu_init(void)
910 + struct cpu_vendor_dev *cvdev;
912 + for (cvdev = __x86cpuvendor_start ;
913 + cvdev < __x86cpuvendor_end ;
915 + cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
916 + early_cpu_support_print();
917 + early_identify_cpu(&boot_cpu_data);
920 +/* Do some early cpuid on the boot CPU to get some parameter that are
921 + needed before check_bugs. Everything advanced is in identify_cpu
923 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
927 + c->loops_per_jiffy = loops_per_jiffy;
928 + c->x86_cache_size = -1;
929 + c->x86_vendor = X86_VENDOR_UNKNOWN;
930 + c->x86_model = c->x86_mask = 0; /* So far unknown... */
931 + c->x86_vendor_id[0] = '\0'; /* Unset */
932 + c->x86_model_id[0] = '\0'; /* Unset */
933 + c->x86_clflush_size = 64;
934 + c->x86_cache_alignment = c->x86_clflush_size;
935 + c->x86_max_cores = 1;
936 + c->x86_coreid_bits = 0;
937 + c->extended_cpuid_level = 0;
938 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
940 + /* Get vendor name */
941 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
942 + (unsigned int *)&c->x86_vendor_id[0],
943 + (unsigned int *)&c->x86_vendor_id[8],
944 + (unsigned int *)&c->x86_vendor_id[4]);
948 + /* Initialize the standard set of capabilities */
949 + /* Note that the vendor-specific code below might override */
951 + /* Intel-defined flags: level 0x00000001 */
952 + if (c->cpuid_level >= 0x00000001) {
954 + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
955 + &c->x86_capability[0]);
956 + c->x86 = (tfms >> 8) & 0xf;
957 + c->x86_model = (tfms >> 4) & 0xf;
958 + c->x86_mask = tfms & 0xf;
960 + c->x86 += (tfms >> 20) & 0xff;
962 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
963 + if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
964 + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
966 + /* Have CPUID level 0 only - unheard of */
970 + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
972 + c->phys_proc_id = c->initial_apicid;
974 + /* AMD-defined flags: level 0x80000001 */
975 + xlvl = cpuid_eax(0x80000000);
976 + c->extended_cpuid_level = xlvl;
977 + if ((xlvl & 0xffff0000) == 0x80000000) {
978 + if (xlvl >= 0x80000001) {
979 + c->x86_capability[1] = cpuid_edx(0x80000001);
980 + c->x86_capability[6] = cpuid_ecx(0x80000001);
982 + if (xlvl >= 0x80000004)
983 + get_model_name(c); /* Default name */
986 + /* Transmeta-defined flags: level 0x80860001 */
987 + xlvl = cpuid_eax(0x80860000);
988 + if ((xlvl & 0xffff0000) == 0x80860000) {
989 + /* Don't set x86_cpuid_level here for now to not confuse. */
990 + if (xlvl >= 0x80860001)
991 + c->x86_capability[2] = cpuid_edx(0x80860001);
994 + if (c->extended_cpuid_level >= 0x80000007)
995 + c->x86_power = cpuid_edx(0x80000007);
997 + if (c->extended_cpuid_level >= 0x80000008) {
998 + u32 eax = cpuid_eax(0x80000008);
1000 + c->x86_virt_bits = (eax >> 8) & 0xff;
1001 + c->x86_phys_bits = eax & 0xff;
1006 + if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
1007 + cpu_devs[c->x86_vendor]->c_early_init)
1008 + cpu_devs[c->x86_vendor]->c_early_init(c);
1010 + validate_pat_support(c);
1014 + * This does the hard work of actually picking apart the CPU stuff...
1016 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
1020 + early_identify_cpu(c);
1022 + init_scattered_cpuid_features(c);
1024 + c->apicid = phys_pkg_id(0);
1027 + * Vendor-specific initialization. In this section we
1028 + * canonicalize the feature flags, meaning if there are
1029 + * features a certain CPU supports which CPUID doesn't
1030 + * tell us, CPUID claiming incorrect flags, or other bugs,
1031 + * we handle them here.
1033 + * At the end of this section, c->x86_capability better
1034 + * indicate the features this CPU genuinely supports!
1036 + if (this_cpu->c_init)
1037 + this_cpu->c_init(c);
1042 + * On SMP, boot_cpu_data holds the common feature set between
1043 + * all CPUs; so make sure that we indicate which features are
1044 + * common between the CPUs. The first time this routine gets
1045 + * executed, c == &boot_cpu_data.
1047 + if (c != &boot_cpu_data) {
1048 + /* AND the already accumulated flags with these */
1049 + for (i = 0; i < NCAPINTS; i++)
1050 + boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1053 + /* Clear all flags overriden by options */
1054 + for (i = 0; i < NCAPINTS; i++)
1055 + c->x86_capability[i] &= ~cleared_cpu_caps[i];
1057 +#ifdef CONFIG_X86_MCE
1060 + select_idle_routine(c);
1063 + numa_add_cpu(smp_processor_id());
1068 +void __cpuinit identify_boot_cpu(void)
1070 + identify_cpu(&boot_cpu_data);
1073 +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1075 + BUG_ON(c == &boot_cpu_data);
1080 +static __init int setup_noclflush(char *arg)
1082 + setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1085 +__setup("noclflush", setup_noclflush);
1087 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1089 + if (c->x86_model_id[0])
1090 + printk(KERN_CONT "%s", c->x86_model_id);
1092 + if (c->x86_mask || c->cpuid_level >= 0)
1093 + printk(KERN_CONT " stepping %02x\n", c->x86_mask);
1095 + printk(KERN_CONT "\n");
1098 +static __init int setup_disablecpuid(char *arg)
1101 + if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1102 + setup_clear_cpu_cap(bit);
1107 +__setup("clearcpuid=", setup_disablecpuid);
1109 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1111 +struct x8664_pda **_cpu_pda __read_mostly;
1112 +EXPORT_SYMBOL(_cpu_pda);
1114 +#ifndef CONFIG_X86_NO_IDT
1115 +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
1118 +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
1120 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
1121 +EXPORT_SYMBOL_GPL(__supported_pte_mask);
1123 +static int do_not_nx __cpuinitdata;
1126 +Control non executable mappings for 64bit processes.
1131 +static int __init nonx_setup(char *str)
1135 + if (!strncmp(str, "on", 2)) {
1136 + __supported_pte_mask |= _PAGE_NX;
1138 + } else if (!strncmp(str, "off", 3)) {
1140 + __supported_pte_mask &= ~_PAGE_NX;
1144 +early_param("noexec", nonx_setup);
1146 +int force_personality32;
1149 +Control non executable heap for 32bit processes.
1150 +To control the stack too use noexec=off
1152 +on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
1153 +off PROT_READ implies PROT_EXEC
1155 +static int __init nonx32_setup(char *str)
1157 + if (!strcmp(str, "on"))
1158 + force_personality32 &= ~READ_IMPLIES_EXEC;
1159 + else if (!strcmp(str, "off"))
1160 + force_personality32 |= READ_IMPLIES_EXEC;
1163 +__setup("noexec32=", nonx32_setup);
1165 +static void __init_refok switch_pt(int cpu)
1170 + xen_pt_switch(__pa_symbol(init_level4_pgt));
1171 + xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
1175 +void pda_init(int cpu)
1177 + struct x8664_pda *pda = cpu_pda(cpu);
1179 + /* Setup up data that may be needed in __get_free_pages early */
1180 + loadsegment(fs, 0);
1181 + loadsegment(gs, 0);
1183 + /* Memory clobbers used to order PDA accessed */
1185 + wrmsrl(MSR_GS_BASE, pda);
1188 + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
1189 + (unsigned long)pda))
1193 + pda->cpunumber = cpu;
1194 + pda->irqcount = -1;
1195 + pda->kernelstack = (unsigned long)stack_thread_info() -
1196 + PDA_STACKOFFSET + THREAD_SIZE;
1197 + pda->active_mm = &init_mm;
1198 + pda->mmu_state = 0;
1201 + /* others are initialized in smpboot.c */
1202 + pda->pcurrent = &init_task;
1203 + pda->irqstackptr = boot_cpu_stack;
1204 + pda->irqstackptr += IRQSTACKSIZE - 64;
1206 + if (!pda->irqstackptr) {
1207 + pda->irqstackptr = (char *)
1208 + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
1209 + if (!pda->irqstackptr)
1210 + panic("cannot allocate irqstack for cpu %d",
1212 + pda->irqstackptr += IRQSTACKSIZE - 64;
1215 + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
1216 + pda->nodenumber = cpu_to_node(cpu);
1222 +#ifndef CONFIG_X86_NO_TSS
1223 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
1224 + DEBUG_STKSZ] __page_aligned_bss;
1227 +extern asmlinkage void ignore_sysret(void);
1229 +void __cpuinit syscall_init(void)
1233 + * LSTAR and STAR live in a bit strange symbiosis.
1234 + * They both write to the same internal register. STAR allows to
1235 + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
1237 + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1238 + wrmsrl(MSR_LSTAR, system_call);
1239 + wrmsrl(MSR_CSTAR, ignore_sysret);
1241 + /* Flags to clear on syscall */
1242 + wrmsrl(MSR_SYSCALL_MASK,
1243 + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1245 +#ifdef CONFIG_IA32_EMULATION
1246 + syscall32_cpu_init();
1248 + static /*const*/ struct callback_register __cpuinitdata cstar = {
1249 + .type = CALLBACKTYPE_syscall32,
1250 + .address = (unsigned long)ignore_sysret
1253 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
1254 + printk(KERN_WARN "Unable to register CSTAR callback\n");
1258 +void __cpuinit check_efer(void)
1260 + unsigned long efer;
1262 + rdmsrl(MSR_EFER, efer);
1263 + if (!(efer & EFER_NX) || do_not_nx)
1264 + __supported_pte_mask &= ~_PAGE_NX;
1267 +unsigned long kernel_eflags;
1269 +#ifndef CONFIG_X86_NO_TSS
1271 + * Copies of the original ist values from the tss are only accessed during
1272 + * debugging, no special alignment required.
1274 +DEFINE_PER_CPU(struct orig_ist, orig_ist);
1278 + * cpu_init() initializes state that is per-CPU. Some data is already
1279 + * initialized (naturally) in the bootstrap process, such as the GDT
1280 + * and IDT. We reload them nevertheless, this function acts as a
1281 + * 'CPU state barrier', nothing should get across.
1282 + * A lot of state is already set up in PDA init.
1284 +void __cpuinit cpu_init(void)
1286 + int cpu = stack_smp_processor_id();
1287 +#ifndef CONFIG_X86_NO_TSS
1288 + struct tss_struct *t = &per_cpu(init_tss, cpu);
1289 + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
1291 + char *estacks = NULL;
1294 + struct task_struct *me;
1296 + /* CPU 0 is initialised in head64.c */
1299 +#ifndef CONFIG_X86_NO_TSS
1301 + estacks = boot_exception_stacks;
1306 + if (cpu_test_and_set(cpu, cpu_initialized))
1307 + panic("CPU#%d already initialized!\n", cpu);
1309 + printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1311 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1314 + * Initialize the per-CPU GDT with the boot GDT,
1315 + * and set up the GDT descriptor:
1318 + switch_to_new_gdt();
1319 +#ifndef CONFIG_X86_NO_IDT
1320 + load_idt((const struct desc_ptr *)&idt_descr);
1323 + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
1326 + wrmsrl(MSR_FS_BASE, 0);
1327 + wrmsrl(MSR_KERNEL_GS_BASE, 0);
1332 +#ifndef CONFIG_X86_NO_TSS
1334 + * set up and load the per-CPU TSS
1336 + if (!orig_ist->ist[0]) {
1337 + static const unsigned int order[N_EXCEPTION_STACKS] = {
1338 + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1339 + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1341 + for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1343 + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1345 + panic("Cannot allocate exception "
1346 + "stack %ld %d\n", v, cpu);
1348 + estacks += PAGE_SIZE << order[v];
1349 + orig_ist->ist[v] = t->x86_tss.ist[v] =
1350 + (unsigned long)estacks;
1354 + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1356 + * <= is required because the CPU will access up to
1357 + * 8 bits beyond the end of the IO permission bitmap.
1359 + for (i = 0; i <= IO_BITMAP_LONGS; i++)
1360 + t->io_bitmap[i] = ~0UL;
1363 + atomic_inc(&init_mm.mm_count);
1364 + me->active_mm = &init_mm;
1367 + enter_lazy_tlb(&init_mm, me);
1369 + load_sp0(t, ¤t->thread);
1370 +#ifndef CONFIG_X86_NO_TSS
1371 + set_tss_desc(cpu, t);
1374 + load_LDT(&init_mm.context);
1378 + * If the kgdb is connected no debug regs should be altered. This
1379 + * is only applicable when KGDB and a KGDB I/O module are built
1380 + * into the kernel and you are using early debugging with
1381 + * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1383 + if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1384 + arch_kgdb_ops.correct_hw_break();
1388 + * Clear all 6 debug registers:
1391 + set_debugreg(0UL, 0);
1392 + set_debugreg(0UL, 1);
1393 + set_debugreg(0UL, 2);
1394 + set_debugreg(0UL, 3);
1395 + set_debugreg(0UL, 6);
1396 + set_debugreg(0UL, 7);
1398 + /* If the kgdb is connected no debug regs should be altered. */
1404 + asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
1405 + if (raw_irqs_disabled())
1406 + kernel_eflags &= ~X86_EFLAGS_IF;
1408 + if (is_uv_system())
1411 Index: head-2008-12-01/arch/x86/kernel/e820-xen.c
1412 ===================================================================
1413 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
1414 +++ head-2008-12-01/arch/x86/kernel/e820-xen.c 2008-12-01 11:49:07.000000000 +0100
1417 + * Handle the memory map.
1418 + * The functions here do the job until bootmem takes over.
1420 + * Getting sanitize_e820_map() in sync with i386 version by applying change:
1421 + * - Provisions for empty E820 memory regions (reported by certain BIOSes).
1422 + * Alex Achenbach <xela@slit.de>, December 2002.
1423 + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
1426 +#include <linux/kernel.h>
1427 +#include <linux/types.h>
1428 +#include <linux/init.h>
1429 +#include <linux/bootmem.h>
1430 +#include <linux/ioport.h>
1431 +#include <linux/string.h>
1432 +#include <linux/kexec.h>
1433 +#include <linux/module.h>
1434 +#include <linux/mm.h>
1435 +#include <linux/pfn.h>
1436 +#include <linux/suspend.h>
1437 +#include <linux/firmware-map.h>
1439 +#include <asm/pgtable.h>
1440 +#include <asm/page.h>
1441 +#include <asm/e820.h>
1442 +#include <asm/proto.h>
1443 +#include <asm/setup.h>
1444 +#include <xen/interface/memory.h>
1447 + * The e820 map is the map that gets modified e.g. with command line parameters
1448 + * and that is also registered with modifications in the kernel resource tree
1449 + * with the iomem_resource as parent.
1451 + * The e820_saved is directly saved after the BIOS-provided memory map is
1452 + * copied. It doesn't get modified afterwards. It's registered for the
1453 + * /sys/firmware/memmap interface.
1455 + * That memory map is not modified and is used as base for kexec. The kexec'd
1456 + * kernel should get the same memory map as the firmware provides. Then the
1457 + * user can e.g. boot the original kernel with mem=1G while still booting the
1458 + * next kernel with full memory.
1460 +struct e820map e820;
1461 +struct e820map e820_saved;
1463 +static struct e820map machine_e820;
1466 +/* For PCI or other memory-mapped resources */
1467 +unsigned long pci_mem_start = 0xaeedbabe;
1469 +EXPORT_SYMBOL(pci_mem_start);
1473 + * This function checks if any part of the range <start,end> is mapped
1477 +e820_any_mapped(u64 start, u64 end, unsigned type)
1482 + for (i = 0; i < e820.nr_map; i++) {
1483 + struct e820entry *ei = &e820.map[i];
1485 + if (!is_initial_xendomain())
1487 + for (i = 0; i < machine_e820.nr_map; ++i) {
1488 + const struct e820entry *ei = &machine_e820.map[i];
1491 + if (type && ei->type != type)
1493 + if (ei->addr >= end || ei->addr + ei->size <= start)
1499 +EXPORT_SYMBOL_GPL(e820_any_mapped);
1502 + * This function checks if the entire range <start,end> is mapped with type.
1504 + * Note: this function only works correct if the e820 table is sorted and
1505 + * not-overlapping, which is the case
1507 +int __init e820_all_mapped(u64 start, u64 end, unsigned type)
1512 + for (i = 0; i < e820.nr_map; i++) {
1513 + struct e820entry *ei = &e820.map[i];
1515 + if (!is_initial_xendomain())
1517 + for (i = 0; i < machine_e820.nr_map; ++i) {
1518 + const struct e820entry *ei = &machine_e820.map[i];
1521 + if (type && ei->type != type)
1523 + /* is the region (part) in overlap with the current region ?*/
1524 + if (ei->addr >= end || ei->addr + ei->size <= start)
1527 + /* if the region is at the beginning of <start,end> we move
1528 + * start to the end of the region since it's ok until there
1530 + if (ei->addr <= start)
1531 + start = ei->addr + ei->size;
1533 + * if start is now at or beyond end, we're done, full
1543 + * Add a memory region to the kernel e820 map.
1545 +void __init e820_add_region(u64 start, u64 size, int type)
1547 + int x = e820.nr_map;
1549 + if (x == ARRAY_SIZE(e820.map)) {
1550 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1554 + e820.map[x].addr = start;
1555 + e820.map[x].size = size;
1556 + e820.map[x].type = type;
1560 +void __init e820_print_map(char *who)
1564 + for (i = 0; i < e820.nr_map; i++) {
1565 + printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1566 + (unsigned long long) e820.map[i].addr,
1567 + (unsigned long long)
1568 + (e820.map[i].addr + e820.map[i].size));
1569 + switch (e820.map[i].type) {
1571 + case E820_RESERVED_KERN:
1572 + printk(KERN_CONT "(usable)\n");
1574 + case E820_RESERVED:
1575 + printk(KERN_CONT "(reserved)\n");
1578 + printk(KERN_CONT "(ACPI data)\n");
1581 + printk(KERN_CONT "(ACPI NVS)\n");
1584 + printk(KERN_CONT "type %u\n", e820.map[i].type);
1591 + * Sanitize the BIOS e820 map.
1593 + * Some e820 responses include overlapping entries. The following
1594 + * replaces the original e820 map with a new one, removing overlaps,
1595 + * and resolving conflicting memory types in favor of highest
1598 + * The input parameter biosmap points to an array of 'struct
1599 + * e820entry' which on entry has elements in the range [0, *pnr_map)
1600 + * valid, and which has space for up to max_nr_map entries.
1601 + * On return, the resulting sanitized e820 map entries will be in
1602 + * overwritten in the same location, starting at biosmap.
1604 + * The integer pointed to by pnr_map must be valid on entry (the
1605 + * current number of valid entries located at biosmap) and will
1606 + * be updated on return, with the new number of valid entries
1607 + * (something no more than max_nr_map.)
1609 + * The return value from sanitize_e820_map() is zero if it
1610 + * successfully 'sanitized' the map entries passed in, and is -1
1611 + * if it did nothing, which can happen if either of (1) it was
1612 + * only passed one map entry, or (2) any of the input map entries
1613 + * were invalid (start + size < start, meaning that the size was
1614 + * so big the described memory range wrapped around through zero.)
1616 + * Visually we're performing the following
1617 + * (1,2,3,4 = memory types)...
1619 + * Sample memory map (w/overlaps):
1620 + * ____22__________________
1621 + * ______________________4_
1622 + * ____1111________________
1623 + * _44_____________________
1624 + * 11111111________________
1625 + * ____________________33__
1626 + * ___________44___________
1627 + * __________33333_________
1628 + * ______________22________
1629 + * ___________________2222_
1630 + * _________111111111______
1631 + * _____________________11_
1632 + * _________________4______
1634 + * Sanitized equivalent (no overlap):
1635 + * 1_______________________
1636 + * _44_____________________
1637 + * ___1____________________
1638 + * ____22__________________
1639 + * ______11________________
1640 + * _________1______________
1641 + * __________3_____________
1642 + * ___________44___________
1643 + * _____________33_________
1644 + * _______________2________
1645 + * ________________1_______
1646 + * _________________4______
1647 + * ___________________2____
1648 + * ____________________33__
1649 + * ______________________4_
1652 +int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
1655 + struct change_member {
1656 + struct e820entry *pbios; /* pointer to original bios entry */
1657 + unsigned long long addr; /* address for this change point */
1659 + static struct change_member change_point_list[2*E820_X_MAX] __initdata;
1660 + static struct change_member *change_point[2*E820_X_MAX] __initdata;
1661 + static struct e820entry *overlap_list[E820_X_MAX] __initdata;
1662 + static struct e820entry new_bios[E820_X_MAX] __initdata;
1663 + struct change_member *change_tmp;
1664 + unsigned long current_type, last_type;
1665 + unsigned long long last_addr;
1666 + int chgidx, still_changing;
1667 + int overlap_entries;
1668 + int new_bios_entry;
1669 + int old_nr, new_nr, chg_nr;
1672 + /* if there's only one memory region, don't bother */
1674 + if (*pnr_map == 1)
1680 + old_nr = *pnr_map;
1681 + BUG_ON(old_nr > max_nr_map);
1683 + /* bail out if we find any unreasonable addresses in bios map */
1684 + for (i = 0; i < old_nr; i++)
1685 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
1688 + /* create pointers for initial change-point information (for sorting) */
1689 + for (i = 0; i < 2 * old_nr; i++)
1690 + change_point[i] = &change_point_list[i];
1692 + /* record all known change-points (starting and ending addresses),
1693 + omitting those that are for empty memory regions */
1695 + for (i = 0; i < old_nr; i++) {
1696 + if (biosmap[i].size != 0) {
1697 + change_point[chgidx]->addr = biosmap[i].addr;
1698 + change_point[chgidx++]->pbios = &biosmap[i];
1699 + change_point[chgidx]->addr = biosmap[i].addr +
1701 + change_point[chgidx++]->pbios = &biosmap[i];
1706 + /* sort change-point list by memory addresses (low -> high) */
1707 + still_changing = 1;
1708 + while (still_changing) {
1709 + still_changing = 0;
1710 + for (i = 1; i < chg_nr; i++) {
1711 + unsigned long long curaddr, lastaddr;
1712 + unsigned long long curpbaddr, lastpbaddr;
1714 + curaddr = change_point[i]->addr;
1715 + lastaddr = change_point[i - 1]->addr;
1716 + curpbaddr = change_point[i]->pbios->addr;
1717 + lastpbaddr = change_point[i - 1]->pbios->addr;
1720 + * swap entries, when:
1722 + * curaddr > lastaddr or
1723 + * curaddr == lastaddr and curaddr == curpbaddr and
1724 + * lastaddr != lastpbaddr
1726 + if (curaddr < lastaddr ||
1727 + (curaddr == lastaddr && curaddr == curpbaddr &&
1728 + lastaddr != lastpbaddr)) {
1729 + change_tmp = change_point[i];
1730 + change_point[i] = change_point[i-1];
1731 + change_point[i-1] = change_tmp;
1732 + still_changing = 1;
1737 + /* create a new bios memory map, removing overlaps */
1738 + overlap_entries = 0; /* number of entries in the overlap table */
1739 + new_bios_entry = 0; /* index for creating new bios map entries */
1740 + last_type = 0; /* start with undefined memory type */
1741 + last_addr = 0; /* start with 0 as last starting address */
1743 + /* loop through change-points, determining affect on the new bios map */
1744 + for (chgidx = 0; chgidx < chg_nr; chgidx++) {
1745 + /* keep track of all overlapping bios entries */
1746 + if (change_point[chgidx]->addr ==
1747 + change_point[chgidx]->pbios->addr) {
1749 + * add map entry to overlap list (> 1 entry
1750 + * implies an overlap)
1752 + overlap_list[overlap_entries++] =
1753 + change_point[chgidx]->pbios;
1756 + * remove entry from list (order independent,
1757 + * so swap with last)
1759 + for (i = 0; i < overlap_entries; i++) {
1760 + if (overlap_list[i] ==
1761 + change_point[chgidx]->pbios)
1763 + overlap_list[overlap_entries-1];
1765 + overlap_entries--;
1768 + * if there are overlapping entries, decide which
1769 + * "type" to use (larger value takes precedence --
1770 + * 1=usable, 2,3,4,4+=unusable)
1773 + for (i = 0; i < overlap_entries; i++)
1774 + if (overlap_list[i]->type > current_type)
1775 + current_type = overlap_list[i]->type;
1777 + * continue building up new bios map based on this
1780 + if (current_type != last_type) {
1781 + if (last_type != 0) {
1782 + new_bios[new_bios_entry].size =
1783 + change_point[chgidx]->addr - last_addr;
1785 + * move forward only if the new size
1788 + if (new_bios[new_bios_entry].size != 0)
1790 + * no more space left for new
1793 + if (++new_bios_entry >= max_nr_map)
1796 + if (current_type != 0) {
1797 + new_bios[new_bios_entry].addr =
1798 + change_point[chgidx]->addr;
1799 + new_bios[new_bios_entry].type = current_type;
1800 + last_addr = change_point[chgidx]->addr;
1802 + last_type = current_type;
1805 + /* retain count for new bios entries */
1806 + new_nr = new_bios_entry;
1808 + /* copy new bios mapping into original location */
1809 + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
1810 + *pnr_map = new_nr;
1815 +static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
1818 + u64 start = biosmap->addr;
1819 + u64 size = biosmap->size;
1820 + u64 end = start + size;
1821 + u32 type = biosmap->type;
1823 + /* Overflow in 64 bits? Ignore the memory map. */
1827 + e820_add_region(start, size, type);
1836 + * Copy the BIOS e820 map into a safe place.
1838 + * Sanity-check it while we're at it..
1840 + * If we're lucky and live on a modern system, the setup code
1841 + * will have given us a memory map that we can use to properly
1842 + * set up memory. If we aren't, we'll fake a memory map.
1844 +static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
1847 + /* Only one memory region (or negative)? Ignore it */
1851 + BUG_ON(nr_map < 1);
1854 + return __append_e820_map(biosmap, nr_map);
1857 +static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
1858 + u64 size, unsigned old_type,
1859 + unsigned new_type)
1862 + u64 real_updated_size = 0;
1864 + BUG_ON(old_type == new_type);
1866 + if (size > (ULLONG_MAX - start))
1867 + size = ULLONG_MAX - start;
1869 + for (i = 0; i < e820.nr_map; i++) {
1870 + struct e820entry *ei = &e820x->map[i];
1871 + u64 final_start, final_end;
1872 + if (ei->type != old_type)
1874 + /* totally covered? */
1875 + if (ei->addr >= start &&
1876 + (ei->addr + ei->size) <= (start + size)) {
1877 + ei->type = new_type;
1878 + real_updated_size += ei->size;
1881 + /* partially covered */
1882 + final_start = max(start, ei->addr);
1883 + final_end = min(start + size, ei->addr + ei->size);
1884 + if (final_start >= final_end)
1886 + e820_add_region(final_start, final_end - final_start,
1888 + real_updated_size += final_end - final_start;
1890 + ei->size -= final_end - final_start;
1891 + if (ei->addr < final_start)
1893 + ei->addr = final_end;
1895 + return real_updated_size;
1898 +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
1899 + unsigned new_type)
1901 + return e820_update_range_map(&e820, start, size, old_type, new_type);
1904 +static u64 __init e820_update_range_saved(u64 start, u64 size,
1905 + unsigned old_type, unsigned new_type)
1907 + return e820_update_range_map(&e820_saved, start, size, old_type,
1911 +/* make e820 not cover the range */
1912 +u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
1916 + u64 real_removed_size = 0;
1918 + if (size > (ULLONG_MAX - start))
1919 + size = ULLONG_MAX - start;
1921 + for (i = 0; i < e820.nr_map; i++) {
1922 + struct e820entry *ei = &e820.map[i];
1923 + u64 final_start, final_end;
1925 + if (checktype && ei->type != old_type)
1927 + /* totally covered? */
1928 + if (ei->addr >= start &&
1929 + (ei->addr + ei->size) <= (start + size)) {
1930 + real_removed_size += ei->size;
1931 + memset(ei, 0, sizeof(struct e820entry));
1934 + /* partially covered */
1935 + final_start = max(start, ei->addr);
1936 + final_end = min(start + size, ei->addr + ei->size);
1937 + if (final_start >= final_end)
1939 + real_removed_size += final_end - final_start;
1941 + ei->size -= final_end - final_start;
1942 + if (ei->addr < final_start)
1944 + ei->addr = final_end;
1946 + return real_removed_size;
1949 +void __init update_e820(void)
1953 + nr_map = e820.nr_map;
1954 + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
1956 + e820.nr_map = nr_map;
1957 + printk(KERN_INFO "modified physical RAM map:\n");
1958 + e820_print_map("modified");
1960 +static void __init update_e820_saved(void)
1964 + nr_map = e820_saved.nr_map;
1965 + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
1967 + e820_saved.nr_map = nr_map;
1971 +#define e820 machine_e820
1974 +#define MAX_GAP_END 0x100000000ull
1976 + * Search for a gap in the e820 memory space from start_addr to end_addr.
1978 +__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
1979 + unsigned long start_addr, unsigned long long end_addr)
1981 + unsigned long long last;
1982 + int i = e820.nr_map;
1985 + last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
1986 +#ifdef CONFIG_X86_64
1987 + if (start_addr >= MAX_GAP_END)
1988 + last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
1991 + while (--i >= 0) {
1992 + unsigned long long start = e820.map[i].addr;
1993 + unsigned long long end = start + e820.map[i].size;
1995 + if (end < start_addr)
1999 + * Since "last" is at most 4GB, we know we'll
2000 + * fit in 32 bits if this condition is true
2003 + unsigned long gap = last - end;
2005 + if (gap >= *gapsize) {
2018 + * Search for the biggest gap in the low 32 bits of the e820
2019 + * memory space. We pass this space to PCI to assign MMIO resources
2020 + * for hotplug or unconfigured devices in.
2021 + * Hopefully the BIOS let enough space left.
2023 +__init void e820_setup_gap(void)
2025 + unsigned long gapstart, gapsize, round;
2028 + gapstart = 0x10000000;
2029 + gapsize = 0x400000;
2030 + found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
2032 +#ifdef CONFIG_X86_64
2034 + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2036 + KERN_ERR "PCI: Unassigned devices with 32bit resource "
2037 + "registers may break!\n");
2038 + found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
2044 + * See how much we want to round up: start off with
2045 + * rounding to the next 1MB area.
2048 + while ((gapsize >> 4) > round)
2050 + /* Fun with two's complement */
2051 + pci_mem_start = (gapstart + round) & -round;
2054 + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2055 + pci_mem_start, gapstart, gapsize);
2062 + * Because of the size limitation of struct boot_params, only first
2063 + * 128 E820 memory entries are passed to kernel via
2064 + * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
2065 + * linked list of struct setup_data, which is parsed here.
2067 +void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
2071 + struct e820entry *extmap;
2073 + entries = sdata->len / sizeof(struct e820entry);
2074 + map_len = sdata->len + sizeof(struct setup_data);
2075 + if (map_len > PAGE_SIZE)
2076 + sdata = early_ioremap(pa_data, map_len);
2077 + extmap = (struct e820entry *)(sdata->data);
2078 + __append_e820_map(extmap, entries);
2079 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
2080 + if (map_len > PAGE_SIZE)
2081 + early_iounmap(sdata, map_len);
2082 + printk(KERN_INFO "extended physical RAM map:\n");
2083 + e820_print_map("extended");
2086 +#if defined(CONFIG_X86_64) || \
2087 + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
2089 + * Find the ranges of physical addresses that do not correspond to
2090 + * e820 RAM areas and mark the corresponding pages as nosave for
2091 + * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
2093 + * This function requires the e820 map to be sorted and without any
2094 + * overlapping entries and assumes the first e820 area to be RAM.
2096 +void __init e820_mark_nosave_regions(unsigned long limit_pfn)
2099 + unsigned long pfn;
2101 + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
2102 + for (i = 1; i < e820.nr_map; i++) {
2103 + struct e820entry *ei = &e820.map[i];
2105 + if (pfn < PFN_UP(ei->addr))
2106 + register_nosave_region(pfn, PFN_UP(ei->addr));
2108 + pfn = PFN_DOWN(ei->addr + ei->size);
2109 + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
2110 + register_nosave_region(PFN_UP(ei->addr), pfn);
2112 + if (pfn >= limit_pfn)
2120 + * Early reserved memory areas.
2122 +#define MAX_EARLY_RES 20
2129 +static struct early_res early_res[MAX_EARLY_RES] __initdata = {
2131 + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
2132 +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
2133 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
2135 +#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
2137 + * But first pinch a few for the stack/trampoline stuff
2138 + * FIXME: Don't need the extra page at 4K, but need to fix
2139 + * trampoline before removing it. (see the GDT stuff)
2141 + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
2143 + * Has to be in very low memory so we can execute
2144 + * real-mode AP code.
2146 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
2152 +static int __init find_overlapped_early(u64 start, u64 end)
2155 + struct early_res *r;
2157 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2158 + r = &early_res[i];
2159 + if (end > r->start && start < r->end)
2167 + * Drop the i-th range from the early reservation map,
2168 + * by copying any higher ranges down one over it, and
2169 + * clearing what had been the last slot.
2171 +static void __init drop_range(int i)
2175 + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
2178 + memmove(&early_res[i], &early_res[i + 1],
2179 + (j - 1 - i) * sizeof(struct early_res));
2181 + early_res[j - 1].end = 0;
2185 + * Split any existing ranges that:
2186 + * 1) are marked 'overlap_ok', and
2187 + * 2) overlap with the stated range [start, end)
2188 + * into whatever portion (if any) of the existing range is entirely
2189 + * below or entirely above the stated range. Drop the portion
2190 + * of the existing range that overlaps with the stated range,
2191 + * which will allow the caller of this routine to then add that
2192 + * stated range without conflicting with any existing range.
2194 +static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
2197 + struct early_res *r;
2198 + u64 lower_start, lower_end;
2199 + u64 upper_start, upper_end;
2202 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2203 + r = &early_res[i];
2205 + /* Continue past non-overlapping ranges */
2206 + if (end <= r->start || start >= r->end)
2210 + * Leave non-ok overlaps as is; let caller
2211 + * panic "Overlapping early reservations"
2212 + * when it hits this overlap.
2214 + if (!r->overlap_ok)
2218 + * We have an ok overlap. We will drop it from the early
2219 + * reservation map, and add back in any non-overlapping
2220 + * portions (lower or upper) as separate, overlap_ok,
2221 + * non-overlapping ranges.
2224 + /* 1. Note any non-overlapping (lower or upper) ranges. */
2225 + strncpy(name, r->name, sizeof(name) - 1);
2227 + lower_start = lower_end = 0;
2228 + upper_start = upper_end = 0;
2229 + if (r->start < start) {
2230 + lower_start = r->start;
2231 + lower_end = start;
2233 + if (r->end > end) {
2234 + upper_start = end;
2235 + upper_end = r->end;
2238 + /* 2. Drop the original ok overlapping range */
2241 + i--; /* resume for-loop on copied down entry */
2243 + /* 3. Add back in any non-overlapping ranges. */
2245 + reserve_early_overlap_ok(lower_start, lower_end, name);
2247 + reserve_early_overlap_ok(upper_start, upper_end, name);
2251 +static void __init __reserve_early(u64 start, u64 end, char *name,
2255 + struct early_res *r;
2257 + i = find_overlapped_early(start, end);
2258 + if (i >= MAX_EARLY_RES)
2259 + panic("Too many early reservations");
2260 + r = &early_res[i];
2262 + panic("Overlapping early reservations "
2263 + "%llx-%llx %s to %llx-%llx %s\n",
2264 + start, end - 1, name?name:"", r->start,
2265 + r->end - 1, r->name);
2268 + r->overlap_ok = overlap_ok;
2270 + strncpy(r->name, name, sizeof(r->name) - 1);
2274 + * A few early reservtations come here.
2276 + * The 'overlap_ok' in the name of this routine does -not- mean it
2277 + * is ok for these reservations to overlap an earlier reservation.
2278 + * Rather it means that it is ok for subsequent reservations to
2279 + * overlap this one.
2281 + * Use this entry point to reserve early ranges when you are doing
2282 + * so out of "Paranoia", reserving perhaps more memory than you need,
2283 + * just in case, and don't mind a subsequent overlapping reservation
2284 + * that is known to be needed.
2286 + * The drop_overlaps_that_are_ok() call here isn't really needed.
2287 + * It would be needed if we had two colliding 'overlap_ok'
2288 + * reservations, so that the second such would not panic on the
2289 + * overlap with the first. We don't have any such as of this
2290 + * writing, but might as well tolerate such if it happens in
2293 +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
2295 + drop_overlaps_that_are_ok(start, end);
2296 + __reserve_early(start, end, name, 1);
2300 + * Most early reservations come here.
2302 + * We first have drop_overlaps_that_are_ok() drop any pre-existing
2303 + * 'overlap_ok' ranges, so that we can then reserve this memory
2304 + * range without risk of panic'ing on an overlapping overlap_ok
2305 + * early reservation.
2307 +void __init reserve_early(u64 start, u64 end, char *name)
2309 + drop_overlaps_that_are_ok(start, end);
2310 + __reserve_early(start, end, name, 0);
2313 +void __init free_early(u64 start, u64 end)
2315 + struct early_res *r;
2318 + i = find_overlapped_early(start, end);
2319 + r = &early_res[i];
2320 + if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
2321 + panic("free_early on not reserved area: %llx-%llx!",
2327 +void __init early_res_to_bootmem(u64 start, u64 end)
2330 + u64 final_start, final_end;
2333 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
2336 + printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
2337 + count, start, end);
2338 + for (i = 0; i < count; i++) {
2339 + struct early_res *r = &early_res[i];
2340 + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
2341 + r->start, r->end, r->name);
2342 + final_start = max(start, r->start);
2343 + final_end = min(end, r->end);
2344 + if (final_start >= final_end) {
2345 + printk(KERN_CONT "\n");
2348 + printk(KERN_CONT " ==> [%010llx - %010llx]\n",
2349 + final_start, final_end);
2350 + reserve_bootmem_generic(final_start, final_end - final_start,
2355 +/* Check for already reserved areas */
2356 +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
2359 + u64 addr = *addrp;
2361 + struct early_res *r;
2363 + i = find_overlapped_early(addr, addr + size);
2364 + r = &early_res[i];
2365 + if (i < MAX_EARLY_RES && r->end) {
2366 + *addrp = addr = round_up(r->end, align);
2373 +/* Check for already reserved areas */
2374 +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
2377 + u64 addr = *addrp, last;
2378 + u64 size = *sizep;
2381 + last = addr + size;
2382 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2383 + struct early_res *r = &early_res[i];
2384 + if (last > r->start && addr < r->start) {
2385 + size = r->start - addr;
2389 + if (last > r->end && addr < r->end) {
2390 + addr = round_up(r->end, align);
2391 + size = last - addr;
2395 + if (last <= r->end && addr >= r->start) {
2408 + * Find a free area with specified alignment in a specific range.
2410 +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
2414 + for (i = 0; i < e820.nr_map; i++) {
2415 + struct e820entry *ei = &e820.map[i];
2419 + if (ei->type != E820_RAM)
2421 + addr = round_up(ei->addr, align);
2422 + ei_last = ei->addr + ei->size;
2424 + addr = round_up(start, align);
2425 + if (addr >= ei_last)
2427 + while (bad_addr(&addr, size, align) && addr+size <= ei_last)
2429 + last = addr + size;
2430 + if (last > ei_last)
2440 + * Find next free range after *start
2442 +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
2446 + for (i = 0; i < e820.nr_map; i++) {
2447 + struct e820entry *ei = &e820.map[i];
2451 + if (ei->type != E820_RAM)
2453 + addr = round_up(ei->addr, align);
2454 + ei_last = ei->addr + ei->size;
2456 + addr = round_up(start, align);
2457 + if (addr >= ei_last)
2459 + *sizep = ei_last - addr;
2460 + while (bad_addr_size(&addr, sizep, align) &&
2461 + addr + *sizep <= ei_last)
2463 + last = addr + *sizep;
2464 + if (last > ei_last)
2473 + * pre allocated 4k and reserved it in e820
2475 +u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
2482 + while (size < sizet)
2483 + start = find_e820_area_size(start, &size, align);
2488 + addr = round_down(start + size - sizet, align);
2489 + e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
2490 + e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
2491 + printk(KERN_INFO "update e820 for early_reserve_e820\n");
2493 + update_e820_saved();
2498 +#ifdef CONFIG_X86_32
2499 +# ifdef CONFIG_X86_PAE
2500 +# define MAX_ARCH_PFN (1ULL<<(40-PAGE_SHIFT))
2502 +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
2504 +#else /* CONFIG_X86_32 */
2505 +# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
2509 + * Find the highest page frame number we have available
2511 +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
2514 + unsigned long last_pfn = 0;
2515 + unsigned long max_arch_pfn = MAX_ARCH_PFN;
2517 + for (i = 0; i < e820.nr_map; i++) {
2518 + struct e820entry *ei = &e820.map[i];
2519 + unsigned long start_pfn;
2520 + unsigned long end_pfn;
2522 + if (ei->type != type)
2525 + start_pfn = ei->addr >> PAGE_SHIFT;
2526 + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
2528 + if (start_pfn >= limit_pfn)
2530 + if (end_pfn > limit_pfn) {
2531 + last_pfn = limit_pfn;
2534 + if (end_pfn > last_pfn)
2535 + last_pfn = end_pfn;
2538 + if (last_pfn > max_arch_pfn)
2539 + last_pfn = max_arch_pfn;
2541 + printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
2542 + last_pfn, max_arch_pfn);
2545 +unsigned long __init e820_end_of_ram_pfn(void)
2547 + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
2550 +unsigned long __init e820_end_of_low_ram_pfn(void)
2552 + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
2555 + * Finds an active region in the address range from start_pfn to last_pfn and
2556 + * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
2558 +int __init e820_find_active_region(const struct e820entry *ei,
2559 + unsigned long start_pfn,
2560 + unsigned long last_pfn,
2561 + unsigned long *ei_startpfn,
2562 + unsigned long *ei_endpfn)
2564 + u64 align = PAGE_SIZE;
2566 + *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
2567 + *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
2569 + /* Skip map entries smaller than a page */
2570 + if (*ei_startpfn >= *ei_endpfn)
2573 + /* Skip if map is outside the node */
2574 + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
2575 + *ei_startpfn >= last_pfn)
2578 + /* Check for overlaps */
2579 + if (*ei_startpfn < start_pfn)
2580 + *ei_startpfn = start_pfn;
2581 + if (*ei_endpfn > last_pfn)
2582 + *ei_endpfn = last_pfn;
2587 +/* Walk the e820 map and register active regions within a node */
2588 +void __init e820_register_active_regions(int nid, unsigned long start_pfn,
2589 + unsigned long last_pfn)
2591 + unsigned long ei_startpfn;
2592 + unsigned long ei_endpfn;
2595 + for (i = 0; i < e820.nr_map; i++)
2596 + if (e820_find_active_region(&e820.map[i],
2597 + start_pfn, last_pfn,
2598 + &ei_startpfn, &ei_endpfn))
2599 + add_active_range(nid, ei_startpfn, ei_endpfn);
2603 + * Find the hole size (in bytes) in the memory range.
2604 + * @start: starting address of the memory range to scan
2605 + * @end: ending address of the memory range to scan
2607 +u64 __init e820_hole_size(u64 start, u64 end)
2609 + unsigned long start_pfn = start >> PAGE_SHIFT;
2610 + unsigned long last_pfn = end >> PAGE_SHIFT;
2611 + unsigned long ei_startpfn, ei_endpfn, ram = 0;
2614 + for (i = 0; i < e820.nr_map; i++) {
2615 + if (e820_find_active_region(&e820.map[i],
2616 + start_pfn, last_pfn,
2617 + &ei_startpfn, &ei_endpfn))
2618 + ram += ei_endpfn - ei_startpfn;
2620 + return end - start - ((u64)ram << PAGE_SHIFT);
2623 +static void early_panic(char *msg)
2625 + early_printk(msg);
2629 +static int userdef __initdata;
2631 +/* "mem=nopentium" disables the 4MB page tables. */
2632 +static int __init parse_memopt(char *p)
2634 + u64 mem_size, current_end;
2640 +#ifdef CONFIG_X86_32
2641 + if (!strcmp(p, "nopentium")) {
2642 + setup_clear_cpu_cap(X86_FEATURE_PSE);
2648 + mem_size = memparse(p, &p);
2649 + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2651 + i = e820.nr_map - 1;
2652 + current_end = e820.map[i].addr + e820.map[i].size;
2653 + if (current_end < mem_size) {
2655 + * The e820 map ends before our requested size so
2656 + * extend the final entry to the requested address.
2658 + if (e820.map[i].type == E820_RAM)
2659 + e820.map[i].size = mem_size - e820.map[i].addr;
2661 + e820_add_region(current_end, mem_size - current_end, E820_RAM);
2666 +early_param("mem", parse_memopt);
2669 +static int __init parse_memmap_opt(char *p)
2672 + u64 start_at, mem_size;
2677 + if (!strncmp(p, "exactmap", 8)) {
2678 +#ifdef CONFIG_CRASH_DUMP
2680 + * If we are doing a crash dump, we still need to know
2681 + * the real mem size before original memory map is
2684 + saved_max_pfn = e820_end_of_ram_pfn();
2692 + mem_size = memparse(p, &p);
2698 + start_at = memparse(p+1, &p);
2699 + e820_add_region(start_at, mem_size, E820_RAM);
2700 + } else if (*p == '#') {
2701 + start_at = memparse(p+1, &p);
2702 + e820_add_region(start_at, mem_size, E820_ACPI);
2703 + } else if (*p == '$') {
2704 + start_at = memparse(p+1, &p);
2705 + e820_add_region(start_at, mem_size, E820_RESERVED);
2707 + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2709 + return *p == '\0' ? 0 : -EINVAL;
2711 +early_param("memmap", parse_memmap_opt);
2713 +void __init finish_e820_parsing(void)
2716 + int nr = e820.nr_map;
2718 + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
2719 + early_panic("Invalid user supplied memory map");
2722 + printk(KERN_INFO "user-defined physical RAM map:\n");
2723 + e820_print_map("user");
2728 +static inline const char *e820_type_to_string(int e820_type)
2730 + switch (e820_type) {
2731 + case E820_RESERVED_KERN:
2732 + case E820_RAM: return "System RAM";
2733 + case E820_ACPI: return "ACPI Tables";
2734 + case E820_NVS: return "ACPI Non-volatile Storage";
2735 + default: return "reserved";
2740 +#define e820 machine_e820
2744 + * Mark e820 reserved areas as busy for the resource manager.
2746 +void __init e820_reserve_resources(void)
2749 + struct resource *res;
2752 + res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
2753 + for (i = 0; i < e820.nr_map; i++) {
2754 + end = e820.map[i].addr + e820.map[i].size - 1;
2755 +#ifndef CONFIG_RESOURCES_64BIT
2756 + if (end > 0x100000000ULL) {
2761 + res->name = e820_type_to_string(e820.map[i].type);
2762 + res->start = e820.map[i].addr;
2765 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
2766 + insert_resource(&iomem_resource, res);
2770 + for (i = 0; i < e820_saved.nr_map; i++) {
2771 + struct e820entry *entry = &e820_saved.map[i];
2772 + firmware_map_add_early(entry->addr,
2773 + entry->addr + entry->size - 1,
2774 + e820_type_to_string(entry->type));
2781 +char *__init default_machine_specific_memory_setup(void)
2783 + char *who = "BIOS-e820";
2786 + * Try to copy the BIOS-supplied E820-map.
2788 + * Otherwise fake a memory map; one section from 0k->640k,
2789 + * the next section from 1mb->appropriate_mem_k
2791 + new_nr = boot_params.e820_entries;
2792 + sanitize_e820_map(boot_params.e820_map,
2793 + ARRAY_SIZE(boot_params.e820_map),
2795 + boot_params.e820_entries = new_nr;
2796 + if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
2800 + /* compare results from other methods and take the greater */
2801 + if (boot_params.alt_mem_k
2802 + < boot_params.screen_info.ext_mem_k) {
2803 + mem_size = boot_params.screen_info.ext_mem_k;
2806 + mem_size = boot_params.alt_mem_k;
2807 + who = "BIOS-e801";
2811 + e820_add_region(0, LOWMEMSIZE(), E820_RAM);
2812 + e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
2815 + /* In case someone cares... */
2819 +char *__init __attribute__((weak)) machine_specific_memory_setup(void)
2821 + if (x86_quirks->arch_memory_setup) {
2822 + char *who = x86_quirks->arch_memory_setup();
2827 + return default_machine_specific_memory_setup();
2831 +char * __init memory_setup(void)
2834 + struct xen_memory_map memmap;
2836 + * This is rather large for a stack variable but this early in
2837 + * the boot process we know we have plenty slack space.
2839 + struct e820entry map[E820MAX];
2841 + memmap.nr_entries = E820MAX;
2842 + set_xen_guest_handle(memmap.buffer, map);
2844 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
2845 + if (rc == -ENOSYS) {
2846 + memmap.nr_entries = 1;
2847 + map[0].addr = 0ULL;
2848 + map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
2849 + /* 8MB slack (to balance backend allocations). */
2850 + map[0].size += 8ULL << 20;
2851 + map[0].type = E820_RAM;
2856 + nr_map = memmap.nr_entries;
2857 + sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
2859 + if (append_e820_map(map, nr_map) < 0)
2863 + if (is_initial_xendomain()) {
2864 + memmap.nr_entries = E820MAX;
2865 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
2867 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
2869 + machine_e820.nr_map = memmap.nr_entries;
2871 + machine_e820 = e820;
2877 +void __init setup_memory_map(void)
2881 + who = memory_setup();
2882 + memcpy(&e820_saved, &e820, sizeof(struct e820map));
2883 + printk(KERN_INFO "Xen-provided physical RAM map:\n");
2884 + e820_print_map(who);
2886 Index: head-2008-12-01/arch/x86/kernel/e820_32-xen.c
2887 ===================================================================
2888 --- head-2008-12-01.orig/arch/x86/kernel/e820_32-xen.c 2008-12-01 11:44:55.000000000 +0100
2889 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
2891 -#include <linux/kernel.h>
2892 -#include <linux/types.h>
2893 -#include <linux/init.h>
2894 -#include <linux/bootmem.h>
2895 -#include <linux/ioport.h>
2896 -#include <linux/string.h>
2897 -#include <linux/kexec.h>
2898 -#include <linux/module.h>
2899 -#include <linux/mm.h>
2900 -#include <linux/pfn.h>
2901 -#include <linux/uaccess.h>
2902 -#include <linux/suspend.h>
2904 -#include <asm/pgtable.h>
2905 -#include <asm/page.h>
2906 -#include <asm/e820.h>
2907 -#include <asm/setup.h>
2908 -#include <xen/interface/memory.h>
2910 -struct e820map e820;
2911 -struct change_member {
2912 - struct e820entry *pbios; /* pointer to original bios entry */
2913 - unsigned long long addr; /* address for this change point */
2915 -static struct change_member change_point_list[2*E820MAX] __initdata;
2916 -static struct change_member *change_point[2*E820MAX] __initdata;
2917 -static struct e820entry *overlap_list[E820MAX] __initdata;
2918 -static struct e820entry new_bios[E820MAX] __initdata;
2919 -/* For PCI or other memory-mapped resources */
2920 -unsigned long pci_mem_start = 0x10000000;
2922 -EXPORT_SYMBOL(pci_mem_start);
2924 -extern int user_defined_memmap;
2926 -static struct resource system_rom_resource = {
2927 - .name = "System ROM",
2930 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2933 -static struct resource extension_rom_resource = {
2934 - .name = "Extension ROM",
2937 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2940 -static struct resource adapter_rom_resources[] = { {
2941 - .name = "Adapter ROM",
2944 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2946 - .name = "Adapter ROM",
2949 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2951 - .name = "Adapter ROM",
2954 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2956 - .name = "Adapter ROM",
2959 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2961 - .name = "Adapter ROM",
2964 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2966 - .name = "Adapter ROM",
2969 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2972 -static struct resource video_rom_resource = {
2973 - .name = "Video ROM",
2976 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2979 -#define ROMSIGNATURE 0xaa55
2981 -static int __init romsignature(const unsigned char *rom)
2983 - const unsigned short * const ptr = (const unsigned short *)rom;
2984 - unsigned short sig;
2986 - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
2989 -static int __init romchecksum(const unsigned char *rom, unsigned long length)
2991 - unsigned char sum, c;
2993 - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
2995 - return !length && !sum;
2998 -static void __init probe_roms(void)
3000 - const unsigned char *rom;
3001 - unsigned long start, length, upper;
3006 - /* Nothing to do if not running in dom0. */
3007 - if (!is_initial_xendomain())
3012 - upper = adapter_rom_resources[0].start;
3013 - for (start = video_rom_resource.start; start < upper; start += 2048) {
3014 - rom = isa_bus_to_virt(start);
3015 - if (!romsignature(rom))
3018 - video_rom_resource.start = start;
3020 - if (probe_kernel_address(rom + 2, c) != 0)
3023 - /* 0 < length <= 0x7f * 512, historically */
3026 - /* if checksum okay, trust length byte */
3027 - if (length && romchecksum(rom, length))
3028 - video_rom_resource.end = start + length - 1;
3030 - request_resource(&iomem_resource, &video_rom_resource);
3034 - start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
3035 - if (start < upper)
3039 - request_resource(&iomem_resource, &system_rom_resource);
3040 - upper = system_rom_resource.start;
3042 - /* check for extension rom (ignore length byte!) */
3043 - rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
3044 - if (romsignature(rom)) {
3045 - length = extension_rom_resource.end - extension_rom_resource.start + 1;
3046 - if (romchecksum(rom, length)) {
3047 - request_resource(&iomem_resource, &extension_rom_resource);
3048 - upper = extension_rom_resource.start;
3052 - /* check for adapter roms on 2k boundaries */
3053 - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
3054 - rom = isa_bus_to_virt(start);
3055 - if (!romsignature(rom))
3058 - if (probe_kernel_address(rom + 2, c) != 0)
3061 - /* 0 < length <= 0x7f * 512, historically */
3064 - /* but accept any length that fits if checksum okay */
3065 - if (!length || start + length > upper || !romchecksum(rom, length))
3068 - adapter_rom_resources[i].start = start;
3069 - adapter_rom_resources[i].end = start + length - 1;
3070 - request_resource(&iomem_resource, &adapter_rom_resources[i]);
3072 - start = adapter_rom_resources[i++].end & ~2047UL;
3077 -static struct e820map machine_e820;
3078 -#define e820 machine_e820
3082 - * Request address space for all standard RAM and ROM resources
3083 - * and also for regions reported as reserved by the e820.
3085 -void __init init_iomem_resources(struct resource *code_resource,
3086 - struct resource *data_resource,
3087 - struct resource *bss_resource)
3092 - for (i = 0; i < e820.nr_map; i++) {
3093 - struct resource *res;
3094 -#ifndef CONFIG_RESOURCES_64BIT
3095 - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
3098 - res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
3099 - switch (e820.map[i].type) {
3100 - case E820_RAM: res->name = "System RAM"; break;
3101 - case E820_ACPI: res->name = "ACPI Tables"; break;
3102 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
3103 - default: res->name = "reserved";
3105 - res->start = e820.map[i].addr;
3106 - res->end = res->start + e820.map[i].size - 1;
3107 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3108 - if (request_resource(&iomem_resource, res)) {
3112 - if (e820.map[i].type == E820_RAM) {
3114 - * We don't know which RAM region contains kernel data,
3115 - * so we try it repeatedly and let the resource manager
3119 - request_resource(res, code_resource);
3120 - request_resource(res, data_resource);
3121 - request_resource(res, bss_resource);
3123 -#ifdef CONFIG_KEXEC
3124 - if (crashk_res.start != crashk_res.end)
3125 - request_resource(res, &crashk_res);
3127 - xen_machine_kexec_register_resources(res);
3136 -#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
3138 - * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
3139 - * correspond to e820 RAM areas and mark the corresponding pages as nosave for
3142 - * This function requires the e820 map to be sorted and without any
3143 - * overlapping entries and assumes the first e820 area to be RAM.
3145 -void __init e820_mark_nosave_regions(void)
3148 - unsigned long pfn;
3150 - pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
3151 - for (i = 1; i < e820.nr_map; i++) {
3152 - struct e820entry *ei = &e820.map[i];
3154 - if (pfn < PFN_UP(ei->addr))
3155 - register_nosave_region(pfn, PFN_UP(ei->addr));
3157 - pfn = PFN_DOWN(ei->addr + ei->size);
3158 - if (ei->type != E820_RAM)
3159 - register_nosave_region(PFN_UP(ei->addr), pfn);
3161 - if (pfn >= max_low_pfn)
3167 -void __init add_memory_region(unsigned long long start,
3168 - unsigned long long size, int type)
3174 - if (x == E820MAX) {
3175 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
3179 - e820.map[x].addr = start;
3180 - e820.map[x].size = size;
3181 - e820.map[x].type = type;
3183 -} /* add_memory_region */
3186 - * Sanitize the BIOS e820 map.
3188 - * Some e820 responses include overlapping entries. The following
3189 - * replaces the original e820 map with a new one, removing overlaps.
3192 -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
3194 - struct change_member *change_tmp;
3195 - unsigned long current_type, last_type;
3196 - unsigned long long last_addr;
3197 - int chgidx, still_changing;
3198 - int overlap_entries;
3199 - int new_bios_entry;
3200 - int old_nr, new_nr, chg_nr;
3204 - Visually we're performing the following (1,2,3,4 = memory types)...
3206 - Sample memory map (w/overlaps):
3207 - ____22__________________
3208 - ______________________4_
3209 - ____1111________________
3210 - _44_____________________
3211 - 11111111________________
3212 - ____________________33__
3213 - ___________44___________
3214 - __________33333_________
3215 - ______________22________
3216 - ___________________2222_
3217 - _________111111111______
3218 - _____________________11_
3219 - _________________4______
3221 - Sanitized equivalent (no overlap):
3222 - 1_______________________
3223 - _44_____________________
3224 - ___1____________________
3225 - ____22__________________
3226 - ______11________________
3227 - _________1______________
3228 - __________3_____________
3229 - ___________44___________
3230 - _____________33_________
3231 - _______________2________
3232 - ________________1_______
3233 - _________________4______
3234 - ___________________2____
3235 - ____________________33__
3236 - ______________________4_
3238 - /* if there's only one memory region, don't bother */
3239 - if (*pnr_map < 2) {
3243 - old_nr = *pnr_map;
3245 - /* bail out if we find any unreasonable addresses in bios map */
3246 - for (i=0; i<old_nr; i++)
3247 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
3251 - /* create pointers for initial change-point information (for sorting) */
3252 - for (i=0; i < 2*old_nr; i++)
3253 - change_point[i] = &change_point_list[i];
3255 - /* record all known change-points (starting and ending addresses),
3256 - omitting those that are for empty memory regions */
3258 - for (i=0; i < old_nr; i++) {
3259 - if (biosmap[i].size != 0) {
3260 - change_point[chgidx]->addr = biosmap[i].addr;
3261 - change_point[chgidx++]->pbios = &biosmap[i];
3262 - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
3263 - change_point[chgidx++]->pbios = &biosmap[i];
3266 - chg_nr = chgidx; /* true number of change-points */
3268 - /* sort change-point list by memory addresses (low -> high) */
3269 - still_changing = 1;
3270 - while (still_changing) {
3271 - still_changing = 0;
3272 - for (i=1; i < chg_nr; i++) {
3273 - /* if <current_addr> > <last_addr>, swap */
3274 - /* or, if current=<start_addr> & last=<end_addr>, swap */
3275 - if ((change_point[i]->addr < change_point[i-1]->addr) ||
3276 - ((change_point[i]->addr == change_point[i-1]->addr) &&
3277 - (change_point[i]->addr == change_point[i]->pbios->addr) &&
3278 - (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
3281 - change_tmp = change_point[i];
3282 - change_point[i] = change_point[i-1];
3283 - change_point[i-1] = change_tmp;
3289 - /* create a new bios memory map, removing overlaps */
3290 - overlap_entries=0; /* number of entries in the overlap table */
3291 - new_bios_entry=0; /* index for creating new bios map entries */
3292 - last_type = 0; /* start with undefined memory type */
3293 - last_addr = 0; /* start with 0 as last starting address */
3294 - /* loop through change-points, determining affect on the new bios map */
3295 - for (chgidx=0; chgidx < chg_nr; chgidx++)
3297 - /* keep track of all overlapping bios entries */
3298 - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
3300 - /* add map entry to overlap list (> 1 entry implies an overlap) */
3301 - overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
3305 - /* remove entry from list (order independent, so swap with last) */
3306 - for (i=0; i<overlap_entries; i++)
3308 - if (overlap_list[i] == change_point[chgidx]->pbios)
3309 - overlap_list[i] = overlap_list[overlap_entries-1];
3311 - overlap_entries--;
3313 - /* if there are overlapping entries, decide which "type" to use */
3314 - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
3316 - for (i=0; i<overlap_entries; i++)
3317 - if (overlap_list[i]->type > current_type)
3318 - current_type = overlap_list[i]->type;
3319 - /* continue building up new bios map based on this information */
3320 - if (current_type != last_type) {
3321 - if (last_type != 0) {
3322 - new_bios[new_bios_entry].size =
3323 - change_point[chgidx]->addr - last_addr;
3324 - /* move forward only if the new size was non-zero */
3325 - if (new_bios[new_bios_entry].size != 0)
3326 - if (++new_bios_entry >= E820MAX)
3327 - break; /* no more space left for new bios entries */
3329 - if (current_type != 0) {
3330 - new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
3331 - new_bios[new_bios_entry].type = current_type;
3332 - last_addr=change_point[chgidx]->addr;
3334 - last_type = current_type;
3337 - new_nr = new_bios_entry; /* retain count for new bios entries */
3339 - /* copy new bios mapping into original location */
3340 - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
3341 - *pnr_map = new_nr;
3347 - * Copy the BIOS e820 map into a safe place.
3349 - * Sanity-check it while we're at it..
3351 - * If we're lucky and live on a modern system, the setup code
3352 - * will have given us a memory map that we can use to properly
3353 - * set up memory. If we aren't, we'll fake a memory map.
3355 - * We check to see that the memory map contains at least 2 elements
3356 - * before we'll use it, because the detection code in setup.S may
3357 - * not be perfect and most every PC known to man has two memory
3358 - * regions: one from 0 to 640k, and one from 1mb up. (The IBM
3359 - * thinkpad 560x, for example, does not cooperate with the memory
3360 - * detection code.)
3362 -int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
3365 - /* Only one memory region (or negative)? Ignore it */
3369 - BUG_ON(nr_map < 1);
3373 - u64 start = biosmap->addr;
3374 - u64 size = biosmap->size;
3375 - u64 end = start + size;
3376 - u32 type = biosmap->type;
3378 - /* Overflow in 64 bits? Ignore the memory map. */
3382 - add_memory_region(start, size, type);
3383 - } while (biosmap++, --nr_map);
3386 - if (is_initial_xendomain()) {
3387 - struct xen_memory_map memmap;
3389 - memmap.nr_entries = E820MAX;
3390 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
3392 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3394 - machine_e820.nr_map = memmap.nr_entries;
3396 - machine_e820 = e820;
3403 - * Find the highest page frame number we have available
3405 -void __init propagate_e820_map(void)
3411 - for (i = 0; i < e820.nr_map; i++) {
3412 - unsigned long start, end;
3414 - if (e820.map[i].type != E820_RAM)
3416 - start = PFN_UP(e820.map[i].addr);
3417 - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3420 - if (end > max_pfn)
3422 - memory_present(0, start, end);
3427 - * Register fully available low RAM pages with the bootmem allocator.
3429 -void __init register_bootmem_low_pages(unsigned long max_low_pfn)
3433 - for (i = 0; i < e820.nr_map; i++) {
3434 - unsigned long curr_pfn, last_pfn, size;
3436 - * Reserve usable low memory
3438 - if (e820.map[i].type != E820_RAM)
3441 - * We are rounding up the start address of usable memory:
3443 - curr_pfn = PFN_UP(e820.map[i].addr);
3444 - if (curr_pfn >= max_low_pfn)
3447 - * ... and at the end of the usable range downwards:
3449 - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3453 - * Truncate to the number of actual pages currently
3456 - if (last_pfn > xen_start_info->nr_pages)
3457 - last_pfn = xen_start_info->nr_pages;
3460 - if (last_pfn > max_low_pfn)
3461 - last_pfn = max_low_pfn;
3464 - * .. finally, did all the rounding and playing
3465 - * around just make the area go away?
3467 - if (last_pfn <= curr_pfn)
3470 - size = last_pfn - curr_pfn;
3471 - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
3475 -void __init e820_register_memory(void)
3477 - unsigned long gapstart, gapsize, round;
3478 - unsigned long long last;
3482 - if (is_initial_xendomain()) {
3483 - struct xen_memory_map memmap;
3485 - memmap.nr_entries = E820MAX;
3486 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
3488 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3490 - machine_e820.nr_map = memmap.nr_entries;
3493 - machine_e820 = e820;
3494 -#define e820 machine_e820
3498 - * Search for the biggest gap in the low 32 bits of the e820
3501 - last = 0x100000000ull;
3502 - gapstart = 0x10000000;
3503 - gapsize = 0x400000;
3505 - while (--i >= 0) {
3506 - unsigned long long start = e820.map[i].addr;
3507 - unsigned long long end = start + e820.map[i].size;
3510 - * Since "last" is at most 4GB, we know we'll
3511 - * fit in 32 bits if this condition is true
3514 - unsigned long gap = last - end;
3516 - if (gap > gapsize) {
3527 - * See how much we want to round up: start off with
3528 - * rounding to the next 1MB area.
3531 - while ((gapsize >> 4) > round)
3533 - /* Fun with two's complement */
3534 - pci_mem_start = (gapstart + round) & -round;
3536 - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
3537 - pci_mem_start, gapstart, gapsize);
3540 -void __init print_memory_map(char *who)
3544 - for (i = 0; i < e820.nr_map; i++) {
3545 - printk(" %s: %016Lx - %016Lx ", who,
3547 - e820.map[i].addr + e820.map[i].size);
3548 - switch (e820.map[i].type) {
3549 - case E820_RAM: printk("(usable)\n");
3551 - case E820_RESERVED:
3552 - printk("(reserved)\n");
3555 - printk("(ACPI data)\n");
3558 - printk("(ACPI NVS)\n");
3560 - default: printk("type %u\n", e820.map[i].type);
3566 -void __init limit_regions(unsigned long long size)
3568 - unsigned long long current_addr = 0;
3571 - print_memory_map("limit_regions start");
3572 - for (i = 0; i < e820.nr_map; i++) {
3573 - current_addr = e820.map[i].addr + e820.map[i].size;
3574 - if (current_addr < size)
3577 - if (e820.map[i].type != E820_RAM)
3580 - if (e820.map[i].addr >= size) {
3582 - * This region starts past the end of the
3583 - * requested size, skip it completely.
3587 - e820.nr_map = i + 1;
3588 - e820.map[i].size -= current_addr - size;
3590 - print_memory_map("limit_regions endfor");
3594 - if (current_addr < size) {
3596 - * The e820 map finished before our requested size so
3597 - * extend the final entry to the requested address.
3600 - if (e820.map[i].type == E820_RAM)
3601 - e820.map[i].size -= current_addr - size;
3603 - add_memory_region(current_addr, size - current_addr, E820_RAM);
3606 - print_memory_map("limit_regions endfunc");
3610 - * This function checks if any part of the range <start,end> is mapped
3614 -e820_any_mapped(u64 start, u64 end, unsigned type)
3619 - for (i = 0; i < e820.nr_map; i++) {
3620 - const struct e820entry *ei = &e820.map[i];
3622 - if (!is_initial_xendomain())
3624 - for (i = 0; i < machine_e820.nr_map; ++i) {
3625 - const struct e820entry *ei = &machine_e820.map[i];
3628 - if (type && ei->type != type)
3630 - if (ei->addr >= end || ei->addr + ei->size <= start)
3636 -EXPORT_SYMBOL_GPL(e820_any_mapped);
3639 - * This function checks if the entire range <start,end> is mapped with type.
3641 - * Note: this function only works correct if the e820 table is sorted and
3642 - * not-overlapping, which is the case
3645 -e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
3652 - for (i = 0; i < e820.nr_map; i++) {
3653 - struct e820entry *ei = &e820.map[i];
3655 - if (!is_initial_xendomain())
3657 - for (i = 0; i < machine_e820.nr_map; ++i) {
3658 - const struct e820entry *ei = &machine_e820.map[i];
3661 - if (type && ei->type != type)
3663 - /* is the region (part) in overlap with the current region ?*/
3664 - if (ei->addr >= end || ei->addr + ei->size <= start)
3666 - /* if the region is at the beginning of <start,end> we move
3667 - * start to the end of the region since it's ok until there
3669 - if (ei->addr <= start)
3670 - start = ei->addr + ei->size;
3671 - /* if start is now at or beyond end, we're done, full
3674 - return 1; /* we're done */
3679 -static int __init parse_memmap(char *arg)
3684 - if (strcmp(arg, "exactmap") == 0) {
3685 -#ifdef CONFIG_CRASH_DUMP
3686 - /* If we are doing a crash dump, we
3687 - * still need to know the real mem
3688 - * size before original memory map is
3691 - propagate_e820_map();
3692 - saved_max_pfn = max_pfn;
3695 - user_defined_memmap = 1;
3697 - /* If the user specifies memory size, we
3698 - * limit the BIOS-provided memory map to
3699 - * that size. exactmap can be used to specify
3700 - * the exact map. mem=number can be used to
3701 - * trim the existing memory map.
3703 - unsigned long long start_at, mem_size;
3705 - mem_size = memparse(arg, &arg);
3706 - if (*arg == '@') {
3707 - start_at = memparse(arg+1, &arg);
3708 - add_memory_region(start_at, mem_size, E820_RAM);
3709 - } else if (*arg == '#') {
3710 - start_at = memparse(arg+1, &arg);
3711 - add_memory_region(start_at, mem_size, E820_ACPI);
3712 - } else if (*arg == '$') {
3713 - start_at = memparse(arg+1, &arg);
3714 - add_memory_region(start_at, mem_size, E820_RESERVED);
3716 - limit_regions(mem_size);
3717 - user_defined_memmap = 1;
3722 -early_param("memmap", parse_memmap);
3725 -void __init update_memory_range(u64 start, u64 size, unsigned old_type,
3726 - unsigned new_type)
3730 - BUG_ON(old_type == new_type);
3732 - for (i = 0; i < e820.nr_map; i++) {
3733 - struct e820entry *ei = &e820.map[i];
3734 - u64 final_start, final_end;
3735 - if (ei->type != old_type)
3737 - /* totally covered? */
3738 - if (ei->addr >= start && ei->size <= size) {
3739 - ei->type = new_type;
3742 - /* partially covered */
3743 - final_start = max(start, ei->addr);
3744 - final_end = min(start + size, ei->addr + ei->size);
3745 - if (final_start >= final_end)
3747 - add_memory_region(final_start, final_end - final_start,
3752 -void __init update_e820(void)
3756 - nr_map = e820.nr_map;
3757 - if (sanitize_e820_map(e820.map, &nr_map))
3759 - e820.nr_map = nr_map;
3760 - printk(KERN_INFO "modified physical RAM map:\n");
3761 - print_memory_map("modified");
3764 Index: head-2008-12-01/arch/x86/kernel/e820_64-xen.c
3765 ===================================================================
3766 --- head-2008-12-01.orig/arch/x86/kernel/e820_64-xen.c 2008-12-01 11:44:55.000000000 +0100
3767 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3770 - * Handle the memory map.
3771 - * The functions here do the job until bootmem takes over.
3773 - * Getting sanitize_e820_map() in sync with i386 version by applying change:
3774 - * - Provisions for empty E820 memory regions (reported by certain BIOSes).
3775 - * Alex Achenbach <xela@slit.de>, December 2002.
3776 - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
3779 -#include <linux/kernel.h>
3780 -#include <linux/types.h>
3781 -#include <linux/init.h>
3782 -#include <linux/bootmem.h>
3783 -#include <linux/ioport.h>
3784 -#include <linux/string.h>
3785 -#include <linux/kexec.h>
3786 -#include <linux/module.h>
3787 -#include <linux/mm.h>
3788 -#include <linux/suspend.h>
3789 -#include <linux/pfn.h>
3791 -#include <asm/pgtable.h>
3792 -#include <asm/page.h>
3793 -#include <asm/e820.h>
3794 -#include <asm/proto.h>
3795 -#include <asm/setup.h>
3796 -#include <asm/sections.h>
3797 -#include <asm/kdebug.h>
3798 -#include <xen/interface/memory.h>
3800 -struct e820map e820 __initdata;
3802 -struct e820map machine_e820;
3806 - * PFN of last memory page.
3808 -unsigned long end_pfn;
3811 - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
3812 - * The direct mapping extends to max_pfn_mapped, so that we can directly access
3813 - * apertures, ACPI and other tables without having to play with fixmaps.
3815 -unsigned long max_pfn_mapped;
3818 - * Last pfn which the user wants to use.
3820 -static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
3823 - * Early reserved memory areas.
3825 -#define MAX_EARLY_RES 20
3828 - unsigned long start, end;
3831 -static struct early_res early_res[MAX_EARLY_RES] __initdata = {
3833 - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
3834 -#ifdef CONFIG_X86_TRAMPOLINE
3835 - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
3841 -void __init reserve_early(unsigned long start, unsigned long end, char *name)
3844 - struct early_res *r;
3845 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3846 - r = &early_res[i];
3847 - if (end > r->start && start < r->end)
3848 - panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
3849 - start, end - 1, name?name:"", r->start, r->end - 1, r->name);
3851 - if (i >= MAX_EARLY_RES)
3852 - panic("Too many early reservations");
3853 - r = &early_res[i];
3857 - strncpy(r->name, name, sizeof(r->name) - 1);
3860 -void __init free_early(unsigned long start, unsigned long end)
3862 - struct early_res *r;
3865 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3866 - r = &early_res[i];
3867 - if (start == r->start && end == r->end)
3870 - if (i >= MAX_EARLY_RES || !early_res[i].end)
3871 - panic("free_early on not reserved area: %lx-%lx!", start, end);
3873 - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
3876 - memmove(&early_res[i], &early_res[i + 1],
3877 - (j - 1 - i) * sizeof(struct early_res));
3879 - early_res[j - 1].end = 0;
3882 -void __init early_res_to_bootmem(unsigned long start, unsigned long end)
3885 - unsigned long final_start, final_end;
3886 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3887 - struct early_res *r = &early_res[i];
3888 - final_start = max(start, r->start);
3889 - final_end = min(end, r->end);
3890 - if (final_start >= final_end)
3892 - printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
3893 - final_start, final_end - 1, r->name);
3894 - reserve_bootmem_generic(final_start, final_end - final_start);
3898 -/* Check for already reserved areas */
3899 -static inline int __init
3900 -bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
3903 - unsigned long addr = *addrp, last;
3906 - last = addr + size;
3907 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3908 - struct early_res *r = &early_res[i];
3909 - if (last >= r->start && addr < r->end) {
3910 - *addrp = addr = round_up(r->end, align);
3918 -/* Check for already reserved areas */
3919 -static inline int __init
3920 -bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
3923 - unsigned long addr = *addrp, last;
3924 - unsigned long size = *sizep;
3927 - last = addr + size;
3928 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3929 - struct early_res *r = &early_res[i];
3930 - if (last > r->start && addr < r->start) {
3931 - size = r->start - addr;
3935 - if (last > r->end && addr < r->end) {
3936 - addr = round_up(r->end, align);
3937 - size = last - addr;
3941 - if (last <= r->end && addr >= r->start) {
3953 - * This function checks if any part of the range <start,end> is mapped
3957 -e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
3962 - for (i = 0; i < e820.nr_map; i++) {
3963 - struct e820entry *ei = &e820.map[i];
3965 - if (!is_initial_xendomain())
3967 - for (i = 0; i < machine_e820.nr_map; i++) {
3968 - const struct e820entry *ei = &machine_e820.map[i];
3971 - if (type && ei->type != type)
3973 - if (ei->addr >= end || ei->addr + ei->size <= start)
3979 -EXPORT_SYMBOL_GPL(e820_any_mapped);
3982 - * This function checks if the entire range <start,end> is mapped with type.
3984 - * Note: this function only works correct if the e820 table is sorted and
3985 - * not-overlapping, which is the case
3987 -int __init e820_all_mapped(unsigned long start, unsigned long end,
3993 - for (i = 0; i < e820.nr_map; i++) {
3994 - struct e820entry *ei = &e820.map[i];
3996 - if (!is_initial_xendomain())
3998 - for (i = 0; i < machine_e820.nr_map; i++) {
3999 - const struct e820entry *ei = &machine_e820.map[i];
4002 - if (type && ei->type != type)
4004 - /* is the region (part) in overlap with the current region ?*/
4005 - if (ei->addr >= end || ei->addr + ei->size <= start)
4008 - /* if the region is at the beginning of <start,end> we move
4009 - * start to the end of the region since it's ok until there
4011 - if (ei->addr <= start)
4012 - start = ei->addr + ei->size;
4014 - * if start is now at or beyond end, we're done, full
4024 - * Find a free area with specified alignment in a specific range.
4026 -unsigned long __init find_e820_area(unsigned long start, unsigned long end,
4027 - unsigned long size, unsigned long align)
4031 - for (i = 0; i < e820.nr_map; i++) {
4032 - struct e820entry *ei = &e820.map[i];
4033 - unsigned long addr, last;
4034 - unsigned long ei_last;
4036 - if (ei->type != E820_RAM)
4038 - addr = round_up(ei->addr, align);
4039 - ei_last = ei->addr + ei->size;
4041 - addr = round_up(start, align);
4042 - if (addr >= ei_last)
4044 - while (bad_addr(&addr, size, align) && addr+size <= ei_last)
4046 - last = addr + size;
4047 - if (last > ei_last)
4057 - * Find next free range after *start
4059 -unsigned long __init find_e820_area_size(unsigned long start,
4060 - unsigned long *sizep,
4061 - unsigned long align)
4065 - for (i = 0; i < e820.nr_map; i++) {
4066 - struct e820entry *ei = &e820.map[i];
4067 - unsigned long addr, last;
4068 - unsigned long ei_last;
4070 - if (ei->type != E820_RAM)
4072 - addr = round_up(ei->addr, align);
4073 - ei_last = ei->addr + ei->size;
4075 - addr = round_up(start, align);
4076 - if (addr >= ei_last)
4078 - *sizep = ei_last - addr;
4079 - while (bad_addr_size(&addr, sizep, align) &&
4080 - addr + *sizep <= ei_last)
4082 - last = addr + *sizep;
4083 - if (last > ei_last)
4091 - * Find the highest page frame number we have available
4093 -unsigned long __init e820_end_of_ram(void)
4095 - unsigned long end_pfn;
4097 - end_pfn = find_max_pfn_with_active_regions();
4099 - if (end_pfn > max_pfn_mapped)
4100 - max_pfn_mapped = end_pfn;
4101 - if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
4102 - max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
4103 - if (end_pfn > end_user_pfn)
4104 - end_pfn = end_user_pfn;
4105 - if (end_pfn > max_pfn_mapped)
4106 - end_pfn = max_pfn_mapped;
4108 - printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
4113 - * Mark e820 reserved areas as busy for the resource manager.
4115 -void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
4118 - struct resource *res;
4120 - res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
4121 - for (i = 0; i < nr_map; i++) {
4122 - switch (e820[i].type) {
4123 - case E820_RAM: res->name = "System RAM"; break;
4124 - case E820_ACPI: res->name = "ACPI Tables"; break;
4125 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
4126 - default: res->name = "reserved";
4128 - res->start = e820[i].addr;
4129 - res->end = res->start + e820[i].size - 1;
4130 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4131 - insert_resource(&iomem_resource, res);
4138 - * Find the ranges of physical addresses that do not correspond to
4139 - * e820 RAM areas and mark the corresponding pages as nosave for software
4140 - * suspend and suspend to RAM.
4142 - * This function requires the e820 map to be sorted and without any
4143 - * overlapping entries and assumes the first e820 area to be RAM.
4145 -void __init e820_mark_nosave_regions(void)
4148 - unsigned long paddr;
4150 - paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
4151 - for (i = 1; i < e820.nr_map; i++) {
4152 - struct e820entry *ei = &e820.map[i];
4154 - if (paddr < ei->addr)
4155 - register_nosave_region(PFN_DOWN(paddr),
4156 - PFN_UP(ei->addr));
4158 - paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
4159 - if (ei->type != E820_RAM)
4160 - register_nosave_region(PFN_UP(ei->addr),
4163 - if (paddr >= (end_pfn << PAGE_SHIFT))
4170 - * Finds an active region in the address range from start_pfn to end_pfn and
4171 - * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
4173 -static int __init e820_find_active_region(const struct e820entry *ei,
4174 - unsigned long start_pfn,
4175 - unsigned long end_pfn,
4176 - unsigned long *ei_startpfn,
4177 - unsigned long *ei_endpfn)
4179 - *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
4180 - *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
4182 - /* Skip map entries smaller than a page */
4183 - if (*ei_startpfn >= *ei_endpfn)
4186 - /* Check if max_pfn_mapped should be updated */
4187 - if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
4188 - max_pfn_mapped = *ei_endpfn;
4190 - /* Skip if map is outside the node */
4191 - if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
4192 - *ei_startpfn >= end_pfn)
4195 - /* Check for overlaps */
4196 - if (*ei_startpfn < start_pfn)
4197 - *ei_startpfn = start_pfn;
4198 - if (*ei_endpfn > end_pfn)
4199 - *ei_endpfn = end_pfn;
4201 - /* Obey end_user_pfn to save on memmap */
4202 - if (*ei_startpfn >= end_user_pfn)
4204 - if (*ei_endpfn > end_user_pfn)
4205 - *ei_endpfn = end_user_pfn;
4210 -/* Walk the e820 map and register active regions within a node */
4212 -e820_register_active_regions(int nid, unsigned long start_pfn,
4213 - unsigned long end_pfn)
4215 - unsigned long ei_startpfn;
4216 - unsigned long ei_endpfn;
4219 - for (i = 0; i < e820.nr_map; i++)
4220 - if (e820_find_active_region(&e820.map[i],
4221 - start_pfn, end_pfn,
4222 - &ei_startpfn, &ei_endpfn))
4223 - add_active_range(nid, ei_startpfn, ei_endpfn);
4227 - * Add a memory region to the kernel e820 map.
4229 -void __init add_memory_region(unsigned long start, unsigned long size, int type)
4231 - int x = e820.nr_map;
4233 - if (x == E820MAX) {
4234 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
4238 - e820.map[x].addr = start;
4239 - e820.map[x].size = size;
4240 - e820.map[x].type = type;
4245 - * Find the hole size (in bytes) in the memory range.
4246 - * @start: starting address of the memory range to scan
4247 - * @end: ending address of the memory range to scan
4249 -unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
4251 - unsigned long start_pfn = start >> PAGE_SHIFT;
4252 - unsigned long end_pfn = end >> PAGE_SHIFT;
4253 - unsigned long ei_startpfn, ei_endpfn, ram = 0;
4256 - for (i = 0; i < e820.nr_map; i++) {
4257 - if (e820_find_active_region(&e820.map[i],
4258 - start_pfn, end_pfn,
4259 - &ei_startpfn, &ei_endpfn))
4260 - ram += ei_endpfn - ei_startpfn;
4262 - return end - start - (ram << PAGE_SHIFT);
4265 -static void __init e820_print_map(char *who)
4269 - for (i = 0; i < e820.nr_map; i++) {
4270 - printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
4271 - (unsigned long long) e820.map[i].addr,
4272 - (unsigned long long)
4273 - (e820.map[i].addr + e820.map[i].size));
4274 - switch (e820.map[i].type) {
4276 - printk(KERN_CONT "(usable)\n");
4278 - case E820_RESERVED:
4279 - printk(KERN_CONT "(reserved)\n");
4282 - printk(KERN_CONT "(ACPI data)\n");
4285 - printk(KERN_CONT "(ACPI NVS)\n");
4288 - printk(KERN_CONT "type %u\n", e820.map[i].type);
4295 - * Sanitize the BIOS e820 map.
4297 - * Some e820 responses include overlapping entries. The following
4298 - * replaces the original e820 map with a new one, removing overlaps.
4301 -static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
4303 - struct change_member {
4304 - struct e820entry *pbios; /* pointer to original bios entry */
4305 - unsigned long long addr; /* address for this change point */
4307 - static struct change_member change_point_list[2*E820MAX] __initdata;
4308 - static struct change_member *change_point[2*E820MAX] __initdata;
4309 - static struct e820entry *overlap_list[E820MAX] __initdata;
4310 - static struct e820entry new_bios[E820MAX] __initdata;
4311 - struct change_member *change_tmp;
4312 - unsigned long current_type, last_type;
4313 - unsigned long long last_addr;
4314 - int chgidx, still_changing;
4315 - int overlap_entries;
4316 - int new_bios_entry;
4317 - int old_nr, new_nr, chg_nr;
4321 - Visually we're performing the following
4322 - (1,2,3,4 = memory types)...
4324 - Sample memory map (w/overlaps):
4325 - ____22__________________
4326 - ______________________4_
4327 - ____1111________________
4328 - _44_____________________
4329 - 11111111________________
4330 - ____________________33__
4331 - ___________44___________
4332 - __________33333_________
4333 - ______________22________
4334 - ___________________2222_
4335 - _________111111111______
4336 - _____________________11_
4337 - _________________4______
4339 - Sanitized equivalent (no overlap):
4340 - 1_______________________
4341 - _44_____________________
4342 - ___1____________________
4343 - ____22__________________
4344 - ______11________________
4345 - _________1______________
4346 - __________3_____________
4347 - ___________44___________
4348 - _____________33_________
4349 - _______________2________
4350 - ________________1_______
4351 - _________________4______
4352 - ___________________2____
4353 - ____________________33__
4354 - ______________________4_
4357 - /* if there's only one memory region, don't bother */
4361 - old_nr = *pnr_map;
4363 - /* bail out if we find any unreasonable addresses in bios map */
4364 - for (i = 0; i < old_nr; i++)
4365 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
4368 - /* create pointers for initial change-point information (for sorting) */
4369 - for (i = 0; i < 2 * old_nr; i++)
4370 - change_point[i] = &change_point_list[i];
4372 - /* record all known change-points (starting and ending addresses),
4373 - omitting those that are for empty memory regions */
4375 - for (i = 0; i < old_nr; i++) {
4376 - if (biosmap[i].size != 0) {
4377 - change_point[chgidx]->addr = biosmap[i].addr;
4378 - change_point[chgidx++]->pbios = &biosmap[i];
4379 - change_point[chgidx]->addr = biosmap[i].addr +
4381 - change_point[chgidx++]->pbios = &biosmap[i];
4386 - /* sort change-point list by memory addresses (low -> high) */
4387 - still_changing = 1;
4388 - while (still_changing) {
4389 - still_changing = 0;
4390 - for (i = 1; i < chg_nr; i++) {
4391 - unsigned long long curaddr, lastaddr;
4392 - unsigned long long curpbaddr, lastpbaddr;
4394 - curaddr = change_point[i]->addr;
4395 - lastaddr = change_point[i - 1]->addr;
4396 - curpbaddr = change_point[i]->pbios->addr;
4397 - lastpbaddr = change_point[i - 1]->pbios->addr;
4400 - * swap entries, when:
4402 - * curaddr > lastaddr or
4403 - * curaddr == lastaddr and curaddr == curpbaddr and
4404 - * lastaddr != lastpbaddr
4406 - if (curaddr < lastaddr ||
4407 - (curaddr == lastaddr && curaddr == curpbaddr &&
4408 - lastaddr != lastpbaddr)) {
4409 - change_tmp = change_point[i];
4410 - change_point[i] = change_point[i-1];
4411 - change_point[i-1] = change_tmp;
4412 - still_changing = 1;
4417 - /* create a new bios memory map, removing overlaps */
4418 - overlap_entries = 0; /* number of entries in the overlap table */
4419 - new_bios_entry = 0; /* index for creating new bios map entries */
4420 - last_type = 0; /* start with undefined memory type */
4421 - last_addr = 0; /* start with 0 as last starting address */
4423 - /* loop through change-points, determining affect on the new bios map */
4424 - for (chgidx = 0; chgidx < chg_nr; chgidx++) {
4425 - /* keep track of all overlapping bios entries */
4426 - if (change_point[chgidx]->addr ==
4427 - change_point[chgidx]->pbios->addr) {
4429 - * add map entry to overlap list (> 1 entry
4430 - * implies an overlap)
4432 - overlap_list[overlap_entries++] =
4433 - change_point[chgidx]->pbios;
4436 - * remove entry from list (order independent,
4437 - * so swap with last)
4439 - for (i = 0; i < overlap_entries; i++) {
4440 - if (overlap_list[i] ==
4441 - change_point[chgidx]->pbios)
4443 - overlap_list[overlap_entries-1];
4445 - overlap_entries--;
4448 - * if there are overlapping entries, decide which
4449 - * "type" to use (larger value takes precedence --
4450 - * 1=usable, 2,3,4,4+=unusable)
4453 - for (i = 0; i < overlap_entries; i++)
4454 - if (overlap_list[i]->type > current_type)
4455 - current_type = overlap_list[i]->type;
4457 - * continue building up new bios map based on this
4460 - if (current_type != last_type) {
4461 - if (last_type != 0) {
4462 - new_bios[new_bios_entry].size =
4463 - change_point[chgidx]->addr - last_addr;
4465 - * move forward only if the new size
4468 - if (new_bios[new_bios_entry].size != 0)
4470 - * no more space left for new
4473 - if (++new_bios_entry >= E820MAX)
4476 - if (current_type != 0) {
4477 - new_bios[new_bios_entry].addr =
4478 - change_point[chgidx]->addr;
4479 - new_bios[new_bios_entry].type = current_type;
4480 - last_addr = change_point[chgidx]->addr;
4482 - last_type = current_type;
4485 - /* retain count for new bios entries */
4486 - new_nr = new_bios_entry;
4488 - /* copy new bios mapping into original location */
4489 - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
4490 - *pnr_map = new_nr;
4496 - * Copy the BIOS e820 map into a safe place.
4498 - * Sanity-check it while we're at it..
4500 - * If we're lucky and live on a modern system, the setup code
4501 - * will have given us a memory map that we can use to properly
4502 - * set up memory. If we aren't, we'll fake a memory map.
4504 -static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
4507 - /* Only one memory region (or negative)? Ignore it */
4511 - BUG_ON(nr_map < 1);
4515 - u64 start = biosmap->addr;
4516 - u64 size = biosmap->size;
4517 - u64 end = start + size;
4518 - u32 type = biosmap->type;
4520 - /* Overflow in 64 bits? Ignore the memory map. */
4524 - add_memory_region(start, size, type);
4525 - } while (biosmap++, --nr_map);
4528 - if (is_initial_xendomain()) {
4529 - struct xen_memory_map memmap;
4531 - memmap.nr_entries = E820MAX;
4532 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
4534 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
4536 - machine_e820.nr_map = memmap.nr_entries;
4538 - machine_e820 = e820;
4544 -static void early_panic(char *msg)
4546 - early_printk(msg);
4550 -/* We're not void only for x86 32-bit compat */
4551 -char * __init machine_specific_memory_setup(void)
4554 - char *who = "BIOS-e820";
4556 - * Try to copy the BIOS-supplied E820-map.
4558 - * Otherwise fake a memory map; one section from 0k->640k,
4559 - * the next section from 1mb->appropriate_mem_k
4561 - sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
4562 - if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
4563 - early_panic("Cannot find a valid memory map");
4564 -#else /* CONFIG_XEN */
4565 - char *who = "Xen";
4567 - struct xen_memory_map memmap;
4569 - * This is rather large for a stack variable but this early in
4570 - * the boot process we know we have plenty slack space.
4572 - struct e820entry map[E820MAX];
4574 - memmap.nr_entries = E820MAX;
4575 - set_xen_guest_handle(memmap.buffer, map);
4577 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
4578 - if ( rc == -ENOSYS ) {
4579 - memmap.nr_entries = 1;
4580 - map[0].addr = 0ULL;
4581 - map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
4582 - /* 8MB slack (to balance backend allocations). */
4583 - map[0].size += 8 << 20;
4584 - map[0].type = E820_RAM;
4589 - sanitize_e820_map(map, (char *)&memmap.nr_entries);
4591 - if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
4592 - early_panic("Cannot find a valid memory map");
4594 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
4595 - e820_print_map(who);
4597 - /* In case someone cares... */
4601 -static int __init parse_memopt(char *p)
4604 - unsigned long current_end;
4605 - unsigned long end;
4609 - end_user_pfn = memparse(p, &p);
4610 - end_user_pfn >>= PAGE_SHIFT;
4612 - end = end_user_pfn<<PAGE_SHIFT;
4613 - i = e820.nr_map-1;
4614 - current_end = e820.map[i].addr + e820.map[i].size;
4616 - if (current_end < end) {
4618 - * The e820 map ends before our requested size so
4619 - * extend the final entry to the requested address.
4621 - if (e820.map[i].type == E820_RAM)
4622 - e820.map[i].size = end - e820.map[i].addr;
4624 - add_memory_region(current_end, end - current_end, E820_RAM);
4629 -early_param("mem", parse_memopt);
4631 -static int userdef __initdata;
4633 -static int __init parse_memmap_opt(char *p)
4636 - unsigned long long start_at, mem_size;
4638 - if (!strcmp(p, "exactmap")) {
4639 -#ifdef CONFIG_CRASH_DUMP
4641 - * If we are doing a crash dump, we still need to know
4642 - * the real mem size before original memory map is
4645 - e820_register_active_regions(0, 0, -1UL);
4646 - saved_max_pfn = e820_end_of_ram();
4647 - remove_all_active_ranges();
4649 - max_pfn_mapped = 0;
4656 - mem_size = memparse(p, &p);
4662 - start_at = memparse(p+1, &p);
4663 - add_memory_region(start_at, mem_size, E820_RAM);
4664 - } else if (*p == '#') {
4665 - start_at = memparse(p+1, &p);
4666 - add_memory_region(start_at, mem_size, E820_ACPI);
4667 - } else if (*p == '$') {
4668 - start_at = memparse(p+1, &p);
4669 - add_memory_region(start_at, mem_size, E820_RESERVED);
4671 - end_user_pfn = (mem_size >> PAGE_SHIFT);
4673 - return *p == '\0' ? 0 : -EINVAL;
4675 -early_param("memmap", parse_memmap_opt);
4677 -void __init finish_e820_parsing(void)
4680 - char nr = e820.nr_map;
4682 - if (sanitize_e820_map(e820.map, &nr) < 0)
4683 - early_panic("Invalid user supplied memory map");
4686 - printk(KERN_INFO "user-defined physical RAM map:\n");
4687 - e820_print_map("user");
4692 -void __init update_memory_range(u64 start, u64 size, unsigned old_type,
4693 - unsigned new_type)
4697 - BUG_ON(old_type == new_type);
4699 - for (i = 0; i < e820.nr_map; i++) {
4700 - struct e820entry *ei = &e820.map[i];
4701 - u64 final_start, final_end;
4702 - if (ei->type != old_type)
4704 - /* totally covered? */
4705 - if (ei->addr >= start && ei->size <= size) {
4706 - ei->type = new_type;
4709 - /* partially covered */
4710 - final_start = max(start, ei->addr);
4711 - final_end = min(start + size, ei->addr + ei->size);
4712 - if (final_start >= final_end)
4714 - add_memory_region(final_start, final_end - final_start,
4719 -void __init update_e820(void)
4723 - nr_map = e820.nr_map;
4724 - if (sanitize_e820_map(e820.map, &nr_map))
4726 - e820.nr_map = nr_map;
4727 - printk(KERN_INFO "modified physical RAM map:\n");
4728 - e820_print_map("modified");
4732 -unsigned long pci_mem_start = 0xaeedbabe;
4733 -EXPORT_SYMBOL(pci_mem_start);
4736 - * Search for the biggest gap in the low 32 bits of the e820
4737 - * memory space. We pass this space to PCI to assign MMIO resources
4738 - * for hotplug or unconfigured devices in.
4739 - * Hopefully the BIOS let enough space left.
4741 -__init void e820_setup_gap(struct e820entry *e820, int nr_map)
4743 - unsigned long gapstart, gapsize, round;
4744 - unsigned long last;
4748 - last = 0x100000000ull;
4749 - gapstart = 0x10000000;
4750 - gapsize = 0x400000;
4752 - while (--i >= 0) {
4753 - unsigned long long start = e820[i].addr;
4754 - unsigned long long end = start + e820[i].size;
4757 - * Since "last" is at most 4GB, we know we'll
4758 - * fit in 32 bits if this condition is true
4761 - unsigned long gap = last - end;
4763 - if (gap > gapsize) {
4774 - gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
4775 - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
4777 - KERN_ERR "PCI: Unassigned devices with 32bit resource "
4778 - "registers may break!\n");
4782 - * See how much we want to round up: start off with
4783 - * rounding to the next 1MB area.
4786 - while ((gapsize >> 4) > round)
4788 - /* Fun with two's complement */
4789 - pci_mem_start = (gapstart + round) & -round;
4792 - "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
4793 - pci_mem_start, gapstart, gapsize);
4796 -int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
4800 - if (slot < 0 || slot >= e820.nr_map)
4802 - for (i = slot; i < e820.nr_map; i++) {
4803 - if (e820.map[i].type != E820_RAM)
4807 - if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
4809 - *addr = e820.map[i].addr;
4810 - *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
4811 - max_pfn << PAGE_SHIFT) - *addr;
4814 Index: head-2008-12-01/arch/x86/kernel/early_printk-xen.c
4815 ===================================================================
4816 --- head-2008-12-01.orig/arch/x86/kernel/early_printk-xen.c 2008-12-01 11:44:55.000000000 +0100
4817 +++ head-2008-12-01/arch/x86/kernel/early_printk-xen.c 2008-12-01 11:49:07.000000000 +0100
4818 @@ -225,7 +225,7 @@ static struct console simnow_console = {
4819 static struct console *early_console = &early_vga_console;
4820 static int early_console_initialized;
4822 -void early_printk(const char *fmt, ...)
4823 +asmlinkage void early_printk(const char *fmt, ...)
4827 Index: head-2008-12-01/arch/x86/kernel/entry_32-xen.S
4828 ===================================================================
4829 --- head-2008-12-01.orig/arch/x86/kernel/entry_32-xen.S 2008-12-01 11:44:55.000000000 +0100
4830 +++ head-2008-12-01/arch/x86/kernel/entry_32-xen.S 2008-12-01 11:49:07.000000000 +0100
4832 #include <asm/percpu.h>
4833 #include <asm/dwarf2.h>
4834 #include <asm/processor-flags.h>
4835 -#include "irq_vectors.h"
4836 +#include <asm/ftrace.h>
4837 +#include <asm/irq_vectors.h>
4838 #include <xen/interface/xen.h>
4840 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
4841 +#include <linux/elf-em.h>
4842 +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
4843 +#define __AUDIT_ARCH_LE 0x40000000
4845 +#ifndef CONFIG_AUDITSYSCALL
4846 +#define sysenter_audit syscall_trace_entry
4847 +#define sysexit_audit syscall_exit_work
4851 * We use macros for low-level operations which need to be overridden
4852 * for paravirtualization. The following will never clobber any registers:
4853 * INTERRUPT_RETURN (aka. "iret")
4854 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
4855 - * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
4856 + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
4858 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
4859 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
4860 @@ -277,11 +288,6 @@ END(resume_kernel)
4864 - .macro test_tif ti_reg # system call tracing in operation / emulation
4865 - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
4866 - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
4869 /* SYSENTER_RETURN points to after the "sysenter" instruction in
4870 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
4872 @@ -338,8 +344,9 @@ sysenter_past_esp:
4875 GET_THREAD_INFO(%ebp)
4877 - jnz syscall_trace_entry
4878 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4879 + jnz sysenter_audit
4881 cmpl $(nr_syscalls), %eax
4883 call *sys_call_table(,%eax,4)
4884 @@ -349,14 +356,54 @@ sysenter_past_esp:
4886 movl TI_flags(%ebp), %ecx
4887 testw $_TIF_ALLWORK_MASK, %cx
4888 - jne syscall_exit_work
4891 /* if something modifies registers it must also disable sysexit */
4892 movl PT_EIP(%esp), %edx
4893 movl PT_OLDESP(%esp), %ecx
4896 1: mov PT_FS(%esp), %fs
4897 - ENABLE_INTERRUPTS_SYSCALL_RET
4898 + ENABLE_INTERRUPTS_SYSEXIT
4900 +#ifdef CONFIG_AUDITSYSCALL
4902 + testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
4903 + jnz syscall_trace_entry
4905 + CFI_ADJUST_CFA_OFFSET -4
4906 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
4907 + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
4908 + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
4909 + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
4910 + movl %eax,%edx /* 2nd arg: syscall number */
4911 + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
4912 + call audit_syscall_entry
4914 + CFI_ADJUST_CFA_OFFSET 4
4915 + movl PT_EAX(%esp),%eax /* reload syscall number */
4916 + jmp sysenter_do_call
4919 + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4920 + jne syscall_exit_work
4922 + ENABLE_INTERRUPTS(CLBR_ANY)
4923 + movl %eax,%edx /* second arg, syscall return value */
4924 + cmpl $0,%eax /* is it < 0? */
4925 + setl %al /* 1 if so, 0 if not */
4926 + movzbl %al,%eax /* zero-extend that */
4927 + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
4928 + call audit_syscall_exit
4929 + DISABLE_INTERRUPTS(CLBR_ANY)
4931 + movl TI_flags(%ebp), %ecx
4932 + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4933 + jne syscall_exit_work
4934 + movl PT_EAX(%esp),%eax /* reload syscall return value */
4939 .pushsection .fixup,"ax"
4940 2: movl $0,PT_FS(%esp)
4941 @@ -400,7 +447,7 @@ ENTRY(system_call)
4942 CFI_ADJUST_CFA_OFFSET 4
4944 GET_THREAD_INFO(%ebp)
4946 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4947 jnz syscall_trace_entry
4948 cmpl $(nr_syscalls), %eax
4950 @@ -413,10 +460,6 @@ syscall_exit:
4951 # setting need_resched or sigpending
4952 # between sampling and the iret
4954 - testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
4956 - orl $_TIF_SINGLESTEP,TI_flags(%ebp)
4958 movl TI_flags(%ebp), %ecx
4959 testw $_TIF_ALLWORK_MASK, %cx # current->work
4960 jne syscall_exit_work
4961 @@ -588,12 +631,8 @@ END(work_pending)
4962 syscall_trace_entry:
4963 movl $-ENOSYS,PT_EAX(%esp)
4966 - call do_syscall_trace
4968 - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
4969 - # so must skip actual syscall
4970 - movl PT_ORIG_EAX(%esp), %eax
4971 + call syscall_trace_enter
4972 + /* What it returned is what we'll actually use. */
4973 cmpl $(nr_syscalls), %eax
4976 @@ -602,14 +641,13 @@ END(syscall_trace_entry)
4977 # perform syscall exit tracing
4980 - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
4981 + testb $_TIF_WORK_SYSCALL_EXIT, %cl
4984 - ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
4985 + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
4986 # schedule() instead
4989 - call do_syscall_trace
4990 + call syscall_trace_leave
4991 jmp resume_userspace
4992 END(syscall_exit_work)
4994 @@ -1109,10 +1147,10 @@ ENTRY(native_iret)
4998 -ENTRY(native_irq_enable_syscall_ret)
4999 +ENTRY(native_irq_enable_sysexit)
5002 -END(native_irq_enable_syscall_ret)
5003 +END(native_irq_enable_sysexit)
5007 @@ -1261,6 +1299,77 @@ ENTRY(kernel_thread_helper)
5009 ENDPROC(kernel_thread_helper)
5011 +#ifdef CONFIG_FTRACE
5012 +#ifdef CONFIG_DYNAMIC_FTRACE
5018 + movl 0xc(%esp), %eax
5019 + subl $MCOUNT_INSN_SIZE, %eax
5032 +ENTRY(ftrace_caller)
5036 + movl 0xc(%esp), %eax
5037 + movl 0x4(%ebp), %edx
5038 + subl $MCOUNT_INSN_SIZE, %eax
5053 +#else /* ! CONFIG_DYNAMIC_FTRACE */
5056 + cmpl $ftrace_stub, ftrace_trace_function
5062 + /* taken from glibc */
5067 + movl 0xc(%esp), %eax
5068 + movl 0x4(%ebp), %edx
5069 + subl $MCOUNT_INSN_SIZE, %eax
5071 + call *ftrace_trace_function
5079 +#endif /* CONFIG_DYNAMIC_FTRACE */
5080 +#endif /* CONFIG_FTRACE */
5082 #include <asm/alternative-asm.h>
5084 # pv syscall call handler stub
5085 @@ -1286,7 +1395,7 @@ ENTRY(ia32pv_cstar_target)
5088 GET_THREAD_INFO(%ebp)
5090 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
5091 jnz cstar_trace_entry
5092 cmpl $nr_syscalls,%eax
5094 @@ -1320,29 +1429,21 @@ cstar_trace_entry:
5095 btl %eax,cstar_special
5096 jc .Lcstar_trace_special
5100 orl $_TIF_CSTAR,TI_flags(%ebp)
5101 - call do_syscall_trace
5102 + call syscall_trace_enter
5104 andl $~_TIF_CSTAR,TI_flags(%ebp)
5106 - jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
5107 - # so must skip actual syscall
5108 - movl PT_ORIG_EAX(%esp),%eax
5109 + /* What it returned is what we'll actually use. */
5110 cmpl $nr_syscalls,%eax
5113 .Lcstar_trace_special:
5114 movl PT_ECX(%esp),%ecx
5117 movl %ecx,PT_EBP(%esp) # put user EBP back in place
5118 - call do_syscall_trace
5120 - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5121 - # so must skip actual syscall
5122 - movl PT_ORIG_EAX(%esp),%eax
5123 + call syscall_trace_enter
5124 + /* What it returned is what we'll actually use. */
5125 cmpl $nr_syscalls,%eax
5128 Index: head-2008-12-01/arch/x86/kernel/entry_64.S
5129 ===================================================================
5130 --- head-2008-12-01.orig/arch/x86/kernel/entry_64.S 2008-12-03 15:48:43.000000000 +0100
5131 +++ head-2008-12-01/arch/x86/kernel/entry_64.S 2008-12-01 11:49:07.000000000 +0100
5132 @@ -1409,7 +1409,7 @@ ENTRY(arch_unwind_init_running)
5133 ENDPROC(arch_unwind_init_running)
5137 +#ifdef CONFIG_PARAVIRT_XEN
5138 ENTRY(xen_hypervisor_callback)
5139 zeroentry xen_do_hypervisor_callback
5140 END(xen_hypervisor_callback)
5141 @@ -1507,4 +1507,4 @@ ENTRY(xen_failsafe_callback)
5143 END(xen_failsafe_callback)
5145 -#endif /* CONFIG_XEN */
5146 +#endif /* CONFIG_PARAVIRT_XEN */
5147 Index: head-2008-12-01/arch/x86/kernel/entry_64-xen.S
5148 ===================================================================
5149 --- head-2008-12-01.orig/arch/x86/kernel/entry_64-xen.S 2008-12-01 11:44:55.000000000 +0100
5150 +++ head-2008-12-01/arch/x86/kernel/entry_64-xen.S 2008-12-01 11:49:07.000000000 +0100
5151 @@ -53,19 +53,130 @@
5152 #include <asm/hw_irq.h>
5153 #include <asm/page.h>
5154 #include <asm/irqflags.h>
5155 +#include <asm/ftrace.h>
5156 #include <asm/errno.h>
5157 #include <xen/interface/xen.h>
5158 #include <xen/interface/features.h>
5160 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
5161 +#include <linux/elf-em.h>
5162 +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
5163 +#define __AUDIT_ARCH_64BIT 0x80000000
5164 +#define __AUDIT_ARCH_LE 0x40000000
5168 +#ifdef CONFIG_FTRACE
5169 +#ifdef CONFIG_DYNAMIC_FTRACE
5174 + movq %rcx, 8(%rsp)
5175 + movq %rdx, 16(%rsp)
5176 + movq %rsi, 24(%rsp)
5177 + movq %rdi, 32(%rsp)
5178 + movq %r8, 40(%rsp)
5179 + movq %r9, 48(%rsp)
5181 + movq 0x38(%rsp), %rdi
5182 + subq $MCOUNT_INSN_SIZE, %rdi
5188 + movq 48(%rsp), %r9
5189 + movq 40(%rsp), %r8
5190 + movq 32(%rsp), %rdi
5191 + movq 24(%rsp), %rsi
5192 + movq 16(%rsp), %rdx
5193 + movq 8(%rsp), %rcx
5200 +ENTRY(ftrace_caller)
5202 + /* taken from glibc */
5205 + movq %rcx, 8(%rsp)
5206 + movq %rdx, 16(%rsp)
5207 + movq %rsi, 24(%rsp)
5208 + movq %rdi, 32(%rsp)
5209 + movq %r8, 40(%rsp)
5210 + movq %r9, 48(%rsp)
5212 + movq 0x38(%rsp), %rdi
5213 + movq 8(%rbp), %rsi
5214 + subq $MCOUNT_INSN_SIZE, %rdi
5220 + movq 48(%rsp), %r9
5221 + movq 40(%rsp), %r8
5222 + movq 32(%rsp), %rdi
5223 + movq 24(%rsp), %rsi
5224 + movq 16(%rsp), %rdx
5225 + movq 8(%rsp), %rcx
5234 +#else /* ! CONFIG_DYNAMIC_FTRACE */
5236 + cmpq $ftrace_stub, ftrace_trace_function
5243 + /* taken from glibc */
5246 + movq %rcx, 8(%rsp)
5247 + movq %rdx, 16(%rsp)
5248 + movq %rsi, 24(%rsp)
5249 + movq %rdi, 32(%rsp)
5250 + movq %r8, 40(%rsp)
5251 + movq %r9, 48(%rsp)
5253 + movq 0x38(%rsp), %rdi
5254 + movq 8(%rbp), %rsi
5255 + subq $MCOUNT_INSN_SIZE, %rdi
5257 + call *ftrace_trace_function
5259 + movq 48(%rsp), %r9
5260 + movq 40(%rsp), %r8
5261 + movq 32(%rsp), %rdi
5262 + movq 24(%rsp), %rsi
5263 + movq 16(%rsp), %rdx
5264 + movq 8(%rsp), %rcx
5270 +#endif /* CONFIG_DYNAMIC_FTRACE */
5271 +#endif /* CONFIG_FTRACE */
5273 #ifndef CONFIG_PREEMPT
5274 #define retint_kernel retint_restore_args
5277 #ifdef CONFIG_PARAVIRT
5278 -ENTRY(native_irq_enable_syscall_ret)
5279 - movq %gs:pda_oldrsp,%rsp
5280 +ENTRY(native_usergs_sysret64)
5283 #endif /* CONFIG_PARAVIRT */
5284 @@ -102,7 +213,7 @@ NMI_MASK = 0x80000000
5285 .macro FAKE_STACK_FRAME child_rip
5286 /* push in order ss, rsp, eflags, cs, rip */
5288 - pushq %rax /* ss */
5289 + pushq $__KERNEL_DS /* ss */
5290 CFI_ADJUST_CFA_OFFSET 8
5291 /*CFI_REL_OFFSET ss,0*/
5292 pushq %rax /* rsp */
5293 @@ -197,13 +308,13 @@ ENTRY(ret_from_fork)
5294 CFI_ADJUST_CFA_OFFSET -4
5296 GET_THREAD_INFO(%rcx)
5297 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
5298 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5302 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
5303 je int_ret_from_sys_call
5304 - testl $_TIF_IA32,threadinfo_flags(%rcx)
5305 + testl $_TIF_IA32,TI_flags(%rcx)
5306 jnz int_ret_from_sys_call
5307 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
5308 jmp ret_from_sys_call
5309 @@ -265,8 +376,9 @@ ENTRY(system_call)
5311 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
5312 GET_THREAD_INFO(%rcx)
5313 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
5314 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
5316 +system_call_fastpath:
5317 cmpq $__NR_syscall_max,%rax
5320 @@ -284,7 +396,7 @@ sysret_check:
5321 GET_THREAD_INFO(%rcx)
5322 DISABLE_INTERRUPTS(CLBR_NONE)
5324 - movl threadinfo_flags(%rcx),%edx
5325 + movl TI_flags(%rcx),%edx
5329 @@ -315,16 +427,16 @@ sysret_careful:
5332 ENABLE_INTERRUPTS(CLBR_NONE)
5333 - testl $_TIF_DO_NOTIFY_MASK,%edx
5336 - /* Really a signal */
5337 +#ifdef CONFIG_AUDITSYSCALL
5338 + bt $TIF_SYSCALL_AUDIT,%edx
5341 /* edx: work flags (arg3) */
5342 leaq do_notify_resume(%rip),%rax
5343 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
5344 xorl %esi,%esi # oldset -> arg2
5345 call ptregscall_common
5346 -1: movl $_TIF_NEED_RESCHED,%edi
5347 + movl $_TIF_WORK_MASK,%edi
5348 /* Use IRET because user could have changed frame. This
5349 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
5350 DISABLE_INTERRUPTS(CLBR_NONE)
5351 @@ -335,14 +447,56 @@ badsys:
5352 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
5353 jmp ret_from_sys_call
5355 +#ifdef CONFIG_AUDITSYSCALL
5357 + * Fast path for syscall audit without full syscall trace.
5358 + * We just call audit_syscall_entry() directly, and then
5359 + * jump back to the normal fast path.
5362 + movq %r10,%r9 /* 6th arg: 4th syscall arg */
5363 + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
5364 + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
5365 + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
5366 + movq %rax,%rsi /* 2nd arg: syscall number */
5367 + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
5368 + call audit_syscall_entry
5369 + LOAD_ARGS 0 /* reload call-clobbered registers */
5370 + jmp system_call_fastpath
5373 + * Return fast path for syscall audit. Call audit_syscall_exit()
5374 + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
5378 + movq %rax,%rsi /* second arg, syscall return value */
5379 + cmpq $0,%rax /* is it < 0? */
5380 + setl %al /* 1 if so, 0 if not */
5381 + movzbl %al,%edi /* zero-extend that into %edi */
5382 + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
5383 + call audit_syscall_exit
5384 + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
5386 +#endif /* CONFIG_AUDITSYSCALL */
5388 /* Do syscall tracing */
5390 +#ifdef CONFIG_AUDITSYSCALL
5391 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5395 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
5396 FIXUP_TOP_OF_STACK %rdi
5398 call syscall_trace_enter
5399 - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
5401 + * Reload arg registers from stack in case ptrace changed them.
5402 + * We don't reload %rax because syscall_trace_enter() returned
5403 + * the value it wants us to use in the table lookup.
5405 + LOAD_ARGS ARGOFFSET, 1
5407 cmpq $__NR_syscall_max,%rax
5408 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
5409 @@ -356,6 +510,7 @@ tracesys:
5410 * Has correct top of stack, but partial stack frame.
5412 .globl int_ret_from_sys_call
5413 + .globl int_with_check
5414 int_ret_from_sys_call:
5415 DISABLE_INTERRUPTS(CLBR_NONE)
5417 @@ -370,10 +525,10 @@ int_ret_from_sys_call:
5419 LOCKDEP_SYS_EXIT_IRQ
5420 GET_THREAD_INFO(%rcx)
5421 - movl threadinfo_flags(%rcx),%edx
5422 + movl TI_flags(%rcx),%edx
5425 - andl $~TS_COMPAT,threadinfo_status(%rcx)
5426 + andl $~TS_COMPAT,TI_status(%rcx)
5427 jmp retint_restore_args
5429 /* Either reschedule or signal or syscall exit tracking needed. */
5430 @@ -399,7 +554,7 @@ int_very_careful:
5431 ENABLE_INTERRUPTS(CLBR_NONE)
5433 /* Check for syscall exit trace */
5434 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
5435 + testl $_TIF_WORK_SYSCALL_EXIT,%edx
5438 CFI_ADJUST_CFA_OFFSET 8
5439 @@ -407,7 +562,7 @@ int_very_careful:
5440 call syscall_trace_leave
5442 CFI_ADJUST_CFA_OFFSET -8
5443 - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
5444 + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
5445 jmp int_restore_rest
5448 @@ -416,7 +571,7 @@ int_signal:
5449 movq %rsp,%rdi # &ptregs -> arg1
5450 xorl %esi,%esi # oldset -> arg2
5451 call do_notify_resume
5452 -1: movl $_TIF_NEED_RESCHED,%edi
5453 +1: movl $_TIF_WORK_MASK,%edi
5456 DISABLE_INTERRUPTS(CLBR_NONE)
5457 @@ -443,7 +598,6 @@ END(\label)
5458 PTREGSCALL stub_clone, sys_clone, %r8
5459 PTREGSCALL stub_fork, sys_fork, %rdi
5460 PTREGSCALL stub_vfork, sys_vfork, %rdi
5461 - PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
5462 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
5463 PTREGSCALL stub_iopl, sys_iopl, %rsi
5465 @@ -517,10 +671,12 @@ END(stub_rt_sigreturn)
5470 +retint_with_reschedule:
5471 CFI_DEFAULT_STACK adj=1
5472 + movl $_TIF_WORK_MASK,%edi
5474 LOCKDEP_SYS_EXIT_IRQ
5475 - movl threadinfo_flags(%rcx),%edx
5476 + movl TI_flags(%rcx),%edx
5480 @@ -565,17 +721,16 @@ retint_signal:
5482 DISABLE_INTERRUPTS(CLBR_NONE)
5484 - movl $_TIF_NEED_RESCHED,%edi
5485 GET_THREAD_INFO(%rcx)
5487 + jmp retint_with_reschedule
5489 #ifdef CONFIG_PREEMPT
5490 /* Returning to kernel space. Check if we need preemption */
5491 /* rcx: threadinfo. interrupts off. */
5492 ENTRY(retint_kernel)
5493 - cmpl $0,threadinfo_preempt_count(%rcx)
5494 + cmpl $0,TI_preempt_count(%rcx)
5495 jnz retint_restore_args
5496 - bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
5497 + bt $TIF_NEED_RESCHED,TI_flags(%rcx)
5498 jnc retint_restore_args
5499 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
5500 jnc retint_restore_args
5501 @@ -630,6 +785,9 @@ END(invalidate_interrupt\num)
5502 ENTRY(call_function_interrupt)
5503 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
5504 END(call_function_interrupt)
5505 +ENTRY(call_function_single_interrupt)
5506 + apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
5507 +END(call_function_single_interrupt)
5508 ENTRY(irq_move_cleanup_interrupt)
5509 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
5510 END(irq_move_cleanup_interrupt)
5511 @@ -639,6 +797,10 @@ ENTRY(apic_timer_interrupt)
5512 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
5513 END(apic_timer_interrupt)
5515 +ENTRY(uv_bau_message_intr1)
5516 + apicinterrupt 220,uv_bau_message_interrupt
5517 +END(uv_bau_message_intr1)
5519 ENTRY(error_interrupt)
5520 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
5521 END(error_interrupt)
5522 @@ -752,7 +914,7 @@ paranoid_restore\trace:
5524 paranoid_userspace\trace:
5525 GET_THREAD_INFO(%rcx)
5526 - movl threadinfo_flags(%rcx),%ebx
5527 + movl TI_flags(%rcx),%ebx
5528 andl $_TIF_WORK_MASK,%ebx
5529 jz paranoid_swapgs\trace
5530 movq %rsp,%rdi /* &pt_regs */
5531 @@ -849,7 +1011,7 @@ error_exit:
5532 testb $3,CS-ARGOFFSET(%rsp)
5534 LOCKDEP_SYS_EXIT_IRQ
5535 - movl threadinfo_flags(%rcx),%edx
5536 + movl TI_flags(%rcx),%edx
5537 movl $_TIF_WORK_MASK,%edi
5540 @@ -871,11 +1033,11 @@ error_kernelspace:
5541 iret run with kernel gs again, so don't set the user space flag.
5542 B stepping K8s sometimes report an truncated RIP for IRET
5543 exceptions returning to compat mode. Check for these here too. */
5544 - leaq irq_return(%rip),%rbp
5545 - cmpq %rbp,RIP(%rsp)
5546 + leaq irq_return(%rip),%rcx
5547 + cmpq %rcx,RIP(%rsp)
5549 - movl %ebp,%ebp /* zero extend */
5550 - cmpq %rbp,RIP(%rsp)
5551 + movl %ecx,%ecx /* zero extend */
5552 + cmpq %rcx,RIP(%rsp)
5554 cmpq $gs_change,RIP(%rsp)
5556 @@ -1121,6 +1283,7 @@ END(device_not_available)
5557 /* runs on exception stack */
5560 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5562 CFI_ADJUST_CFA_OFFSET 8 */
5564 @@ -1148,6 +1311,7 @@ END(do_nmi_callback)
5568 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5570 CFI_ADJUST_CFA_OFFSET 8 */
5572 @@ -1171,14 +1335,11 @@ ENTRY(coprocessor_segment_overrun)
5573 zeroentry do_coprocessor_segment_overrun
5574 END(coprocessor_segment_overrun)
5577 - zeroentry do_reserved
5581 /* runs on exception stack */
5584 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5585 paranoidentry do_double_fault
5588 @@ -1196,6 +1357,7 @@ END(segment_not_present)
5589 /* runs on exception stack */
5590 ENTRY(stack_segment)
5592 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5593 paranoidentry do_stack_segment */
5594 errorentry do_stack_segment
5595 /* jmp paranoid_exit1
5596 @@ -1222,6 +1384,7 @@ END(spurious_interrupt_bug)
5597 /* runs on exception stack */
5598 ENTRY(machine_check)
5600 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5602 CFI_ADJUST_CFA_OFFSET 8
5603 paranoidentry do_machine_check
5604 Index: head-2008-12-01/arch/x86/kernel/genapic_64-xen.c
5605 ===================================================================
5606 --- head-2008-12-01.orig/arch/x86/kernel/genapic_64-xen.c 2008-12-01 11:44:55.000000000 +0100
5607 +++ head-2008-12-01/arch/x86/kernel/genapic_64-xen.c 2008-12-01 11:49:07.000000000 +0100
5608 @@ -58,7 +58,7 @@ void __init setup_apic_routing(void)
5612 - if (num_possible_cpus() <= 8)
5613 + if (max_physical_apicid < 8)
5614 genapic = &apic_flat;
5616 genapic = &apic_physflat;
5617 @@ -121,4 +121,5 @@ int is_uv_system(void)
5619 return uv_system_type != UV_NONE;
5621 +EXPORT_SYMBOL_GPL(is_uv_system);
5623 Index: head-2008-12-01/arch/x86/kernel/genapic_xen_64.c
5624 ===================================================================
5625 --- head-2008-12-01.orig/arch/x86/kernel/genapic_xen_64.c 2008-12-01 11:44:55.000000000 +0100
5626 +++ head-2008-12-01/arch/x86/kernel/genapic_xen_64.c 2008-12-01 11:49:07.000000000 +0100
5627 @@ -43,7 +43,7 @@ void xen_send_IPI_shortcut(unsigned int
5628 __send_IPI_one(smp_processor_id(), vector);
5630 case APIC_DEST_ALLBUT:
5631 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5632 + for_each_possible_cpu(cpu) {
5633 if (cpu == smp_processor_id())
5635 if (cpu_isset(cpu, cpu_online_map)) {
5636 @@ -52,7 +52,7 @@ void xen_send_IPI_shortcut(unsigned int
5639 case APIC_DEST_ALLINC:
5640 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5641 + for_each_possible_cpu(cpu) {
5642 if (cpu_isset(cpu, cpu_online_map)) {
5643 __send_IPI_one(cpu, vector);
5645 @@ -81,8 +81,6 @@ static cpumask_t xen_vector_allocation_d
5647 static void xen_init_apic_ldr(void)
5649 - Dprintk("%s\n", __FUNCTION__);
5653 static void xen_send_IPI_allbutself(int vector)
5654 @@ -92,14 +90,12 @@ static void xen_send_IPI_allbutself(int
5655 * we get an APIC send error if we try to broadcast.
5656 * thus we have to avoid sending IPIs in this case.
5658 - Dprintk("%s\n", __FUNCTION__);
5659 if (num_online_cpus() > 1)
5660 xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
5663 static void xen_send_IPI_all(int vector)
5665 - Dprintk("%s\n", __FUNCTION__);
5666 xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
5669 @@ -109,11 +105,10 @@ static void xen_send_IPI_mask(cpumask_t
5671 unsigned long flags;
5673 - Dprintk("%s\n", __FUNCTION__);
5674 local_irq_save(flags);
5675 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
5677 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5678 + for_each_possible_cpu(cpu) {
5679 if (cpu_isset(cpu, cpumask)) {
5680 __send_IPI_one(cpu, vector);
5682 @@ -125,14 +120,12 @@ static void xen_send_IPI_mask(cpumask_t
5683 static int xen_apic_id_registered(void)
5686 - Dprintk("%s\n", __FUNCTION__);
5687 return physid_isset(smp_processor_id(), phys_cpu_present_map);
5691 static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
5693 - Dprintk("%s\n", __FUNCTION__);
5694 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
5697 @@ -140,7 +133,6 @@ static unsigned int phys_pkg_id(int inde
5701 - Dprintk("%s\n", __FUNCTION__);
5703 return ((ebx >> 24) & 0xFF) >> index_msb;
5705 Index: head-2008-12-01/arch/x86/kernel/head-xen.c
5706 ===================================================================
5707 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5708 +++ head-2008-12-01/arch/x86/kernel/head-xen.c 2008-12-01 11:49:07.000000000 +0100
5710 +#include <linux/kernel.h>
5711 +#include <linux/init.h>
5713 +#include <asm/setup.h>
5714 +#include <asm/bios_ebda.h>
5716 +#define BIOS_LOWMEM_KILOBYTES 0x413
5719 + * The BIOS places the EBDA/XBDA at the top of conventional
5720 + * memory, and usually decreases the reported amount of
5721 + * conventional memory (int 0x12) too. This also contains a
5722 + * workaround for Dell systems that neglect to reserve EBDA.
5723 + * The same workaround also avoids a problem with the AMD768MPX
5724 + * chipset: reserve a page before VGA to prevent PCI prefetch
5725 + * into it (errata #56). Usually the page is reserved anyways,
5726 + * unless you have no PS/2 mouse plugged in.
5728 +void __init reserve_ebda_region(void)
5731 + unsigned int lowmem, ebda_addr;
5733 + /* To determine the position of the EBDA and the */
5734 + /* end of conventional memory, we need to look at */
5735 + /* the BIOS data area. In a paravirtual environment */
5736 + /* that area is absent. We'll just have to assume */
5737 + /* that the paravirt case can handle memory setup */
5738 + /* correctly, without our help. */
5739 + if (paravirt_enabled())
5742 + /* end of low (conventional) memory */
5743 + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5746 + /* start of EBDA area */
5747 + ebda_addr = get_bios_ebda();
5749 + /* Fixup: bios puts an EBDA in the top 64K segment */
5750 + /* of conventional memory, but does not adjust lowmem. */
5751 + if ((lowmem - ebda_addr) <= 0x10000)
5752 + lowmem = ebda_addr;
5754 + /* Fixup: bios does not report an EBDA at all. */
5755 + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5756 + if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5759 + /* Paranoia: should never happen, but... */
5760 + if ((lowmem == 0) || (lowmem >= 0x100000))
5763 + /* reserve all memory between lowmem and the 1MB mark */
5764 + reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
5767 Index: head-2008-12-01/arch/x86/kernel/head32-xen.c
5768 ===================================================================
5769 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5770 +++ head-2008-12-01/arch/x86/kernel/head32-xen.c 2008-12-01 11:49:07.000000000 +0100
5773 + * linux/arch/i386/kernel/head32.c -- prepare to run common code
5775 + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5776 + * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
5779 +#include <linux/init.h>
5780 +#include <linux/start_kernel.h>
5782 +#include <asm/setup.h>
5783 +#include <asm/sections.h>
5784 +#include <asm/e820.h>
5785 +#include <asm/bios_ebda.h>
5787 +void __init i386_start_kernel(void)
5789 + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
5792 +#ifdef CONFIG_BLK_DEV_INITRD
5793 + /* Reserve INITRD */
5794 + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
5795 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
5796 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
5797 + u64 ramdisk_end = ramdisk_image + ramdisk_size;
5798 + reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
5801 + reserve_early(init_pg_tables_start, init_pg_tables_end,
5804 + reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
5805 + __pa(xen_start_info->pt_base)
5806 + + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
5812 + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
5813 + max_cmdline = COMMAND_LINE_SIZE;
5814 + memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
5815 + boot_command_line[max_cmdline-1] = '\0';
5819 + reserve_ebda_region();
5822 + * At this point everything still needed from the boot loader
5823 + * or BIOS or kernel text should be early reserved or marked not
5824 + * RAM in e820. All other memory is free game.
5829 Index: head-2008-12-01/arch/x86/kernel/head64-xen.c
5830 ===================================================================
5831 --- head-2008-12-01.orig/arch/x86/kernel/head64-xen.c 2008-12-01 11:44:55.000000000 +0100
5832 +++ head-2008-12-01/arch/x86/kernel/head64-xen.c 2008-12-01 11:49:07.000000000 +0100
5834 #include <asm/e820.h>
5835 #include <asm/bios_ebda.h>
5837 -unsigned long start_pfn;
5839 +static struct x8664_pda _boot_cpu_pda __read_mostly;
5843 + * We install an empty cpu_pda pointer table to indicate to early users
5844 + * (numa_set_node) that the cpu_pda pointer table for cpus other than
5845 + * the boot cpu is not yet setup.
5847 +static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
5849 +static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
5852 +void __init x86_64_init_pda(void)
5854 + _cpu_pda = __cpu_pda;
5855 + cpu_pda(0) = &_boot_cpu_pda;
5860 static void __init zap_identity_mappings(void)
5861 @@ -77,83 +96,10 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
5862 unsigned int machine_to_phys_order;
5863 EXPORT_SYMBOL(machine_to_phys_order);
5865 -#define BIOS_LOWMEM_KILOBYTES 0x413
5868 - * The BIOS places the EBDA/XBDA at the top of conventional
5869 - * memory, and usually decreases the reported amount of
5870 - * conventional memory (int 0x12) too. This also contains a
5871 - * workaround for Dell systems that neglect to reserve EBDA.
5872 - * The same workaround also avoids a problem with the AMD768MPX
5873 - * chipset: reserve a page before VGA to prevent PCI prefetch
5874 - * into it (errata #56). Usually the page is reserved anyways,
5875 - * unless you have no PS/2 mouse plugged in.
5877 -static void __init reserve_ebda_region(void)
5880 - unsigned int lowmem, ebda_addr;
5882 - /* To determine the position of the EBDA and the */
5883 - /* end of conventional memory, we need to look at */
5884 - /* the BIOS data area. In a paravirtual environment */
5885 - /* that area is absent. We'll just have to assume */
5886 - /* that the paravirt case can handle memory setup */
5887 - /* correctly, without our help. */
5888 - if (paravirt_enabled())
5891 - /* end of low (conventional) memory */
5892 - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5895 - /* start of EBDA area */
5896 - ebda_addr = get_bios_ebda();
5898 - /* Fixup: bios puts an EBDA in the top 64K segment */
5899 - /* of conventional memory, but does not adjust lowmem. */
5900 - if ((lowmem - ebda_addr) <= 0x10000)
5901 - lowmem = ebda_addr;
5903 - /* Fixup: bios does not report an EBDA at all. */
5904 - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5905 - if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5908 - /* Paranoia: should never happen, but... */
5909 - if ((lowmem == 0) || (lowmem >= 0x100000))
5912 - /* reserve all memory between lowmem and the 1MB mark */
5913 - reserve_early(lowmem, 0x100000, "BIOS reserved");
5917 -static void __init reserve_setup_data(void)
5920 - struct setup_data *data;
5921 - unsigned long pa_data;
5924 - if (boot_params.hdr.version < 0x0209)
5926 - pa_data = boot_params.hdr.setup_data;
5928 - data = early_ioremap(pa_data, sizeof(*data));
5929 - sprintf(buf, "setup data %x", data->type);
5930 - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
5931 - pa_data = data->next;
5932 - early_iounmap(data, sizeof(*data));
5937 void __init x86_64_start_kernel(char * real_mode_data)
5939 struct xen_machphys_mapping mapping;
5940 unsigned long machine_to_phys_nr_ents;
5944 * Build-time sanity checks on the kernel image and module
5945 @@ -167,6 +113,7 @@ void __init x86_64_start_kernel(char * r
5946 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
5947 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
5948 (__START_KERNEL & PGDIR_MASK)));
5949 + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
5951 xen_setup_features();
5953 @@ -174,8 +121,6 @@ void __init x86_64_start_kernel(char * r
5954 if (!xen_feature(XENFEAT_auto_translated_physmap))
5955 phys_to_machine_mapping =
5956 (unsigned long *)xen_start_info->mfn_list;
5957 - start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
5958 - xen_start_info->nr_pt_frames;
5960 machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
5961 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
5962 @@ -208,19 +153,23 @@ void __init x86_64_start_kernel(char * r
5964 early_printk("Kernel alive\n");
5966 - for (i = 0; i < NR_CPUS; i++)
5967 - cpu_pda(i) = &boot_cpu_pda[i];
5968 + x86_64_init_pda();
5971 + early_printk("Kernel really alive\n");
5973 + x86_64_start_reservations(real_mode_data);
5976 +void __init x86_64_start_reservations(char *real_mode_data)
5978 copy_bootdata(__va(real_mode_data));
5980 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
5982 reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
5983 - start_pfn << PAGE_SHIFT, "Xen provided");
5985 - reserve_ebda_region();
5986 - reserve_setup_data();
5987 + __pa(xen_start_info->pt_base)
5988 + + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
5992 * At this point everything still needed from the boot loader
5993 Index: head-2008-12-01/arch/x86/kernel/head_64-xen.S
5994 ===================================================================
5995 --- head-2008-12-01.orig/arch/x86/kernel/head_64-xen.S 2008-12-01 11:36:47.000000000 +0100
5996 +++ head-2008-12-01/arch/x86/kernel/head_64-xen.S 2008-12-01 11:49:07.000000000 +0100
5997 @@ -95,53 +95,6 @@ NEXT_PAGE(hypercall_page)
6004 - .globl cpu_gdt_descr
6006 - .word gdt_end-cpu_gdt_table-1
6008 - .quad cpu_gdt_table
6016 -/* We need valid kernel segments for data and code in long mode too
6017 - * IRET will check the segment types kkeil 2000/10/28
6018 - * Also sysret mandates a special GDT layout
6021 - .section .data.page_aligned, "aw"
6024 -/* The TLS descriptors are currently at a different place compared to i386.
6025 - Hopefully nobody expects them at a fixed place (Wine?) */
6027 -ENTRY(cpu_gdt_table)
6028 - .quad 0x0000000000000000 /* NULL descriptor */
6029 - .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
6030 - .quad 0x00af9b000000ffff /* __KERNEL_CS */
6031 - .quad 0x00cf93000000ffff /* __KERNEL_DS */
6032 - .quad 0x00cffb000000ffff /* __USER32_CS */
6033 - .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
6034 - .quad 0x00affb000000ffff /* __USER_CS */
6035 - .quad 0x0 /* unused */
6036 - .quad 0,0 /* TSS */
6037 - .quad 0,0 /* LDT */
6038 - .quad 0,0,0 /* three TLS descriptors */
6039 - .quad 0x0000f40000000000 /* node/CPU stored in limit */
6041 - /* asm/segment.h:GDT_ENTRIES must match this */
6042 - /* This should be a multiple of the cache line size */
6043 - /* GDTs of other CPUs are now dynamically allocated */
6045 - /* zero the remaining page */
6046 - .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
6048 .section .bss.page_aligned, "aw", @nobits
6050 ENTRY(empty_zero_page)
6051 Index: head-2008-12-01/arch/x86/kernel/io_apic_32-xen.c
6052 ===================================================================
6053 --- head-2008-12-01.orig/arch/x86/kernel/io_apic_32-xen.c 2008-12-01 11:44:55.000000000 +0100
6054 +++ head-2008-12-01/arch/x86/kernel/io_apic_32-xen.c 2008-12-01 11:49:07.000000000 +0100
6056 #include <linux/init.h>
6057 #include <linux/delay.h>
6058 #include <linux/sched.h>
6059 +#include <linux/bootmem.h>
6060 #include <linux/mc146818rtc.h>
6061 #include <linux/compiler.h>
6062 #include <linux/acpi.h>
6063 @@ -75,7 +76,7 @@ static struct { int pin, apic; } ioapic_
6064 static DEFINE_SPINLOCK(ioapic_lock);
6065 static DEFINE_SPINLOCK(vector_lock);
6067 -int timer_over_8254 __initdata = 1;
6068 +int timer_through_8259 __initdata;
6071 * Is the SiS APIC rmw bug present ?
6072 @@ -89,15 +90,21 @@ int sis_apic_bug = -1;
6073 int nr_ioapic_registers[MAX_IO_APICS];
6075 /* I/O APIC entries */
6076 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
6077 +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
6080 /* MP IRQ source entries */
6081 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6082 +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6084 /* # of MP IRQ source entries */
6087 +#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
6088 +int mp_bus_id_to_type[MAX_MP_BUSSES];
6091 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
6093 static int disable_timer_pin_1 __initdata;
6096 @@ -128,7 +135,7 @@ struct io_apic {
6097 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
6099 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
6100 - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
6101 + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
6105 @@ -142,7 +149,7 @@ static inline unsigned int io_apic_read(
6106 struct physdev_apic apic_op;
6109 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6110 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6112 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
6114 @@ -160,7 +167,7 @@ static inline void io_apic_write(unsigne
6116 struct physdev_apic apic_op;
6118 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6119 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6121 apic_op.value = value;
6122 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
6123 @@ -288,7 +295,7 @@ static void __init replace_pin_at_irq(un
6127 -static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
6128 +static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
6130 struct irq_pin_list *entry = irq_2_pin + irq;
6131 unsigned int pin, reg;
6132 @@ -308,30 +315,32 @@ static void __modify_IO_APIC_irq (unsign
6136 -static void __mask_IO_APIC_irq (unsigned int irq)
6137 +static void __mask_IO_APIC_irq(unsigned int irq)
6139 - __modify_IO_APIC_irq(irq, 0x00010000, 0);
6140 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
6144 -static void __unmask_IO_APIC_irq (unsigned int irq)
6145 +static void __unmask_IO_APIC_irq(unsigned int irq)
6147 - __modify_IO_APIC_irq(irq, 0, 0x00010000);
6148 + __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
6151 /* mask = 1, trigger = 0 */
6152 -static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
6153 +static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
6155 - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
6156 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
6157 + IO_APIC_REDIR_LEVEL_TRIGGER);
6160 /* mask = 0, trigger = 1 */
6161 -static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
6162 +static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
6164 - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
6165 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
6166 + IO_APIC_REDIR_MASKED);
6169 -static void mask_IO_APIC_irq (unsigned int irq)
6170 +static void mask_IO_APIC_irq(unsigned int irq)
6172 unsigned long flags;
6174 @@ -340,7 +349,7 @@ static void mask_IO_APIC_irq (unsigned i
6175 spin_unlock_irqrestore(&ioapic_lock, flags);
6178 -static void unmask_IO_APIC_irq (unsigned int irq)
6179 +static void unmask_IO_APIC_irq(unsigned int irq)
6181 unsigned long flags;
6183 @@ -352,7 +361,7 @@ static void unmask_IO_APIC_irq (unsigned
6184 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
6186 struct IO_APIC_route_entry entry;
6189 /* Check delivery_mode to be sure we're not clearing an SMI pin */
6190 entry = ioapic_read_entry(apic, pin);
6191 if (entry.delivery_mode == dest_SMI)
6192 @@ -364,7 +373,7 @@ static void clear_IO_APIC_pin(unsigned i
6193 ioapic_mask_entry(apic, pin);
6196 -static void clear_IO_APIC (void)
6197 +static void clear_IO_APIC(void)
6201 @@ -381,7 +390,7 @@ static void set_ioapic_affinity_irq(unsi
6202 struct irq_pin_list *entry = irq_2_pin + irq;
6203 unsigned int apicid_value;
6207 cpus_and(tmp, cpumask, cpu_online_map);
6208 if (cpus_empty(tmp))
6210 @@ -410,7 +419,7 @@ static void set_ioapic_affinity_irq(unsi
6211 # include <linux/kernel_stat.h> /* kstat */
6212 # include <linux/slab.h> /* kmalloc() */
6213 # include <linux/timer.h>
6216 #define IRQBALANCE_CHECK_ARCH -999
6217 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
6218 #define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
6219 @@ -422,14 +431,14 @@ static int physical_balance __read_mostl
6220 static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
6222 static struct irq_cpu_info {
6223 - unsigned long * last_irq;
6224 - unsigned long * irq_delta;
6225 + unsigned long *last_irq;
6226 + unsigned long *irq_delta;
6228 } irq_cpu_data[NR_CPUS];
6230 #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
6231 -#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
6232 -#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
6233 +#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
6234 +#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
6236 #define IDLE_ENOUGH(cpu,now) \
6237 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
6238 @@ -468,8 +477,8 @@ inside:
6242 - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
6243 - (search_idle && !IDLE_ENOUGH(cpu,now)));
6244 + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
6245 + (search_idle && !IDLE_ENOUGH(cpu, now)));
6249 @@ -479,15 +488,14 @@ static inline void balance_irq(int cpu,
6250 unsigned long now = jiffies;
6251 cpumask_t allowed_mask;
6252 unsigned int new_cpu;
6255 if (irqbalance_disabled)
6259 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
6260 new_cpu = move(cpu, allowed_mask, now, 1);
6261 - if (cpu != new_cpu) {
6262 + if (cpu != new_cpu)
6263 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
6267 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
6268 @@ -499,14 +507,14 @@ static inline void rotate_irqs_among_cpu
6269 if (!irq_desc[j].action)
6271 /* Is it a significant load ? */
6272 - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
6273 + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
6274 useful_load_threshold)
6279 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6280 - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6281 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6285 @@ -535,22 +543,22 @@ static void do_irq_balance(void)
6286 /* Is this an active IRQ or balancing disabled ? */
6287 if (!irq_desc[j].action || irq_balancing_disabled(j))
6289 - if ( package_index == i )
6290 - IRQ_DELTA(package_index,j) = 0;
6291 + if (package_index == i)
6292 + IRQ_DELTA(package_index, j) = 0;
6293 /* Determine the total count per processor per IRQ */
6294 value_now = (unsigned long) kstat_cpu(i).irqs[j];
6296 /* Determine the activity per processor per IRQ */
6297 - delta = value_now - LAST_CPU_IRQ(i,j);
6298 + delta = value_now - LAST_CPU_IRQ(i, j);
6300 /* Update last_cpu_irq[][] for the next time */
6301 - LAST_CPU_IRQ(i,j) = value_now;
6302 + LAST_CPU_IRQ(i, j) = value_now;
6304 /* Ignore IRQs whose rate is less than the clock */
6305 if (delta < useful_load_threshold)
6307 /* update the load for the processor or package total */
6308 - IRQ_DELTA(package_index,j) += delta;
6309 + IRQ_DELTA(package_index, j) += delta;
6311 /* Keep track of the higher numbered sibling as well */
6312 if (i != package_index)
6313 @@ -576,7 +584,8 @@ static void do_irq_balance(void)
6314 max_cpu_irq = ULONG_MAX;
6317 - /* Look for heaviest loaded processor.
6319 + * Look for heaviest loaded processor.
6320 * We may come back to get the next heaviest loaded processor.
6321 * Skip processors with trivial loads.
6323 @@ -585,7 +594,7 @@ tryanothercpu:
6324 for_each_online_cpu(i) {
6325 if (i != CPU_TO_PACKAGEINDEX(i))
6327 - if (max_cpu_irq <= CPU_IRQ(i))
6328 + if (max_cpu_irq <= CPU_IRQ(i))
6330 if (tmp_cpu_irq < CPU_IRQ(i)) {
6331 tmp_cpu_irq = CPU_IRQ(i);
6332 @@ -594,8 +603,9 @@ tryanothercpu:
6335 if (tmp_loaded == -1) {
6336 - /* In the case of small number of heavy interrupt sources,
6337 - * loading some of the cpus too much. We use Ingo's original
6339 + * In the case of small number of heavy interrupt sources,
6340 + * loading some of the cpus too much. We use Ingo's original
6341 * approach to rotate them around.
6343 if (!first_attempt && imbalance >= useful_load_threshold) {
6344 @@ -604,13 +614,14 @@ tryanothercpu:
6346 goto not_worth_the_effort;
6350 first_attempt = 0; /* heaviest search */
6351 max_cpu_irq = tmp_cpu_irq; /* load */
6352 max_loaded = tmp_loaded; /* processor */
6353 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
6355 - /* if imbalance is less than approx 10% of max load, then
6358 + * if imbalance is less than approx 10% of max load, then
6359 * observe diminishing returns action. - quit
6361 if (imbalance < (max_cpu_irq >> 3))
6362 @@ -626,26 +637,25 @@ tryanotherirq:
6363 /* Is this an active IRQ? */
6364 if (!irq_desc[j].action)
6366 - if (imbalance <= IRQ_DELTA(max_loaded,j))
6367 + if (imbalance <= IRQ_DELTA(max_loaded, j))
6369 /* Try to find the IRQ that is closest to the imbalance
6370 * without going over.
6372 - if (move_this_load < IRQ_DELTA(max_loaded,j)) {
6373 - move_this_load = IRQ_DELTA(max_loaded,j);
6374 + if (move_this_load < IRQ_DELTA(max_loaded, j)) {
6375 + move_this_load = IRQ_DELTA(max_loaded, j);
6379 - if (selected_irq == -1) {
6380 + if (selected_irq == -1)
6384 imbalance = move_this_load;
6387 /* For physical_balance case, we accumulated both load
6388 * values in the one of the siblings cpu_irq[],
6389 * to use the same code for physical and logical processors
6390 - * as much as possible.
6391 + * as much as possible.
6393 * NOTE: the cpu_irq[] array holds the sum of the load for
6394 * sibling A and sibling B in the slot for the lowest numbered
6395 @@ -674,11 +684,11 @@ tryanotherirq:
6396 /* mark for change destination */
6397 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
6399 - /* Since we made a change, come back sooner to
6400 + /* Since we made a change, come back sooner to
6401 * check for more variation.
6403 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6404 - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6405 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6409 @@ -689,7 +699,7 @@ not_worth_the_effort:
6412 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
6413 - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6414 + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6418 @@ -728,13 +738,13 @@ static int __init balanced_irq_init(void
6421 cpus_shift_right(tmp, cpu_online_map, 2);
6422 - c = &boot_cpu_data;
6423 + c = &boot_cpu_data;
6424 /* When not overwritten by the command line ask subarchitecture. */
6425 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
6426 irqbalance_disabled = NO_BALANCE_IRQ;
6427 if (irqbalance_disabled)
6431 /* disable irqbalance completely if there is only one processor online */
6432 if (num_online_cpus() < 2) {
6433 irqbalance_disabled = 1;
6434 @@ -748,16 +758,14 @@ static int __init balanced_irq_init(void
6435 physical_balance = 1;
6437 for_each_online_cpu(i) {
6438 - irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6439 - irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6440 + irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6441 + irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6442 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
6443 printk(KERN_ERR "balanced_irq_init: out of memory");
6446 - memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
6447 - memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
6451 printk(KERN_INFO "Starting balanced_irq\n");
6452 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
6454 @@ -799,7 +807,7 @@ void send_IPI_self(int vector)
6456 * Send the IPI. The write to APIC_ICR fires this off.
6458 - apic_write_around(APIC_ICR, cfg);
6459 + apic_write(APIC_ICR, cfg);
6462 #endif /* !CONFIG_SMP */
6463 @@ -853,10 +861,10 @@ static int find_irq_entry(int apic, int
6466 for (i = 0; i < mp_irq_entries; i++)
6467 - if (mp_irqs[i].mpc_irqtype == type &&
6468 - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
6469 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
6470 - mp_irqs[i].mpc_dstirq == pin)
6471 + if (mp_irqs[i].mp_irqtype == type &&
6472 + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
6473 + mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
6474 + mp_irqs[i].mp_dstirq == pin)
6478 @@ -871,13 +879,13 @@ static int __init find_isa_irq_pin(int i
6481 for (i = 0; i < mp_irq_entries; i++) {
6482 - int lbus = mp_irqs[i].mpc_srcbus;
6483 + int lbus = mp_irqs[i].mp_srcbus;
6485 if (test_bit(lbus, mp_bus_not_pci) &&
6486 - (mp_irqs[i].mpc_irqtype == type) &&
6487 - (mp_irqs[i].mpc_srcbusirq == irq))
6488 + (mp_irqs[i].mp_irqtype == type) &&
6489 + (mp_irqs[i].mp_srcbusirq == irq))
6491 - return mp_irqs[i].mpc_dstirq;
6492 + return mp_irqs[i].mp_dstirq;
6496 @@ -887,17 +895,17 @@ static int __init find_isa_irq_apic(int
6499 for (i = 0; i < mp_irq_entries; i++) {
6500 - int lbus = mp_irqs[i].mpc_srcbus;
6501 + int lbus = mp_irqs[i].mp_srcbus;
6503 if (test_bit(lbus, mp_bus_not_pci) &&
6504 - (mp_irqs[i].mpc_irqtype == type) &&
6505 - (mp_irqs[i].mpc_srcbusirq == irq))
6506 + (mp_irqs[i].mp_irqtype == type) &&
6507 + (mp_irqs[i].mp_srcbusirq == irq))
6510 if (i < mp_irq_entries) {
6512 - for(apic = 0; apic < nr_ioapics; apic++) {
6513 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
6514 + for (apic = 0; apic < nr_ioapics; apic++) {
6515 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
6519 @@ -918,28 +926,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6521 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
6522 "slot:%d, pin:%d.\n", bus, slot, pin);
6523 - if (mp_bus_id_to_pci_bus[bus] == -1) {
6524 + if (test_bit(bus, mp_bus_not_pci)) {
6525 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
6528 for (i = 0; i < mp_irq_entries; i++) {
6529 - int lbus = mp_irqs[i].mpc_srcbus;
6530 + int lbus = mp_irqs[i].mp_srcbus;
6532 for (apic = 0; apic < nr_ioapics; apic++)
6533 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
6534 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
6535 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
6536 + mp_irqs[i].mp_dstapic == MP_APIC_ALL)
6539 if (!test_bit(lbus, mp_bus_not_pci) &&
6540 - !mp_irqs[i].mpc_irqtype &&
6541 + !mp_irqs[i].mp_irqtype &&
6543 - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
6544 - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
6545 + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
6546 + int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
6548 if (!(apic || IO_APIC_IRQ(irq)))
6551 - if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
6552 + if (pin == (mp_irqs[i].mp_srcbusirq & 3))
6555 * Use the first all-but-pin matching entry as a
6556 @@ -954,7 +962,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6557 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
6560 - * This function currently is only a helper for the i386 smp boot process where
6561 + * This function currently is only a helper for the i386 smp boot process where
6562 * we need to reprogram the ioredtbls to cater for the cpus which have come online
6563 * so mask in all cases should simply be TARGET_CPUS
6565 @@ -1008,7 +1016,7 @@ static int EISA_ELCR(unsigned int irq)
6566 * EISA conforming in the MP table, that means its trigger type must
6567 * be read in from the ELCR */
6569 -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
6570 +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
6571 #define default_EISA_polarity(idx) default_ISA_polarity(idx)
6573 /* PCI interrupts are always polarity one level triggered,
6574 @@ -1025,118 +1033,115 @@ static int EISA_ELCR(unsigned int irq)
6576 static int MPBIOS_polarity(int idx)
6578 - int bus = mp_irqs[idx].mpc_srcbus;
6579 + int bus = mp_irqs[idx].mp_srcbus;
6583 * Determine IRQ line polarity (high active or low active):
6585 - switch (mp_irqs[idx].mpc_irqflag & 3)
6586 + switch (mp_irqs[idx].mp_irqflag & 3) {
6587 + case 0: /* conforms, ie. bus-type dependent polarity */
6589 - case 0: /* conforms, ie. bus-type dependent polarity */
6591 - polarity = test_bit(bus, mp_bus_not_pci)?
6592 - default_ISA_polarity(idx):
6593 - default_PCI_polarity(idx);
6596 - case 1: /* high active */
6601 - case 2: /* reserved */
6603 - printk(KERN_WARNING "broken BIOS!!\n");
6607 - case 3: /* low active */
6612 - default: /* invalid */
6614 - printk(KERN_WARNING "broken BIOS!!\n");
6618 + polarity = test_bit(bus, mp_bus_not_pci)?
6619 + default_ISA_polarity(idx):
6620 + default_PCI_polarity(idx);
6623 + case 1: /* high active */
6628 + case 2: /* reserved */
6630 + printk(KERN_WARNING "broken BIOS!!\n");
6634 + case 3: /* low active */
6639 + default: /* invalid */
6641 + printk(KERN_WARNING "broken BIOS!!\n");
6649 static int MPBIOS_trigger(int idx)
6651 - int bus = mp_irqs[idx].mpc_srcbus;
6652 + int bus = mp_irqs[idx].mp_srcbus;
6656 * Determine IRQ trigger mode (edge or level sensitive):
6658 - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
6659 + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
6660 + case 0: /* conforms, ie. bus-type dependent */
6662 - case 0: /* conforms, ie. bus-type dependent */
6664 - trigger = test_bit(bus, mp_bus_not_pci)?
6665 - default_ISA_trigger(idx):
6666 - default_PCI_trigger(idx);
6667 + trigger = test_bit(bus, mp_bus_not_pci)?
6668 + default_ISA_trigger(idx):
6669 + default_PCI_trigger(idx);
6670 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
6671 - switch (mp_bus_id_to_type[bus])
6673 - case MP_BUS_ISA: /* ISA pin */
6675 - /* set before the switch */
6678 - case MP_BUS_EISA: /* EISA pin */
6680 - trigger = default_EISA_trigger(idx);
6683 - case MP_BUS_PCI: /* PCI pin */
6685 - /* set before the switch */
6688 - case MP_BUS_MCA: /* MCA pin */
6690 - trigger = default_MCA_trigger(idx);
6695 - printk(KERN_WARNING "broken BIOS!!\n");
6701 + switch (mp_bus_id_to_type[bus]) {
6702 + case MP_BUS_ISA: /* ISA pin */
6704 + /* set before the switch */
6707 - case 1: /* edge */
6708 + case MP_BUS_EISA: /* EISA pin */
6711 + trigger = default_EISA_trigger(idx);
6714 - case 2: /* reserved */
6715 + case MP_BUS_PCI: /* PCI pin */
6717 - printk(KERN_WARNING "broken BIOS!!\n");
6719 + /* set before the switch */
6722 - case 3: /* level */
6723 + case MP_BUS_MCA: /* MCA pin */
6726 + trigger = default_MCA_trigger(idx);
6729 - default: /* invalid */
6732 printk(KERN_WARNING "broken BIOS!!\n");
6741 + case 1: /* edge */
6746 + case 2: /* reserved */
6748 + printk(KERN_WARNING "broken BIOS!!\n");
6752 + case 3: /* level */
6757 + default: /* invalid */
6759 + printk(KERN_WARNING "broken BIOS!!\n");
6767 @@ -1153,16 +1158,16 @@ static inline int irq_trigger(int idx)
6768 static int pin_2_irq(int idx, int apic, int pin)
6771 - int bus = mp_irqs[idx].mpc_srcbus;
6772 + int bus = mp_irqs[idx].mp_srcbus;
6775 * Debugging check, we are in big trouble if this message pops up!
6777 - if (mp_irqs[idx].mpc_dstirq != pin)
6778 + if (mp_irqs[idx].mp_dstirq != pin)
6779 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
6781 if (test_bit(bus, mp_bus_not_pci))
6782 - irq = mp_irqs[idx].mpc_srcbusirq;
6783 + irq = mp_irqs[idx].mp_srcbusirq;
6786 * PCI IRQs are mapped in order
6787 @@ -1204,8 +1209,8 @@ static inline int IO_APIC_irq_trigger(in
6789 for (apic = 0; apic < nr_ioapics; apic++) {
6790 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6791 - idx = find_irq_entry(apic,pin,mp_INT);
6792 - if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
6793 + idx = find_irq_entry(apic, pin, mp_INT);
6794 + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
6795 return irq_trigger(idx);
6798 @@ -1291,25 +1296,25 @@ static void __init setup_IO_APIC_irqs(vo
6800 * add it to the IO-APIC irq-routing table:
6802 - memset(&entry,0,sizeof(entry));
6803 + memset(&entry, 0, sizeof(entry));
6805 entry.delivery_mode = INT_DELIVERY_MODE;
6806 entry.dest_mode = INT_DEST_MODE;
6807 entry.mask = 0; /* enable IRQ */
6808 - entry.dest.logical.logical_dest =
6809 + entry.dest.logical.logical_dest =
6810 cpu_mask_to_apicid(TARGET_CPUS);
6812 - idx = find_irq_entry(apic,pin,mp_INT);
6813 + idx = find_irq_entry(apic, pin, mp_INT);
6816 apic_printk(APIC_VERBOSE, KERN_DEBUG
6817 " IO-APIC (apicid-pin) %d-%d",
6818 - mp_ioapics[apic].mpc_apicid,
6819 + mp_ioapics[apic].mp_apicid,
6823 apic_printk(APIC_VERBOSE, ", %d-%d",
6824 - mp_ioapics[apic].mpc_apicid, pin);
6825 + mp_ioapics[apic].mp_apicid, pin);
6829 @@ -1343,7 +1348,7 @@ static void __init setup_IO_APIC_irqs(vo
6830 vector = assign_irq_vector(irq);
6831 entry.vector = vector;
6832 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
6835 if (!apic && (irq < 16))
6836 disable_8259A_irq(irq);
6838 @@ -1355,27 +1360,23 @@ static void __init setup_IO_APIC_irqs(vo
6839 apic_printk(APIC_VERBOSE, " not connected.\n");
6844 - * Set up the 8259A-master output pin:
6845 + * Set up the timer pin, possibly with the 8259A-master behind.
6848 -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
6849 +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
6852 struct IO_APIC_route_entry entry;
6854 - memset(&entry,0,sizeof(entry));
6856 - disable_8259A_irq(0);
6859 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6860 + memset(&entry, 0, sizeof(entry));
6863 * We use logical delivery to get the timer IRQ
6866 entry.dest_mode = INT_DEST_MODE;
6867 - entry.mask = 0; /* unmask IRQ now */
6868 + entry.mask = 1; /* mask IRQ now */
6869 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6870 entry.delivery_mode = INT_DELIVERY_MODE;
6872 @@ -1384,17 +1385,14 @@ static void __init setup_ExtINT_IRQ0_pin
6875 * The timer IRQ doesn't have to know that behind the
6876 - * scene we have a 8259A-master in AEOI mode ...
6877 + * scene we may have a 8259A-master in AEOI mode ...
6879 - irq_desc[0].chip = &ioapic_chip;
6880 - set_irq_handler(0, handle_edge_irq);
6881 + ioapic_register_intr(0, vector, IOAPIC_EDGE);
6884 * Add it to the IO-APIC irq-routing table:
6886 ioapic_write_entry(apic, pin, entry);
6888 - enable_8259A_irq(0);
6891 void __init print_IO_APIC(void)
6892 @@ -1409,10 +1407,10 @@ void __init print_IO_APIC(void)
6893 if (apic_verbosity == APIC_QUIET)
6896 - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6897 + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6898 for (i = 0; i < nr_ioapics; i++)
6899 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
6900 - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
6901 + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
6904 * We are a bit conservative about what we expect. We have to
6905 @@ -1431,7 +1429,7 @@ void __init print_IO_APIC(void)
6906 reg_03.raw = io_apic_read(apic, 3);
6907 spin_unlock_irqrestore(&ioapic_lock, flags);
6909 - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
6910 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
6911 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
6912 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
6913 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
6914 @@ -1512,7 +1510,7 @@ void __init print_IO_APIC(void)
6918 -static void print_APIC_bitfield (int base)
6919 +static void print_APIC_bitfield(int base)
6923 @@ -1533,7 +1531,7 @@ static void print_APIC_bitfield (int bas
6927 -void /*__init*/ print_local_APIC(void * dummy)
6928 +void /*__init*/ print_local_APIC(void *dummy)
6930 unsigned int v, ver, maxlvt;
6932 @@ -1542,6 +1540,7 @@ void /*__init*/ print_local_APIC(void *
6934 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
6935 smp_processor_id(), hard_smp_processor_id());
6936 + v = apic_read(APIC_ID);
6937 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
6938 GET_APIC_ID(read_apic_id()));
6939 v = apic_read(APIC_LVR);
6940 @@ -1616,9 +1615,9 @@ void /*__init*/ print_local_APIC(void *
6944 -void print_all_local_APICs (void)
6945 +void print_all_local_APICs(void)
6947 - on_each_cpu(print_local_APIC, NULL, 1, 1);
6948 + on_each_cpu(print_local_APIC, NULL, 1);
6951 void /*__init*/ print_PIC(void)
6952 @@ -1639,11 +1638,11 @@ void /*__init*/ print_PIC(void)
6953 v = inb(0xa0) << 8 | inb(0x20);
6954 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
6960 v = inb(0xa0) << 8 | inb(0x20);
6966 spin_unlock_irqrestore(&i8259A_lock, flags);
6968 @@ -1652,6 +1651,8 @@ void /*__init*/ print_PIC(void)
6969 v = inb(0x4d1) << 8 | inb(0x4d0);
6970 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
6973 +void __init print_IO_APIC(void) {}
6974 #endif /* !CONFIG_XEN */
6976 static void __init enable_IO_APIC(void)
6977 @@ -1681,7 +1682,7 @@ static void __init enable_IO_APIC(void)
6978 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
6981 - for(apic = 0; apic < nr_ioapics; apic++) {
6982 + for (apic = 0; apic < nr_ioapics; apic++) {
6984 /* See if any of the pins is in ExtINT mode */
6985 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6986 @@ -1774,7 +1775,7 @@ void disable_IO_APIC(void)
6987 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
6990 -#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
6992 static void __init setup_ioapic_ids_from_mpc(void)
6994 union IO_APIC_reg_00 reg_00;
6995 @@ -1784,6 +1785,11 @@ static void __init setup_ioapic_ids_from
6996 unsigned char old_id;
6997 unsigned long flags;
6999 +#ifdef CONFIG_X86_NUMAQ
7005 * Don't check I/O APIC IDs for xAPIC systems. They have
7006 * no meaning without the serial APIC bus.
7007 @@ -1806,15 +1812,15 @@ static void __init setup_ioapic_ids_from
7008 spin_lock_irqsave(&ioapic_lock, flags);
7009 reg_00.raw = io_apic_read(apic, 0);
7010 spin_unlock_irqrestore(&ioapic_lock, flags);
7012 - old_id = mp_ioapics[apic].mpc_apicid;
7014 - if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
7015 + old_id = mp_ioapics[apic].mp_apicid;
7017 + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
7018 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
7019 - apic, mp_ioapics[apic].mpc_apicid);
7020 + apic, mp_ioapics[apic].mp_apicid);
7021 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7023 - mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
7024 + mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
7028 @@ -1823,9 +1829,9 @@ static void __init setup_ioapic_ids_from
7029 * 'stuck on smp_invalidate_needed IPI wait' messages.
7031 if (check_apicid_used(phys_id_present_map,
7032 - mp_ioapics[apic].mpc_apicid)) {
7033 + mp_ioapics[apic].mp_apicid)) {
7034 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
7035 - apic, mp_ioapics[apic].mpc_apicid);
7036 + apic, mp_ioapics[apic].mp_apicid);
7037 for (i = 0; i < get_physical_broadcast(); i++)
7038 if (!physid_isset(i, phys_id_present_map))
7040 @@ -1834,13 +1840,13 @@ static void __init setup_ioapic_ids_from
7041 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7043 physid_set(i, phys_id_present_map);
7044 - mp_ioapics[apic].mpc_apicid = i;
7045 + mp_ioapics[apic].mp_apicid = i;
7048 - tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
7049 + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
7050 apic_printk(APIC_VERBOSE, "Setting %d in the "
7051 "phys_id_present_map\n",
7052 - mp_ioapics[apic].mpc_apicid);
7053 + mp_ioapics[apic].mp_apicid);
7054 physids_or(phys_id_present_map, phys_id_present_map, tmp);
7057 @@ -1849,21 +1855,21 @@ static void __init setup_ioapic_ids_from
7058 * We need to adjust the IRQ routing table
7059 * if the ID changed.
7061 - if (old_id != mp_ioapics[apic].mpc_apicid)
7062 + if (old_id != mp_ioapics[apic].mp_apicid)
7063 for (i = 0; i < mp_irq_entries; i++)
7064 - if (mp_irqs[i].mpc_dstapic == old_id)
7065 - mp_irqs[i].mpc_dstapic
7066 - = mp_ioapics[apic].mpc_apicid;
7067 + if (mp_irqs[i].mp_dstapic == old_id)
7068 + mp_irqs[i].mp_dstapic
7069 + = mp_ioapics[apic].mp_apicid;
7072 * Read the right value from the MPC table and
7073 * write it into the ID register.
7076 apic_printk(APIC_VERBOSE, KERN_INFO
7077 "...changing IO-APIC physical APIC ID to %d ...",
7078 - mp_ioapics[apic].mpc_apicid);
7079 + mp_ioapics[apic].mp_apicid);
7081 - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
7082 + reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
7083 spin_lock_irqsave(&ioapic_lock, flags);
7084 io_apic_write(apic, 0, reg_00.raw);
7085 spin_unlock_irqrestore(&ioapic_lock, flags);
7086 @@ -1874,17 +1880,13 @@ static void __init setup_ioapic_ids_from
7087 spin_lock_irqsave(&ioapic_lock, flags);
7088 reg_00.raw = io_apic_read(apic, 0);
7089 spin_unlock_irqrestore(&ioapic_lock, flags);
7090 - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
7091 + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
7092 printk("could not set ID!\n");
7094 apic_printk(APIC_VERBOSE, " ok.\n");
7098 -static void __init setup_ioapic_ids_from_mpc(void) { }
7102 int no_timer_check __initdata;
7104 static int __init notimercheck(char *s)
7105 @@ -2077,45 +2079,53 @@ static inline void init_IO_APIC_traps(vo
7106 * The local APIC irq-chip implementation:
7109 -static void ack_apic(unsigned int irq)
7110 +static void ack_lapic_irq(unsigned int irq)
7115 -static void mask_lapic_irq (unsigned int irq)
7116 +static void mask_lapic_irq(unsigned int irq)
7120 v = apic_read(APIC_LVT0);
7121 - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
7122 + apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
7125 -static void unmask_lapic_irq (unsigned int irq)
7126 +static void unmask_lapic_irq(unsigned int irq)
7130 v = apic_read(APIC_LVT0);
7131 - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
7132 + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
7135 static struct irq_chip lapic_chip __read_mostly = {
7136 - .name = "local-APIC-edge",
7137 + .name = "local-APIC",
7138 .mask = mask_lapic_irq,
7139 .unmask = unmask_lapic_irq,
7141 + .ack = ack_lapic_irq,
7144 +static void lapic_register_intr(int irq, int vector)
7146 + irq_desc[irq].status &= ~IRQ_LEVEL;
7147 + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
7149 + set_intr_gate(vector, interrupt[irq]);
7152 static void __init setup_nmi(void)
7155 - * Dirty trick to enable the NMI watchdog ...
7156 + * Dirty trick to enable the NMI watchdog ...
7157 * We put the 8259A master into AEOI mode and
7158 * unmask on all local APICs LVT0 as NMI.
7160 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
7161 * is from Maciej W. Rozycki - so we do not have to EOI from
7162 * the NMI handler or the timer interrupt.
7165 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
7167 enable_NMI_through_LVT0();
7168 @@ -2191,11 +2201,16 @@ static inline void __init unlock_ExtINT_
7169 static inline void __init check_timer(void)
7171 int apic1, pin1, apic2, pin2;
7175 unsigned long flags;
7177 local_irq_save(flags);
7179 + ver = apic_read(APIC_LVR);
7180 + ver = GET_APIC_VERSION(ver);
7183 * get/set the timer IRQ vector:
7185 @@ -2204,34 +2219,54 @@ static inline void __init check_timer(vo
7186 set_intr_gate(vector, interrupt[0]);
7189 - * Subtle, code in do_timer_interrupt() expects an AEOI
7190 - * mode for the 8259A whenever interrupts are routed
7191 - * through I/O APICs. Also IRQ0 has to be enabled in
7192 - * the 8259A which implies the virtual wire has to be
7193 - * disabled in the local APIC.
7194 + * As IRQ0 is to be enabled in the 8259A, the virtual
7195 + * wire has to be disabled in the local APIC. Also
7196 + * timer interrupts need to be acknowledged manually in
7197 + * the 8259A for the i82489DX when using the NMI
7198 + * watchdog as that APIC treats NMIs as level-triggered.
7199 + * The AEOI mode will finish them in the 8259A
7202 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7203 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7206 - if (timer_over_8254 > 0)
7207 - enable_8259A_irq(0);
7208 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
7210 pin1 = find_isa_irq_pin(0, mp_INT);
7211 apic1 = find_isa_irq_apic(0, mp_INT);
7212 pin2 = ioapic_i8259.pin;
7213 apic2 = ioapic_i8259.apic;
7215 - printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
7216 - vector, apic1, pin1, apic2, pin2);
7217 + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
7218 + "apic1=%d pin1=%d apic2=%d pin2=%d\n",
7219 + vector, apic1, pin1, apic2, pin2);
7222 + * Some BIOS writers are clueless and report the ExtINTA
7223 + * I/O APIC input from the cascaded 8259A as the timer
7224 + * interrupt input. So just in case, if only one pin
7225 + * was found above, try it both directly and through the
7232 + } else if (pin2 == -1) {
7239 * Ok, does IRQ0 through the IOAPIC work?
7242 + add_pin_to_irq(0, apic1, pin1);
7243 + setup_timer_IRQ0_pin(apic1, pin1, vector);
7245 unmask_IO_APIC_irq(0);
7246 if (timer_irq_works()) {
7247 if (nmi_watchdog == NMI_IO_APIC) {
7248 - disable_8259A_irq(0);
7250 enable_8259A_irq(0);
7252 @@ -2240,71 +2275,77 @@ static inline void __init check_timer(vo
7255 clear_IO_APIC_pin(apic1, pin1);
7256 - printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
7260 - printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
7262 - printk("\n..... (found pin %d) ...", pin2);
7264 + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
7265 + "8254 timer not connected to IO-APIC\n");
7267 + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
7268 + "(IRQ0) through the 8259A ...\n");
7269 + apic_printk(APIC_QUIET, KERN_INFO
7270 + "..... (found apic %d pin %d) ...\n", apic2, pin2);
7272 * legacy devices should be connected to IO APIC #0
7274 - setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
7275 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7276 + setup_timer_IRQ0_pin(apic2, pin2, vector);
7277 + unmask_IO_APIC_irq(0);
7278 + enable_8259A_irq(0);
7279 if (timer_irq_works()) {
7280 - printk("works.\n");
7282 - replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7284 - add_pin_to_irq(0, apic2, pin2);
7285 + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
7286 + timer_through_8259 = 1;
7287 if (nmi_watchdog == NMI_IO_APIC) {
7288 + disable_8259A_irq(0);
7290 + enable_8259A_irq(0);
7295 * Cleanup, just in case ...
7297 + disable_8259A_irq(0);
7298 clear_IO_APIC_pin(apic2, pin2);
7299 + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
7301 - printk(" failed.\n");
7303 if (nmi_watchdog == NMI_IO_APIC) {
7304 - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
7306 + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
7307 + "through the IO-APIC - disabling NMI Watchdog!\n");
7308 + nmi_watchdog = NMI_NONE;
7312 - printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
7313 + apic_printk(APIC_QUIET, KERN_INFO
7314 + "...trying to set up timer as Virtual Wire IRQ...\n");
7316 - disable_8259A_irq(0);
7317 - set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
7319 - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7320 + lapic_register_intr(0, vector);
7321 + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7322 enable_8259A_irq(0);
7324 if (timer_irq_works()) {
7325 - printk(" works.\n");
7326 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7329 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7330 - printk(" failed.\n");
7331 + disable_8259A_irq(0);
7332 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7333 + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
7335 - printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
7336 + apic_printk(APIC_QUIET, KERN_INFO
7337 + "...trying to set up timer as ExtINT IRQ...\n");
7342 - apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
7343 + apic_write(APIC_LVT0, APIC_DM_EXTINT);
7345 unlock_ExtINT_logic();
7347 if (timer_irq_works()) {
7348 - printk(" works.\n");
7349 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7352 - printk(" failed :(.\n");
7353 + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
7354 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
7355 - "report. Then try booting with the 'noapic' option");
7356 + "report. Then try booting with the 'noapic' option.\n");
7358 local_irq_restore(flags);
7360 @@ -2314,11 +2355,21 @@ int timer_uses_ioapic_pin_0 = 0;
7365 - * IRQ's that are handled by the PIC in the MPS IOAPIC case.
7366 - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
7367 - * Linux doesn't really care, as it's not actually used
7368 - * for any interrupt handling anyway.
7369 + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
7370 + * to devices. However there may be an I/O APIC pin available for
7371 + * this interrupt regardless. The pin may be left unconnected, but
7372 + * typically it will be reused as an ExtINT cascade interrupt for
7373 + * the master 8259A. In the MPS case such a pin will normally be
7374 + * reported as an ExtINT interrupt in the MP table. With ACPI
7375 + * there is no provision for ExtINT interrupts, and in the absence
7376 + * of an override it would be treated as an ordinary ISA I/O APIC
7377 + * interrupt, that is edge-triggered and unmasked by default. We
7378 + * used to do this, but it caused problems on some systems because
7379 + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
7380 + * the same ExtINT cascade interrupt to drive the local APIC of the
7381 + * bootstrap processor. Therefore we refrain from routing IRQ2 to
7382 + * the I/O APIC in all cases now. No actual device should request
7383 + * it anyway. --macro
7385 #define PIC_IRQS (1 << PIC_CASCADE_IR)
7387 @@ -2328,25 +2379,22 @@ void __init setup_IO_APIC(void)
7390 /* Reserve all the system vectors. */
7391 - for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
7392 + for (i = first_system_vector; i < NR_VECTORS; i++)
7393 set_bit(i, used_vectors);
7399 - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
7401 - io_apic_irqs = ~PIC_IRQS;
7402 + io_apic_irqs = ~PIC_IRQS;
7404 printk("ENABLING IO-APIC IRQs\n");
7408 * Set up IO-APIC IRQ routing.
7411 setup_ioapic_ids_from_mpc();
7415 setup_IO_APIC_irqs();
7416 @@ -2356,28 +2404,14 @@ void __init setup_IO_APIC(void)
7420 -static int __init setup_disable_8254_timer(char *s)
7422 - timer_over_8254 = -1;
7425 -static int __init setup_enable_8254_timer(char *s)
7427 - timer_over_8254 = 2;
7431 -__setup("disable_8254_timer", setup_disable_8254_timer);
7432 -__setup("enable_8254_timer", setup_enable_8254_timer);
7435 * Called after all the initialization is done. If we didnt find any
7436 * APIC bugs then we can allow the modify fast path
7440 static int __init io_apic_bug_finalize(void)
7442 - if(sis_apic_bug == -1)
7443 + if (sis_apic_bug == -1)
7445 if (is_initial_xendomain()) {
7446 struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
7447 @@ -2396,17 +2430,17 @@ struct sysfs_ioapic_data {
7448 struct sys_device dev;
7449 struct IO_APIC_route_entry entry[0];
7451 -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
7452 +static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
7454 static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
7456 struct IO_APIC_route_entry *entry;
7457 struct sysfs_ioapic_data *data;
7461 data = container_of(dev, struct sysfs_ioapic_data, dev);
7462 entry = data->entry;
7463 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7464 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7465 entry[i] = ioapic_read_entry(dev->id, i);
7468 @@ -2419,18 +2453,18 @@ static int ioapic_resume(struct sys_devi
7469 unsigned long flags;
7470 union IO_APIC_reg_00 reg_00;
7474 data = container_of(dev, struct sysfs_ioapic_data, dev);
7475 entry = data->entry;
7477 spin_lock_irqsave(&ioapic_lock, flags);
7478 reg_00.raw = io_apic_read(dev->id, 0);
7479 - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
7480 - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
7481 + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
7482 + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
7483 io_apic_write(dev->id, 0, reg_00.raw);
7485 spin_unlock_irqrestore(&ioapic_lock, flags);
7486 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7487 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7488 ioapic_write_entry(dev->id, i, entry[i]);
7491 @@ -2444,24 +2478,23 @@ static struct sysdev_class ioapic_sysdev
7493 static int __init ioapic_init_sysfs(void)
7495 - struct sys_device * dev;
7496 + struct sys_device *dev;
7497 int i, size, error = 0;
7499 error = sysdev_class_register(&ioapic_sysdev_class);
7503 - for (i = 0; i < nr_ioapics; i++ ) {
7504 - size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7505 + for (i = 0; i < nr_ioapics; i++) {
7506 + size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7507 * sizeof(struct IO_APIC_route_entry);
7508 - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
7509 + mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
7510 if (!mp_ioapic_data[i]) {
7511 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
7514 - memset(mp_ioapic_data[i], 0, size);
7515 dev = &mp_ioapic_data[i]->dev;
7518 dev->cls = &ioapic_sysdev_class;
7519 error = sysdev_register(dev);
7521 @@ -2538,7 +2571,7 @@ static int msi_compose_msg(struct pci_de
7524 ((INT_DEST_MODE == 0) ?
7525 - MSI_ADDR_DEST_MODE_PHYSICAL:
7526 +MSI_ADDR_DEST_MODE_PHYSICAL:
7527 MSI_ADDR_DEST_MODE_LOGICAL) |
7528 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7529 MSI_ADDR_REDIRECTION_CPU:
7530 @@ -2549,7 +2582,7 @@ static int msi_compose_msg(struct pci_de
7531 MSI_DATA_TRIGGER_EDGE |
7532 MSI_DATA_LEVEL_ASSERT |
7533 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7534 - MSI_DATA_DELIVERY_FIXED:
7535 +MSI_DATA_DELIVERY_FIXED:
7536 MSI_DATA_DELIVERY_LOWPRI) |
7537 MSI_DATA_VECTOR(vector);
7539 @@ -2720,12 +2753,12 @@ int arch_setup_ht_irq(unsigned int irq,
7540 #endif /* CONFIG_HT_IRQ */
7542 /* --------------------------------------------------------------------------
7543 - ACPI-based IOAPIC Configuration
7544 + ACPI-based IOAPIC Configuration
7545 -------------------------------------------------------------------------- */
7549 -int __init io_apic_get_unique_id (int ioapic, int apic_id)
7550 +int __init io_apic_get_unique_id(int ioapic, int apic_id)
7553 union IO_APIC_reg_00 reg_00;
7554 @@ -2735,10 +2768,10 @@ int __init io_apic_get_unique_id (int io
7558 - * The P4 platform supports up to 256 APIC IDs on two separate APIC
7559 - * buses (one for LAPICs, one for IOAPICs), where predecessors only
7560 + * The P4 platform supports up to 256 APIC IDs on two separate APIC
7561 + * buses (one for LAPICs, one for IOAPICs), where predecessors only
7562 * supports up to 16 on one shared APIC bus.
7565 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
7566 * advantage of new APIC bus architecture.
7568 @@ -2757,7 +2790,7 @@ int __init io_apic_get_unique_id (int io
7572 - * Every APIC in a system must have a unique ID or we get lots of nice
7573 + * Every APIC in a system must have a unique ID or we get lots of nice
7574 * 'stuck on smp_invalidate_needed IPI wait' messages.
7576 if (check_apicid_used(apic_id_map, apic_id)) {
7577 @@ -2774,7 +2807,7 @@ int __init io_apic_get_unique_id (int io
7578 "trying %d\n", ioapic, apic_id, i);
7584 tmp = apicid_to_cpu_present(apic_id);
7585 physids_or(apic_id_map, apic_id_map, tmp);
7586 @@ -2802,7 +2835,7 @@ int __init io_apic_get_unique_id (int io
7590 -int __init io_apic_get_version (int ioapic)
7591 +int __init io_apic_get_version(int ioapic)
7593 union IO_APIC_reg_01 reg_01;
7594 unsigned long flags;
7595 @@ -2815,7 +2848,7 @@ int __init io_apic_get_version (int ioap
7599 -int __init io_apic_get_redir_entries (int ioapic)
7600 +int __init io_apic_get_redir_entries(int ioapic)
7602 union IO_APIC_reg_01 reg_01;
7603 unsigned long flags;
7604 @@ -2828,7 +2861,7 @@ int __init io_apic_get_redir_entries (in
7608 -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
7609 +int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
7611 struct IO_APIC_route_entry entry;
7613 @@ -2844,7 +2877,7 @@ int io_apic_set_pci_routing (int ioapic,
7614 * corresponding device driver registers for this IRQ.
7617 - memset(&entry,0,sizeof(entry));
7618 + memset(&entry, 0, sizeof(entry));
7620 entry.delivery_mode = INT_DELIVERY_MODE;
7621 entry.dest_mode = INT_DEST_MODE;
7622 @@ -2863,7 +2896,7 @@ int io_apic_set_pci_routing (int ioapic,
7624 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
7625 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
7626 - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
7627 + mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
7628 edge_level, active_high_low);
7630 ioapic_register_intr(irq, entry.vector, edge_level);
7631 @@ -2884,8 +2917,8 @@ int acpi_get_override_irq(int bus_irq, i
7634 for (i = 0; i < mp_irq_entries; i++)
7635 - if (mp_irqs[i].mpc_irqtype == mp_INT &&
7636 - mp_irqs[i].mpc_srcbusirq == bus_irq)
7637 + if (mp_irqs[i].mp_irqtype == mp_INT &&
7638 + mp_irqs[i].mp_srcbusirq == bus_irq)
7640 if (i >= mp_irq_entries)
7642 @@ -2918,3 +2951,35 @@ static int __init parse_noapic(char *arg
7645 early_param("noapic", parse_noapic);
7648 +void __init ioapic_init_mappings(void)
7650 + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
7653 + for (i = 0; i < nr_ioapics; i++) {
7654 + if (smp_found_config) {
7655 + ioapic_phys = mp_ioapics[i].mp_apicaddr;
7656 + if (!ioapic_phys) {
7658 + "WARNING: bogus zero IO-APIC "
7659 + "address found in MPTABLE, "
7660 + "disabling IO/APIC support!\n");
7661 + smp_found_config = 0;
7662 + skip_ioapic_setup = 1;
7663 + goto fake_ioapic_page;
7667 + ioapic_phys = (unsigned long)
7668 + alloc_bootmem_pages(PAGE_SIZE);
7669 + ioapic_phys = __pa(ioapic_phys);
7671 + set_fixmap_nocache(idx, ioapic_phys);
7672 + printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
7673 + __fix_to_virt(idx), ioapic_phys);
7678 Index: head-2008-12-01/arch/x86/kernel/io_apic_64-xen.c
7679 ===================================================================
7680 --- head-2008-12-01.orig/arch/x86/kernel/io_apic_64-xen.c 2008-12-01 11:44:55.000000000 +0100
7681 +++ head-2008-12-01/arch/x86/kernel/io_apic_64-xen.c 2008-12-01 11:49:07.000000000 +0100
7683 #include <asm/proto.h>
7684 #include <asm/acpi.h>
7685 #include <asm/dma.h>
7686 +#include <asm/i8259.h>
7687 #include <asm/nmi.h>
7688 #include <asm/msidef.h>
7689 #include <asm/hypertransport.h>
7690 @@ -63,10 +64,16 @@ struct irq_cfg {
7693 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
7694 -struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7695 +static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7697 static int assign_irq_vector(int irq, cpumask_t mask);
7700 +int first_system_vector = 0xfe;
7702 +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
7705 #define __apicdebuginit __init
7707 int sis_apic_bug; /* not actually supported, dummy for compile */
7708 @@ -89,14 +96,14 @@ unsigned long io_apic_irqs;
7710 #define clear_IO_APIC() ((void)0)
7712 -int timer_over_8254 __initdata = 1;
7713 +int timer_through_8259 __initdata;
7715 /* Where if anywhere is the i8259 connect in external int mode */
7716 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
7719 static DEFINE_SPINLOCK(ioapic_lock);
7720 -DEFINE_SPINLOCK(vector_lock);
7721 +static DEFINE_SPINLOCK(vector_lock);
7724 * # of IRQ routing registers
7725 @@ -104,15 +111,17 @@ DEFINE_SPINLOCK(vector_lock);
7726 int nr_ioapic_registers[MAX_IO_APICS];
7728 /* I/O APIC entries */
7729 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
7730 +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
7733 /* MP IRQ source entries */
7734 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7735 +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7737 /* # of MP IRQ source entries */
7740 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
7743 * Rough estimation of how many shared IRQs there are, can
7744 * be changed anytime.
7745 @@ -141,7 +150,7 @@ struct io_apic {
7746 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
7748 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
7749 - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
7750 + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
7754 @@ -155,7 +164,7 @@ static inline unsigned int io_apic_read(
7755 struct physdev_apic apic_op;
7758 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7759 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7761 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
7763 @@ -173,7 +182,7 @@ static inline void io_apic_write(unsigne
7765 struct physdev_apic apic_op;
7767 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7768 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7770 apic_op.value = value;
7771 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
7772 @@ -209,7 +218,7 @@ static bool io_apic_level_ack_pending(un
7774 reg = io_apic_read(entry->apic, 0x10 + pin*2);
7775 /* Is the remote IRR bit set? */
7776 - if ((reg >> 14) & 1) {
7777 + if (reg & IO_APIC_REDIR_REMOTE_IRR) {
7778 spin_unlock_irqrestore(&ioapic_lock, flags);
7781 @@ -308,7 +317,7 @@ static void __target_IO_APIC_irq(unsigne
7783 io_apic_write(apic, 0x11 + pin*2, dest);
7784 reg = io_apic_read(apic, 0x10 + pin*2);
7785 - reg &= ~0x000000ff;
7786 + reg &= ~IO_APIC_REDIR_VECTOR_MASK;
7788 io_apic_modify(apic, reg);
7790 @@ -372,6 +381,26 @@ static void add_pin_to_irq(unsigned int
7795 + * Reroute an IRQ to a different pin.
7797 +static void __init replace_pin_at_irq(unsigned int irq,
7798 + int oldapic, int oldpin,
7799 + int newapic, int newpin)
7801 + struct irq_pin_list *entry = irq_2_pin + irq;
7804 + if (entry->apic == oldapic && entry->pin == oldpin) {
7805 + entry->apic = newapic;
7806 + entry->pin = newpin;
7810 + entry = irq_2_pin + entry->next;
7814 #define __DO_ACTION(R, ACTION, FINAL) \
7817 @@ -399,10 +428,11 @@ static void add_pin_to_irq(unsigned int
7818 static void name##_IO_APIC_irq (unsigned int irq) \
7819 __DO_ACTION(R, ACTION, FINAL)
7821 -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
7823 -DO_ACTION( __unmask, 0, &= 0xfffeffff, )
7826 +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
7829 +DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
7831 static void mask_IO_APIC_irq (unsigned int irq)
7833 @@ -465,22 +495,6 @@ static int __init disable_timer_pin_setu
7835 __setup("disable_timer_pin_1", disable_timer_pin_setup);
7838 -static int __init setup_disable_8254_timer(char *s)
7840 - timer_over_8254 = -1;
7843 -static int __init setup_enable_8254_timer(char *s)
7845 - timer_over_8254 = 2;
7849 -__setup("disable_8254_timer", setup_disable_8254_timer);
7850 -__setup("enable_8254_timer", setup_enable_8254_timer);
7851 -#endif /* !CONFIG_XEN */
7855 * Find the IRQ entry number of a certain pin.
7856 @@ -490,10 +504,10 @@ static int find_irq_entry(int apic, int
7859 for (i = 0; i < mp_irq_entries; i++)
7860 - if (mp_irqs[i].mpc_irqtype == type &&
7861 - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
7862 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
7863 - mp_irqs[i].mpc_dstirq == pin)
7864 + if (mp_irqs[i].mp_irqtype == type &&
7865 + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
7866 + mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
7867 + mp_irqs[i].mp_dstirq == pin)
7871 @@ -508,13 +522,13 @@ static int __init find_isa_irq_pin(int i
7874 for (i = 0; i < mp_irq_entries; i++) {
7875 - int lbus = mp_irqs[i].mpc_srcbus;
7876 + int lbus = mp_irqs[i].mp_srcbus;
7878 if (test_bit(lbus, mp_bus_not_pci) &&
7879 - (mp_irqs[i].mpc_irqtype == type) &&
7880 - (mp_irqs[i].mpc_srcbusirq == irq))
7881 + (mp_irqs[i].mp_irqtype == type) &&
7882 + (mp_irqs[i].mp_srcbusirq == irq))
7884 - return mp_irqs[i].mpc_dstirq;
7885 + return mp_irqs[i].mp_dstirq;
7889 @@ -524,17 +538,17 @@ static int __init find_isa_irq_apic(int
7892 for (i = 0; i < mp_irq_entries; i++) {
7893 - int lbus = mp_irqs[i].mpc_srcbus;
7894 + int lbus = mp_irqs[i].mp_srcbus;
7896 if (test_bit(lbus, mp_bus_not_pci) &&
7897 - (mp_irqs[i].mpc_irqtype == type) &&
7898 - (mp_irqs[i].mpc_srcbusirq == irq))
7899 + (mp_irqs[i].mp_irqtype == type) &&
7900 + (mp_irqs[i].mp_srcbusirq == irq))
7903 if (i < mp_irq_entries) {
7905 for(apic = 0; apic < nr_ioapics; apic++) {
7906 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
7907 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
7911 @@ -555,28 +569,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7913 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
7915 - if (mp_bus_id_to_pci_bus[bus] == -1) {
7916 + if (test_bit(bus, mp_bus_not_pci)) {
7917 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
7920 for (i = 0; i < mp_irq_entries; i++) {
7921 - int lbus = mp_irqs[i].mpc_srcbus;
7922 + int lbus = mp_irqs[i].mp_srcbus;
7924 for (apic = 0; apic < nr_ioapics; apic++)
7925 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
7926 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
7927 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
7928 + mp_irqs[i].mp_dstapic == MP_APIC_ALL)
7931 if (!test_bit(lbus, mp_bus_not_pci) &&
7932 - !mp_irqs[i].mpc_irqtype &&
7933 + !mp_irqs[i].mp_irqtype &&
7935 - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
7936 - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
7937 + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
7938 + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
7940 if (!(apic || IO_APIC_IRQ(irq)))
7943 - if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
7944 + if (pin == (mp_irqs[i].mp_srcbusirq & 3))
7947 * Use the first all-but-pin matching entry as a
7948 @@ -604,13 +618,13 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7950 static int MPBIOS_polarity(int idx)
7952 - int bus = mp_irqs[idx].mpc_srcbus;
7953 + int bus = mp_irqs[idx].mp_srcbus;
7957 * Determine IRQ line polarity (high active or low active):
7959 - switch (mp_irqs[idx].mpc_irqflag & 3)
7960 + switch (mp_irqs[idx].mp_irqflag & 3)
7962 case 0: /* conforms, ie. bus-type dependent polarity */
7963 if (test_bit(bus, mp_bus_not_pci))
7964 @@ -646,13 +660,13 @@ static int MPBIOS_polarity(int idx)
7966 static int MPBIOS_trigger(int idx)
7968 - int bus = mp_irqs[idx].mpc_srcbus;
7969 + int bus = mp_irqs[idx].mp_srcbus;
7973 * Determine IRQ trigger mode (edge or level sensitive):
7975 - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
7976 + switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
7978 case 0: /* conforms, ie. bus-type dependent */
7979 if (test_bit(bus, mp_bus_not_pci))
7980 @@ -699,16 +713,16 @@ static inline int irq_trigger(int idx)
7981 static int pin_2_irq(int idx, int apic, int pin)
7984 - int bus = mp_irqs[idx].mpc_srcbus;
7985 + int bus = mp_irqs[idx].mp_srcbus;
7988 * Debugging check, we are in big trouble if this message pops up!
7990 - if (mp_irqs[idx].mpc_dstirq != pin)
7991 + if (mp_irqs[idx].mp_dstirq != pin)
7992 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
7994 if (test_bit(bus, mp_bus_not_pci)) {
7995 - irq = mp_irqs[idx].mpc_srcbusirq;
7996 + irq = mp_irqs[idx].mp_srcbusirq;
7999 * PCI IRQs are mapped in order
8000 @@ -722,6 +736,19 @@ static int pin_2_irq(int idx, int apic,
8004 +void lock_vector_lock(void)
8006 + /* Used to the online set of cpus does not change
8007 + * during assign_irq_vector.
8009 + spin_lock(&vector_lock);
8012 +void unlock_vector_lock(void)
8014 + spin_unlock(&vector_lock);
8017 static int __assign_irq_vector(int irq, cpumask_t mask)
8019 struct physdev_irq irq_op;
8020 @@ -773,7 +800,7 @@ static void __clear_irq_vector(int irq)
8022 vector = cfg->vector;
8023 cpus_and(mask, cfg->domain, cpu_online_map);
8024 - for_each_cpu_mask(cpu, mask)
8025 + for_each_cpu_mask_nr(cpu, mask)
8026 per_cpu(vector_irq, cpu)[vector] = -1;
8029 @@ -842,7 +869,7 @@ static void setup_IO_APIC_irq(int apic,
8030 apic_printk(APIC_VERBOSE,KERN_DEBUG
8031 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
8032 "IRQ %d Mode:%i Active:%i)\n",
8033 - apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
8034 + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
8035 irq, trigger, polarity);
8038 @@ -883,10 +910,10 @@ static void __init setup_IO_APIC_irqs(vo
8039 idx = find_irq_entry(apic,pin,mp_INT);
8042 - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8043 + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
8046 - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8047 + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
8050 if (!first_notcon) {
8051 @@ -908,26 +935,21 @@ static void __init setup_IO_APIC_irqs(vo
8055 - * Set up the 8259A-master output pin as broadcast to all
8057 + * Set up the timer pin, possibly with the 8259A-master behind.
8059 -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
8060 +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
8063 struct IO_APIC_route_entry entry;
8065 memset(&entry, 0, sizeof(entry));
8067 - disable_8259A_irq(0);
8070 - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8073 * We use logical delivery to get the timer IRQ
8076 entry.dest_mode = INT_DEST_MODE;
8077 - entry.mask = 0; /* unmask IRQ now */
8078 + entry.mask = 1; /* mask IRQ now */
8079 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
8080 entry.delivery_mode = INT_DELIVERY_MODE;
8082 @@ -936,7 +958,7 @@ static void __init setup_ExtINT_IRQ0_pin
8085 * The timer IRQ doesn't have to know that behind the
8086 - * scene we have a 8259A-master in AEOI mode ...
8087 + * scene we may have a 8259A-master in AEOI mode ...
8089 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
8091 @@ -944,8 +966,6 @@ static void __init setup_ExtINT_IRQ0_pin
8092 * Add it to the IO-APIC irq-routing table:
8094 ioapic_write_entry(apic, pin, entry);
8096 - enable_8259A_irq(0);
8099 void __apicdebuginit print_IO_APIC(void)
8100 @@ -962,7 +982,7 @@ void __apicdebuginit print_IO_APIC(void)
8101 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
8102 for (i = 0; i < nr_ioapics; i++)
8103 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
8104 - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
8105 + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
8108 * We are a bit conservative about what we expect. We have to
8109 @@ -980,7 +1000,7 @@ void __apicdebuginit print_IO_APIC(void)
8110 spin_unlock_irqrestore(&ioapic_lock, flags);
8113 - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
8114 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
8115 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
8116 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
8118 @@ -1072,6 +1092,7 @@ void __apicdebuginit print_local_APIC(vo
8120 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
8121 smp_processor_id(), hard_smp_processor_id());
8122 + v = apic_read(APIC_ID);
8123 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
8124 v = apic_read(APIC_LVR);
8125 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
8126 @@ -1141,7 +1162,7 @@ void __apicdebuginit print_local_APIC(vo
8128 void print_all_local_APICs (void)
8130 - on_each_cpu(print_local_APIC, NULL, 1, 1);
8131 + on_each_cpu(print_local_APIC, NULL, 1);
8134 void __apicdebuginit print_PIC(void)
8135 @@ -1175,6 +1196,8 @@ void __apicdebuginit print_PIC(void)
8136 v = inb(0x4d1) << 8 | inb(0x4d0);
8137 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
8140 +void __apicdebuginit print_IO_APIC(void) {}
8141 #endif /* !CONFIG_XEN */
8143 void __init enable_IO_APIC(void)
8144 @@ -1359,12 +1382,10 @@ static unsigned int startup_ioapic_irq(u
8145 static int ioapic_retrigger_irq(unsigned int irq)
8147 struct irq_cfg *cfg = &irq_cfg[irq];
8149 unsigned long flags;
8151 spin_lock_irqsave(&vector_lock, flags);
8152 - mask = cpumask_of_cpu(first_cpu(cfg->domain));
8153 - send_IPI_mask(mask, cfg->vector);
8154 + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
8155 spin_unlock_irqrestore(&vector_lock, flags);
8158 @@ -1545,7 +1566,7 @@ static inline void init_IO_APIC_traps(vo
8162 -static void enable_lapic_irq (unsigned int irq)
8163 +static void unmask_lapic_irq(unsigned int irq)
8167 @@ -1553,7 +1574,7 @@ static void enable_lapic_irq (unsigned i
8168 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
8171 -static void disable_lapic_irq (unsigned int irq)
8172 +static void mask_lapic_irq(unsigned int irq)
8176 @@ -1566,19 +1587,20 @@ static void ack_lapic_irq (unsigned int
8180 -static void end_lapic_irq (unsigned int i) { /* nothing */ }
8182 -static struct hw_interrupt_type lapic_irq_type __read_mostly = {
8183 - .name = "local-APIC",
8184 - .typename = "local-APIC-edge",
8185 - .startup = NULL, /* startup_irq() not used for IRQ0 */
8186 - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
8187 - .enable = enable_lapic_irq,
8188 - .disable = disable_lapic_irq,
8189 - .ack = ack_lapic_irq,
8190 - .end = end_lapic_irq,
8191 +static struct irq_chip lapic_chip __read_mostly = {
8192 + .name = "local-APIC",
8193 + .mask = mask_lapic_irq,
8194 + .unmask = unmask_lapic_irq,
8195 + .ack = ack_lapic_irq,
8198 +static void lapic_register_intr(int irq)
8200 + irq_desc[irq].status &= ~IRQ_LEVEL;
8201 + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
8205 static void __init setup_nmi(void)
8208 @@ -1664,6 +1686,7 @@ static inline void __init check_timer(vo
8209 struct irq_cfg *cfg = irq_cfg + 0;
8210 int apic1, pin1, apic2, pin2;
8211 unsigned long flags;
8214 local_irq_save(flags);
8216 @@ -1674,34 +1697,48 @@ static inline void __init check_timer(vo
8217 assign_irq_vector(0, TARGET_CPUS);
8220 - * Subtle, code in do_timer_interrupt() expects an AEOI
8221 - * mode for the 8259A whenever interrupts are routed
8222 - * through I/O APICs. Also IRQ0 has to be enabled in
8223 - * the 8259A which implies the virtual wire has to be
8224 - * disabled in the local APIC.
8225 + * As IRQ0 is to be enabled in the 8259A, the virtual
8226 + * wire has to be disabled in the local APIC.
8228 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8230 - if (timer_over_8254 > 0)
8231 - enable_8259A_irq(0);
8233 pin1 = find_isa_irq_pin(0, mp_INT);
8234 apic1 = find_isa_irq_apic(0, mp_INT);
8235 pin2 = ioapic_i8259.pin;
8236 apic2 = ioapic_i8259.apic;
8238 - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
8239 - cfg->vector, apic1, pin1, apic2, pin2);
8240 + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
8241 + "apic1=%d pin1=%d apic2=%d pin2=%d\n",
8242 + cfg->vector, apic1, pin1, apic2, pin2);
8245 + * Some BIOS writers are clueless and report the ExtINTA
8246 + * I/O APIC input from the cascaded 8259A as the timer
8247 + * interrupt input. So just in case, if only one pin
8248 + * was found above, try it both directly and through the
8255 + } else if (pin2 == -1) {
8262 * Ok, does IRQ0 through the IOAPIC work?
8265 + add_pin_to_irq(0, apic1, pin1);
8266 + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
8268 unmask_IO_APIC_irq(0);
8269 if (!no_timer_check && timer_irq_works()) {
8270 - nmi_watchdog_default();
8271 if (nmi_watchdog == NMI_IO_APIC) {
8272 - disable_8259A_irq(0);
8274 enable_8259A_irq(0);
8276 @@ -1710,54 +1747,62 @@ static inline void __init check_timer(vo
8279 clear_IO_APIC_pin(apic1, pin1);
8280 - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
8281 - "connected to IO-APIC\n");
8284 - apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
8285 - "through the 8259A ... ");
8287 - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
8290 + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
8291 + "8254 timer not connected to IO-APIC\n");
8293 + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
8294 + "(IRQ0) through the 8259A ...\n");
8295 + apic_printk(APIC_QUIET, KERN_INFO
8296 + "..... (found apic %d pin %d) ...\n", apic2, pin2);
8298 * legacy devices should be connected to IO APIC #0
8300 - setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
8301 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
8302 + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
8303 + unmask_IO_APIC_irq(0);
8304 + enable_8259A_irq(0);
8305 if (timer_irq_works()) {
8306 - apic_printk(APIC_VERBOSE," works.\n");
8307 - nmi_watchdog_default();
8308 + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
8309 + timer_through_8259 = 1;
8310 if (nmi_watchdog == NMI_IO_APIC) {
8311 + disable_8259A_irq(0);
8313 + enable_8259A_irq(0);
8318 * Cleanup, just in case ...
8320 + disable_8259A_irq(0);
8321 clear_IO_APIC_pin(apic2, pin2);
8322 + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
8324 - apic_printk(APIC_VERBOSE," failed.\n");
8326 if (nmi_watchdog == NMI_IO_APIC) {
8327 - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
8329 + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
8330 + "through the IO-APIC - disabling NMI Watchdog!\n");
8331 + nmi_watchdog = NMI_NONE;
8334 - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
8335 + apic_printk(APIC_QUIET, KERN_INFO
8336 + "...trying to set up timer as Virtual Wire IRQ...\n");
8338 - disable_8259A_irq(0);
8339 - irq_desc[0].chip = &lapic_irq_type;
8340 + lapic_register_intr(0);
8341 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
8342 enable_8259A_irq(0);
8344 if (timer_irq_works()) {
8345 - apic_printk(APIC_VERBOSE," works.\n");
8346 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8349 + disable_8259A_irq(0);
8350 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
8351 - apic_printk(APIC_VERBOSE," failed.\n");
8352 + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
8354 - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
8355 + apic_printk(APIC_QUIET, KERN_INFO
8356 + "...trying to set up timer as ExtINT IRQ...\n");
8360 @@ -1766,11 +1811,12 @@ static inline void __init check_timer(vo
8361 unlock_ExtINT_logic();
8363 if (timer_irq_works()) {
8364 - apic_printk(APIC_VERBOSE," works.\n");
8365 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8368 - apic_printk(APIC_VERBOSE," failed :(.\n");
8369 - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
8370 + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
8371 + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
8372 + "report. Then try booting with the 'noapic' option.\n");
8374 local_irq_restore(flags);
8376 @@ -1788,10 +1834,21 @@ __setup("no_timer_check", notimercheck);
8380 - * IRQs that are handled by the PIC in the MPS IOAPIC case.
8381 - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
8382 - * Linux doesn't really care, as it's not actually used
8383 - * for any interrupt handling anyway.
8384 + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
8385 + * to devices. However there may be an I/O APIC pin available for
8386 + * this interrupt regardless. The pin may be left unconnected, but
8387 + * typically it will be reused as an ExtINT cascade interrupt for
8388 + * the master 8259A. In the MPS case such a pin will normally be
8389 + * reported as an ExtINT interrupt in the MP table. With ACPI
8390 + * there is no provision for ExtINT interrupts, and in the absence
8391 + * of an override it would be treated as an ordinary ISA I/O APIC
8392 + * interrupt, that is edge-triggered and unmasked by default. We
8393 + * used to do this, but it caused problems on some systems because
8394 + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
8395 + * the same ExtINT cascade interrupt to drive the local APIC of the
8396 + * bootstrap processor. Therefore we refrain from routing IRQ2 to
8397 + * the I/O APIC in all cases now. No actual device should request
8398 + * it anyway. --macro
8400 #define PIC_IRQS (1<<2)
8402 @@ -1799,10 +1856,7 @@ void __init setup_IO_APIC(void)
8407 - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
8409 - io_apic_irqs = ~PIC_IRQS;
8410 + io_apic_irqs = ~PIC_IRQS;
8412 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
8414 @@ -1851,8 +1905,8 @@ static int ioapic_resume(struct sys_devi
8416 spin_lock_irqsave(&ioapic_lock, flags);
8417 reg_00.raw = io_apic_read(dev->id, 0);
8418 - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
8419 - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
8420 + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
8421 + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
8422 io_apic_write(dev->id, 0, reg_00.raw);
8424 spin_unlock_irqrestore(&ioapic_lock, flags);
8425 @@ -2254,8 +2308,8 @@ int acpi_get_override_irq(int bus_irq, i
8428 for (i = 0; i < mp_irq_entries; i++)
8429 - if (mp_irqs[i].mpc_irqtype == mp_INT &&
8430 - mp_irqs[i].mpc_srcbusirq == bus_irq)
8431 + if (mp_irqs[i].mp_irqtype == mp_INT &&
8432 + mp_irqs[i].mp_srcbusirq == bus_irq)
8434 if (i >= mp_irq_entries)
8436 @@ -2349,7 +2403,7 @@ void __init ioapic_init_mappings(void)
8437 ioapic_res = ioapic_setup_resources();
8438 for (i = 0; i < nr_ioapics; i++) {
8439 if (smp_found_config) {
8440 - ioapic_phys = mp_ioapics[i].mpc_apicaddr;
8441 + ioapic_phys = mp_ioapics[i].mp_apicaddr;
8443 ioapic_phys = (unsigned long)
8444 alloc_bootmem_pages(PAGE_SIZE);
8445 Index: head-2008-12-01/arch/x86/kernel/ipi-xen.c
8446 ===================================================================
8447 --- head-2008-12-01.orig/arch/x86/kernel/ipi-xen.c 2008-12-01 11:44:55.000000000 +0100
8448 +++ head-2008-12-01/arch/x86/kernel/ipi-xen.c 2008-12-01 11:49:07.000000000 +0100
8450 #include <linux/kernel_stat.h>
8451 #include <linux/mc146818rtc.h>
8452 #include <linux/cache.h>
8453 -#include <linux/interrupt.h>
8454 #include <linux/cpu.h>
8455 #include <linux/module.h>
8457 @@ -85,7 +84,7 @@ void __send_IPI_shortcut(unsigned int sh
8459 * Send the IPI. The write to APIC_ICR fires this off.
8461 - apic_write_around(APIC_ICR, cfg);
8462 + apic_write(APIC_ICR, cfg);
8466 @@ -132,7 +131,7 @@ static inline void __send_IPI_dest_field
8467 * prepare target chip field
8469 cfg = __prepare_ICR2(mask);
8470 - apic_write_around(APIC_ICR2, cfg);
8471 + apic_write(APIC_ICR2, cfg);
8475 @@ -142,7 +141,7 @@ static inline void __send_IPI_dest_field
8477 * Send the IPI. The write to APIC_ICR fires this off.
8479 - apic_write_around(APIC_ICR, cfg);
8480 + apic_write(APIC_ICR, cfg);
8484 Index: head-2008-12-01/arch/x86/kernel/irq_32-xen.c
8485 ===================================================================
8486 --- head-2008-12-01.orig/arch/x86/kernel/irq_32-xen.c 2008-12-01 11:44:55.000000000 +0100
8487 +++ head-2008-12-01/arch/x86/kernel/irq_32-xen.c 2008-12-01 11:49:07.000000000 +0100
8488 @@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
8492 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
8493 +/* Debugging check for stack overflow: is there less than 1KB free? */
8494 +static int check_stack_overflow(void)
8498 + __asm__ __volatile__("andl %%esp,%0" :
8499 + "=r" (sp) : "0" (THREAD_SIZE - 1));
8501 + return sp < (sizeof(struct thread_info) + STACK_WARN);
8504 +static void print_stack_overflow(void)
8506 + printk(KERN_WARNING "low stack detected by irq handler\n");
8511 +static inline int check_stack_overflow(void) { return 0; }
8512 +static inline void print_stack_overflow(void) { }
8515 #ifdef CONFIG_4KSTACKS
8517 * per-CPU IRQ handling contexts (thread information and stack)
8518 @@ -59,48 +82,26 @@ union irq_ctx {
8520 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
8521 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
8525 - * do_IRQ handles all normal device IRQ's (the special
8526 - * SMP cross-CPU interrupts have their own specific
8529 -unsigned int do_IRQ(struct pt_regs *regs)
8531 - struct pt_regs *old_regs;
8532 - /* high bit used in ret_from_ code */
8533 - int irq = ~regs->orig_ax;
8534 - struct irq_desc *desc = irq_desc + irq;
8535 -#ifdef CONFIG_4KSTACKS
8536 - union irq_ctx *curctx, *irqctx;
8540 - if (unlikely((unsigned)irq >= NR_IRQS)) {
8541 - printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8545 +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8546 +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8548 - old_regs = set_irq_regs(regs);
8550 -#ifdef CONFIG_DEBUG_STACKOVERFLOW
8551 - /* Debugging check for stack overflow: is there less than 1KB free? */
8555 - __asm__ __volatile__("andl %%esp,%0" :
8556 - "=r" (sp) : "0" (THREAD_SIZE - 1));
8557 - if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
8558 - printk("do_IRQ: stack overflow: %ld\n",
8559 - sp - sizeof(struct thread_info));
8564 +static void call_on_stack(void *func, void *stack)
8566 + asm volatile("xchgl %%ebx,%%esp \n"
8568 + "movl %%ebx,%%esp \n"
8572 + : "memory", "cc", "edx", "ecx", "eax");
8575 -#ifdef CONFIG_4KSTACKS
8577 +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
8579 + union irq_ctx *curctx, *irqctx;
8580 + u32 *isp, arg1, arg2;
8582 curctx = (union irq_ctx *) current_thread_info();
8583 irqctx = hardirq_ctx[smp_processor_id()];
8584 @@ -111,52 +112,39 @@ unsigned int do_IRQ(struct pt_regs *regs
8585 * handler) we can't do that and just have to keep using the
8586 * current stack (which is the irq stack already after all)
8588 - if (curctx != irqctx) {
8589 - int arg1, arg2, bx;
8591 - /* build the stack frame on the IRQ stack */
8592 - isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8593 - irqctx->tinfo.task = curctx->tinfo.task;
8594 - irqctx->tinfo.previous_esp = current_stack_pointer;
8595 + if (unlikely(curctx == irqctx))
8599 - * Copy the softirq bits in preempt_count so that the
8600 - * softirq checks work in the hardirq context.
8602 - irqctx->tinfo.preempt_count =
8603 - (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8604 - (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8607 - " xchgl %%ebx,%%esp \n"
8609 - " movl %%ebx,%%esp \n"
8610 - : "=a" (arg1), "=d" (arg2), "=b" (bx)
8611 - : "0" (irq), "1" (desc), "2" (isp),
8612 - "D" (desc->handle_irq)
8613 - : "memory", "cc", "ecx"
8617 - desc->handle_irq(irq, desc);
8618 + /* build the stack frame on the IRQ stack */
8619 + isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
8620 + irqctx->tinfo.task = curctx->tinfo.task;
8621 + irqctx->tinfo.previous_esp = current_stack_pointer;
8624 - set_irq_regs(old_regs);
8626 + * Copy the softirq bits in preempt_count so that the
8627 + * softirq checks work in the hardirq context.
8629 + irqctx->tinfo.preempt_count =
8630 + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8631 + (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8633 + if (unlikely(overflow))
8634 + call_on_stack(print_stack_overflow, isp);
8636 + asm volatile("xchgl %%ebx,%%esp \n"
8638 + "movl %%ebx,%%esp \n"
8639 + : "=a" (arg1), "=d" (arg2), "=b" (isp)
8640 + : "0" (irq), "1" (desc), "2" (isp),
8641 + "D" (desc->handle_irq)
8642 + : "memory", "cc", "ecx");
8646 -#ifdef CONFIG_4KSTACKS
8648 -static char softirq_stack[NR_CPUS * THREAD_SIZE]
8649 - __attribute__((__section__(".bss.page_aligned")));
8651 -static char hardirq_stack[NR_CPUS * THREAD_SIZE]
8652 - __attribute__((__section__(".bss.page_aligned")));
8655 * allocate per-cpu stacks for hardirq and for softirq processing
8657 -void irq_ctx_init(int cpu)
8658 +void __cpuinit irq_ctx_init(int cpu)
8660 union irq_ctx *irqctx;
8662 @@ -164,25 +152,25 @@ void irq_ctx_init(int cpu)
8665 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
8666 - irqctx->tinfo.task = NULL;
8667 - irqctx->tinfo.exec_domain = NULL;
8668 - irqctx->tinfo.cpu = cpu;
8669 - irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8670 - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8671 + irqctx->tinfo.task = NULL;
8672 + irqctx->tinfo.exec_domain = NULL;
8673 + irqctx->tinfo.cpu = cpu;
8674 + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8675 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8677 hardirq_ctx[cpu] = irqctx;
8679 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
8680 - irqctx->tinfo.task = NULL;
8681 - irqctx->tinfo.exec_domain = NULL;
8682 - irqctx->tinfo.cpu = cpu;
8683 - irqctx->tinfo.preempt_count = 0;
8684 - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8685 + irqctx->tinfo.task = NULL;
8686 + irqctx->tinfo.exec_domain = NULL;
8687 + irqctx->tinfo.cpu = cpu;
8688 + irqctx->tinfo.preempt_count = 0;
8689 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8691 softirq_ctx[cpu] = irqctx;
8693 - printk("CPU %u irqstacks, hard=%p soft=%p\n",
8694 - cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8695 + printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
8696 + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8699 void irq_ctx_exit(int cpu)
8700 @@ -211,25 +199,56 @@ asmlinkage void do_softirq(void)
8701 /* build the stack frame on the softirq stack */
8702 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8705 - " xchgl %%ebx,%%esp \n"
8706 - " call __do_softirq \n"
8707 - " movl %%ebx,%%esp \n"
8710 - : "memory", "cc", "edx", "ecx", "eax"
8712 + call_on_stack(__do_softirq, isp);
8714 * Shouldnt happen, we returned above if in_interrupt():
8717 WARN_ON_ONCE(softirq_count());
8720 local_irq_restore(flags);
8725 +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
8729 + * do_IRQ handles all normal device IRQ's (the special
8730 + * SMP cross-CPU interrupts have their own specific
8733 +unsigned int do_IRQ(struct pt_regs *regs)
8735 + struct pt_regs *old_regs;
8736 + /* high bit used in ret_from_ code */
8737 + int overflow, irq = ~regs->orig_ax;
8738 + struct irq_desc *desc = irq_desc + irq;
8740 + if (unlikely((unsigned)irq >= NR_IRQS)) {
8741 + printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8746 + old_regs = set_irq_regs(regs);
8749 + overflow = check_stack_overflow();
8751 + if (!execute_on_irq_stack(overflow, desc, irq)) {
8752 + if (unlikely(overflow))
8753 + print_stack_overflow();
8754 + desc->handle_irq(irq, desc);
8758 + set_irq_regs(old_regs);
8763 * Interrupt statistics:
8766 @@ -337,6 +356,42 @@ skip:
8771 + * /proc/stat helpers
8773 +u64 arch_irq_stat_cpu(unsigned int cpu)
8775 + u64 sum = nmi_count(cpu);
8777 +#ifdef CONFIG_X86_LOCAL_APIC
8778 + sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
8781 + sum += per_cpu(irq_stat, cpu).irq_resched_count;
8782 + sum += per_cpu(irq_stat, cpu).irq_call_count;
8784 + sum += per_cpu(irq_stat, cpu).irq_tlb_count;
8787 +#ifdef CONFIG_X86_MCE
8788 + sum += per_cpu(irq_stat, cpu).irq_thermal_count;
8790 +#ifdef CONFIG_X86_LOCAL_APIC
8791 + sum += per_cpu(irq_stat, cpu).irq_spurious_count;
8796 +u64 arch_irq_stat(void)
8798 + u64 sum = atomic_read(&irq_err_count);
8800 +#ifdef CONFIG_X86_IO_APIC
8801 + sum += atomic_read(&irq_mis_count);
8806 #ifdef CONFIG_HOTPLUG_CPU
8808 void fixup_irqs(cpumask_t map)
8809 Index: head-2008-12-01/arch/x86/kernel/irq_64-xen.c
8810 ===================================================================
8811 --- head-2008-12-01.orig/arch/x86/kernel/irq_64-xen.c 2008-12-01 11:37:10.000000000 +0100
8812 +++ head-2008-12-01/arch/x86/kernel/irq_64-xen.c 2008-12-01 11:49:07.000000000 +0100
8813 @@ -163,6 +163,34 @@ skip:
8817 + * /proc/stat helpers
8819 +u64 arch_irq_stat_cpu(unsigned int cpu)
8821 + u64 sum = cpu_pda(cpu)->__nmi_count;
8823 + sum += cpu_pda(cpu)->apic_timer_irqs;
8825 + sum += cpu_pda(cpu)->irq_resched_count;
8826 + sum += cpu_pda(cpu)->irq_call_count;
8828 + sum += cpu_pda(cpu)->irq_tlb_count;
8831 +#ifdef CONFIG_X86_MCE
8832 + sum += cpu_pda(cpu)->irq_thermal_count;
8833 + sum += cpu_pda(cpu)->irq_threshold_count;
8835 + sum += cpu_pda(cpu)->irq_spurious_count;
8839 +u64 arch_irq_stat(void)
8841 + return atomic_read(&irq_err_count);
8845 * do_IRQ handles all normal device IRQ's (the special
8846 * SMP cross-CPU interrupts have their own specific
8848 Index: head-2008-12-01/arch/x86/kernel/ldt-xen.c
8849 ===================================================================
8850 --- head-2008-12-01.orig/arch/x86/kernel/ldt-xen.c 2008-12-01 11:37:10.000000000 +0100
8851 +++ head-2008-12-01/arch/x86/kernel/ldt-xen.c 2008-12-01 11:49:07.000000000 +0100
8853 #include <asm/mmu_context.h>
8856 -static void flush_ldt(void *null)
8857 +static void flush_ldt(void *current_mm)
8859 - if (current->active_mm)
8860 + if (current->active_mm == current_mm)
8861 load_LDT(¤t->active_mm->context);
8864 @@ -62,8 +62,6 @@ static int alloc_ldt(mm_context_t *pc, i
8872 make_pages_readonly(newldt,
8873 @@ -71,9 +69,9 @@ static int alloc_ldt(mm_context_t *pc, i
8874 XENFEAT_writable_descriptor_tables);
8877 - mask = cpumask_of_cpu(smp_processor_id());
8878 - if (!cpus_equal(current->mm->cpu_vm_mask, mask))
8879 - smp_call_function(flush_ldt, NULL, 1, 1);
8880 + if (!cpus_equal(current->mm->cpu_vm_mask,
8881 + cpumask_of_cpu(smp_processor_id())))
8882 + smp_call_function(flush_ldt, current->mm, 1);
8886 Index: head-2008-12-01/arch/x86/kernel/microcode-xen.c
8887 ===================================================================
8888 --- head-2008-12-01.orig/arch/x86/kernel/microcode-xen.c 2008-12-01 11:44:55.000000000 +0100
8889 +++ head-2008-12-01/arch/x86/kernel/microcode-xen.c 2008-12-01 11:49:07.000000000 +0100
8891 * 2006 Shaohua Li <shaohua.li@intel.com>
8893 * This driver allows to upgrade microcode on Intel processors
8894 - * belonging to IA-32 family - PentiumPro, Pentium II,
8895 + * belonging to IA-32 family - PentiumPro, Pentium II,
8896 * Pentium III, Xeon, Pentium 4, etc.
8898 - * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
8899 - * Order Number 245472 or free download from:
8901 - * http://developer.intel.com/design/pentium4/manuals/245472.htm
8902 + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
8903 + * Software Developer's Manual
8904 + * Order Number 253668 or free download from:
8906 + * http://developer.intel.com/design/pentium4/manuals/253668.htm
8908 * For more information, go to http://www.urbanmyth.org/microcode
8911 #include <linux/kernel.h>
8912 #include <linux/init.h>
8913 #include <linux/sched.h>
8914 +#include <linux/smp_lock.h>
8915 #include <linux/cpumask.h>
8916 #include <linux/module.h>
8917 #include <linux/slab.h>
8918 @@ -86,6 +88,7 @@ static int do_microcode_update (const vo
8920 static int microcode_open (struct inode *unused1, struct file *unused2)
8922 + cycle_kernel_lock();
8923 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
8926 @@ -162,7 +165,7 @@ static int request_microcode(void)
8927 c->x86, c->x86_model, c->x86_mask);
8928 error = request_firmware(&firmware, name, µcode_pdev->dev);
8930 - pr_debug("microcode: ucode data file %s load failed\n", name);
8931 + pr_debug("microcode: data file %s load failed\n", name);
8935 @@ -183,6 +186,9 @@ static int __init microcode_init (void)
8940 + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8942 error = microcode_dev_init();
8945 @@ -195,8 +201,6 @@ static int __init microcode_init (void)
8947 request_microcode();
8950 - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8954 Index: head-2008-12-01/arch/x86/kernel/mpparse-xen.c
8955 ===================================================================
8956 --- head-2008-12-01.orig/arch/x86/kernel/mpparse-xen.c 2008-12-01 11:44:55.000000000 +0100
8957 +++ head-2008-12-01/arch/x86/kernel/mpparse-xen.c 2008-12-01 11:49:07.000000000 +0100
8959 #include <asm/proto.h>
8960 #include <asm/acpi.h>
8961 #include <asm/bios_ebda.h>
8962 +#include <asm/e820.h>
8963 +#include <asm/trampoline.h>
8964 +#include <asm/setup.h>
8966 #include <mach_apic.h>
8967 #ifdef CONFIG_X86_32
8969 #include <mach_mpparse.h>
8972 -/* Have we found an MP table */
8973 -int smp_found_config;
8976 - * Various Linux-internal data structures created from the
8979 -#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
8980 -int mp_bus_id_to_type[MAX_MP_BUSSES];
8983 -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
8984 -int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
8986 -static int mp_current_pci_id;
8991 - * Intel MP BIOS table parsing routines:
8995 * Checksum an MP configuration block.
8997 @@ -68,20 +49,8 @@ static int __init mpf_checksum(unsigned
9001 -#ifdef CONFIG_X86_NUMAQ
9003 - * Have to match translation table entries to main table entries by counter
9004 - * hence the mpc_record variable .... can't see a less disgusting way of
9008 -static int mpc_record;
9009 -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
9014 -static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
9015 +static void __init MP_processor_info(struct mpc_config_processor *m)
9018 char *bootup_cpu = "";
9019 @@ -90,11 +59,12 @@ static void __cpuinit MP_processor_info(
9023 -#ifdef CONFIG_X86_NUMAQ
9024 - apicid = mpc_apic_id(m, translation_table[mpc_record]);
9026 - apicid = m->mpc_apicid;
9029 + if (x86_quirks->mpc_apic_id)
9030 + apicid = x86_quirks->mpc_apic_id(m);
9032 + apicid = m->mpc_apicid;
9034 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
9035 bootup_cpu = " (Bootup-CPU)";
9036 boot_cpu_physical_apicid = m->mpc_apicid;
9037 @@ -104,24 +74,23 @@ static void __cpuinit MP_processor_info(
9038 generic_processor_info(apicid, m->mpc_apicver);
9041 -static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
9042 +static void __init MP_processor_info(struct mpc_config_processor *m)
9046 #endif /* CONFIG_XEN */
9048 +#ifdef CONFIG_X86_IO_APIC
9049 static void __init MP_bus_info(struct mpc_config_bus *m)
9053 memcpy(str, m->mpc_bustype, 6);
9056 -#ifdef CONFIG_X86_NUMAQ
9057 - mpc_oem_bus_info(m, str, translation_table[mpc_record]);
9059 - Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
9061 + if (x86_quirks->mpc_oem_bus_info)
9062 + x86_quirks->mpc_oem_bus_info(m, str);
9064 + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
9066 #if MAX_MP_BUSSES < 256
9067 if (m->mpc_busid >= MAX_MP_BUSSES) {
9068 @@ -138,12 +107,10 @@ static void __init MP_bus_info(struct mp
9069 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
9071 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
9072 -#ifdef CONFIG_X86_NUMAQ
9073 - mpc_oem_pci_bus(m, translation_table[mpc_record]);
9075 + if (x86_quirks->mpc_oem_pci_bus)
9076 + x86_quirks->mpc_oem_pci_bus(m);
9078 clear_bit(m->mpc_busid, mp_bus_not_pci);
9079 - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
9080 - mp_current_pci_id++;
9081 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
9082 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
9083 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
9084 @@ -154,6 +121,7 @@ static void __init MP_bus_info(struct mp
9086 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
9090 #ifdef CONFIG_X86_IO_APIC
9092 @@ -183,117 +151,111 @@ static void __init MP_ioapic_info(struct
9093 if (bad_ioapic(m->mpc_apicaddr))
9096 - mp_ioapics[nr_ioapics] = *m;
9097 + mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
9098 + mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
9099 + mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
9100 + mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
9101 + mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
9105 -static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9106 +static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
9108 - mp_irqs[mp_irq_entries] = *m;
9109 - Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
9110 + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9111 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9112 m->mpc_irqtype, m->mpc_irqflag & 3,
9113 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
9114 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
9115 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
9116 - panic("Max # of irq sources exceeded!!\n");
9121 -static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9122 +static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
9124 - Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
9125 - " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9126 - m->mpc_irqtype, m->mpc_irqflag & 3,
9127 - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9128 - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9129 + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9130 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9131 + mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
9132 + (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
9133 + mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
9136 -#ifdef CONFIG_X86_NUMAQ
9137 -static void __init MP_translation_info(struct mpc_config_translation *m)
9138 +static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
9139 + struct mp_config_intsrc *mp_irq)
9142 - "Translation: record %d, type %d, quad %d, global %d, local %d\n",
9143 - mpc_record, m->trans_type, m->trans_quad, m->trans_global,
9145 + mp_irq->mp_dstapic = m->mpc_dstapic;
9146 + mp_irq->mp_type = m->mpc_type;
9147 + mp_irq->mp_irqtype = m->mpc_irqtype;
9148 + mp_irq->mp_irqflag = m->mpc_irqflag;
9149 + mp_irq->mp_srcbus = m->mpc_srcbus;
9150 + mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
9151 + mp_irq->mp_dstirq = m->mpc_dstirq;
9154 - if (mpc_record >= MAX_MPC_ENTRY)
9155 - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
9157 - translation_table[mpc_record] = m; /* stash this for later */
9158 - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
9159 - node_set_online(m->trans_quad);
9160 +static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
9161 + struct mpc_config_intsrc *m)
9163 + m->mpc_dstapic = mp_irq->mp_dstapic;
9164 + m->mpc_type = mp_irq->mp_type;
9165 + m->mpc_irqtype = mp_irq->mp_irqtype;
9166 + m->mpc_irqflag = mp_irq->mp_irqflag;
9167 + m->mpc_srcbus = mp_irq->mp_srcbus;
9168 + m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
9169 + m->mpc_dstirq = mp_irq->mp_dstirq;
9173 - * Read/parse the MPC oem tables
9175 +static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
9176 + struct mpc_config_intsrc *m)
9178 + if (mp_irq->mp_dstapic != m->mpc_dstapic)
9180 + if (mp_irq->mp_type != m->mpc_type)
9182 + if (mp_irq->mp_irqtype != m->mpc_irqtype)
9184 + if (mp_irq->mp_irqflag != m->mpc_irqflag)
9186 + if (mp_irq->mp_srcbus != m->mpc_srcbus)
9188 + if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
9190 + if (mp_irq->mp_dstirq != m->mpc_dstirq)
9196 -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
9197 - unsigned short oemsize)
9198 +static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9200 - int count = sizeof(*oemtable); /* the header size */
9201 - unsigned char *oemptr = ((unsigned char *)oemtable) + count;
9205 - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
9207 - if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
9208 - printk(KERN_WARNING
9209 - "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
9210 - oemtable->oem_signature[0], oemtable->oem_signature[1],
9211 - oemtable->oem_signature[2], oemtable->oem_signature[3]);
9214 - if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
9215 - printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
9218 - while (count < oemtable->oem_length) {
9219 - switch (*oemptr) {
9220 - case MP_TRANSLATION:
9222 - struct mpc_config_translation *m =
9223 - (struct mpc_config_translation *)oemptr;
9224 - MP_translation_info(m);
9225 - oemptr += sizeof(*m);
9226 - count += sizeof(*m);
9232 - printk(KERN_WARNING
9233 - "Unrecognised OEM table entry type! - %d\n",
9238 + print_MP_intsrc_info(m);
9240 + for (i = 0; i < mp_irq_entries; i++) {
9241 + if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
9245 + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
9246 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
9247 + panic("Max # of irq sources exceeded!!\n");
9250 -static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
9254 +static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9256 - if (strncmp(oem, "IBM NUMA", 8))
9257 - printk("Warning! May not be a NUMA-Q system!\n");
9258 - if (mpc->mpc_oemptr)
9259 - smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
9260 - mpc->mpc_oemsize);
9261 + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
9262 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9263 + m->mpc_irqtype, m->mpc_irqflag & 3,
9264 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9265 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9267 -#endif /* CONFIG_X86_NUMAQ */
9270 * Read/parse the MPC
9273 -static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9274 +static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
9279 - int count = sizeof(*mpc);
9280 - unsigned char *mpt = ((unsigned char *)mpc) + count;
9282 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
9283 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
9284 @@ -316,19 +278,41 @@ static int __init smp_read_mpc(struct mp
9286 memcpy(oem, mpc->mpc_oem, 8);
9288 - printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
9289 + printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
9291 memcpy(str, mpc->mpc_productid, 12);
9293 - printk("Product ID: %s ", str);
9295 -#ifdef CONFIG_X86_32
9296 - mps_oem_check(mpc, oem, str);
9298 - printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
9299 + printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
9301 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
9306 +static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9311 + int count = sizeof(*mpc);
9312 + unsigned char *mpt = ((unsigned char *)mpc) + count;
9314 + if (!smp_check_mpc(mpc, oem, str))
9317 +#ifdef CONFIG_X86_32
9319 + * need to make sure summit and es7000's mps_oem_check is safe to be
9320 + * called early via genericarch 's mps_oem_check
9323 +#ifdef CONFIG_X86_NUMAQ
9324 + numaq_mps_oem_check(mpc, oem, str);
9327 + mps_oem_check(mpc, oem, str);
9329 /* save the local APIC address, it might be non-default */
9331 mp_lapic_addr = mpc->mpc_lapic;
9332 @@ -336,12 +320,17 @@ static int __init smp_read_mpc(struct mp
9336 + if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
9337 + struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
9338 + x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
9342 * Now process the configuration blocks.
9344 -#ifdef CONFIG_X86_NUMAQ
9347 + if (x86_quirks->mpc_record)
9348 + *x86_quirks->mpc_record = 0;
9350 while (count < mpc->mpc_length) {
9353 @@ -359,7 +348,9 @@ static int __init smp_read_mpc(struct mp
9355 struct mpc_config_bus *m =
9356 (struct mpc_config_bus *)mpt;
9357 +#ifdef CONFIG_X86_IO_APIC
9361 count += sizeof(*m);
9363 @@ -405,10 +396,14 @@ static int __init smp_read_mpc(struct mp
9364 count = mpc->mpc_length;
9367 -#ifdef CONFIG_X86_NUMAQ
9370 + if (x86_quirks->mpc_record)
9371 + (*x86_quirks->mpc_record)++;
9374 +#ifdef CONFIG_X86_GENERICARCH
9375 + generic_bigsmp_probe();
9378 setup_apic_routing();
9379 if (!num_processors)
9380 printk(KERN_ERR "MPTABLE: no processors registered!\n");
9381 @@ -434,7 +429,7 @@ static void __init construct_default_ioi
9382 intsrc.mpc_type = MP_INTSRC;
9383 intsrc.mpc_irqflag = 0; /* conforming */
9384 intsrc.mpc_srcbus = 0;
9385 - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
9386 + intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
9388 intsrc.mpc_irqtype = mp_INT;
9390 @@ -495,40 +490,11 @@ static void __init construct_default_ioi
9391 MP_intsrc_info(&intsrc);
9396 -static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9397 +static void __init construct_ioapic_table(int mpc_default_type)
9399 - struct mpc_config_processor processor;
9400 - struct mpc_config_bus bus;
9401 -#ifdef CONFIG_X86_IO_APIC
9402 struct mpc_config_ioapic ioapic;
9404 - struct mpc_config_lintsrc lintsrc;
9405 - int linttypes[2] = { mp_ExtINT, mp_NMI };
9409 - * local APIC has default address
9411 - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9414 - * 2 CPUs, numbered 0 & 1.
9416 - processor.mpc_type = MP_PROCESSOR;
9417 - /* Either an integrated APIC or a discrete 82489DX. */
9418 - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9419 - processor.mpc_cpuflag = CPU_ENABLED;
9420 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9421 - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9422 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9423 - processor.mpc_reserved[0] = 0;
9424 - processor.mpc_reserved[1] = 0;
9425 - for (i = 0; i < 2; i++) {
9426 - processor.mpc_apicid = i;
9427 - MP_processor_info(&processor);
9429 + struct mpc_config_bus bus;
9431 bus.mpc_type = MP_BUS;
9433 @@ -557,7 +523,6 @@ static inline void __init construct_defa
9437 -#ifdef CONFIG_X86_IO_APIC
9438 ioapic.mpc_type = MP_IOAPIC;
9439 ioapic.mpc_apicid = 2;
9440 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9441 @@ -569,7 +534,42 @@ static inline void __init construct_defa
9442 * We set up most of the low 16 IO-APIC pins according to MPS rules.
9444 construct_default_ioirq_mptable(mpc_default_type);
9447 +static inline void __init construct_ioapic_table(int mpc_default_type) { }
9450 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9452 + struct mpc_config_processor processor;
9453 + struct mpc_config_lintsrc lintsrc;
9454 + int linttypes[2] = { mp_ExtINT, mp_NMI };
9458 + * local APIC has default address
9460 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9463 + * 2 CPUs, numbered 0 & 1.
9465 + processor.mpc_type = MP_PROCESSOR;
9466 + /* Either an integrated APIC or a discrete 82489DX. */
9467 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9468 + processor.mpc_cpuflag = CPU_ENABLED;
9469 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9470 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9471 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9472 + processor.mpc_reserved[0] = 0;
9473 + processor.mpc_reserved[1] = 0;
9474 + for (i = 0; i < 2; i++) {
9475 + processor.mpc_apicid = i;
9476 + MP_processor_info(&processor);
9479 + construct_ioapic_table(mpc_default_type);
9481 lintsrc.mpc_type = MP_LINTSRC;
9482 lintsrc.mpc_irqflag = 0; /* conforming */
9483 lintsrc.mpc_srcbusid = 0;
9484 @@ -587,10 +587,14 @@ static struct intel_mp_floating *mpf_fou
9486 * Scan the memory blocks for an SMP configuration block.
9488 -static void __init __get_smp_config(unsigned early)
9489 +static void __init __get_smp_config(unsigned int early)
9491 struct intel_mp_floating *mpf = mpf_found;
9493 + if (x86_quirks->mach_get_smp_config) {
9494 + if (x86_quirks->mach_get_smp_config(early))
9497 if (acpi_lapic && early)
9500 @@ -607,7 +611,7 @@ static void __init __get_smp_config(unsi
9502 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
9503 mpf->mpf_specification);
9504 -#ifdef CONFIG_X86_32
9505 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
9506 if (mpf->mpf_feature2 & (1 << 7)) {
9507 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
9509 @@ -639,7 +643,9 @@ static void __init __get_smp_config(unsi
9510 * override the defaults.
9512 if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
9513 +#ifdef CONFIG_X86_LOCAL_APIC
9514 smp_found_config = 0;
9517 "BIOS bug, MP table errors detected!...\n");
9518 printk(KERN_ERR "... disabling SMP support. "
9519 @@ -696,7 +702,8 @@ static int __init smp_scan_config(unsign
9520 unsigned int *bp = isa_bus_to_virt(base);
9521 struct intel_mp_floating *mpf;
9523 - Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
9524 + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
9526 BUILD_BUG_ON(sizeof(*mpf) != 16);
9528 while (length > 0) {
9529 @@ -706,16 +713,22 @@ static int __init smp_scan_config(unsign
9530 !mpf_checksum((unsigned char *)bp, 16) &&
9531 ((mpf->mpf_specification == 1)
9532 || (mpf->mpf_specification == 4))) {
9534 +#ifdef CONFIG_X86_LOCAL_APIC
9535 smp_found_config = 1;
9538 -#ifdef CONFIG_X86_32
9541 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9542 mpf, virt_to_phys(mpf));
9543 - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
9547 + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
9549 if (mpf->mpf_physptr) {
9550 + unsigned long size = PAGE_SIZE;
9551 +#ifdef CONFIG_X86_32
9553 * We cannot access to MPC table to compute
9554 * table size yet, as only few megabytes from
9555 @@ -725,27 +738,18 @@ static int __init smp_scan_config(unsign
9556 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
9557 * in reserve_bootmem.
9559 - unsigned long size = PAGE_SIZE;
9560 unsigned long end = max_low_pfn * PAGE_SIZE;
9561 if (mpf->mpf_physptr + size > end)
9562 size = end - mpf->mpf_physptr;
9563 - reserve_bootmem(mpf->mpf_physptr, size,
9565 + reserve_bootmem_generic(mpf->mpf_physptr, size,
9569 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9570 mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
9572 -#elif !defined(CONFIG_XEN)
9576 - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
9577 - if (mpf->mpf_physptr)
9578 - reserve_bootmem_generic(mpf->mpf_physptr,
9586 @@ -753,10 +757,15 @@ static int __init smp_scan_config(unsign
9590 -static void __init __find_smp_config(unsigned reserve)
9591 +static void __init __find_smp_config(unsigned int reserve)
9594 unsigned int address;
9596 + if (x86_quirks->mach_find_smp_config) {
9597 + if (x86_quirks->mach_find_smp_config(reserve))
9603 @@ -805,300 +814,301 @@ void __init find_smp_config(void)
9604 __find_smp_config(1);
9607 -/* --------------------------------------------------------------------------
9608 - ACPI-based MP Configuration
9609 - -------------------------------------------------------------------------- */
9612 - * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
9617 +#ifdef CONFIG_X86_IO_APIC
9618 +static u8 __initdata irq_used[MAX_IRQ_SOURCES];
9620 -#ifdef CONFIG_X86_IO_APIC
9621 +static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
9625 -#define MP_ISA_BUS 0
9626 + if (m->mpc_irqtype != mp_INT)
9629 -extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
9630 + if (m->mpc_irqflag != 0x0f)
9633 -static int mp_find_ioapic(int gsi)
9638 - /* Find the IOAPIC that manages this GSI. */
9639 - for (i = 0; i < nr_ioapics; i++) {
9640 - if ((gsi >= mp_ioapic_routing[i].gsi_base)
9641 - && (gsi <= mp_ioapic_routing[i].gsi_end))
9643 + for (i = 0; i < mp_irq_entries; i++) {
9644 + if (mp_irqs[i].mp_irqtype != mp_INT)
9647 + if (mp_irqs[i].mp_irqflag != 0x0f)
9650 + if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
9652 + if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
9654 + if (irq_used[i]) {
9655 + /* already claimed */
9662 - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
9667 -static u8 __init uniq_ioapic_id(u8 id)
9669 -#ifdef CONFIG_X86_32
9670 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
9671 - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
9672 - return io_apic_get_unique_id(nr_ioapics, id);
9677 - DECLARE_BITMAP(used, 256);
9678 - bitmap_zero(used, 256);
9679 - for (i = 0; i < nr_ioapics; i++) {
9680 - struct mpc_config_ioapic *ia = &mp_ioapics[i];
9681 - __set_bit(ia->mpc_apicid, used);
9683 - if (!test_bit(id, used))
9685 - return find_first_zero_bit(used, 256);
9686 +#define SPARE_SLOT_NUM 20
9688 +static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
9692 -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
9693 +static int __init replace_intsrc_all(struct mp_config_table *mpc,
9694 + unsigned long mpc_new_phys,
9695 + unsigned long mpc_new_length)
9699 - if (bad_ioapic(address))
9701 +#ifdef CONFIG_X86_IO_APIC
9703 + int nr_m_spare = 0;
9707 + int count = sizeof(*mpc);
9708 + unsigned char *mpt = ((unsigned char *)mpc) + count;
9710 - mp_ioapics[idx].mpc_type = MP_IOAPIC;
9711 - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
9712 - mp_ioapics[idx].mpc_apicaddr = address;
9713 + printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
9714 + while (count < mpc->mpc_length) {
9716 + case MP_PROCESSOR:
9718 + struct mpc_config_processor *m =
9719 + (struct mpc_config_processor *)mpt;
9720 + mpt += sizeof(*m);
9721 + count += sizeof(*m);
9726 + struct mpc_config_bus *m =
9727 + (struct mpc_config_bus *)mpt;
9728 + mpt += sizeof(*m);
9729 + count += sizeof(*m);
9734 + mpt += sizeof(struct mpc_config_ioapic);
9735 + count += sizeof(struct mpc_config_ioapic);
9740 +#ifdef CONFIG_X86_IO_APIC
9741 + struct mpc_config_intsrc *m =
9742 + (struct mpc_config_intsrc *)mpt;
9745 - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
9746 + printk(KERN_INFO "OLD ");
9747 + print_MP_intsrc_info(m);
9748 + i = get_MP_intsrc_index(m);
9750 + assign_to_mpc_intsrc(&mp_irqs[i], m);
9751 + printk(KERN_INFO "NEW ");
9752 + print_mp_irq_info(&mp_irqs[i]);
9754 + /* legacy, do nothing */
9755 + } else if (nr_m_spare < SPARE_SLOT_NUM) {
9757 + * not found (-1), or duplicated (-2)
9758 + * are invalid entries,
9759 + * we need to use the slot later
9761 + m_spare[nr_m_spare] = m;
9765 - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
9766 -#ifdef CONFIG_X86_32
9767 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
9769 - mp_ioapics[idx].mpc_apicver = 0;
9770 + mpt += sizeof(struct mpc_config_intsrc);
9771 + count += sizeof(struct mpc_config_intsrc);
9776 + struct mpc_config_lintsrc *m =
9777 + (struct mpc_config_lintsrc *)mpt;
9778 + mpt += sizeof(*m);
9779 + count += sizeof(*m);
9783 + /* wrong mptable */
9784 + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
9785 + printk(KERN_ERR "type %x\n", *mpt);
9786 + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
9787 + 1, mpc, mpc->mpc_length, 1);
9792 +#ifdef CONFIG_X86_IO_APIC
9793 + for (i = 0; i < mp_irq_entries; i++) {
9797 + if (mp_irqs[i].mp_irqtype != mp_INT)
9800 + if (mp_irqs[i].mp_irqflag != 0x0f)
9803 + if (nr_m_spare > 0) {
9804 + printk(KERN_INFO "*NEW* found ");
9806 + assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
9807 + m_spare[nr_m_spare] = NULL;
9809 + struct mpc_config_intsrc *m =
9810 + (struct mpc_config_intsrc *)mpt;
9811 + count += sizeof(struct mpc_config_intsrc);
9812 + if (!mpc_new_phys) {
9813 + printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
9815 + if (count <= mpc_new_length)
9816 + printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
9818 + printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
9822 + assign_to_mpc_intsrc(&mp_irqs[i], m);
9823 + mpc->mpc_length = count;
9824 + mpt += sizeof(struct mpc_config_intsrc);
9826 + print_mp_irq_info(&mp_irqs[i]);
9830 - * Build basic GSI lookup table to facilitate gsi->io_apic lookups
9831 - * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
9833 - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
9834 - mp_ioapic_routing[idx].gsi_base = gsi_base;
9835 - mp_ioapic_routing[idx].gsi_end = gsi_base +
9836 - io_apic_get_redir_entries(idx);
9838 - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
9839 - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
9840 - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
9841 - mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
9843 + /* update checksum */
9844 + mpc->mpc_checksum = 0;
9845 + mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
9852 -void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
9854 - struct mpc_config_intsrc intsrc;
9859 - * Convert 'gsi' to 'ioapic.pin'.
9861 - ioapic = mp_find_ioapic(gsi);
9864 - pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9865 +static int __initdata enable_update_mptable;
9868 - * TBD: This check is for faulty timer entries, where the override
9869 - * erroneously sets the trigger to level, resulting in a HUGE
9870 - * increase of timer interrupts!
9872 - if ((bus_irq == 0) && (trigger == 3))
9874 +static int __init update_mptable_setup(char *str)
9876 + enable_update_mptable = 1;
9879 +early_param("update_mptable", update_mptable_setup);
9881 - intsrc.mpc_type = MP_INTSRC;
9882 - intsrc.mpc_irqtype = mp_INT;
9883 - intsrc.mpc_irqflag = (trigger << 2) | polarity;
9884 - intsrc.mpc_srcbus = MP_ISA_BUS;
9885 - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
9886 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
9887 - intsrc.mpc_dstirq = pin; /* INTIN# */
9888 +static unsigned long __initdata mpc_new_phys;
9889 +static unsigned long mpc_new_length __initdata = 4096;
9891 - MP_intsrc_info(&intsrc);
9892 +/* alloc_mptable or alloc_mptable=4k */
9893 +static int __initdata alloc_mptable;
9894 +static int __init parse_alloc_mptable_opt(char *p)
9896 + enable_update_mptable = 1;
9897 + alloc_mptable = 1;
9900 + mpc_new_length = PAGE_SIZE << get_order(memparse(p, &p));
9903 +early_param("alloc_mptable", parse_alloc_mptable_opt);
9905 -void __init mp_config_acpi_legacy_irqs(void)
9906 +void __init early_reserve_e820_mpc_new(void)
9908 - struct mpc_config_intsrc intsrc;
9912 -#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9914 - * Fabricate the legacy ISA bus (bus #31).
9916 - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
9917 + if (enable_update_mptable && alloc_mptable) {
9918 + u64 startt = PAGE_SIZE;
9919 +#ifdef CONFIG_X86_TRAMPOLINE
9920 + startt = TRAMPOLINE_BASE;
9922 - set_bit(MP_ISA_BUS, mp_bus_not_pci);
9923 - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
9926 - * Older generations of ES7000 have no legacy identity mappings
9928 - if (es7000_plat == 1)
9932 - * Locate the IOAPIC that manages the ISA IRQs (0-15).
9934 - ioapic = mp_find_ioapic(0);
9938 - intsrc.mpc_type = MP_INTSRC;
9939 - intsrc.mpc_irqflag = 0; /* Conforming */
9940 - intsrc.mpc_srcbus = MP_ISA_BUS;
9941 -#ifdef CONFIG_X86_IO_APIC
9942 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
9945 - * Use the default configuration for the IRQs 0-15. Unless
9946 - * overridden by (MADT) interrupt source override entries.
9948 - for (i = 0; i < 16; i++) {
9951 - for (idx = 0; idx < mp_irq_entries; idx++) {
9952 - struct mpc_config_intsrc *irq = mp_irqs + idx;
9954 - /* Do we already have a mapping for this ISA IRQ? */
9955 - if (irq->mpc_srcbus == MP_ISA_BUS
9956 - && irq->mpc_srcbusirq == i)
9959 - /* Do we already have a mapping for this IOAPIC pin */
9960 - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
9961 - (irq->mpc_dstirq == i))
9965 - if (idx != mp_irq_entries) {
9966 - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
9967 - continue; /* IRQ already used */
9970 - intsrc.mpc_irqtype = mp_INT;
9971 - intsrc.mpc_srcbusirq = i; /* Identity mapped */
9972 - intsrc.mpc_dstirq = i;
9974 - MP_intsrc_info(&intsrc);
9975 + mpc_new_phys = early_reserve_e820(startt, mpc_new_length,
9980 -int mp_register_gsi(u32 gsi, int triggering, int polarity)
9981 +static int __init update_mp_table(void)
9985 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
9986 -#define MAX_GSI_NUM 4096
9987 -#define IRQ_COMPRESSION_START 64
9990 + struct intel_mp_floating *mpf;
9991 + struct mp_config_table *mpc;
9992 + struct mp_config_table *mpc_new;
9994 + if (!enable_update_mptable)
10001 - static int pci_irq = IRQ_COMPRESSION_START;
10003 - * Mapping between Global System Interrupts, which
10004 - * represent all possible interrupts, and IRQs
10005 - * assigned to actual devices.
10006 + * Now see if we need to go further.
10008 - static int gsi_to_irq[MAX_GSI_NUM];
10010 + if (mpf->mpf_feature1 != 0)
10013 - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
10016 + if (!mpf->mpf_physptr)
10019 - /* Don't set up the ACPI SCI because it's already set up */
10020 - if (acpi_gbl_FADT.sci_interrupt == gsi)
10022 + mpc = isa_bus_to_virt(mpf->mpf_physptr);
10024 - ioapic = mp_find_ioapic(gsi);
10025 - if (ioapic < 0) {
10026 - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
10029 + if (!smp_check_mpc(mpc, oem, str))
10032 - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
10033 + printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
10034 + printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
10036 -#ifndef CONFIG_X86_32
10037 - if (ioapic_renumber_irq)
10038 - gsi = ioapic_renumber_irq(ioapic, gsi);
10040 + if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
10041 + mpc_new_phys = 0;
10042 + printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
10046 + if (!mpc_new_phys) {
10047 + unsigned char old, new;
10048 + /* check if we can change the postion */
10049 + mpc->mpc_checksum = 0;
10050 + old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10051 + mpc->mpc_checksum = 0xff;
10052 + new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10053 + if (old == new) {
10054 + printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
10057 + printk(KERN_INFO "use in-positon replacing\n");
10059 + maddr_t mpc_new_bus;
10062 - * Avoid pin reprogramming. PRTs typically include entries
10063 - * with redundant pin->gsi mappings (but unique PCI devices);
10064 - * we only program the IOAPIC on the first.
10066 - if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
10067 - printk(KERN_ERR "Invalid reference to IOAPIC pin "
10068 - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
10072 - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
10073 - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
10074 - mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
10075 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10076 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
10080 + if (xen_create_contiguous_region((unsigned long)phys_to_virt(mpc_new_phys),
10081 + get_order(mpc_new_length), 32))
10083 + mpc_new_bus = phys_to_machine(mpc_new_phys);
10084 + mpf->mpf_physptr = mpc_new_bus;
10085 + mpc_new = phys_to_virt(mpc_new_phys);
10086 + memcpy(mpc_new, mpc, mpc->mpc_length);
10088 + /* check if we can modify that */
10089 + if (mpc_new_bus - mpf->mpf_physptr) {
10090 + struct intel_mp_floating *mpf_new;
10091 + /* steal 16 bytes from [0, 1k) */
10092 + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
10093 + mpf_new = isa_bus_to_virt(0x400 - 16);
10094 + memcpy(mpf_new, mpf, 16);
10096 + mpf->mpf_physptr = mpc_new_bus;
10098 + mpf->mpf_checksum = 0;
10099 + mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
10100 + printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
10103 - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
10104 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10106 - * For GSI >= 64, use IRQ compression
10107 + * only replace the one with mp_INT and
10108 + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
10109 + * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
10110 + * may need pci=routeirq for all coverage
10112 - if ((gsi >= IRQ_COMPRESSION_START)
10113 - && (triggering == ACPI_LEVEL_SENSITIVE)) {
10115 - * For PCI devices assign IRQs in order, avoiding gaps
10116 - * due to unused I/O APIC pins.
10119 - if (gsi < MAX_GSI_NUM) {
10121 - * Retain the VIA chipset work-around (gsi > 15), but
10122 - * avoid a problem where the 8254 timer (IRQ0) is setup
10123 - * via an override (so it's not on pin 0 of the ioapic),
10124 - * and at the same time, the pin 0 interrupt is a PCI
10125 - * type. The gsi > 15 test could cause these two pins
10126 - * to be shared as IRQ0, and they are not shareable.
10127 - * So test for this condition, and if necessary, avoid
10128 - * the pin collision.
10132 - * Don't assign IRQ used by ACPI SCI
10134 - if (gsi == acpi_gbl_FADT.sci_interrupt)
10136 - gsi_to_irq[irq] = gsi;
10138 - printk(KERN_ERR "GSI %u is too high\n", gsi);
10143 - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
10144 - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
10145 - polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
10147 + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
10152 -#endif /* CONFIG_X86_IO_APIC */
10153 -#endif /* CONFIG_ACPI */
10154 +late_initcall(update_mp_table);
10155 Index: head-2008-12-01/arch/x86/kernel/nmi.c
10156 ===================================================================
10157 --- head-2008-12-01.orig/arch/x86/kernel/nmi.c 2008-12-03 15:48:43.000000000 +0100
10158 +++ head-2008-12-01/arch/x86/kernel/nmi.c 2008-12-01 11:49:07.000000000 +0100
10160 #include <linux/kdebug.h>
10161 #include <linux/smp.h>
10163 +#ifndef CONFIG_XEN
10164 #include <asm/i8259.h>
10166 #include <asm/io_apic.h>
10167 #include <asm/smp.h>
10168 #include <asm/nmi.h>
10169 @@ -179,8 +181,10 @@ int __init check_nmi_watchdog(void)
10170 kfree(prev_nmi_count);
10173 +#ifndef CONFIG_XEN
10174 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
10175 disable_8259A_irq(0);
10177 #ifdef CONFIG_X86_32
10180 Index: head-2008-12-01/arch/x86/kernel/pci-dma-xen.c
10181 ===================================================================
10182 --- head-2008-12-01.orig/arch/x86/kernel/pci-dma-xen.c 2008-12-01 11:44:55.000000000 +0100
10183 +++ head-2008-12-01/arch/x86/kernel/pci-dma-xen.c 2008-12-01 11:49:07.000000000 +0100
10186 #include <asm/proto.h>
10187 #include <asm/dma.h>
10188 -#include <asm/gart.h>
10189 +#include <asm/iommu.h>
10190 #include <asm/calgary.h>
10191 +#include <asm/amd_iommu.h>
10193 -int forbid_dac __read_mostly;
10194 -EXPORT_SYMBOL(forbid_dac);
10195 +static int forbid_dac __read_mostly;
10197 -const struct dma_mapping_ops *dma_ops;
10198 +struct dma_mapping_ops *dma_ops;
10199 EXPORT_SYMBOL(dma_ops);
10201 static int iommu_sac_force __read_mostly;
10202 @@ -74,13 +74,17 @@ early_param("dma32_size", parse_dma32_si
10203 void __init dma32_reserve_bootmem(void)
10205 unsigned long size, align;
10206 - if (end_pfn <= MAX_DMA32_PFN)
10207 + if (max_pfn <= MAX_DMA32_PFN)
10211 + * check aperture_64.c allocate_aperture() for reason about
10212 + * using 512M as goal
10215 size = round_up(dma32_bootmem_size, align);
10216 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
10217 - __pa(MAX_DMA_ADDRESS));
10219 if (dma32_bootmem_ptr)
10220 dma32_bootmem_size = size;
10222 @@ -88,17 +92,14 @@ void __init dma32_reserve_bootmem(void)
10224 static void __init dma32_free_bootmem(void)
10228 - if (end_pfn <= MAX_DMA32_PFN)
10229 + if (max_pfn <= MAX_DMA32_PFN)
10232 if (!dma32_bootmem_ptr)
10235 - for_each_online_node(node)
10236 - free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
10237 - dma32_bootmem_size);
10238 + free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
10240 dma32_bootmem_ptr = NULL;
10241 dma32_bootmem_size = 0;
10242 @@ -107,7 +108,7 @@ static void __init dma32_free_bootmem(vo
10243 #define dma32_free_bootmem() ((void)0)
10246 -static const struct dma_mapping_ops swiotlb_dma_ops = {
10247 +static struct dma_mapping_ops swiotlb_dma_ops = {
10248 .mapping_error = swiotlb_dma_mapping_error,
10249 .map_single = swiotlb_map_single_phys,
10250 .unmap_single = swiotlb_unmap_single,
10251 @@ -130,25 +131,31 @@ void __init pci_iommu_alloc(void)
10252 * The order of these functions is important for
10253 * fall-back/fail-over reasons
10255 -#ifdef CONFIG_GART_IOMMU
10256 gart_iommu_hole_init();
10259 -#ifdef CONFIG_CALGARY_IOMMU
10263 detect_intel_iommu();
10265 -#ifdef CONFIG_SWIOTLB
10266 + amd_iommu_detect();
10270 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
10271 dma_ops = &swiotlb_dma_ops;
10276 +#ifndef CONFIG_XEN
10277 +unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
10279 + unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
10281 + return size >> PAGE_SHIFT;
10283 +EXPORT_SYMBOL(iommu_num_pages);
10287 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
10289 @@ -201,9 +208,7 @@ static __init int iommu_setup(char *p)
10293 -#ifdef CONFIG_GART_IOMMU
10294 gart_parse_options(p);
10297 #ifdef CONFIG_CALGARY_IOMMU
10298 if (!strncmp(p, "calgary", 7))
10299 @@ -245,136 +250,19 @@ int range_straddles_page_boundary(paddr_
10300 !check_pages_physically_contiguous(pfn, offset, size));
10303 -#ifdef CONFIG_X86_32
10304 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
10305 - dma_addr_t device_addr, size_t size, int flags)
10307 - void __iomem *mem_base = NULL;
10308 - int pages = size >> PAGE_SHIFT;
10309 - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
10311 - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
10315 - if (dev->dma_mem)
10318 - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
10320 - mem_base = ioremap(bus_addr, size);
10324 - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
10325 - if (!dev->dma_mem)
10327 - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
10328 - if (!dev->dma_mem->bitmap)
10331 - dev->dma_mem->virt_base = mem_base;
10332 - dev->dma_mem->device_base = device_addr;
10333 - dev->dma_mem->size = pages;
10334 - dev->dma_mem->flags = flags;
10336 - if (flags & DMA_MEMORY_MAP)
10337 - return DMA_MEMORY_MAP;
10339 - return DMA_MEMORY_IO;
10342 - kfree(dev->dma_mem);
10345 - iounmap(mem_base);
10348 -EXPORT_SYMBOL(dma_declare_coherent_memory);
10350 -void dma_release_declared_memory(struct device *dev)
10352 - struct dma_coherent_mem *mem = dev->dma_mem;
10356 - dev->dma_mem = NULL;
10357 - iounmap(mem->virt_base);
10358 - kfree(mem->bitmap);
10361 -EXPORT_SYMBOL(dma_release_declared_memory);
10363 -void *dma_mark_declared_memory_occupied(struct device *dev,
10364 - dma_addr_t device_addr, size_t size)
10366 - struct dma_coherent_mem *mem = dev->dma_mem;
10368 - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
10370 - pages >>= PAGE_SHIFT;
10373 - return ERR_PTR(-EINVAL);
10375 - pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
10376 - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
10378 - return ERR_PTR(err);
10379 - return mem->virt_base + (pos << PAGE_SHIFT);
10381 -EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
10383 -static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
10384 - dma_addr_t *dma_handle, void **ret)
10386 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10387 - int order = get_order(size);
10390 - int page = bitmap_find_free_region(mem->bitmap, mem->size,
10393 - *dma_handle = mem->device_base + (page << PAGE_SHIFT);
10394 - *ret = mem->virt_base + (page << PAGE_SHIFT);
10395 - memset(*ret, 0, size);
10397 - if (mem->flags & DMA_MEMORY_EXCLUSIVE)
10400 - return (mem != NULL);
10403 -static int dma_release_coherent(struct device *dev, int order, void *vaddr)
10405 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10407 - if (mem && vaddr >= mem->virt_base && vaddr <
10408 - (mem->virt_base + (mem->size << PAGE_SHIFT))) {
10409 - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
10411 - bitmap_release_region(mem->bitmap, page, order);
10417 -#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
10418 -#define dma_release_coherent(dev, order, vaddr) (0)
10419 -#endif /* CONFIG_X86_32 */
10421 int dma_supported(struct device *dev, u64 mask)
10423 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10426 if (mask > 0xffffffff && forbid_dac > 0) {
10427 - printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
10429 + dev_info(dev, "PCI: Disallowing DAC for device\n");
10434 - if (dma_ops->dma_supported)
10435 - return dma_ops->dma_supported(dev, mask);
10436 + if (ops->dma_supported)
10437 + return ops->dma_supported(dev, mask);
10439 /* Copied from i386. Doesn't make much sense, because it will
10440 only work for pci_alloc_coherent.
10441 @@ -395,8 +283,7 @@ int dma_supported(struct device *dev, u6
10442 type. Normally this doesn't make any difference, but gives
10443 more gentle handling of IOMMU overflow. */
10444 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
10445 - printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
10446 - dev->bus_id, mask);
10447 + dev_info(dev, "Force SAC with mask %Lx\n", mask);
10451 @@ -422,6 +309,9 @@ void *
10452 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
10455 +#ifndef CONFIG_XEN
10456 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10458 void *memory = NULL;
10460 unsigned long dma_mask = 0;
10461 @@ -431,7 +321,7 @@ dma_alloc_coherent(struct device *dev, s
10462 /* ignore region specifiers */
10463 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
10465 - if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
10466 + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
10470 @@ -491,8 +381,8 @@ dma_alloc_coherent(struct device *dev, s
10471 /* Let low level make its own zone decisions */
10472 gfp &= ~(GFP_DMA32|GFP_DMA);
10474 - if (dma_ops->alloc_coherent)
10475 - return dma_ops->alloc_coherent(dev, size,
10476 + if (ops->alloc_coherent)
10477 + return ops->alloc_coherent(dev, size,
10481 @@ -504,14 +394,14 @@ dma_alloc_coherent(struct device *dev, s
10485 - if (dma_ops->alloc_coherent) {
10486 + if (ops->alloc_coherent) {
10487 free_pages((unsigned long)memory, order);
10488 gfp &= ~(GFP_DMA|GFP_DMA32);
10489 - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
10490 + return ops->alloc_coherent(dev, size, dma_handle, gfp);
10493 - if (dma_ops->map_simple) {
10494 - *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
10495 + if (ops->map_simple) {
10496 + *dma_handle = ops->map_simple(dev, virt_to_bus(memory),
10498 PCI_DMA_BIDIRECTIONAL);
10499 if (*dma_handle != bad_dma_address)
10500 @@ -542,13 +432,17 @@ EXPORT_SYMBOL(dma_alloc_coherent);
10501 void dma_free_coherent(struct device *dev, size_t size,
10502 void *vaddr, dma_addr_t bus)
10504 +#ifndef CONFIG_XEN
10505 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10508 int order = get_order(size);
10509 WARN_ON(irqs_disabled()); /* for portability */
10510 - if (dma_release_coherent(dev, order, vaddr))
10511 + if (dma_release_from_coherent(dev, order, vaddr))
10514 - if (dma_ops->unmap_single)
10515 - dma_ops->unmap_single(dev, bus, size, 0);
10516 + if (ops->unmap_single)
10517 + ops->unmap_single(dev, bus, size, 0);
10519 xen_destroy_contiguous_region((unsigned long)vaddr, order);
10520 free_pages((unsigned long)vaddr, order);
10521 @@ -557,15 +451,13 @@ EXPORT_SYMBOL(dma_free_coherent);
10523 static int __init pci_iommu_init(void)
10525 -#ifdef CONFIG_CALGARY_IOMMU
10526 calgary_iommu_init();
10529 intel_iommu_init();
10531 -#ifdef CONFIG_GART_IOMMU
10532 + amd_iommu_init();
10539 Index: head-2008-12-01/arch/x86/kernel/pci-nommu-xen.c
10540 ===================================================================
10541 --- head-2008-12-01.orig/arch/x86/kernel/pci-nommu-xen.c 2008-12-01 11:44:55.000000000 +0100
10542 +++ head-2008-12-01/arch/x86/kernel/pci-nommu-xen.c 2008-12-01 11:49:07.000000000 +0100
10543 @@ -79,18 +79,12 @@ gnttab_unmap_single(struct device *dev,
10544 gnttab_dma_unmap_page(dma_addr);
10547 -static int nommu_mapping_error(dma_addr_t dma_addr)
10549 - return (dma_addr == bad_dma_address);
10552 -static const struct dma_mapping_ops nommu_dma_ops = {
10553 +static struct dma_mapping_ops nommu_dma_ops = {
10554 .map_single = gnttab_map_single,
10555 .unmap_single = gnttab_unmap_single,
10556 .map_sg = gnttab_map_sg,
10557 .unmap_sg = gnttab_unmap_sg,
10558 .dma_supported = swiotlb_dma_supported,
10559 - .mapping_error = nommu_mapping_error
10562 void __init no_iommu_init(void)
10563 Index: head-2008-12-01/arch/x86/kernel/probe_roms_32.c
10564 ===================================================================
10565 --- head-2008-12-01.orig/arch/x86/kernel/probe_roms_32.c 2008-12-03 15:48:43.000000000 +0100
10566 +++ head-2008-12-01/arch/x86/kernel/probe_roms_32.c 2008-12-01 11:49:07.000000000 +0100
10567 @@ -99,6 +99,11 @@ void __init probe_roms(void)
10572 + if (!is_initial_xendomain())
10577 upper = adapter_rom_resources[0].start;
10578 for (start = video_rom_resource.start; start < upper; start += 2048) {
10579 @@ -131,7 +136,7 @@ void __init probe_roms(void)
10580 upper = system_rom_resource.start;
10582 /* check for extension rom (ignore length byte!) */
10583 - rom = isa_bus_to_virt(extension_rom_resource.start);
10584 + rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
10585 if (romsignature(rom)) {
10586 length = extension_rom_resource.end - extension_rom_resource.start + 1;
10587 if (romchecksum(rom, length)) {
10588 Index: head-2008-12-01/arch/x86/kernel/process-xen.c
10589 ===================================================================
10590 --- head-2008-12-01.orig/arch/x86/kernel/process-xen.c 2008-12-01 11:44:55.000000000 +0100
10591 +++ head-2008-12-01/arch/x86/kernel/process-xen.c 2008-12-01 11:49:07.000000000 +0100
10593 #include <linux/sched.h>
10594 #include <linux/module.h>
10595 #include <linux/pm.h>
10596 +#include <linux/clockchips.h>
10597 +#include <asm/system.h>
10599 +unsigned long idle_halt;
10600 +EXPORT_SYMBOL(idle_halt);
10601 +unsigned long idle_nomwait;
10602 +EXPORT_SYMBOL(idle_nomwait);
10604 struct kmem_cache *task_xstate_cachep;
10606 @@ -45,6 +52,70 @@ void arch_task_cache_init(void)
10611 + * Idle related variables and functions
10613 +unsigned long boot_option_idle_override = 0;
10614 +EXPORT_SYMBOL(boot_option_idle_override);
10617 + * Powermanagement idle function, if any..
10619 +void (*pm_idle)(void);
10620 +EXPORT_SYMBOL(pm_idle);
10622 +#ifdef CONFIG_X86_32
10624 + * This halt magic was a workaround for ancient floppy DMA
10625 + * wreckage. It should be safe to remove.
10627 +static int hlt_counter;
10628 +void disable_hlt(void)
10632 +EXPORT_SYMBOL(disable_hlt);
10634 +void enable_hlt(void)
10638 +EXPORT_SYMBOL(enable_hlt);
10640 +static inline int hlt_use_halt(void)
10642 + return (!hlt_counter && boot_cpu_data.hlt_works_ok);
10645 +static inline int hlt_use_halt(void)
10652 + * We use this if we don't have any better
10655 +void xen_idle(void)
10657 + current_thread_info()->status &= ~TS_POLLING;
10659 + * TS_POLLING-cleared state must be visible before we
10660 + * test NEED_RESCHED:
10664 + if (!need_resched())
10665 + safe_halt(); /* enables interrupts racelessly */
10667 + local_irq_enable();
10668 + current_thread_info()->status |= TS_POLLING;
10670 +#ifdef CONFIG_APM_MODULE
10671 +EXPORT_SYMBOL(default_idle);
10674 static void do_nothing(void *unused)
10677 @@ -61,7 +132,7 @@ void cpu_idle_wait(void)
10680 /* kick all the CPUs so that they exit out of pm_idle */
10681 - smp_call_function(do_nothing, NULL, 0, 1);
10682 + smp_call_function(do_nothing, NULL, 1);
10684 EXPORT_SYMBOL_GPL(cpu_idle_wait);
10686 @@ -125,60 +196,175 @@ static void poll_idle(void)
10688 * idle=mwait overrides this decision and forces the usage of mwait.
10690 +static int __cpuinitdata force_mwait;
10692 +#define MWAIT_INFO 0x05
10693 +#define MWAIT_ECX_EXTENDED_INFO 0x01
10694 +#define MWAIT_EDX_C1 0xf0
10696 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
10698 + u32 eax, ebx, ecx, edx;
10703 - if (c->x86_vendor == X86_VENDOR_AMD) {
10708 + if (c->cpuid_level < MWAIT_INFO)
10711 + cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
10712 + /* Check, whether EDX has extended info about MWAIT */
10713 + if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
10717 + * edx enumeratios MONITOR/MWAIT extensions. Check, whether
10718 + * C1 supports MWAIT
10720 + return (edx & MWAIT_EDX_C1);
10724 + * Check for AMD CPUs, which have potentially C1E support
10726 +static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
10728 + if (c->x86_vendor != X86_VENDOR_AMD)
10731 + if (c->x86 < 0x0F)
10734 + /* Family 0x0f models < rev F do not have C1E */
10735 + if (c->x86 == 0x0f && c->x86_model < 0x40)
10741 +static cpumask_t c1e_mask = CPU_MASK_NONE;
10742 +static int c1e_detected;
10744 +void c1e_remove_cpu(int cpu)
10746 + cpu_clear(cpu, c1e_mask);
10750 + * C1E aware idle routine. We check for C1E active in the interrupt
10751 + * pending message MSR. If we detect C1E, then we handle it the same
10752 + * way as C3 power states (local apic timer and TSC stop)
10754 +static void c1e_idle(void)
10756 + if (need_resched())
10759 + if (!c1e_detected) {
10762 + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
10763 + if (lo & K8_INTP_C1E_ACTIVE_MASK) {
10764 + c1e_detected = 1;
10765 + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
10766 + mark_tsc_unstable("TSC halt in AMD C1E");
10767 + printk(KERN_INFO "System has AMD C1E enabled\n");
10768 + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
10773 + if (c1e_detected) {
10774 + int cpu = smp_processor_id();
10776 + if (!cpu_isset(cpu, c1e_mask)) {
10777 + cpu_set(cpu, c1e_mask);
10779 + * Force broadcast so ACPI can not interfere. Needs
10780 + * to run with interrupts enabled as it uses
10781 + * smp_function_call.
10783 + local_irq_enable();
10784 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
10786 + printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
10788 + local_irq_disable();
10790 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
10795 + * The switch back from broadcast mode needs to be
10796 + * called with interrupts disabled.
10798 + local_irq_disable();
10799 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
10800 + local_irq_enable();
10806 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
10809 - static int selected;
10813 #ifdef CONFIG_X86_SMP
10814 if (pm_idle == poll_idle && smp_num_siblings > 1) {
10815 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
10816 " performance may degrade.\n");
10822 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
10824 - * Skip, if setup has overridden idle.
10825 * One CPU supports mwait => All CPUs supports mwait
10828 - printk(KERN_INFO "using mwait in idle threads.\n");
10829 - pm_idle = mwait_idle;
10833 + printk(KERN_INFO "using mwait in idle threads.\n");
10834 + pm_idle = mwait_idle;
10835 + } else if (check_c1e_idle(c)) {
10836 + printk(KERN_INFO "using C1E aware idle routine\n");
10837 + pm_idle = c1e_idle;
10839 + pm_idle = default_idle;
10843 static int __init idle_setup(char *str)
10848 if (!strcmp(str, "poll")) {
10849 printk("using polling idle threads.\n");
10850 pm_idle = poll_idle;
10853 - else if (!strcmp(str, "mwait"))
10854 + } else if (!strcmp(str, "mwait"))
10856 + else if (!strcmp(str, "halt")) {
10858 + * When the boot option of idle=halt is added, halt is
10859 + * forced to be used for CPU idle. In such case CPU C2/C3
10860 + * won't be used again.
10861 + * To continue to load the CPU idle driver, don't touch
10862 + * the boot_option_idle_override.
10864 + pm_idle = default_idle;
10867 + } else if (!strcmp(str, "nomwait")) {
10869 + * If the boot option of "idle=nomwait" is added,
10870 + * it means that mwait will be disabled for CPU C2/C3
10871 + * states. In such case it won't touch the variable
10872 + * of boot_option_idle_override.
10874 + idle_nomwait = 1;
10881 boot_option_idle_override = 1;
10882 Index: head-2008-12-01/arch/x86/kernel/process_32-xen.c
10883 ===================================================================
10884 --- head-2008-12-01.orig/arch/x86/kernel/process_32-xen.c 2008-12-01 11:44:55.000000000 +0100
10885 +++ head-2008-12-01/arch/x86/kernel/process_32-xen.c 2008-12-01 11:49:07.000000000 +0100
10886 @@ -59,15 +59,11 @@
10887 #include <asm/tlbflush.h>
10888 #include <asm/cpu.h>
10889 #include <asm/kdebug.h>
10890 +#include <asm/idle.h>
10892 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
10893 asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
10895 -static int hlt_counter;
10897 -unsigned long boot_option_idle_override = 0;
10898 -EXPORT_SYMBOL(boot_option_idle_override);
10900 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
10901 EXPORT_PER_CPU_SYMBOL(current_task);
10903 @@ -82,46 +78,27 @@ unsigned long thread_saved_pc(struct tas
10904 return ((unsigned long *)tsk->thread.sp)[3];
10908 - * Powermanagement idle function, if any..
10910 -void (*pm_idle)(void);
10911 -EXPORT_SYMBOL(pm_idle);
10912 +#ifdef CONFIG_HOTPLUG_CPU
10913 +#ifndef CONFIG_XEN
10914 +#include <asm/nmi.h>
10916 -void disable_hlt(void)
10917 +static void cpu_exit_clear(void)
10921 + int cpu = raw_smp_processor_id();
10923 -EXPORT_SYMBOL(disable_hlt);
10925 -void enable_hlt(void)
10929 + idle_task_exit();
10931 -EXPORT_SYMBOL(enable_hlt);
10933 + irq_ctx_exit(cpu);
10935 -static void xen_idle(void)
10937 - current_thread_info()->status &= ~TS_POLLING;
10939 - * TS_POLLING-cleared state must be visible before we
10940 - * test NEED_RESCHED:
10943 + cpu_clear(cpu, cpu_callout_map);
10944 + cpu_clear(cpu, cpu_callin_map);
10946 - if (!need_resched())
10947 - safe_halt(); /* enables interrupts racelessly */
10949 - local_irq_enable();
10950 - current_thread_info()->status |= TS_POLLING;
10951 + numa_remove_cpu(cpu);
10952 + c1e_remove_cpu(cpu);
10954 -#ifdef CONFIG_APM_MODULE
10955 -EXPORT_SYMBOL(default_idle);
10958 -#ifdef CONFIG_HOTPLUG_CPU
10959 static inline void play_dead(void)
10962 @@ -152,13 +129,11 @@ void cpu_idle(void)
10964 /* endless idle loop with no priority at all */
10966 - tick_nohz_stop_sched_tick();
10967 + tick_nohz_stop_sched_tick(1);
10968 while (!need_resched()) {
10969 - void (*idle)(void);
10973 - idle = xen_idle; /* no alternatives */
10975 if (rcu_pending(cpu))
10976 rcu_check_callbacks(cpu, 0);
10977 @@ -168,7 +143,10 @@ void cpu_idle(void)
10979 local_irq_disable();
10980 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
10982 + /* Don't trace irqs off for idle */
10983 + stop_critical_timings();
10985 + start_critical_timings();
10987 tick_nohz_restart_sched_tick();
10988 preempt_enable_no_resched();
10989 Index: head-2008-12-01/arch/x86/kernel/process_64-xen.c
10990 ===================================================================
10991 --- head-2008-12-01.orig/arch/x86/kernel/process_64-xen.c 2008-12-01 11:44:55.000000000 +0100
10992 +++ head-2008-12-01/arch/x86/kernel/process_64-xen.c 2008-12-01 11:49:07.000000000 +0100
10993 @@ -65,15 +65,6 @@ asmlinkage extern void ret_from_fork(voi
10995 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
10997 -unsigned long boot_option_idle_override = 0;
10998 -EXPORT_SYMBOL(boot_option_idle_override);
11001 - * Powermanagement idle function, if any..
11003 -void (*pm_idle)(void);
11004 -EXPORT_SYMBOL(pm_idle);
11006 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
11008 void idle_notifier_register(struct notifier_block *n)
11009 @@ -103,25 +94,13 @@ void exit_idle(void)
11013 -static void xen_idle(void)
11015 - current_thread_info()->status &= ~TS_POLLING;
11017 - * TS_POLLING-cleared state must be visible before we
11018 - * test NEED_RESCHED:
11021 - if (!need_resched())
11022 - safe_halt(); /* enables interrupts racelessly */
11024 - local_irq_enable();
11025 - current_thread_info()->status |= TS_POLLING;
11028 #ifdef CONFIG_HOTPLUG_CPU
11029 static inline void play_dead(void)
11032 +#ifndef CONFIG_XEN
11033 + c1e_remove_cpu(raw_smp_processor_id());
11035 local_irq_disable();
11036 cpu_clear(smp_processor_id(), cpu_initialized);
11037 preempt_enable_no_resched();
11038 @@ -146,12 +125,11 @@ void cpu_idle(void)
11039 current_thread_info()->status |= TS_POLLING;
11040 /* endless idle loop with no priority at all */
11042 - tick_nohz_stop_sched_tick();
11043 + tick_nohz_stop_sched_tick(1);
11044 while (!need_resched()) {
11045 - void (*idle)(void);
11048 - idle = xen_idle; /* no alternatives */
11050 if (cpu_is_offline(smp_processor_id()))
11053 @@ -161,7 +139,10 @@ void cpu_idle(void)
11055 local_irq_disable();
11058 + /* Don't trace irqs off for idle */
11059 + stop_critical_timings();
11061 + start_critical_timings();
11062 /* In many cases the interrupt that ended idle
11063 has already called exit_idle. But some idle
11064 loops can be woken up without interrupt. */
11065 @@ -271,7 +252,7 @@ void exit_thread(void)
11069 -void load_gs_index(unsigned gs)
11070 +void xen_load_gs_index(unsigned gs)
11072 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
11074 @@ -373,10 +354,10 @@ int copy_thread(int nr, unsigned long cl
11075 p->thread.fs = me->thread.fs;
11076 p->thread.gs = me->thread.gs;
11078 - asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
11079 - asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
11080 - asm("mov %%es,%0" : "=m" (p->thread.es));
11081 - asm("mov %%ds,%0" : "=m" (p->thread.ds));
11082 + savesegment(gs, p->thread.gsindex);
11083 + savesegment(fs, p->thread.fsindex);
11084 + savesegment(es, p->thread.es);
11085 + savesegment(ds, p->thread.ds);
11087 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
11088 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
11089 @@ -417,7 +398,9 @@ out:
11091 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
11093 - asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
11094 + loadsegment(fs, 0);
11095 + loadsegment(es, 0);
11096 + loadsegment(ds, 0);
11100 @@ -557,8 +540,8 @@ static inline void __switch_to_xtra(stru
11101 struct task_struct *
11102 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
11104 - struct thread_struct *prev = &prev_p->thread,
11105 - *next = &next_p->thread;
11106 + struct thread_struct *prev = &prev_p->thread;
11107 + struct thread_struct *next = &next_p->thread;
11108 int cpu = smp_processor_id();
11109 #ifndef CONFIG_X86_NO_TSS
11110 struct tss_struct *tss = &per_cpu(init_tss, cpu);
11111 @@ -661,12 +644,25 @@ __switch_to(struct task_struct *prev_p,
11113 if (unlikely(next->es))
11114 loadsegment(es, next->es);
11117 if (unlikely(next->ds))
11118 loadsegment(ds, next->ds);
11121 + * Leave lazy mode, flushing any hypercalls made here.
11122 + * This must be done before restoring TLS segments so
11123 + * the GDT and LDT are properly updated, and must be
11124 + * done before math_state_restore, so the TS bit is up
11127 + arch_leave_lazy_cpu_mode();
11130 * Switch FS and GS.
11132 + * Segment register != 0 always requires a reload. Also
11133 + * reload when it has changed. When prev process used 64bit
11134 + * base always reload to avoid an information leak.
11136 if (unlikely(next->fsindex))
11137 loadsegment(fs, next->fsindex);
11138 @@ -687,7 +683,8 @@ __switch_to(struct task_struct *prev_p,
11139 write_pda(oldrsp, next->usersp);
11140 write_pda(pcurrent, next_p);
11141 write_pda(kernelstack,
11142 - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
11143 + (unsigned long)task_stack_page(next_p) +
11144 + THREAD_SIZE - PDA_STACKOFFSET);
11145 #ifdef CONFIG_CC_STACKPROTECTOR
11146 write_pda(stack_canary, next_p->stack_canary);
11148 @@ -848,7 +845,7 @@ long do_arch_prctl(struct task_struct *t
11149 set_32bit_tls(task, FS_TLS, addr);
11151 load_TLS(&task->thread, cpu);
11152 - asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
11153 + loadsegment(fs, FS_TLS_SEL);
11155 task->thread.fsindex = FS_TLS_SEL;
11156 task->thread.fs = 0;
11157 @@ -858,7 +855,7 @@ long do_arch_prctl(struct task_struct *t
11159 /* set the selector to 0 to not confuse
11161 - asm volatile("movl %0,%%fs" :: "r" (0));
11162 + loadsegment(fs, 0);
11163 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
11166 @@ -882,7 +879,7 @@ long do_arch_prctl(struct task_struct *t
11167 if (task->thread.gsindex == GS_TLS_SEL)
11168 base = read_32bit_tls(task, GS_TLS);
11170 - asm("movl %%gs,%0" : "=r" (gsindex));
11171 + savesegment(gs, gsindex);
11173 rdmsrl(MSR_KERNEL_GS_BASE, base);
11175 Index: head-2008-12-01/arch/x86/kernel/quirks-xen.c
11176 ===================================================================
11177 --- head-2008-12-01.orig/arch/x86/kernel/quirks-xen.c 2008-12-01 11:37:10.000000000 +0100
11178 +++ head-2008-12-01/arch/x86/kernel/quirks-xen.c 2008-12-01 11:49:07.000000000 +0100
11179 @@ -63,6 +63,7 @@ static enum {
11180 ICH_FORCE_HPET_RESUME,
11181 VT8237_FORCE_HPET_RESUME,
11182 NVIDIA_FORCE_HPET_RESUME,
11183 + ATI_FORCE_HPET_RESUME,
11184 } force_hpet_resume_type;
11186 static void __iomem *rcba_base;
11187 @@ -156,6 +157,8 @@ static void ich_force_enable_hpet(struct
11189 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
11190 ich_force_enable_hpet);
11191 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
11192 + ich_force_enable_hpet);
11193 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
11194 ich_force_enable_hpet);
11195 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
11196 @@ -172,6 +175,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
11198 static struct pci_dev *cached_dev;
11200 +static void hpet_print_force_info(void)
11202 + printk(KERN_INFO "HPET not enabled in BIOS. "
11203 + "You might try hpet=force boot option\n");
11206 static void old_ich_force_hpet_resume(void)
11209 @@ -251,8 +260,12 @@ static void old_ich_force_enable_hpet_us
11211 if (hpet_force_user)
11212 old_ich_force_enable_hpet(dev);
11214 + hpet_print_force_info();
11217 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
11218 + old_ich_force_enable_hpet_user);
11219 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
11220 old_ich_force_enable_hpet_user);
11221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
11222 @@ -288,9 +301,14 @@ static void vt8237_force_enable_hpet(str
11224 u32 uninitialized_var(val);
11226 - if (!hpet_force_user || hpet_address || force_hpet_address)
11227 + if (hpet_address || force_hpet_address)
11230 + if (!hpet_force_user) {
11231 + hpet_print_force_info();
11235 pci_read_config_dword(dev, 0x68, &val);
11237 * Bit 7 is HPET enable bit.
11238 @@ -328,6 +346,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_V
11239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
11240 vt8237_force_enable_hpet);
11242 +static void ati_force_hpet_resume(void)
11244 + pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
11245 + printk(KERN_DEBUG "Force enabled HPET at resume\n");
11248 +static void ati_force_enable_hpet(struct pci_dev *dev)
11250 + u32 uninitialized_var(val);
11252 + if (hpet_address || force_hpet_address)
11255 + if (!hpet_force_user) {
11256 + hpet_print_force_info();
11260 + pci_write_config_dword(dev, 0x14, 0xfed00000);
11261 + pci_read_config_dword(dev, 0x14, &val);
11262 + force_hpet_address = val;
11263 + force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
11264 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
11265 + force_hpet_address);
11266 + cached_dev = dev;
11269 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
11270 + ati_force_enable_hpet);
11273 * Undocumented chipset feature taken from LinuxBIOS.
11275 @@ -341,8 +389,13 @@ static void nvidia_force_enable_hpet(str
11277 u32 uninitialized_var(val);
11279 - if (!hpet_force_user || hpet_address || force_hpet_address)
11280 + if (hpet_address || force_hpet_address)
11283 + if (!hpet_force_user) {
11284 + hpet_print_force_info();
11288 pci_write_config_dword(dev, 0x44, 0xfed00001);
11289 pci_read_config_dword(dev, 0x44, &val);
11290 @@ -395,6 +448,9 @@ void force_hpet_resume(void)
11291 case NVIDIA_FORCE_HPET_RESUME:
11292 nvidia_force_hpet_resume();
11294 + case ATI_FORCE_HPET_RESUME:
11295 + ati_force_hpet_resume();
11300 Index: head-2008-12-01/arch/x86/kernel/setup-xen.c
11301 ===================================================================
11302 --- head-2008-12-01.orig/arch/x86/kernel/setup-xen.c 2008-12-01 11:44:55.000000000 +0100
11303 +++ head-2008-12-01/arch/x86/kernel/setup-xen.c 2008-12-01 11:49:07.000000000 +0100
11304 @@ -1,141 +1,1147 @@
11305 -#include <linux/kernel.h>
11307 + * Copyright (C) 1995 Linus Torvalds
11309 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11311 + * Memory region support
11312 + * David Parsons <orc@pell.chi.il.us>, July-August 1999
11314 + * Added E820 sanitization routine (removes overlapping memory regions);
11315 + * Brian Moyle <bmoyle@mvista.com>, February 2001
11317 + * Moved CPU detection code to cpu/${cpu}.c
11318 + * Patrick Mochel <mochel@osdl.org>, March 2002
11320 + * Provisions for empty E820 memory regions (reported by certain BIOSes).
11321 + * Alex Achenbach <xela@slit.de>, December 2002.
11326 + * This file handles the architecture-dependent parts of initialization
11329 +#include <linux/sched.h>
11330 +#include <linux/mm.h>
11331 +#include <linux/mmzone.h>
11332 +#include <linux/screen_info.h>
11333 +#include <linux/ioport.h>
11334 +#include <linux/acpi.h>
11335 +#include <linux/apm_bios.h>
11336 +#include <linux/initrd.h>
11337 +#include <linux/bootmem.h>
11338 +#include <linux/seq_file.h>
11339 +#include <linux/console.h>
11340 +#include <linux/mca.h>
11341 +#include <linux/root_dev.h>
11342 +#include <linux/highmem.h>
11343 #include <linux/module.h>
11344 +#include <linux/efi.h>
11345 #include <linux/init.h>
11346 -#include <linux/bootmem.h>
11347 +#include <linux/edd.h>
11348 +#include <linux/iscsi_ibft.h>
11349 +#include <linux/nodemask.h>
11350 +#include <linux/kexec.h>
11351 +#include <linux/dmi.h>
11352 +#include <linux/pfn.h>
11353 +#include <linux/pci.h>
11354 +#include <asm/pci-direct.h>
11355 +#include <linux/init_ohci1394_dma.h>
11356 +#include <linux/kvm_para.h>
11358 +#include <linux/errno.h>
11359 +#include <linux/kernel.h>
11360 +#include <linux/stddef.h>
11361 +#include <linux/unistd.h>
11362 +#include <linux/ptrace.h>
11363 +#include <linux/slab.h>
11364 +#include <linux/user.h>
11365 +#include <linux/delay.h>
11367 +#include <linux/kallsyms.h>
11368 +#include <linux/cpufreq.h>
11369 +#include <linux/dma-mapping.h>
11370 +#include <linux/ctype.h>
11371 +#include <linux/uaccess.h>
11373 #include <linux/percpu.h>
11374 -#include <asm/smp.h>
11375 -#include <asm/percpu.h>
11376 +#include <linux/crash_dump.h>
11378 +#include <video/edid.h>
11380 +#include <asm/mtrr.h>
11381 +#include <asm/apic.h>
11382 +#include <asm/e820.h>
11383 +#include <asm/mpspec.h>
11384 +#include <asm/setup.h>
11385 +#include <asm/arch_hooks.h>
11386 +#include <asm/efi.h>
11387 #include <asm/sections.h>
11388 +#include <asm/dmi.h>
11389 +#include <asm/io_apic.h>
11390 +#include <asm/ist.h>
11391 +#include <asm/vmi.h>
11392 +#include <setup_arch.h>
11393 +#include <asm/bios_ebda.h>
11394 +#include <asm/cacheflush.h>
11395 #include <asm/processor.h>
11396 -#include <asm/setup.h>
11397 +#include <asm/bugs.h>
11399 +#include <asm/system.h>
11400 +#include <asm/vsyscall.h>
11401 +#include <asm/smp.h>
11402 +#include <asm/desc.h>
11403 +#include <asm/dma.h>
11404 +#include <asm/iommu.h>
11405 +#include <asm/mmu_context.h>
11406 +#include <asm/proto.h>
11408 +#include <mach_apic.h>
11409 +#include <asm/paravirt.h>
11411 +#include <asm/percpu.h>
11412 #include <asm/topology.h>
11413 -#include <asm/mpspec.h>
11414 #include <asm/apicdef.h>
11415 +#ifdef CONFIG_X86_64
11416 +#include <asm/numa_64.h>
11420 +#include <asm/hypervisor.h>
11421 +#include <xen/interface/kexec.h>
11422 +#include <xen/interface/memory.h>
11423 +#include <xen/interface/nmi.h>
11424 +#include <xen/interface/physdev.h>
11425 +#include <xen/features.h>
11426 +#include <xen/firmware.h>
11427 +#include <xen/xencons.h>
11429 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
11430 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
11432 -#ifdef CONFIG_X86_LOCAL_APIC
11433 -unsigned int num_processors;
11434 -unsigned disabled_cpus __cpuinitdata;
11435 -/* Processor that is doing the boot up */
11436 -unsigned int boot_cpu_physical_apicid = -1U;
11437 -EXPORT_SYMBOL(boot_cpu_physical_apicid);
11438 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
11439 +static struct notifier_block xen_panic_block = {
11440 + xen_panic_event, NULL, 0 /* try to go last */
11443 -DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
11444 -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
11445 +unsigned long *phys_to_machine_mapping;
11446 +EXPORT_SYMBOL(phys_to_machine_mapping);
11448 -/* Bitmask of physically existing CPUs */
11449 -physid_mask_t phys_cpu_present_map;
11450 +unsigned long *pfn_to_mfn_frame_list_list,
11451 +#ifdef CONFIG_X86_64
11452 + *pfn_to_mfn_frame_list[512];
11454 + *pfn_to_mfn_frame_list[128];
11457 +/* Raw start-of-day parameters from the hypervisor. */
11458 +start_info_t *xen_start_info;
11459 +EXPORT_SYMBOL(xen_start_info);
11462 +#ifndef ARCH_SETUP
11463 +#define ARCH_SETUP
11466 +#ifndef CONFIG_XEN
11467 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
11468 +struct boot_params __initdata boot_params;
11470 +struct boot_params boot_params;
11474 -#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
11476 - * Copy data used in early init routines from the initial arrays to the
11477 - * per cpu data areas. These arrays then become expendable and the
11478 - * *_early_ptr's are zeroed indicating that the static arrays are gone.
11479 + * Machine setup..
11481 -static void __init setup_per_cpu_maps(void)
11482 +static struct resource data_resource = {
11483 + .name = "Kernel data",
11486 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11489 +static struct resource code_resource = {
11490 + .name = "Kernel code",
11493 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11496 +static struct resource bss_resource = {
11497 + .name = "Kernel bss",
11500 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11504 +#ifdef CONFIG_X86_32
11505 +#ifndef CONFIG_XEN
11506 +/* This value is set up by the early boot code to point to the value
11507 + immediately after the boot time page tables. It contains a *physical*
11508 + address, and must not be in the .bss segment! */
11509 +unsigned long init_pg_tables_start __initdata = ~0UL;
11510 +unsigned long init_pg_tables_end __initdata = ~0UL;
11513 +static struct resource video_ram_resource = {
11514 + .name = "Video RAM area",
11515 + .start = 0xa0000,
11517 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11520 +/* cpu data as detected by the assembly code in head.S */
11521 +struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11522 +/* common cpu data for all cpus */
11523 +struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11524 +EXPORT_SYMBOL(boot_cpu_data);
11525 +#ifndef CONFIG_XEN
11526 +static void set_mca_bus(int x)
11533 +unsigned int def_to_bigsmp;
11535 +/* for MCA, but anyone else can use it if they want */
11536 +unsigned int machine_id;
11537 +unsigned int machine_submodel_id;
11538 +unsigned int BIOS_revision;
11540 +struct apm_info apm_info;
11541 +EXPORT_SYMBOL(apm_info);
11544 +#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
11545 +struct ist_info ist_info;
11546 +EXPORT_SYMBOL(ist_info);
11547 +#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
11548 +struct ist_info ist_info;
11552 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
11553 +EXPORT_SYMBOL(boot_cpu_data);
11557 +#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
11558 +unsigned long mmu_cr4_features;
11560 +unsigned long mmu_cr4_features = X86_CR4_PAE;
11563 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
11564 +int bootloader_type;
11567 + * Early DMI memory
11569 +int dmi_alloc_index;
11570 +char dmi_alloc_data[DMI_MAX_DATA];
11575 +struct screen_info screen_info;
11576 +EXPORT_SYMBOL(screen_info);
11577 +struct edid_info edid_info;
11578 +EXPORT_SYMBOL_GPL(edid_info);
11580 +extern int root_mountflags;
11582 +unsigned long saved_video_mode;
11584 +#define RAMDISK_IMAGE_START_MASK 0x07FF
11585 +#define RAMDISK_PROMPT_FLAG 0x8000
11586 +#define RAMDISK_LOAD_FLAG 0x4000
11588 +static char __initdata command_line[COMMAND_LINE_SIZE];
11590 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
11592 +#ifdef CONFIG_EDD_MODULE
11593 +EXPORT_SYMBOL(edd);
11598 + * copy_edd() - Copy the BIOS EDD information
11599 + * from boot_params into a safe place.
11602 +static inline void copy_edd(void)
11604 + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
11605 + sizeof(edd.mbr_signature));
11606 + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
11607 + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
11608 + edd.edd_info_nr = boot_params.eddbuf_entries;
11612 +static inline void copy_edd(void)
11617 +#ifdef CONFIG_BLK_DEV_INITRD
11619 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11621 +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
11622 +static void __init relocate_initrd(void)
11625 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11626 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11627 + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11628 + u64 ramdisk_here;
11629 + unsigned long slop, clen, mapaddr;
11632 + /* We need to move the initrd down into lowmem */
11633 + ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
11636 + if (ramdisk_here == -1ULL)
11637 + panic("Cannot find place for new RAMDISK of size %lld\n",
11640 + /* Note: this includes all the lowmem currently occupied by
11641 + the initrd, we rely on that fact to keep the data intact. */
11642 + reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
11644 + initrd_start = ramdisk_here + PAGE_OFFSET;
11645 + initrd_end = initrd_start + ramdisk_size;
11646 + printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
11647 + ramdisk_here, ramdisk_here + ramdisk_size);
11649 + q = (char *)initrd_start;
11651 + /* Copy any lowmem portion of the initrd */
11652 + if (ramdisk_image < end_of_lowmem) {
11653 + clen = end_of_lowmem - ramdisk_image;
11654 + p = (char *)__va(ramdisk_image);
11655 + memcpy(q, p, clen);
11657 + ramdisk_image += clen;
11658 + ramdisk_size -= clen;
11661 + /* Copy the highmem portion of the initrd */
11662 + while (ramdisk_size) {
11663 + slop = ramdisk_image & ~PAGE_MASK;
11664 + clen = ramdisk_size;
11665 + if (clen > MAX_MAP_CHUNK-slop)
11666 + clen = MAX_MAP_CHUNK-slop;
11667 + mapaddr = ramdisk_image & PAGE_MASK;
11668 + p = early_ioremap(mapaddr, clen+slop);
11669 + memcpy(q, p+slop, clen);
11670 + early_iounmap(p, clen+slop);
11672 + ramdisk_image += clen;
11673 + ramdisk_size -= clen;
11675 + /* high pages is not converted by early_res_to_bootmem */
11676 + ramdisk_image = boot_params.hdr.ramdisk_image;
11677 + ramdisk_size = boot_params.hdr.ramdisk_size;
11678 + printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
11679 + " %08llx - %08llx\n",
11680 + ramdisk_image, ramdisk_image + ramdisk_size - 1,
11681 + ramdisk_here, ramdisk_here + ramdisk_size - 1);
11685 +static void __init reserve_initrd(void)
11687 +#ifndef CONFIG_XEN
11688 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11689 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11690 + u64 ramdisk_end = ramdisk_image + ramdisk_size;
11691 + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11693 - for_each_possible_cpu(cpu) {
11694 - per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
11695 - per_cpu(x86_bios_cpu_apicid, cpu) =
11696 - x86_bios_cpu_apicid_init[cpu];
11697 -#ifdef CONFIG_NUMA
11698 - per_cpu(x86_cpu_to_node_map, cpu) =
11699 - x86_cpu_to_node_map_init[cpu];
11700 + if (!boot_params.hdr.type_of_loader ||
11701 + !ramdisk_image || !ramdisk_size)
11702 + return; /* No initrd provided by bootloader */
11704 + unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
11705 + unsigned long ramdisk_size = xen_start_info->mod_len;
11706 + unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
11707 + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11709 + if (!xen_start_info->mod_start || !ramdisk_size)
11710 + return; /* No initrd provided by bootloader */
11713 + initrd_start = 0;
11715 + if (ramdisk_size >= (end_of_lowmem>>1)) {
11716 + free_early(ramdisk_image, ramdisk_end);
11717 + printk(KERN_ERR "initrd too large to handle, "
11718 + "disabling initrd\n");
11722 - /* indicate the early static arrays will soon be gone */
11723 - x86_cpu_to_apicid_early_ptr = NULL;
11724 - x86_bios_cpu_apicid_early_ptr = NULL;
11725 -#ifdef CONFIG_NUMA
11726 - x86_cpu_to_node_map_early_ptr = NULL;
11727 + printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
11731 + if (ramdisk_end <= end_of_lowmem) {
11732 + /* All in lowmem, easy case */
11734 + * don't need to reserve again, already reserved early
11735 + * in i386_start_kernel
11737 + initrd_start = ramdisk_image + PAGE_OFFSET;
11738 + initrd_end = initrd_start + ramdisk_size;
11739 +#ifdef CONFIG_X86_64_XEN
11740 + initrd_below_start_ok = 1;
11745 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11746 + relocate_initrd();
11748 + printk(KERN_ERR "initrd extends beyond end of memory "
11749 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
11750 + ramdisk_end, end_of_lowmem);
11751 + initrd_start = 0;
11753 + free_early(ramdisk_image, ramdisk_end);
11756 +static void __init reserve_initrd(void)
11759 +#endif /* CONFIG_BLK_DEV_INITRD */
11761 -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
11762 -cpumask_t *cpumask_of_cpu_map __read_mostly;
11763 -EXPORT_SYMBOL(cpumask_of_cpu_map);
11764 +static void __init parse_setup_data(void)
11766 +#ifndef CONFIG_XEN
11767 + struct setup_data *data;
11770 + if (boot_params.hdr.version < 0x0209)
11772 + pa_data = boot_params.hdr.setup_data;
11773 + while (pa_data) {
11774 + data = early_ioremap(pa_data, PAGE_SIZE);
11775 + switch (data->type) {
11776 + case SETUP_E820_EXT:
11777 + parse_e820_ext(data, pa_data);
11782 + pa_data = data->next;
11783 + early_iounmap(data, PAGE_SIZE);
11788 -/* requires nr_cpu_ids to be initialized */
11789 -static void __init setup_cpumask_of_cpu(void)
11790 +static void __init e820_reserve_setup_data(void)
11793 +#ifndef CONFIG_XEN
11794 + struct setup_data *data;
11798 - /* alloc_bootmem zeroes memory */
11799 - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
11800 - for (i = 0; i < nr_cpu_ids; i++)
11801 - cpu_set(i, cpumask_of_cpu_map[i]);
11802 + if (boot_params.hdr.version < 0x0209)
11804 + pa_data = boot_params.hdr.setup_data;
11805 + while (pa_data) {
11806 + data = early_ioremap(pa_data, sizeof(*data));
11807 + e820_update_range(pa_data, sizeof(*data)+data->len,
11808 + E820_RAM, E820_RESERVED_KERN);
11810 + pa_data = data->next;
11811 + early_iounmap(data, sizeof(*data));
11816 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
11817 + memcpy(&e820_saved, &e820, sizeof(struct e820map));
11818 + printk(KERN_INFO "extended physical RAM map:\n");
11819 + e820_print_map("reserve setup_data");
11823 -static inline void setup_cpumask_of_cpu(void) { }
11825 +static void __init reserve_early_setup_data(void)
11827 +#ifndef CONFIG_XEN
11828 + struct setup_data *data;
11832 + if (boot_params.hdr.version < 0x0209)
11834 + pa_data = boot_params.hdr.setup_data;
11835 + while (pa_data) {
11836 + data = early_ioremap(pa_data, sizeof(*data));
11837 + sprintf(buf, "setup data %x", data->type);
11838 + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
11839 + pa_data = data->next;
11840 + early_iounmap(data, sizeof(*data));
11845 -#ifdef CONFIG_X86_32
11847 - * Great future not-so-futuristic plan: make i386 and x86_64 do it
11849 + * --------- Crashkernel reservation ------------------------------
11852 +#ifdef CONFIG_KEXEC
11854 +#ifndef CONFIG_XEN
11856 + * Reserve @size bytes of crashkernel memory at any suitable offset.
11858 + * @size: Size of the crashkernel memory to reserve.
11859 + * Returns the base address on success, and -1ULL on failure.
11861 +unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
11863 + const unsigned long long alignment = 16<<20; /* 16M */
11864 + unsigned long long start = 0LL;
11869 + start = find_e820_area(start, ULONG_MAX, size, alignment);
11870 + if (start == -1ULL)
11873 + /* try to reserve it */
11874 + ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
11878 + start += alignment;
11882 +static inline unsigned long long get_total_mem(void)
11884 + unsigned long long total;
11886 + total = max_low_pfn - min_low_pfn;
11887 +#ifdef CONFIG_HIGHMEM
11888 + total += highend_pfn - highstart_pfn;
11891 + return total << PAGE_SHIFT;
11894 +static void __init reserve_crashkernel(void)
11896 + unsigned long long total_mem;
11897 + unsigned long long crash_size, crash_base;
11900 + total_mem = get_total_mem();
11902 + ret = parse_crashkernel(boot_command_line, total_mem,
11903 + &crash_size, &crash_base);
11904 + if (ret != 0 || crash_size <= 0)
11907 + /* 0 means: find the address automatically */
11908 + if (crash_base <= 0) {
11909 + crash_base = find_and_reserve_crashkernel(crash_size);
11910 + if (crash_base == -1ULL) {
11911 + pr_info("crashkernel reservation failed. "
11912 + "No suitable area found.\n");
11916 + ret = reserve_bootmem_generic(crash_base, crash_size,
11917 + BOOTMEM_EXCLUSIVE);
11919 + pr_info("crashkernel reservation failed - "
11920 + "memory is in use\n");
11925 + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
11926 + "for crashkernel (System RAM: %ldMB)\n",
11927 + (unsigned long)(crash_size >> 20),
11928 + (unsigned long)(crash_base >> 20),
11929 + (unsigned long)(total_mem >> 20));
11931 + crashk_res.start = crash_base;
11932 + crashk_res.end = crash_base + crash_size - 1;
11933 + insert_resource(&iomem_resource, &crashk_res);
11936 +#define reserve_crashkernel xen_machine_kexec_setup_resources
11939 +static void __init reserve_crashkernel(void)
11944 +static struct resource standard_io_resources[] = {
11945 + { .name = "dma1", .start = 0x00, .end = 0x1f,
11946 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11947 + { .name = "pic1", .start = 0x20, .end = 0x21,
11948 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11949 + { .name = "timer0", .start = 0x40, .end = 0x43,
11950 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11951 + { .name = "timer1", .start = 0x50, .end = 0x53,
11952 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11953 + { .name = "keyboard", .start = 0x60, .end = 0x60,
11954 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11955 + { .name = "keyboard", .start = 0x64, .end = 0x64,
11956 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11957 + { .name = "dma page reg", .start = 0x80, .end = 0x8f,
11958 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11959 + { .name = "pic2", .start = 0xa0, .end = 0xa1,
11960 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11961 + { .name = "dma2", .start = 0xc0, .end = 0xdf,
11962 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11963 + { .name = "fpu", .start = 0xf0, .end = 0xff,
11964 + .flags = IORESOURCE_BUSY | IORESOURCE_IO }
11967 +static void __init reserve_standard_io_resources(void)
11971 + /* Nothing to do if not running in dom0. */
11972 + if (!is_initial_xendomain())
11975 + /* request I/O space for devices used on all i[345]86 PCs */
11976 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
11977 + request_resource(&ioport_resource, &standard_io_resources[i]);
11981 +#ifdef CONFIG_PROC_VMCORE
11982 +/* elfcorehdr= specifies the location of elf core header
11983 + * stored by the crashed kernel. This option will be passed
11984 + * by kexec loader to the capture kernel.
11986 -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
11987 -EXPORT_SYMBOL(__per_cpu_offset);
11988 +static int __init setup_elfcorehdr(char *arg)
11993 + elfcorehdr_addr = memparse(arg, &end);
11994 + return end > arg ? 0 : -EINVAL;
11996 +early_param("elfcorehdr", setup_elfcorehdr);
11999 +static struct x86_quirks default_x86_quirks __initdata;
12001 +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
12004 - * Great future plan:
12005 - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
12006 - * Always point %gs to its beginning
12007 + * Determine if we were loaded by an EFI loader. If so, then we have also been
12008 + * passed the efi memmap, systab, etc., so we should use these data structures
12009 + * for initialization. Note, the efi init code path is determined by the
12010 + * global efi_enabled. This allows the same kernel image to be used on existing
12011 + * systems (with a traditional BIOS) as well as on EFI systems.
12013 -void __init setup_per_cpu_areas(void)
12015 + * setup_arch - architecture-specific boot-time initializations
12017 + * Note: On x86_64, fixmaps are ready for use even before this is called.
12020 +void __init setup_arch(char **cmdline_p)
12022 - int i, highest_cpu = 0;
12023 - unsigned long size;
12026 + unsigned long p2m_pages;
12027 + struct physdev_set_iopl set_iopl;
12029 -#ifdef CONFIG_HOTPLUG_CPU
12030 - prefill_possible_map();
12031 +#ifdef CONFIG_X86_32
12032 + /* Force a quick death if the kernel panics (not domain 0). */
12033 + extern int panic_timeout;
12034 + if (!panic_timeout && !is_initial_xendomain())
12035 + panic_timeout = 1;
12038 - /* Copy section for each CPU (we discard the original) */
12039 - size = PERCPU_ENOUGH_ROOM;
12040 - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
12043 - for_each_possible_cpu(i) {
12045 -#ifndef CONFIG_NEED_MULTIPLE_NODES
12046 - ptr = alloc_bootmem_pages(size);
12048 - int node = early_cpu_to_node(i);
12049 - if (!node_online(node) || !NODE_DATA(node)) {
12050 - ptr = alloc_bootmem_pages(size);
12052 - "cpu %d has no node or node-local memory\n", i);
12055 - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
12056 + /* Register a call for panic conditions. */
12057 + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
12059 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12060 + VMASST_TYPE_writable_pagetables));
12061 +#ifdef CONFIG_X86_32
12062 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12063 + VMASST_TYPE_4gb_segments));
12065 +#endif /* CONFIG_XEN */
12067 +#ifdef CONFIG_X86_32
12068 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
12069 + visws_early_detect();
12070 + pre_setup_arch_hook();
12072 + printk(KERN_INFO "Command line: %s\n", boot_command_line);
12075 + early_cpu_init();
12076 + early_ioremap_init();
12078 +#ifndef CONFIG_XEN
12079 + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
12080 + screen_info = boot_params.screen_info;
12081 + edid_info = boot_params.edid_info;
12082 +#ifdef CONFIG_X86_32
12083 + apm_info.bios = boot_params.apm_bios_info;
12084 + ist_info = boot_params.ist_info;
12085 + if (boot_params.sys_desc_table.length != 0) {
12086 + set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
12087 + machine_id = boot_params.sys_desc_table.table[0];
12088 + machine_submodel_id = boot_params.sys_desc_table.table[1];
12089 + BIOS_revision = boot_params.sys_desc_table.table[2];
12092 + saved_video_mode = boot_params.hdr.vid_mode;
12093 + bootloader_type = boot_params.hdr.type_of_loader;
12095 +#ifdef CONFIG_BLK_DEV_RAM
12096 + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
12097 + rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
12098 + rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
12101 + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
12102 +#ifdef CONFIG_X86_32
12109 + efi_reserve_early();
12112 +#else /* CONFIG_XEN */
12113 +#ifdef CONFIG_X86_32
12114 + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
12115 + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
12117 + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
12119 + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
12121 + if (is_initial_xendomain()) {
12122 + const struct dom0_vga_console_info *info =
12123 + (void *)((char *)xen_start_info +
12124 + xen_start_info->console.dom0.info_off);
12126 + dom0_init_screen_info(info,
12127 + xen_start_info->console.dom0.info_size);
12128 + xen_start_info->console.domU.mfn = 0;
12129 + xen_start_info->console.domU.evtchn = 0;
12131 + screen_info.orig_video_isVGA = 0;
12133 +#endif /* CONFIG_XEN */
12137 + setup_memory_map();
12138 + parse_setup_data();
12139 + /* update the e820_saved too */
12140 + e820_reserve_setup_data();
12144 +#ifndef CONFIG_XEN
12145 + if (!boot_params.hdr.root_flags)
12146 + root_mountflags &= ~MS_RDONLY;
12149 - panic("Cannot allocate cpu data for CPU %d\n", i);
12150 + init_mm.start_code = (unsigned long) _text;
12151 + init_mm.end_code = (unsigned long) _etext;
12152 + init_mm.end_data = (unsigned long) _edata;
12153 +#ifdef CONFIG_X86_32
12154 +#ifndef CONFIG_XEN
12155 + init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
12157 + init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
12158 + xen_start_info->nr_pt_frames) << PAGE_SHIFT;
12161 + init_mm.brk = (unsigned long) &_end;
12164 + code_resource.start = virt_to_phys(_text);
12165 + code_resource.end = virt_to_phys(_etext)-1;
12166 + data_resource.start = virt_to_phys(_etext);
12167 + data_resource.end = virt_to_phys(_edata)-1;
12168 + bss_resource.start = virt_to_phys(&__bss_start);
12169 + bss_resource.end = virt_to_phys(&__bss_stop)-1;
12171 + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
12172 + *cmdline_p = command_line;
12174 + parse_early_param();
12176 #ifdef CONFIG_X86_64
12177 - cpu_pda(i)->data_offset = ptr - __per_cpu_start;
12181 +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
12183 + * Must be before kernel pagetables are setup
12184 + * or fixmap area is touched.
12189 + /* after early param, so could get panic from serial */
12190 + reserve_early_setup_data();
12192 + if (acpi_mps_check()) {
12193 +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
12194 + disable_apic = 1;
12196 + setup_clear_cpu_cap(X86_FEATURE_APIC);
12200 + if (pci_early_dump_regs)
12201 + early_dump_pci_devices();
12204 + finish_e820_parsing();
12206 +#ifdef CONFIG_X86_32
12210 +#ifndef CONFIG_XEN
12211 + /* after parse_early_param, so could debug it */
12212 + insert_resource(&iomem_resource, &code_resource);
12213 + insert_resource(&iomem_resource, &data_resource);
12214 + insert_resource(&iomem_resource, &bss_resource);
12219 +#ifdef CONFIG_X86_32
12220 + if (ppro_with_ram_bug()) {
12221 + e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
12223 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
12224 + printk(KERN_INFO "fixed physical RAM map:\n");
12225 + e820_print_map("bad_ppro");
12228 - __per_cpu_offset[i] = ptr - __per_cpu_start;
12229 + early_gart_iommu_check();
12231 - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
12232 +#endif /* CONFIG_XEN */
12236 + * partially used pages are not usable - thus
12237 + * we are rounding upwards:
12239 + max_pfn = e820_end_of_ram_pfn();
12241 + /* preallocate 4k for mptable mpc */
12242 + early_reserve_e820_mpc_new();
12243 + /* update e820 for memory not covered by WB MTRRs */
12245 +#ifndef CONFIG_XEN
12246 + if (mtrr_trim_uncached_memory(max_pfn))
12247 + max_pfn = e820_end_of_ram_pfn();
12250 +#ifdef CONFIG_X86_32
12251 + /* max_low_pfn get updated here */
12252 + find_low_pfn_range();
12254 + num_physpages = max_pfn;
12255 + max_mapnr = max_pfn;
12258 + /* How many end-of-memory variables you have, grandma! */
12259 + /* need this before calling reserve_initrd */
12260 + if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
12261 + max_low_pfn = e820_end_of_low_ram_pfn();
12263 + max_low_pfn = max_pfn;
12265 + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
12268 + /* max_pfn_mapped is updated here */
12269 +#ifdef CONFIG_X86_64_XEN
12271 + * Due to the way initial table space gets calculated on Xen, we have
12272 + * to call init_memory_mapping() with the larger end address first.
12274 + if (max_pfn > max_low_pfn)
12275 + max_pfn_mapped = init_memory_mapping(1UL<<32,
12276 + max_pfn<<PAGE_SHIFT);
12277 + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
12278 + if (max_pfn > max_low_pfn)
12279 + /* can we preserve max_low_pfn ?*/
12280 + max_low_pfn = max_pfn;
12282 + max_pfn_mapped = max_low_pfn_mapped;
12284 + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
12285 + max_pfn_mapped = max_low_pfn_mapped;
12287 +#ifdef CONFIG_X86_64
12288 + if (max_pfn > max_low_pfn) {
12289 + max_pfn_mapped = init_memory_mapping(1UL<<32,
12290 + max_pfn<<PAGE_SHIFT);
12291 + /* can we preseve max_low_pfn ?*/
12292 + max_low_pfn = max_pfn;
12297 - nr_cpu_ids = highest_cpu + 1;
12298 - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
12300 + * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
12303 - /* Setup percpu data maps */
12304 - setup_per_cpu_maps();
12305 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
12306 + if (init_ohci1394_dma_early)
12307 + init_ohci1394_dma_on_all_controllers();
12310 - /* Setup cpumask_of_cpu map */
12311 - setup_cpumask_of_cpu();
12313 + reserve_initrd();
12315 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12319 + if (is_initial_xendomain())
12320 + dmi_scan_machine();
12324 +#ifdef CONFIG_ACPI
12325 + if (!is_initial_xendomain()) {
12326 + printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
12332 + * Parse the ACPI tables for possible boot-time SMP configuration.
12334 + acpi_boot_table_init();
12336 +#ifdef CONFIG_ACPI_NUMA
12338 + * Parse SRAT to discover nodes.
12340 + acpi_numa_init();
12343 + initmem_init(0, max_pfn);
12345 +#ifdef CONFIG_ACPI_SLEEP
12347 + * Reserve low memory region for sleep support.
12349 + acpi_reserve_bootmem();
12351 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
12353 + * Find and reserve possible boot-time SMP configuration:
12355 + find_smp_config();
12357 + reserve_crashkernel();
12359 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12361 + * dma32_reserve_bootmem() allocates bootmem which may conflict
12362 + * with the crashkernel command line, so do that after
12363 + * reserve_crashkernel()
12365 + dma32_reserve_bootmem();
12368 + reserve_ibft_region();
12370 +#ifdef CONFIG_KVM_CLOCK
12374 + xen_pagetable_setup_start(swapper_pg_dir);
12376 + xen_pagetable_setup_done(swapper_pg_dir);
12377 + paravirt_post_allocator_init();
12379 +#ifdef CONFIG_X86_64
12384 + p2m_pages = max_pfn;
12385 + if (xen_start_info->nr_pages > max_pfn) {
12387 + * the max_pfn was shrunk (probably by mem= or highmem=
12388 + * kernel parameter); shrink reservation with the HV
12390 + struct xen_memory_reservation reservation = {
12391 + .address_bits = 0,
12392 + .extent_order = 0,
12393 + .domid = DOMID_SELF
12395 + unsigned int difference;
12398 + difference = xen_start_info->nr_pages - max_pfn;
12400 + set_xen_guest_handle(reservation.extent_start,
12401 + ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
12402 + reservation.nr_extents = difference;
12403 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
12405 + BUG_ON(ret != difference);
12407 + else if (max_pfn > xen_start_info->nr_pages)
12408 + p2m_pages = xen_start_info->nr_pages;
12410 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
12411 + unsigned long i, j;
12412 + unsigned int k, fpp;
12414 + /* Make sure we have a large enough P->M table. */
12415 + phys_to_machine_mapping = alloc_bootmem_pages(
12416 + max_pfn * sizeof(unsigned long));
12417 + memset(phys_to_machine_mapping, ~0,
12418 + max_pfn * sizeof(unsigned long));
12419 + memcpy(phys_to_machine_mapping,
12420 + (unsigned long *)xen_start_info->mfn_list,
12421 + p2m_pages * sizeof(unsigned long));
12423 + __pa(xen_start_info->mfn_list),
12424 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
12425 + sizeof(unsigned long))));
12428 + * Initialise the list of the frames that specify the list of
12429 + * frames that make up the p2m table. Used by save/restore.
12431 + pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
12433 + fpp = PAGE_SIZE/sizeof(unsigned long);
12434 + for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
12439 + BUG_ON(k>=ARRAY_SIZE(pfn_to_mfn_frame_list));
12440 + pfn_to_mfn_frame_list[k] =
12441 + alloc_bootmem_pages(PAGE_SIZE);
12442 + pfn_to_mfn_frame_list_list[k] =
12443 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
12445 + pfn_to_mfn_frame_list[k][j] =
12446 + virt_to_mfn(&phys_to_machine_mapping[i]);
12448 + HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
12449 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
12450 + virt_to_mfn(pfn_to_mfn_frame_list_list);
12453 + /* Mark all ISA DMA channels in-use - using them wouldn't work. */
12454 + for (i = 0; i < MAX_DMA_CHANNELS; ++i)
12455 + if (i != 4 && request_dma(i, "xen") != 0)
12457 +#endif /* CONFIG_XEN */
12459 +#ifdef CONFIG_X86_GENERICARCH
12460 + generic_apic_probe();
12463 +#ifndef CONFIG_XEN
12468 + * Read APIC and some other early information from ACPI tables.
12470 + acpi_boot_init();
12472 +#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
12474 + * get boot-time SMP configuration:
12476 + if (smp_found_config)
12477 + get_smp_config();
12480 + prefill_possible_map();
12481 +#ifdef CONFIG_X86_64
12482 + init_cpu_to_node();
12485 +#ifndef CONFIG_XEN
12486 + init_apic_mappings();
12487 + ioapic_init_mappings();
12489 + kvm_guest_init();
12491 + e820_reserve_resources();
12492 + e820_mark_nosave_regions(max_low_pfn);
12494 + if (is_initial_xendomain())
12495 + e820_reserve_resources();
12498 +#ifdef CONFIG_X86_32
12499 + request_resource(&iomem_resource, &video_ram_resource);
12501 + reserve_standard_io_resources();
12503 +#ifndef CONFIG_XEN
12504 + e820_setup_gap();
12507 +#if defined(CONFIG_VGA_CONSOLE)
12508 + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
12509 + conswitchp = &vga_con;
12510 +#elif defined(CONFIG_DUMMY_CONSOLE)
12511 + conswitchp = &dummy_con;
12514 +#else /* CONFIG_XEN */
12515 + if (is_initial_xendomain())
12516 + e820_setup_gap();
12518 + set_iopl.iopl = 1;
12519 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
12522 +#ifdef CONFIG_DUMMY_CONSOLE
12523 + conswitchp = &dummy_con;
12525 +#ifdef CONFIG_VGA_CONSOLE
12526 + if (is_initial_xendomain())
12527 + conswitchp = &vga_con;
12530 +#endif /* CONFIG_XEN */
12535 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
12537 + HYPERVISOR_shutdown(SHUTDOWN_crash);
12538 + /* we're never actually going to get here... */
12539 + return NOTIFY_DONE;
12541 +#endif /* !CONFIG_XEN */
12542 Index: head-2008-12-01/arch/x86/kernel/setup64-xen.c
12543 ===================================================================
12544 --- head-2008-12-01.orig/arch/x86/kernel/setup64-xen.c 2008-12-01 11:44:55.000000000 +0100
12545 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12548 - * X86-64 specific CPU setup.
12549 - * Copyright (C) 1995 Linus Torvalds
12550 - * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
12551 - * See setup.c for older changelog.
12553 - * Jun Nakajima <jun.nakajima@intel.com>
12554 - * Modified for Xen
12557 -#include <linux/init.h>
12558 -#include <linux/kernel.h>
12559 -#include <linux/sched.h>
12560 -#include <linux/string.h>
12561 -#include <linux/bootmem.h>
12562 -#include <linux/bitops.h>
12563 -#include <linux/module.h>
12564 -#include <linux/kgdb.h>
12565 -#include <asm/pda.h>
12566 -#include <asm/pgtable.h>
12567 -#include <asm/processor.h>
12568 -#include <asm/desc.h>
12569 -#include <asm/atomic.h>
12570 -#include <asm/mmu_context.h>
12571 -#include <asm/smp.h>
12572 -#include <asm/i387.h>
12573 -#include <asm/percpu.h>
12574 -#include <asm/proto.h>
12575 -#include <asm/sections.h>
12576 -#include <asm/setup.h>
12577 -#include <asm/genapic.h>
12579 -#include <asm/hypervisor.h>
12582 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
12583 -struct boot_params __initdata boot_params;
12585 -struct boot_params boot_params;
12588 -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
12590 -struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
12591 -EXPORT_SYMBOL(_cpu_pda);
12592 -struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
12594 -#ifndef CONFIG_X86_NO_IDT
12595 -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
12598 -char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
12600 -unsigned long __supported_pte_mask __read_mostly = ~0UL;
12601 -EXPORT_SYMBOL(__supported_pte_mask);
12603 -static int do_not_nx __cpuinitdata = 0;
12606 -Control non executable mappings for 64bit processes.
12608 -on Enable(default)
12611 -static int __init nonx_setup(char *str)
12615 - if (!strncmp(str, "on", 2)) {
12616 - __supported_pte_mask |= _PAGE_NX;
12618 - } else if (!strncmp(str, "off", 3)) {
12620 - __supported_pte_mask &= ~_PAGE_NX;
12624 -early_param("noexec", nonx_setup);
12626 -int force_personality32 = 0;
12628 -/* noexec32=on|off
12629 -Control non executable heap for 32bit processes.
12630 -To control the stack too use noexec=off
12632 -on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
12633 -off PROT_READ implies PROT_EXEC
12635 -static int __init nonx32_setup(char *str)
12637 - if (!strcmp(str, "on"))
12638 - force_personality32 &= ~READ_IMPLIES_EXEC;
12639 - else if (!strcmp(str, "off"))
12640 - force_personality32 |= READ_IMPLIES_EXEC;
12643 -__setup("noexec32=", nonx32_setup);
12646 -static void __init_refok switch_pt(int cpu)
12650 - xen_pt_switch(__pa_symbol(init_level4_pgt));
12651 - xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
12653 -#define switch_pt() switch_pt(cpu)
12655 -static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12657 - unsigned long frames[16];
12658 - unsigned long va;
12661 - for (va = gdt_descr->address, f = 0;
12662 - va < gdt_descr->address + gdt_descr->size;
12663 - va += PAGE_SIZE, f++) {
12664 - frames[f] = virt_to_mfn(va);
12665 - make_page_readonly(
12666 - (void *)va, XENFEAT_writable_descriptor_tables);
12668 - if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
12669 - sizeof (struct desc_struct)))
12673 -static void switch_pt(void)
12675 - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
12678 -static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12680 - load_gdt(gdt_descr);
12681 - load_idt(idt_descr);
12685 -void pda_init(int cpu)
12687 - struct x8664_pda *pda = cpu_pda(cpu);
12689 - /* Setup up data that may be needed in __get_free_pages early */
12690 - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
12691 -#ifndef CONFIG_XEN
12692 - /* Memory clobbers used to order PDA accessed */
12694 - wrmsrl(MSR_GS_BASE, pda);
12697 - if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
12698 - (unsigned long)pda))
12701 - pda->cpunumber = cpu;
12702 - pda->irqcount = -1;
12703 - pda->kernelstack =
12704 - (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
12705 - pda->active_mm = &init_mm;
12706 - pda->mmu_state = 0;
12709 - /* others are initialized in smpboot.c */
12710 - pda->pcurrent = &init_task;
12711 - pda->irqstackptr = boot_cpu_stack;
12713 - pda->irqstackptr = (char *)
12714 - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
12715 - if (!pda->irqstackptr)
12716 - panic("cannot allocate irqstack for cpu %d", cpu);
12721 - pda->irqstackptr += IRQSTACKSIZE-64;
12724 -#ifndef CONFIG_X86_NO_TSS
12725 -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
12726 -__attribute__((section(".bss.page_aligned")));
12729 -extern asmlinkage void ignore_sysret(void);
12731 -/* May not be marked __init: used by software suspend */
12732 -void syscall_init(void)
12734 -#ifndef CONFIG_XEN
12736 - * LSTAR and STAR live in a bit strange symbiosis.
12737 - * They both write to the same internal register. STAR allows to set CS/DS
12738 - * but only a 32bit target. LSTAR sets the 64bit rip.
12740 - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
12741 - wrmsrl(MSR_LSTAR, system_call);
12742 - wrmsrl(MSR_CSTAR, ignore_sysret);
12744 - /* Flags to clear on syscall */
12745 - wrmsrl(MSR_SYSCALL_MASK,
12746 - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
12748 -#ifdef CONFIG_IA32_EMULATION
12749 - syscall32_cpu_init ();
12752 - static const struct callback_register cstar = {
12753 - .type = CALLBACKTYPE_syscall32,
12754 - .address = (unsigned long)ignore_sysret
12756 - if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
12757 - printk(KERN_WARN "Unable to register CSTAR callback\n");
12762 -void __cpuinit check_efer(void)
12764 - unsigned long efer;
12766 - rdmsrl(MSR_EFER, efer);
12767 - if (!(efer & EFER_NX) || do_not_nx) {
12768 - __supported_pte_mask &= ~_PAGE_NX;
12772 -unsigned long kernel_eflags;
12774 -#ifndef CONFIG_X86_NO_TSS
12776 - * Copies of the original ist values from the tss are only accessed during
12777 - * debugging, no special alignment required.
12779 -DEFINE_PER_CPU(struct orig_ist, orig_ist);
12783 - * cpu_init() initializes state that is per-CPU. Some data is already
12784 - * initialized (naturally) in the bootstrap process, such as the GDT
12785 - * and IDT. We reload them nevertheless, this function acts as a
12786 - * 'CPU state barrier', nothing should get across.
12787 - * A lot of state is already set up in PDA init.
12789 -void __cpuinit cpu_init (void)
12791 - int cpu = stack_smp_processor_id();
12792 -#ifndef CONFIG_X86_NO_TSS
12793 - struct tss_struct *t = &per_cpu(init_tss, cpu);
12794 - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
12796 - char *estacks = NULL;
12799 - struct task_struct *me;
12801 - /* CPU 0 is initialised in head64.c */
12805 -#ifndef CONFIG_X86_NO_TSS
12807 - estacks = boot_exception_stacks;
12812 - if (cpu_test_and_set(cpu, cpu_initialized))
12813 - panic("CPU#%d already initialized!\n", cpu);
12815 - printk("Initializing CPU#%d\n", cpu);
12817 - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
12820 - * Initialize the per-CPU GDT with the boot GDT,
12821 - * and set up the GDT descriptor:
12823 -#ifndef CONFIG_XEN
12825 - memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
12828 - cpu_gdt_descr[cpu].size = GDT_SIZE;
12829 - cpu_gdt_init(&cpu_gdt_descr[cpu]);
12831 - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
12834 - wrmsrl(MSR_FS_BASE, 0);
12835 - wrmsrl(MSR_KERNEL_GS_BASE, 0);
12840 -#ifndef CONFIG_X86_NO_TSS
12842 - * set up and load the per-CPU TSS
12844 - for (v = 0; v < N_EXCEPTION_STACKS; v++) {
12845 - static const unsigned int order[N_EXCEPTION_STACKS] = {
12846 - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
12847 - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
12850 - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
12852 - panic("Cannot allocate exception stack %ld %d\n",
12855 - estacks += PAGE_SIZE << order[v];
12856 - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
12859 - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
12861 - * <= is required because the CPU will access up to
12862 - * 8 bits beyond the end of the IO permission bitmap.
12864 - for (i = 0; i <= IO_BITMAP_LONGS; i++)
12865 - t->io_bitmap[i] = ~0UL;
12868 - atomic_inc(&init_mm.mm_count);
12869 - me->active_mm = &init_mm;
12872 - enter_lazy_tlb(&init_mm, me);
12874 -#ifndef CONFIG_X86_NO_TSS
12875 - set_tss_desc(cpu, t);
12877 -#ifndef CONFIG_XEN
12880 - load_LDT(&init_mm.context);
12882 -#ifdef CONFIG_KGDB
12884 - * If the kgdb is connected no debug regs should be altered. This
12885 - * is only applicable when KGDB and a KGDB I/O module are built
12886 - * into the kernel and you are using early debugging with
12887 - * kgdbwait. KGDB will control the kernel HW breakpoint registers.
12889 - if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
12890 - arch_kgdb_ops.correct_hw_break();
12894 - * Clear all 6 debug registers:
12897 - set_debugreg(0UL, 0);
12898 - set_debugreg(0UL, 1);
12899 - set_debugreg(0UL, 2);
12900 - set_debugreg(0UL, 3);
12901 - set_debugreg(0UL, 6);
12902 - set_debugreg(0UL, 7);
12903 -#ifdef CONFIG_KGDB
12904 - /* If the kgdb is connected no debug regs should be altered. */
12910 - asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
12911 - if (raw_irqs_disabled())
12912 - kernel_eflags &= ~X86_EFLAGS_IF;
12914 - if (is_uv_system())
12917 Index: head-2008-12-01/arch/x86/kernel/setup_32-xen.c
12918 ===================================================================
12919 --- head-2008-12-01.orig/arch/x86/kernel/setup_32-xen.c 2008-12-01 11:44:55.000000000 +0100
12920 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12923 - * Copyright (C) 1995 Linus Torvalds
12925 - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
12927 - * Memory region support
12928 - * David Parsons <orc@pell.chi.il.us>, July-August 1999
12930 - * Added E820 sanitization routine (removes overlapping memory regions);
12931 - * Brian Moyle <bmoyle@mvista.com>, February 2001
12933 - * Moved CPU detection code to cpu/${cpu}.c
12934 - * Patrick Mochel <mochel@osdl.org>, March 2002
12936 - * Provisions for empty E820 memory regions (reported by certain BIOSes).
12937 - * Alex Achenbach <xela@slit.de>, December 2002.
12942 - * This file handles the architecture-dependent parts of initialization
12945 -#include <linux/sched.h>
12946 -#include <linux/mm.h>
12947 -#include <linux/mmzone.h>
12948 -#include <linux/screen_info.h>
12949 -#include <linux/ioport.h>
12950 -#include <linux/acpi.h>
12951 -#include <linux/apm_bios.h>
12952 -#include <linux/initrd.h>
12953 -#include <linux/bootmem.h>
12954 -#include <linux/seq_file.h>
12955 -#include <linux/console.h>
12956 -#include <linux/mca.h>
12957 -#include <linux/root_dev.h>
12958 -#include <linux/highmem.h>
12959 -#include <linux/module.h>
12960 -#include <linux/efi.h>
12961 -#include <linux/init.h>
12962 -#include <linux/edd.h>
12963 -#include <linux/iscsi_ibft.h>
12964 -#include <linux/nodemask.h>
12965 -#include <linux/kernel.h>
12966 -#include <linux/percpu.h>
12967 -#include <linux/notifier.h>
12968 -#include <linux/kexec.h>
12969 -#include <linux/crash_dump.h>
12970 -#include <linux/dmi.h>
12971 -#include <linux/pfn.h>
12972 -#include <linux/pci.h>
12973 -#include <linux/init_ohci1394_dma.h>
12974 -#include <linux/kvm_para.h>
12976 -#include <video/edid.h>
12978 -#include <asm/mtrr.h>
12979 -#include <asm/apic.h>
12980 -#include <asm/e820.h>
12981 -#include <asm/mpspec.h>
12982 -#include <asm/mmzone.h>
12983 -#include <asm/setup.h>
12984 -#include <asm/arch_hooks.h>
12985 -#include <asm/sections.h>
12986 -#include <asm/io_apic.h>
12987 -#include <asm/ist.h>
12988 -#include <asm/io.h>
12989 -#include <asm/hypervisor.h>
12990 -#include <xen/interface/physdev.h>
12991 -#include <xen/interface/memory.h>
12992 -#include <xen/features.h>
12993 -#include <xen/firmware.h>
12994 -#include <xen/xencons.h>
12995 -#include <setup_arch.h>
12996 -#include <asm/bios_ebda.h>
12997 -#include <asm/cacheflush.h>
12998 -#include <asm/processor.h>
13001 -#include <xen/interface/kexec.h>
13004 -static int xen_panic_event(struct notifier_block *, unsigned long, void *);
13005 -static struct notifier_block xen_panic_block = {
13006 - xen_panic_event, NULL, 0 /* try to go last */
13010 - * Machine setup..
13012 -static struct resource data_resource = {
13013 - .name = "Kernel data",
13016 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13019 -static struct resource code_resource = {
13020 - .name = "Kernel code",
13023 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13026 -static struct resource bss_resource = {
13027 - .name = "Kernel bss",
13030 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13033 -static struct resource video_ram_resource = {
13034 - .name = "Video RAM area",
13035 - .start = 0xa0000,
13037 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13040 -static struct resource standard_io_resources[] = { {
13044 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13049 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13051 - .name = "timer0",
13054 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13056 - .name = "timer1",
13059 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13061 - .name = "keyboard",
13064 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13066 - .name = "keyboard",
13069 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13071 - .name = "dma page reg",
13074 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13079 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13084 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13089 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13092 -/* cpu data as detected by the assembly code in head.S */
13093 -struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13094 -/* common cpu data for all cpus */
13095 -struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13096 -EXPORT_SYMBOL(boot_cpu_data);
13098 -unsigned int def_to_bigsmp;
13100 -#ifndef CONFIG_X86_PAE
13101 -unsigned long mmu_cr4_features;
13103 -unsigned long mmu_cr4_features = X86_CR4_PAE;
13106 -/* for MCA, but anyone else can use it if they want */
13107 -unsigned int machine_id;
13108 -unsigned int machine_submodel_id;
13109 -unsigned int BIOS_revision;
13111 -/* Boot loader ID as an integer, for the benefit of proc_dointvec */
13112 -int bootloader_type;
13114 -/* user-defined highmem size */
13115 -static unsigned int highmem_pages = -1;
13120 -struct screen_info screen_info;
13121 -EXPORT_SYMBOL(screen_info);
13122 -struct apm_info apm_info;
13123 -EXPORT_SYMBOL(apm_info);
13124 -struct edid_info edid_info;
13125 -EXPORT_SYMBOL_GPL(edid_info);
13126 -#ifndef CONFIG_XEN
13127 -#define copy_edid() (edid_info = boot_params.edid_info)
13129 -struct ist_info ist_info;
13130 -#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
13131 - defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
13132 -EXPORT_SYMBOL(ist_info);
13135 -extern void early_cpu_init(void);
13136 -extern int root_mountflags;
13138 -unsigned long saved_video_mode;
13140 -#define RAMDISK_IMAGE_START_MASK 0x07FF
13141 -#define RAMDISK_PROMPT_FLAG 0x8000
13142 -#define RAMDISK_LOAD_FLAG 0x4000
13144 -static char __initdata command_line[COMMAND_LINE_SIZE];
13146 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
13147 -struct boot_params __initdata boot_params;
13149 -struct boot_params boot_params;
13153 - * Point at the empty zero page to start with. We map the real shared_info
13154 - * page as soon as fixmap is up and running.
13156 -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
13157 -EXPORT_SYMBOL(HYPERVISOR_shared_info);
13159 -unsigned long *phys_to_machine_mapping;
13160 -unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
13161 -EXPORT_SYMBOL(phys_to_machine_mapping);
13163 -/* Raw start-of-day parameters from the hypervisor. */
13164 -start_info_t *xen_start_info;
13165 -EXPORT_SYMBOL(xen_start_info);
13167 -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
13169 -#ifdef CONFIG_EDD_MODULE
13170 -EXPORT_SYMBOL(edd);
13172 -#ifndef CONFIG_XEN
13174 - * copy_edd() - Copy the BIOS EDD information
13175 - * from boot_params into a safe place.
13178 -static inline void copy_edd(void)
13180 - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
13181 - sizeof(edd.mbr_signature));
13182 - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
13183 - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
13184 - edd.edd_info_nr = boot_params.eddbuf_entries;
13188 -static inline void copy_edd(void)
13193 -int __initdata user_defined_memmap;
13196 - * "mem=nopentium" disables the 4MB page tables.
13197 - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
13198 - * to <mem>, overriding the bios size.
13199 - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
13200 - * <start> to <start>+<mem>, overriding the bios size.
13202 - * HPA tells me bootloaders need to parse mem=, so no new
13203 - * option should be mem= [also see Documentation/i386/boot.txt]
13205 -static int __init parse_mem(char *arg)
13210 - if (strcmp(arg, "nopentium") == 0) {
13211 - setup_clear_cpu_cap(X86_FEATURE_PSE);
13213 - /* If the user specifies memory size, we
13214 - * limit the BIOS-provided memory map to
13215 - * that size. exactmap can be used to specify
13216 - * the exact map. mem=number can be used to
13217 - * trim the existing memory map.
13219 - unsigned long long mem_size;
13221 - mem_size = memparse(arg, &arg);
13222 - limit_regions(mem_size);
13223 - user_defined_memmap = 1;
13227 -early_param("mem", parse_mem);
13229 -#ifdef CONFIG_PROC_VMCORE
13230 -/* elfcorehdr= specifies the location of elf core header
13231 - * stored by the crashed kernel.
13233 -static int __init parse_elfcorehdr(char *arg)
13238 - elfcorehdr_addr = memparse(arg, &arg);
13241 -early_param("elfcorehdr", parse_elfcorehdr);
13242 -#endif /* CONFIG_PROC_VMCORE */
13245 - * highmem=size forces highmem to be exactly 'size' bytes.
13246 - * This works even on boxes that have no highmem otherwise.
13247 - * This also works to reduce highmem size on bigger boxes.
13249 -static int __init parse_highmem(char *arg)
13254 - highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
13257 -early_param("highmem", parse_highmem);
13260 - * vmalloc=size forces the vmalloc area to be exactly 'size'
13261 - * bytes. This can be used to increase (or decrease) the
13262 - * vmalloc area - the default is 128m.
13264 -static int __init parse_vmalloc(char *arg)
13269 - __VMALLOC_RESERVE = memparse(arg, &arg);
13272 -early_param("vmalloc", parse_vmalloc);
13274 -#ifndef CONFIG_XEN
13276 - * reservetop=size reserves a hole at the top of the kernel address space which
13277 - * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
13278 - * so relocating the fixmap can be done before paging initialization.
13280 -static int __init parse_reservetop(char *arg)
13282 - unsigned long address;
13287 - address = memparse(arg, &arg);
13288 - reserve_top_address(address);
13291 -early_param("reservetop", parse_reservetop);
13295 - * Determine low and high memory ranges:
13297 -unsigned long __init find_max_low_pfn(void)
13299 - unsigned long max_low_pfn;
13301 - max_low_pfn = max_pfn;
13302 - if (max_low_pfn > MAXMEM_PFN) {
13303 - if (highmem_pages == -1)
13304 - highmem_pages = max_pfn - MAXMEM_PFN;
13305 - if (highmem_pages + MAXMEM_PFN < max_pfn)
13306 - max_pfn = MAXMEM_PFN + highmem_pages;
13307 - if (highmem_pages + MAXMEM_PFN > max_pfn) {
13308 - printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
13309 - highmem_pages = 0;
13311 - max_low_pfn = MAXMEM_PFN;
13312 -#ifndef CONFIG_HIGHMEM
13313 - /* Maximum memory usable is what is directly addressable */
13314 - printk(KERN_WARNING "Warning only %ldMB will be used.\n",
13316 - if (max_pfn > MAX_NONPAE_PFN)
13317 - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13319 - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
13320 - max_pfn = MAXMEM_PFN;
13321 -#else /* !CONFIG_HIGHMEM */
13322 -#ifndef CONFIG_HIGHMEM64G
13323 - if (max_pfn > MAX_NONPAE_PFN) {
13324 - max_pfn = MAX_NONPAE_PFN;
13325 - printk(KERN_WARNING "Warning only 4GB will be used.\n");
13326 - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13328 -#endif /* !CONFIG_HIGHMEM64G */
13329 -#endif /* !CONFIG_HIGHMEM */
13331 - if (highmem_pages == -1)
13332 - highmem_pages = 0;
13333 -#ifdef CONFIG_HIGHMEM
13334 - if (highmem_pages >= max_pfn) {
13335 - printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
13336 - highmem_pages = 0;
13338 - if (highmem_pages) {
13339 - if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
13340 - printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
13341 - highmem_pages = 0;
13343 - max_low_pfn -= highmem_pages;
13346 - if (highmem_pages)
13347 - printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
13350 - return max_low_pfn;
13353 -#ifndef CONFIG_XEN
13354 -#define BIOS_LOWMEM_KILOBYTES 0x413
13357 - * The BIOS places the EBDA/XBDA at the top of conventional
13358 - * memory, and usually decreases the reported amount of
13359 - * conventional memory (int 0x12) too. This also contains a
13360 - * workaround for Dell systems that neglect to reserve EBDA.
13361 - * The same workaround also avoids a problem with the AMD768MPX
13362 - * chipset: reserve a page before VGA to prevent PCI prefetch
13363 - * into it (errata #56). Usually the page is reserved anyways,
13364 - * unless you have no PS/2 mouse plugged in.
13366 -static void __init reserve_ebda_region(void)
13368 - unsigned int lowmem, ebda_addr;
13370 - /* To determine the position of the EBDA and the */
13371 - /* end of conventional memory, we need to look at */
13372 - /* the BIOS data area. In a paravirtual environment */
13373 - /* that area is absent. We'll just have to assume */
13374 - /* that the paravirt case can handle memory setup */
13375 - /* correctly, without our help. */
13376 - if (paravirt_enabled())
13379 - /* end of low (conventional) memory */
13380 - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
13383 - /* start of EBDA area */
13384 - ebda_addr = get_bios_ebda();
13386 - /* Fixup: bios puts an EBDA in the top 64K segment */
13387 - /* of conventional memory, but does not adjust lowmem. */
13388 - if ((lowmem - ebda_addr) <= 0x10000)
13389 - lowmem = ebda_addr;
13391 - /* Fixup: bios does not report an EBDA at all. */
13392 - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
13393 - if ((ebda_addr == 0) && (lowmem >= 0x9f000))
13394 - lowmem = 0x9f000;
13396 - /* Paranoia: should never happen, but... */
13397 - if ((lowmem == 0) || (lowmem >= 0x100000))
13398 - lowmem = 0x9f000;
13400 - /* reserve all memory between lowmem and the 1MB mark */
13401 - reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
13405 -#ifndef CONFIG_NEED_MULTIPLE_NODES
13406 -static void __init setup_bootmem_allocator(void);
13407 -static unsigned long __init setup_memory(void)
13410 - * partially used pages are not usable - thus
13411 - * we are rounding upwards:
13413 - min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
13414 - xen_start_info->nr_pt_frames;
13416 - max_low_pfn = find_max_low_pfn();
13418 -#ifdef CONFIG_HIGHMEM
13419 - highstart_pfn = highend_pfn = max_pfn;
13420 - if (max_pfn > max_low_pfn) {
13421 - highstart_pfn = max_low_pfn;
13423 - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
13424 - pages_to_mb(highend_pfn - highstart_pfn));
13425 - num_physpages = highend_pfn;
13426 - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
13428 - num_physpages = max_low_pfn;
13429 - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
13431 -#ifdef CONFIG_FLATMEM
13432 - max_mapnr = num_physpages;
13434 - printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
13435 - pages_to_mb(max_low_pfn));
13437 - setup_bootmem_allocator();
13439 - return max_low_pfn;
13442 -static void __init zone_sizes_init(void)
13444 - unsigned long max_zone_pfns[MAX_NR_ZONES];
13445 - memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
13446 - max_zone_pfns[ZONE_DMA] =
13447 - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
13448 - max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
13449 -#ifdef CONFIG_HIGHMEM
13450 - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
13451 - add_active_range(0, 0, highend_pfn);
13453 - add_active_range(0, 0, max_low_pfn);
13456 - free_area_init_nodes(max_zone_pfns);
13459 -extern unsigned long __init setup_memory(void);
13460 -extern void zone_sizes_init(void);
13461 -#endif /* !CONFIG_NEED_MULTIPLE_NODES */
13463 -static inline unsigned long long get_total_mem(void)
13465 - unsigned long long total;
13467 - total = max_low_pfn - min_low_pfn;
13468 -#ifdef CONFIG_HIGHMEM
13469 - total += highend_pfn - highstart_pfn;
13472 - return total << PAGE_SHIFT;
13475 -#ifdef CONFIG_KEXEC
13476 -#ifndef CONFIG_XEN
13477 -static void __init reserve_crashkernel(void)
13479 - unsigned long long total_mem;
13480 - unsigned long long crash_size, crash_base;
13483 - total_mem = get_total_mem();
13485 - ret = parse_crashkernel(boot_command_line, total_mem,
13486 - &crash_size, &crash_base);
13487 - if (ret == 0 && crash_size > 0) {
13488 - if (crash_base > 0) {
13489 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
13490 - "for crashkernel (System RAM: %ldMB)\n",
13491 - (unsigned long)(crash_size >> 20),
13492 - (unsigned long)(crash_base >> 20),
13493 - (unsigned long)(total_mem >> 20));
13495 - if (reserve_bootmem(crash_base, crash_size,
13496 - BOOTMEM_EXCLUSIVE) < 0) {
13497 - printk(KERN_INFO "crashkernel reservation "
13498 - "failed - memory is in use\n");
13502 - crashk_res.start = crash_base;
13503 - crashk_res.end = crash_base + crash_size - 1;
13505 - printk(KERN_INFO "crashkernel reservation failed - "
13506 - "you have to specify a base address\n");
13510 -#define reserve_crashkernel xen_machine_kexec_setup_resources
13513 -static inline void __init reserve_crashkernel(void)
13517 -#ifdef CONFIG_BLK_DEV_INITRD
13519 -static bool do_relocate_initrd = false;
13521 -static void __init reserve_initrd(void)
13523 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
13524 - unsigned long ramdisk_size = xen_start_info->mod_len;
13525 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
13526 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13527 - unsigned long ramdisk_here;
13529 - initrd_start = 0;
13531 - if (!xen_start_info->mod_start || !ramdisk_size)
13532 - return; /* No initrd provided by bootloader */
13534 - if (ramdisk_end < ramdisk_image) {
13535 - printk(KERN_ERR "initrd wraps around end of memory, "
13536 - "disabling initrd\n");
13539 - if (ramdisk_size >= end_of_lowmem/2) {
13540 - printk(KERN_ERR "initrd too large to handle, "
13541 - "disabling initrd\n");
13544 - if (ramdisk_end <= end_of_lowmem) {
13545 - /* All in lowmem, easy case */
13546 - reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
13547 - initrd_start = ramdisk_image + PAGE_OFFSET;
13548 - initrd_end = initrd_start+ramdisk_size;
13552 - /* We need to move the initrd down into lowmem */
13553 - ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
13555 - /* Note: this includes all the lowmem currently occupied by
13556 - the initrd, we rely on that fact to keep the data intact. */
13557 - reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
13558 - initrd_start = ramdisk_here + PAGE_OFFSET;
13559 - initrd_end = initrd_start + ramdisk_size;
13561 - do_relocate_initrd = true;
13564 -#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
13566 -static void __init relocate_initrd(void)
13568 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
13569 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
13570 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13571 - unsigned long ramdisk_here;
13572 - unsigned long slop, clen, mapaddr;
13575 - if (!do_relocate_initrd)
13578 - ramdisk_here = initrd_start - PAGE_OFFSET;
13580 - q = (char *)initrd_start;
13582 - /* Copy any lowmem portion of the initrd */
13583 - if (ramdisk_image < end_of_lowmem) {
13584 - clen = end_of_lowmem - ramdisk_image;
13585 - p = (char *)__va(ramdisk_image);
13586 - memcpy(q, p, clen);
13588 - ramdisk_image += clen;
13589 - ramdisk_size -= clen;
13592 - /* Copy the highmem portion of the initrd */
13593 - while (ramdisk_size) {
13594 - slop = ramdisk_image & ~PAGE_MASK;
13595 - clen = ramdisk_size;
13596 - if (clen > MAX_MAP_CHUNK-slop)
13597 - clen = MAX_MAP_CHUNK-slop;
13598 - mapaddr = ramdisk_image & PAGE_MASK;
13599 - p = early_ioremap(mapaddr, clen+slop);
13600 - memcpy(q, p+slop, clen);
13601 - early_iounmap(p, clen+slop);
13603 - ramdisk_image += clen;
13604 - ramdisk_size -= clen;
13608 -#endif /* CONFIG_BLK_DEV_INITRD */
13610 -void __init setup_bootmem_allocator(void)
13612 - unsigned long bootmap_size;
13614 - * Initialize the boot-time allocator (with low memory only):
13616 - bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
13618 - register_bootmem_low_pages(max_low_pfn);
13621 - * Reserve the bootmem bitmap itself as well. We do this in two
13622 - * steps (first step was init_bootmem()) because this catches
13623 - * the (very unlikely) case of us accidentally initializing the
13624 - * bootmem allocator with an invalid RAM area.
13626 - reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
13627 - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
13628 - BOOTMEM_DEFAULT);
13630 -#ifndef CONFIG_XEN
13632 - * reserve physical page 0 - it's a special BIOS page on many boxes,
13633 - * enabling clean reboots, SMP operation, laptop functions.
13635 - reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
13637 - /* reserve EBDA region */
13638 - reserve_ebda_region();
13642 - * But first pinch a few for the stack/trampoline stuff
13643 - * FIXME: Don't need the extra page at 4K, but need to fix
13644 - * trampoline before removing it. (see the GDT stuff)
13646 - reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
13648 -#ifdef CONFIG_ACPI_SLEEP
13650 - * Reserve low memory region for sleep support.
13652 - acpi_reserve_bootmem();
13654 -#endif /* !CONFIG_XEN */
13656 -#ifdef CONFIG_BLK_DEV_INITRD
13657 - reserve_initrd();
13659 - numa_kva_reserve();
13660 - reserve_crashkernel();
13662 - reserve_ibft_region();
13666 - * The node 0 pgdat is initialized before all of these because
13667 - * it's needed for bootmem. node>0 pgdats have their virtual
13668 - * space allocated before the pagetables are in place to access
13669 - * them, so they can't be cleared then.
13671 - * This should all compile down to nothing when NUMA is off.
13673 -static void __init remapped_pgdat_init(void)
13677 - for_each_online_node(nid) {
13679 - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
13684 -static void set_mca_bus(int x)
13689 -static void set_mca_bus(int x) { }
13692 -/* Overridden in paravirt.c if CONFIG_PARAVIRT */
13693 -char * __init __attribute__((weak)) memory_setup(void)
13695 - return machine_specific_memory_setup();
13698 -#ifdef CONFIG_NUMA
13700 - * In the golden day, when everything among i386 and x86_64 will be
13701 - * integrated, this will not live here
13703 -void *x86_cpu_to_node_map_early_ptr;
13704 -int x86_cpu_to_node_map_init[NR_CPUS] = {
13705 - [0 ... NR_CPUS-1] = NUMA_NO_NODE
13707 -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
13711 - * Determine if we were loaded by an EFI loader. If so, then we have also been
13712 - * passed the efi memmap, systab, etc., so we should use these data structures
13713 - * for initialization. Note, the efi init code path is determined by the
13714 - * global efi_enabled. This allows the same kernel image to be used on existing
13715 - * systems (with a traditional BIOS) as well as on EFI systems.
13717 -void __init setup_arch(char **cmdline_p)
13719 - int i, j, k, fpp;
13720 - struct physdev_set_iopl set_iopl;
13721 - unsigned long max_low_pfn;
13722 - unsigned long p2m_pages;
13724 - /* Force a quick death if the kernel panics (not domain 0). */
13725 - extern int panic_timeout;
13726 - if (!panic_timeout && !is_initial_xendomain())
13727 - panic_timeout = 1;
13729 - /* Register a call for panic conditions. */
13730 - atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
13732 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13733 - VMASST_TYPE_4gb_segments));
13734 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13735 - VMASST_TYPE_writable_pagetables));
13737 - memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
13738 - pre_setup_arch_hook();
13739 - early_cpu_init();
13740 - early_ioremap_init();
13742 - prefill_possible_map();
13746 - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
13751 - /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
13752 - properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
13754 - ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
13755 - screen_info = boot_params.screen_info;
13757 - apm_info.bios = boot_params.apm_bios_info;
13758 - ist_info = boot_params.ist_info;
13759 - saved_video_mode = boot_params.hdr.vid_mode;
13760 - if( boot_params.sys_desc_table.length != 0 ) {
13761 - set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
13762 - machine_id = boot_params.sys_desc_table.table[0];
13763 - machine_submodel_id = boot_params.sys_desc_table.table[1];
13764 - BIOS_revision = boot_params.sys_desc_table.table[2];
13766 - bootloader_type = boot_params.hdr.type_of_loader;
13768 - if (is_initial_xendomain()) {
13769 - const struct dom0_vga_console_info *info =
13770 - (void *)((char *)xen_start_info +
13771 - xen_start_info->console.dom0.info_off);
13773 - dom0_init_screen_info(info,
13774 - xen_start_info->console.dom0.info_size);
13775 - xen_start_info->console.domU.mfn = 0;
13776 - xen_start_info->console.domU.evtchn = 0;
13778 - screen_info.orig_video_isVGA = 0;
13780 -#ifdef CONFIG_BLK_DEV_RAM
13781 - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
13782 - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
13783 - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
13788 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
13789 - print_memory_map(memory_setup());
13793 - if (!boot_params.hdr.root_flags)
13794 - root_mountflags &= ~MS_RDONLY;
13795 - init_mm.start_code = (unsigned long) _text;
13796 - init_mm.end_code = (unsigned long) _etext;
13797 - init_mm.end_data = (unsigned long) _edata;
13798 - init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
13799 - xen_start_info->nr_pt_frames) << PAGE_SHIFT;
13801 - code_resource.start = virt_to_phys(_text);
13802 - code_resource.end = virt_to_phys(_etext)-1;
13803 - data_resource.start = virt_to_phys(_etext);
13804 - data_resource.end = virt_to_phys(_edata)-1;
13805 - bss_resource.start = virt_to_phys(&__bss_start);
13806 - bss_resource.end = virt_to_phys(&__bss_stop)-1;
13808 - if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
13809 - i = COMMAND_LINE_SIZE;
13810 - memcpy(boot_command_line, xen_start_info->cmd_line, i);
13811 - boot_command_line[i - 1] = '\0';
13812 - parse_early_param();
13814 - if (user_defined_memmap) {
13815 - printk(KERN_INFO "user-defined physical RAM map:\n");
13816 - print_memory_map("user");
13819 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
13820 - *cmdline_p = command_line;
13825 - /* update e820 for memory not covered by WB MTRRs */
13826 - propagate_e820_map();
13828 -#ifndef CONFIG_XEN
13829 - if (mtrr_trim_uncached_memory(max_pfn))
13830 - propagate_e820_map();
13833 - max_low_pfn = setup_memory();
13835 -#ifdef CONFIG_KVM_CLOCK
13841 - * Must be after max_low_pfn is determined, and before kernel
13842 - * pagetables are setup.
13846 - kvm_guest_init();
13849 - * NOTE: before this point _nobody_ is allowed to allocate
13850 - * any memory using the bootmem allocator. Although the
13851 - * allocator is now initialised only the first 8Mb of the kernel
13852 - * virtual address space has been mapped. All allocations before
13853 - * paging_init() has completed must use the alloc_bootmem_low_pages()
13854 - * variant (which allocates DMA'able memory) and care must be taken
13855 - * not to exceed the 8Mb limit.
13859 - smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
13864 - * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
13867 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
13868 - if (init_ohci1394_dma_early)
13869 - init_ohci1394_dma_on_all_controllers();
13872 - remapped_pgdat_init();
13874 - zone_sizes_init();
13876 -#ifdef CONFIG_X86_FIND_SMP_CONFIG
13878 - * Find and reserve possible boot-time SMP configuration:
13880 - find_smp_config();
13883 - p2m_pages = max_pfn;
13884 - if (xen_start_info->nr_pages > max_pfn) {
13886 - * the max_pfn was shrunk (probably by mem= or highmem=
13887 - * kernel parameter); shrink reservation with the HV
13889 - struct xen_memory_reservation reservation = {
13890 - .address_bits = 0,
13891 - .extent_order = 0,
13892 - .domid = DOMID_SELF
13894 - unsigned int difference;
13897 - difference = xen_start_info->nr_pages - max_pfn;
13899 - set_xen_guest_handle(reservation.extent_start,
13900 - ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
13901 - reservation.nr_extents = difference;
13902 - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
13904 - BUG_ON (ret != difference);
13906 - else if (max_pfn > xen_start_info->nr_pages)
13907 - p2m_pages = xen_start_info->nr_pages;
13909 - /* Make sure we have a correctly sized P->M table. */
13910 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
13911 - phys_to_machine_mapping = alloc_bootmem_low_pages(
13912 - max_pfn * sizeof(unsigned long));
13913 - memset(phys_to_machine_mapping, ~0,
13914 - max_pfn * sizeof(unsigned long));
13915 - memcpy(phys_to_machine_mapping,
13916 - (unsigned long *)xen_start_info->mfn_list,
13917 - p2m_pages * sizeof(unsigned long));
13919 - __pa(xen_start_info->mfn_list),
13920 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
13921 - sizeof(unsigned long))));
13924 - * Initialise the list of the frames that specify the list of
13925 - * frames that make up the p2m table. Used by save/restore
13927 - pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
13929 - fpp = PAGE_SIZE/sizeof(unsigned long);
13930 - for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
13931 - if ((j % fpp) == 0) {
13934 - pfn_to_mfn_frame_list[k] =
13935 - alloc_bootmem_low_pages(PAGE_SIZE);
13936 - pfn_to_mfn_frame_list_list[k] =
13937 - virt_to_mfn(pfn_to_mfn_frame_list[k]);
13940 - pfn_to_mfn_frame_list[k][j] =
13941 - virt_to_mfn(&phys_to_machine_mapping[i]);
13943 - HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
13944 - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
13945 - virt_to_mfn(pfn_to_mfn_frame_list_list);
13948 - /* Mark all ISA DMA channels in-use - using them wouldn't work. */
13949 - for (i = 0; i < MAX_DMA_CHANNELS; ++i)
13950 - if (i != 4 && request_dma(i, "xen") != 0)
13954 - * NOTE: at this point the bootmem allocator is fully available.
13957 -#ifdef CONFIG_BLK_DEV_INITRD
13958 - relocate_initrd();
13961 - paravirt_post_allocator_init();
13963 - if (is_initial_xendomain())
13964 - dmi_scan_machine();
13968 -#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
13970 - * setup to use the early static init tables during kernel startup
13971 - * X86_SMP will exclude sub-arches that don't deal well with it.
13973 - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
13974 - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
13975 -#ifdef CONFIG_NUMA
13976 - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
13980 -#ifdef CONFIG_X86_GENERICARCH
13981 - generic_apic_probe();
13984 - set_iopl.iopl = 1;
13985 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
13987 -#ifdef CONFIG_ACPI
13988 - if (!is_initial_xendomain()) {
13989 - printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
13990 - acpi_disabled = 1;
13995 - * Parse the ACPI tables for possible boot-time SMP configuration.
13997 - acpi_boot_table_init();
14000 -#ifndef CONFIG_XEN
14004 -#ifdef CONFIG_ACPI
14005 - acpi_boot_init();
14007 -#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
14008 - if (def_to_bigsmp)
14009 - printk(KERN_WARNING "More than 8 CPUs detected and "
14010 - "CONFIG_X86_PC cannot handle it.\nUse "
14011 - "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
14014 -#ifdef CONFIG_X86_LOCAL_APIC
14015 - if (smp_found_config)
14016 - get_smp_config();
14019 - e820_register_memory();
14020 - e820_mark_nosave_regions();
14022 - if (is_initial_xendomain()) {
14024 -#if defined(CONFIG_VGA_CONSOLE)
14025 - if (!efi_enabled ||
14026 - (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14027 - conswitchp = &vga_con;
14028 -#elif defined(CONFIG_DUMMY_CONSOLE)
14029 - conswitchp = &dummy_con;
14033 -#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14034 - conswitchp = &dummy_con;
14040 -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14042 - HYPERVISOR_shutdown(SHUTDOWN_crash);
14043 - /* we're never actually going to get here... */
14044 - return NOTIFY_DONE;
14048 - * Request address space for all standard resources
14050 - * This is called just before pcibios_init(), which is also a
14051 - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
14053 -static int __init request_standard_resources(void)
14057 - /* Nothing to do if not running in dom0. */
14058 - if (!is_initial_xendomain())
14061 - printk(KERN_INFO "Setting up standard PCI resources\n");
14062 - init_iomem_resources(&code_resource, &data_resource, &bss_resource);
14064 - request_resource(&iomem_resource, &video_ram_resource);
14066 - /* request I/O space for devices used on all i[345]86 PCs */
14067 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14068 - request_resource(&ioport_resource, &standard_io_resources[i]);
14072 -subsys_initcall(request_standard_resources);
14073 Index: head-2008-12-01/arch/x86/kernel/setup_64-xen.c
14074 ===================================================================
14075 --- head-2008-12-01.orig/arch/x86/kernel/setup_64-xen.c 2008-12-01 11:44:55.000000000 +0100
14076 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
14079 - * Copyright (C) 1995 Linus Torvalds
14083 - * This file handles the architecture-dependent parts of initialization
14086 -#include <linux/errno.h>
14087 -#include <linux/sched.h>
14088 -#include <linux/kernel.h>
14089 -#include <linux/mm.h>
14090 -#include <linux/stddef.h>
14091 -#include <linux/unistd.h>
14092 -#include <linux/ptrace.h>
14093 -#include <linux/slab.h>
14094 -#include <linux/user.h>
14095 -#include <linux/screen_info.h>
14096 -#include <linux/ioport.h>
14097 -#include <linux/delay.h>
14098 -#include <linux/init.h>
14099 -#include <linux/initrd.h>
14100 -#include <linux/highmem.h>
14101 -#include <linux/bootmem.h>
14102 -#include <linux/module.h>
14103 -#include <asm/processor.h>
14104 -#include <linux/console.h>
14105 -#include <linux/seq_file.h>
14106 -#include <linux/crash_dump.h>
14107 -#include <linux/root_dev.h>
14108 -#include <linux/pci.h>
14109 -#include <asm/pci-direct.h>
14110 -#include <linux/efi.h>
14111 -#include <linux/acpi.h>
14112 -#include <linux/kallsyms.h>
14113 -#include <linux/edd.h>
14114 -#include <linux/iscsi_ibft.h>
14115 -#include <linux/mmzone.h>
14116 -#include <linux/kexec.h>
14117 -#include <linux/cpufreq.h>
14118 -#include <linux/dmi.h>
14119 -#include <linux/dma-mapping.h>
14120 -#include <linux/ctype.h>
14121 -#include <linux/sort.h>
14122 -#include <linux/uaccess.h>
14123 -#include <linux/init_ohci1394_dma.h>
14124 -#include <linux/kvm_para.h>
14126 -#include <asm/mtrr.h>
14127 -#include <asm/uaccess.h>
14128 -#include <asm/system.h>
14129 -#include <asm/vsyscall.h>
14130 -#include <asm/io.h>
14131 -#include <asm/smp.h>
14132 -#include <asm/msr.h>
14133 -#include <asm/desc.h>
14134 -#include <video/edid.h>
14135 -#include <asm/e820.h>
14136 -#include <asm/dma.h>
14137 -#include <asm/gart.h>
14138 -#include <asm/mpspec.h>
14139 -#include <asm/mmu_context.h>
14140 -#include <asm/proto.h>
14141 -#include <asm/setup.h>
14142 -#include <asm/numa.h>
14143 -#include <asm/sections.h>
14144 -#include <asm/dmi.h>
14145 -#include <asm/cacheflush.h>
14146 -#include <asm/mce.h>
14147 -#include <asm/ds.h>
14148 -#include <asm/topology.h>
14149 -#include <asm/pat.h>
14151 -#include <mach_apic.h>
14153 -#include <linux/percpu.h>
14154 -#include <xen/interface/physdev.h>
14155 -#include "setup_arch_pre.h"
14156 -#include <asm/hypervisor.h>
14157 -#include <xen/interface/nmi.h>
14158 -#include <xen/features.h>
14159 -#include <xen/firmware.h>
14160 -#include <xen/xencons.h>
14161 -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
14162 -#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
14163 -#include <asm/mach-xen/setup_arch_post.h>
14164 -#include <xen/interface/memory.h>
14167 -#include <xen/interface/kexec.h>
14170 -extern unsigned long start_pfn;
14171 -extern struct edid_info edid_info;
14173 -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
14174 -EXPORT_SYMBOL(HYPERVISOR_shared_info);
14176 -static int xen_panic_event(struct notifier_block *, unsigned long, void *);
14177 -static struct notifier_block xen_panic_block = {
14178 - xen_panic_event, NULL, 0 /* try to go last */
14181 -unsigned long *phys_to_machine_mapping;
14182 -unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
14184 -EXPORT_SYMBOL(phys_to_machine_mapping);
14186 -DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
14187 -DEFINE_PER_CPU(int, nr_multicall_ents);
14189 -/* Raw start-of-day parameters from the hypervisor. */
14190 -start_info_t *xen_start_info;
14191 -EXPORT_SYMBOL(xen_start_info);
14195 - * Machine setup..
14198 -struct cpuinfo_x86 boot_cpu_data __read_mostly;
14199 -EXPORT_SYMBOL(boot_cpu_data);
14201 -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
14203 -unsigned long mmu_cr4_features;
14205 -/* Boot loader ID as an integer, for the benefit of proc_dointvec */
14206 -int bootloader_type;
14208 -unsigned long saved_video_mode;
14210 -int force_mwait __cpuinitdata;
14213 - * Early DMI memory
14215 -int dmi_alloc_index;
14216 -char dmi_alloc_data[DMI_MAX_DATA];
14221 -struct screen_info screen_info;
14222 -EXPORT_SYMBOL(screen_info);
14223 -struct sys_desc_table_struct {
14224 - unsigned short length;
14225 - unsigned char table[0];
14228 -struct edid_info edid_info;
14229 -EXPORT_SYMBOL_GPL(edid_info);
14231 -extern int root_mountflags;
14233 -char __initdata command_line[COMMAND_LINE_SIZE];
14235 -static struct resource standard_io_resources[] = {
14236 - { .name = "dma1", .start = 0x00, .end = 0x1f,
14237 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14238 - { .name = "pic1", .start = 0x20, .end = 0x21,
14239 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14240 - { .name = "timer0", .start = 0x40, .end = 0x43,
14241 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14242 - { .name = "timer1", .start = 0x50, .end = 0x53,
14243 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14244 - { .name = "keyboard", .start = 0x60, .end = 0x60,
14245 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14246 - { .name = "keyboard", .start = 0x64, .end = 0x64,
14247 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14248 - { .name = "dma page reg", .start = 0x80, .end = 0x8f,
14249 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14250 - { .name = "pic2", .start = 0xa0, .end = 0xa1,
14251 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14252 - { .name = "dma2", .start = 0xc0, .end = 0xdf,
14253 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14254 - { .name = "fpu", .start = 0xf0, .end = 0xff,
14255 - .flags = IORESOURCE_BUSY | IORESOURCE_IO }
14258 -#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
14260 -static struct resource data_resource = {
14261 - .name = "Kernel data",
14264 - .flags = IORESOURCE_RAM,
14266 -static struct resource code_resource = {
14267 - .name = "Kernel code",
14270 - .flags = IORESOURCE_RAM,
14272 -static struct resource bss_resource = {
14273 - .name = "Kernel bss",
14276 - .flags = IORESOURCE_RAM,
14279 -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
14281 -#ifdef CONFIG_PROC_VMCORE
14282 -/* elfcorehdr= specifies the location of elf core header
14283 - * stored by the crashed kernel. This option will be passed
14284 - * by kexec loader to the capture kernel.
14286 -static int __init setup_elfcorehdr(char *arg)
14291 - elfcorehdr_addr = memparse(arg, &end);
14292 - return end > arg ? 0 : -EINVAL;
14294 -early_param("elfcorehdr", setup_elfcorehdr);
14297 -#ifndef CONFIG_NUMA
14298 -static void __init
14299 -contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
14301 - unsigned long bootmap_size, bootmap;
14303 - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
14304 - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
14306 - if (bootmap == -1L)
14307 - panic("Cannot find bootmem map of size %ld\n", bootmap_size);
14308 - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
14309 - e820_register_active_regions(0, start_pfn, end_pfn);
14311 - free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
14312 - early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
14314 - free_bootmem_with_active_regions(0, end_pfn);
14315 - early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
14317 - reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
14321 -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
14323 -#ifdef CONFIG_EDD_MODULE
14324 -EXPORT_SYMBOL(edd);
14326 -#ifndef CONFIG_XEN
14328 - * copy_edd() - Copy the BIOS EDD information
14329 - * from boot_params into a safe place.
14332 -static inline void copy_edd(void)
14334 - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
14335 - sizeof(edd.mbr_signature));
14336 - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
14337 - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
14338 - edd.edd_info_nr = boot_params.eddbuf_entries;
14342 -static inline void copy_edd(void)
14347 -#ifdef CONFIG_KEXEC
14348 -#ifndef CONFIG_XEN
14349 -static void __init reserve_crashkernel(void)
14351 - unsigned long long total_mem;
14352 - unsigned long long crash_size, crash_base;
14355 - total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
14357 - ret = parse_crashkernel(boot_command_line, total_mem,
14358 - &crash_size, &crash_base);
14359 - if (ret == 0 && crash_size) {
14360 - if (crash_base <= 0) {
14361 - printk(KERN_INFO "crashkernel reservation failed - "
14362 - "you have to specify a base address\n");
14366 - if (reserve_bootmem(crash_base, crash_size,
14367 - BOOTMEM_EXCLUSIVE) < 0) {
14368 - printk(KERN_INFO "crashkernel reservation failed - "
14369 - "memory is in use\n");
14373 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
14374 - "for crashkernel (System RAM: %ldMB)\n",
14375 - (unsigned long)(crash_size >> 20),
14376 - (unsigned long)(crash_base >> 20),
14377 - (unsigned long)(total_mem >> 20));
14378 - crashk_res.start = crash_base;
14379 - crashk_res.end = crash_base + crash_size - 1;
14380 - insert_resource(&iomem_resource, &crashk_res);
14384 -#define reserve_crashkernel xen_machine_kexec_setup_resources
14387 -static inline void __init reserve_crashkernel(void)
14391 -/* Overridden in paravirt.c if CONFIG_PARAVIRT */
14392 -void __attribute__((weak)) __init memory_setup(void)
14394 - machine_specific_memory_setup();
14397 -static void __init parse_setup_data(void)
14399 - struct setup_data *data;
14400 - unsigned long pa_data;
14402 - if (boot_params.hdr.version < 0x0209)
14404 - pa_data = boot_params.hdr.setup_data;
14405 - while (pa_data) {
14406 - data = early_ioremap(pa_data, PAGE_SIZE);
14407 - switch (data->type) {
14411 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
14412 - free_early(pa_data, pa_data+sizeof(*data)+data->len);
14414 - pa_data = data->next;
14415 - early_iounmap(data, PAGE_SIZE);
14419 -#ifdef CONFIG_PCI_MMCONFIG
14420 -extern void __cpuinit fam10h_check_enable_mmcfg(void);
14421 -extern void __init check_enable_amd_mmconf_dmi(void);
14423 -void __cpuinit fam10h_check_enable_mmcfg(void)
14426 -void __init check_enable_amd_mmconf_dmi(void)
14432 - * setup_arch - architecture-specific boot-time initializations
14434 - * Note: On x86_64, fixmaps are ready for use even before this is called.
14436 -void __init setup_arch(char **cmdline_p)
14441 - extern struct e820map machine_e820;
14443 - printk(KERN_INFO "Command line: %s\n", boot_command_line);
14445 - /* Register a call for panic conditions. */
14446 - atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
14448 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
14449 - VMASST_TYPE_writable_pagetables));
14451 - early_ioremap_init();
14453 - ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
14454 - screen_info = boot_params.screen_info;
14456 - if (is_initial_xendomain()) {
14457 - const struct dom0_vga_console_info *info =
14458 - (void *)((char *)xen_start_info +
14459 - xen_start_info->console.dom0.info_off);
14461 - dom0_init_screen_info(info,
14462 - xen_start_info->console.dom0.info_size);
14463 - xen_start_info->console.domU.mfn = 0;
14464 - xen_start_info->console.domU.evtchn = 0;
14466 - screen_info.orig_video_isVGA = 0;
14470 - printk(KERN_INFO "Command line: %s\n", boot_command_line);
14472 - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
14473 - screen_info = boot_params.screen_info;
14474 - edid_info = boot_params.edid_info;
14475 -#endif /* !CONFIG_XEN */
14476 - saved_video_mode = boot_params.hdr.vid_mode;
14477 - bootloader_type = boot_params.hdr.type_of_loader;
14479 -#ifdef CONFIG_BLK_DEV_RAM
14480 - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
14481 - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
14482 - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
14485 - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
14495 - if (!boot_params.hdr.root_flags)
14496 - root_mountflags &= ~MS_RDONLY;
14497 - init_mm.start_code = (unsigned long) &_text;
14498 - init_mm.end_code = (unsigned long) &_etext;
14499 - init_mm.end_data = (unsigned long) &_edata;
14500 - init_mm.brk = (unsigned long) &_end;
14502 - code_resource.start = virt_to_phys(&_text);
14503 - code_resource.end = virt_to_phys(&_etext)-1;
14504 - data_resource.start = virt_to_phys(&_etext);
14505 - data_resource.end = virt_to_phys(&_edata)-1;
14506 - bss_resource.start = virt_to_phys(&__bss_start);
14507 - bss_resource.end = virt_to_phys(&__bss_stop)-1;
14509 - early_identify_cpu(&boot_cpu_data);
14511 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
14512 - *cmdline_p = command_line;
14514 - parse_setup_data();
14516 - parse_early_param();
14518 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
14519 - if (init_ohci1394_dma_early)
14520 - init_ohci1394_dma_on_all_controllers();
14523 - finish_e820_parsing();
14525 -#ifndef CONFIG_XEN
14526 - /* after parse_early_param, so could debug it */
14527 - insert_resource(&iomem_resource, &code_resource);
14528 - insert_resource(&iomem_resource, &data_resource);
14529 - insert_resource(&iomem_resource, &bss_resource);
14532 - early_gart_iommu_check();
14534 - e820_register_active_regions(0, 0, -1UL);
14536 - * partially used pages are not usable - thus
14537 - * we are rounding upwards:
14539 - end_pfn = e820_end_of_ram();
14540 - /* update e820 for memory not covered by WB MTRRs */
14542 -#ifndef CONFIG_XEN
14543 - if (mtrr_trim_uncached_memory(end_pfn)) {
14544 - e820_register_active_regions(0, 0, -1UL);
14545 - end_pfn = e820_end_of_ram();
14549 - num_physpages = end_pfn;
14550 - max_mapnr = end_pfn;
14554 - max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
14558 -#ifndef CONFIG_XEN
14562 - if (is_initial_xendomain())
14563 - dmi_scan_machine();
14567 -#ifdef CONFIG_KVM_CLOCK
14571 -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
14572 - /* setup to use the early static init tables during kernel startup */
14573 - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
14574 - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
14575 -#ifdef CONFIG_NUMA
14576 - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
14580 - /* How many end-of-memory variables you have, grandma! */
14581 - max_low_pfn = end_pfn;
14582 - max_pfn = end_pfn;
14583 - high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
14585 - /* Remove active ranges so rediscovery with NUMA-awareness happens */
14586 - remove_all_active_ranges();
14588 -#ifdef CONFIG_ACPI_NUMA
14590 - * Parse SRAT to discover nodes.
14592 - acpi_numa_init();
14595 -#ifdef CONFIG_NUMA
14596 - numa_initmem_init(0, end_pfn);
14598 - contig_initmem_init(0, end_pfn);
14601 -#ifndef CONFIG_XEN
14602 - dma32_reserve_bootmem();
14604 -#ifdef CONFIG_ACPI_SLEEP
14606 - * Reserve low memory region for sleep support.
14608 - acpi_reserve_bootmem();
14612 - efi_reserve_bootmem();
14615 -#ifdef CONFIG_BLK_DEV_INITRD
14617 - if (xen_start_info->mod_start) {
14618 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
14619 - unsigned long ramdisk_size = xen_start_info->mod_len;
14621 - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
14622 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
14623 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
14625 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
14626 - unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
14628 - if (ramdisk_end <= end_of_mem) {
14630 - * don't need to reserve again, already reserved early
14631 - * in x86_64_start_kernel, and early_res_to_bootmem
14632 - * convert that to reserved in bootmem
14634 - initrd_start = ramdisk_image + PAGE_OFFSET;
14635 - initrd_end = initrd_start+ramdisk_size;
14637 - initrd_below_start_ok = 1;
14640 - free_bootmem(ramdisk_image, ramdisk_size);
14641 - printk(KERN_ERR "initrd extends beyond end of memory "
14642 - "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
14643 - ramdisk_end, end_of_mem);
14644 - initrd_start = 0;
14648 - reserve_crashkernel();
14650 - reserve_ibft_region();
14654 -#ifdef CONFIG_X86_LOCAL_APIC
14656 - * Find and reserve possible boot-time SMP configuration:
14658 - find_smp_config();
14662 - int i, j, k, fpp;
14663 - unsigned long p2m_pages;
14665 - p2m_pages = end_pfn;
14666 - if (xen_start_info->nr_pages > end_pfn) {
14668 - * the end_pfn was shrunk (probably by mem= or highmem=
14669 - * kernel parameter); shrink reservation with the HV
14671 - struct xen_memory_reservation reservation = {
14672 - .address_bits = 0,
14673 - .extent_order = 0,
14674 - .domid = DOMID_SELF
14676 - unsigned int difference;
14679 - difference = xen_start_info->nr_pages - end_pfn;
14681 - set_xen_guest_handle(reservation.extent_start,
14682 - ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
14683 - reservation.nr_extents = difference;
14684 - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
14686 - BUG_ON (ret != difference);
14688 - else if (end_pfn > xen_start_info->nr_pages)
14689 - p2m_pages = xen_start_info->nr_pages;
14691 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
14692 - /* Make sure we have a large enough P->M table. */
14693 - phys_to_machine_mapping = alloc_bootmem_pages(
14694 - end_pfn * sizeof(unsigned long));
14695 - memset(phys_to_machine_mapping, ~0,
14696 - end_pfn * sizeof(unsigned long));
14697 - memcpy(phys_to_machine_mapping,
14698 - (unsigned long *)xen_start_info->mfn_list,
14699 - p2m_pages * sizeof(unsigned long));
14701 - __pa(xen_start_info->mfn_list),
14702 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
14703 - sizeof(unsigned long))));
14706 - * Initialise the list of the frames that specify the
14707 - * list of frames that make up the p2m table. Used by
14710 - pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
14712 - fpp = PAGE_SIZE/sizeof(unsigned long);
14713 - for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
14714 - if ((j % fpp) == 0) {
14717 - pfn_to_mfn_frame_list[k] =
14718 - alloc_bootmem_pages(PAGE_SIZE);
14719 - pfn_to_mfn_frame_list_list[k] =
14720 - virt_to_mfn(pfn_to_mfn_frame_list[k]);
14723 - pfn_to_mfn_frame_list[k][j] =
14724 - virt_to_mfn(&phys_to_machine_mapping[i]);
14726 - HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
14727 - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
14728 - virt_to_mfn(pfn_to_mfn_frame_list_list);
14731 - /* Mark all ISA DMA channels in-use - using them wouldn't work. */
14732 - for (i = 0; i < MAX_DMA_CHANNELS; ++i)
14733 - if (i != 4 && request_dma(i, "xen") != 0)
14737 -#ifdef CONFIG_ACPI
14738 - if (!is_initial_xendomain()) {
14739 - acpi_disabled = 1;
14745 -#ifndef CONFIG_XEN
14749 -#ifdef CONFIG_ACPI
14751 - * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
14752 - * Call this early for SRAT node setup.
14754 - acpi_boot_table_init();
14757 - * Read APIC and some other early information from ACPI tables.
14759 - acpi_boot_init();
14762 - init_cpu_to_node();
14764 -#ifdef CONFIG_X86_LOCAL_APIC
14766 - * get boot-time SMP configuration:
14768 - if (smp_found_config)
14769 - get_smp_config();
14770 -#ifndef CONFIG_XEN
14771 - init_apic_mappings();
14772 - ioapic_init_mappings();
14775 -#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
14776 - prefill_possible_map();
14779 - kvm_guest_init();
14782 - * We trust e820 completely. No explicit ROM probing in memory.
14785 - if (is_initial_xendomain())
14786 - e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
14788 - e820_reserve_resources(e820.map, e820.nr_map);
14789 - e820_mark_nosave_regions();
14792 - /* request I/O space for devices used on all i[345]86 PCs */
14793 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14794 - request_resource(&ioport_resource, &standard_io_resources[i]);
14797 - if (is_initial_xendomain())
14798 - e820_setup_gap(machine_e820.map, machine_e820.nr_map);
14800 - e820_setup_gap(e820.map, e820.nr_map);
14805 - struct physdev_set_iopl set_iopl;
14807 - set_iopl.iopl = 1;
14808 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
14810 - if (is_initial_xendomain()) {
14812 -#if defined(CONFIG_VGA_CONSOLE)
14813 - conswitchp = &vga_con;
14814 -#elif defined(CONFIG_DUMMY_CONSOLE)
14815 - conswitchp = &dummy_con;
14819 -#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14820 - conswitchp = &dummy_con;
14824 -#else /* CONFIG_XEN */
14827 -#if defined(CONFIG_VGA_CONSOLE)
14828 - if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14829 - conswitchp = &vga_con;
14830 -#elif defined(CONFIG_DUMMY_CONSOLE)
14831 - conswitchp = &dummy_con;
14835 -#endif /* !CONFIG_XEN */
14837 - /* do this before identify_cpu for boot cpu */
14838 - check_enable_amd_mmconf_dmi();
14843 -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14845 - HYPERVISOR_shutdown(SHUTDOWN_crash);
14846 - /* we're never actually going to get here... */
14847 - return NOTIFY_DONE;
14849 -#endif /* !CONFIG_XEN */
14852 -static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
14856 - if (c->extended_cpuid_level < 0x80000004)
14859 - v = (unsigned int *) c->x86_model_id;
14860 - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
14861 - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
14862 - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
14863 - c->x86_model_id[48] = 0;
14868 -static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
14870 - unsigned int n, dummy, eax, ebx, ecx, edx;
14872 - n = c->extended_cpuid_level;
14874 - if (n >= 0x80000005) {
14875 - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
14876 - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
14877 - "D cache %dK (%d bytes/line)\n",
14878 - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
14879 - c->x86_cache_size = (ecx>>24) + (edx>>24);
14880 - /* On K8 L1 TLB is inclusive, so don't count it */
14881 - c->x86_tlbsize = 0;
14884 - if (n >= 0x80000006) {
14885 - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
14886 - ecx = cpuid_ecx(0x80000006);
14887 - c->x86_cache_size = ecx >> 16;
14888 - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
14890 - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
14891 - c->x86_cache_size, ecx & 0xFF);
14893 - if (n >= 0x80000008) {
14894 - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
14895 - c->x86_virt_bits = (eax >> 8) & 0xff;
14896 - c->x86_phys_bits = eax & 0xff;
14900 -#ifdef CONFIG_NUMA
14901 -static int __cpuinit nearby_node(int apicid)
14905 - for (i = apicid - 1; i >= 0; i--) {
14906 - node = apicid_to_node[i];
14907 - if (node != NUMA_NO_NODE && node_online(node))
14910 - for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
14911 - node = apicid_to_node[i];
14912 - if (node != NUMA_NO_NODE && node_online(node))
14915 - return first_node(node_online_map); /* Shouldn't happen */
14920 - * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
14921 - * Assumes number of cores is a power of two.
14923 -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
14927 -#ifdef CONFIG_NUMA
14928 - int cpu = smp_processor_id();
14930 - unsigned apicid = hard_smp_processor_id();
14932 - bits = c->x86_coreid_bits;
14934 - /* Low order bits define the core id (index of core in socket) */
14935 - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
14936 - /* Convert the initial APIC ID into the socket ID */
14937 - c->phys_proc_id = c->initial_apicid >> bits;
14939 -#ifdef CONFIG_NUMA
14940 - node = c->phys_proc_id;
14941 - if (apicid_to_node[apicid] != NUMA_NO_NODE)
14942 - node = apicid_to_node[apicid];
14943 - if (!node_online(node)) {
14944 - /* Two possibilities here:
14945 - - The CPU is missing memory and no node was created.
14946 - In that case try picking one from a nearby CPU
14947 - - The APIC IDs differ from the HyperTransport node IDs
14948 - which the K8 northbridge parsing fills in.
14949 - Assume they are all increased by a constant offset,
14950 - but in the same order as the HT nodeids.
14951 - If that doesn't result in a usable node fall back to the
14952 - path for the previous case. */
14954 - int ht_nodeid = c->initial_apicid;
14956 - if (ht_nodeid >= 0 &&
14957 - apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
14958 - node = apicid_to_node[ht_nodeid];
14959 - /* Pick a nearby node */
14960 - if (!node_online(node))
14961 - node = nearby_node(apicid);
14963 - numa_set_node(cpu, node);
14965 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
14970 -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
14973 - unsigned bits, ecx;
14975 - /* Multi core CPU? */
14976 - if (c->extended_cpuid_level < 0x80000008)
14979 - ecx = cpuid_ecx(0x80000008);
14981 - c->x86_max_cores = (ecx & 0xff) + 1;
14983 - /* CPU telling us the core id bits shift? */
14984 - bits = (ecx >> 12) & 0xF;
14986 - /* Otherwise recompute */
14988 - while ((1 << bits) < c->x86_max_cores)
14992 - c->x86_coreid_bits = bits;
14997 -#define ENABLE_C1E_MASK 0x18000000
14998 -#define CPUID_PROCESSOR_SIGNATURE 1
14999 -#define CPUID_XFAM 0x0ff00000
15000 -#define CPUID_XFAM_K8 0x00000000
15001 -#define CPUID_XFAM_10H 0x00100000
15002 -#define CPUID_XFAM_11H 0x00200000
15003 -#define CPUID_XMOD 0x000f0000
15004 -#define CPUID_XMOD_REV_F 0x00040000
15006 -#ifndef CONFIG_XEN
15007 -/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
15008 -static __cpuinit int amd_apic_timer_broken(void)
15010 - u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
15012 - switch (eax & CPUID_XFAM) {
15013 - case CPUID_XFAM_K8:
15014 - if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
15016 - case CPUID_XFAM_10H:
15017 - case CPUID_XFAM_11H:
15018 - rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
15019 - if (lo & ENABLE_C1E_MASK)
15023 - /* err on the side of caution */
15030 -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
15032 - early_init_amd_mc(c);
15034 - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
15035 - if (c->x86_power & (1<<8))
15036 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15039 -static void __cpuinit init_amd(struct cpuinfo_x86 *c)
15044 - unsigned long value;
15047 - * Disable TLB flush filter by setting HWCR.FFDIS on K8
15048 - * bit 6 of msr C001_0015
15050 - * Errata 63 for SH-B3 steppings
15051 - * Errata 122 for all steppings (F+ have it disabled by default)
15053 - if (c->x86 == 15) {
15054 - rdmsrl(MSR_K8_HWCR, value);
15056 - wrmsrl(MSR_K8_HWCR, value);
15060 - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
15061 - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
15062 - clear_cpu_cap(c, 0*32+31);
15064 - /* On C+ stepping K8 rep microcode works well for copy/memset */
15065 - level = cpuid_eax(1);
15066 - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
15067 - level >= 0x0f58))
15068 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15069 - if (c->x86 == 0x10 || c->x86 == 0x11)
15070 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15072 - /* Enable workaround for FXSAVE leak */
15074 - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
15076 - level = get_model_name(c);
15078 - switch (c->x86) {
15080 - /* Should distinguish Models here, but this is only
15081 - a fallback anyways. */
15082 - strcpy(c->x86_model_id, "Hammer");
15086 - display_cacheinfo(c);
15088 - /* Multi core CPU? */
15089 - if (c->extended_cpuid_level >= 0x80000008)
15090 - amd_detect_cmp(c);
15092 - if (c->extended_cpuid_level >= 0x80000006 &&
15093 - (cpuid_edx(0x80000006) & 0xf000))
15094 - num_cache_leaves = 4;
15096 - num_cache_leaves = 3;
15098 - if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
15099 - set_cpu_cap(c, X86_FEATURE_K8);
15101 - /* MFENCE stops RDTSC speculation */
15102 - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
15104 - if (c->x86 == 0x10)
15105 - fam10h_check_enable_mmcfg();
15107 -#ifndef CONFIG_XEN
15108 - if (amd_apic_timer_broken())
15109 - disable_apic_timer = 1;
15111 - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
15112 - unsigned long long tseg;
15115 - * Split up direct mapping around the TSEG SMM area.
15116 - * Don't do it for gbpages because there seems very little
15117 - * benefit in doing so.
15119 - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
15120 - (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
15121 - set_memory_4k((unsigned long)__va(tseg), 1);
15126 -void __cpuinit detect_ht(struct cpuinfo_x86 *c)
15129 - u32 eax, ebx, ecx, edx;
15130 - int index_msb, core_bits;
15132 - cpuid(1, &eax, &ebx, &ecx, &edx);
15135 - if (!cpu_has(c, X86_FEATURE_HT))
15137 - if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
15140 - smp_num_siblings = (ebx & 0xff0000) >> 16;
15142 - if (smp_num_siblings == 1) {
15143 - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
15144 - } else if (smp_num_siblings > 1) {
15146 - if (smp_num_siblings > NR_CPUS) {
15147 - printk(KERN_WARNING "CPU: Unsupported number of "
15148 - "siblings %d", smp_num_siblings);
15149 - smp_num_siblings = 1;
15153 - index_msb = get_count_order(smp_num_siblings);
15154 - c->phys_proc_id = phys_pkg_id(index_msb);
15156 - smp_num_siblings = smp_num_siblings / c->x86_max_cores;
15158 - index_msb = get_count_order(smp_num_siblings);
15160 - core_bits = get_count_order(c->x86_max_cores);
15162 - c->cpu_core_id = phys_pkg_id(index_msb) &
15163 - ((1 << core_bits) - 1);
15166 - if ((c->x86_max_cores * smp_num_siblings) > 1) {
15167 - printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
15168 - c->phys_proc_id);
15169 - printk(KERN_INFO "CPU: Processor Core ID: %d\n",
15177 - * find out the number of processor cores on the die
15179 -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
15181 - unsigned int eax, t;
15183 - if (c->cpuid_level < 4)
15186 - cpuid_count(4, 0, &eax, &t, &t, &t);
15189 - return ((eax >> 26) + 1);
15194 -static void __cpuinit srat_detect_node(void)
15196 -#ifdef CONFIG_NUMA
15198 - int cpu = smp_processor_id();
15199 - int apicid = hard_smp_processor_id();
15201 - /* Don't do the funky fallback heuristics the AMD version employs
15203 - node = apicid_to_node[apicid];
15204 - if (node == NUMA_NO_NODE || !node_online(node))
15205 - node = first_node(node_online_map);
15206 - numa_set_node(cpu, node);
15208 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
15212 -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
15214 - if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
15215 - (c->x86 == 0x6 && c->x86_model >= 0x0e))
15216 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15219 -static void __cpuinit init_intel(struct cpuinfo_x86 *c)
15221 - /* Cache sizes */
15224 - init_intel_cacheinfo(c);
15225 - if (c->cpuid_level > 9) {
15226 - unsigned eax = cpuid_eax(10);
15227 - /* Check for version and the number of counters */
15228 - if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
15229 - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
15232 - if (cpu_has_ds) {
15233 - unsigned int l1, l2;
15234 - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
15235 - if (!(l1 & (1<<11)))
15236 - set_cpu_cap(c, X86_FEATURE_BTS);
15237 - if (!(l1 & (1<<12)))
15238 - set_cpu_cap(c, X86_FEATURE_PEBS);
15243 - ds_init_intel(c);
15245 - n = c->extended_cpuid_level;
15246 - if (n >= 0x80000008) {
15247 - unsigned eax = cpuid_eax(0x80000008);
15248 - c->x86_virt_bits = (eax >> 8) & 0xff;
15249 - c->x86_phys_bits = eax & 0xff;
15250 - /* CPUID workaround for Intel 0F34 CPU */
15251 - if (c->x86_vendor == X86_VENDOR_INTEL &&
15252 - c->x86 == 0xF && c->x86_model == 0x3 &&
15253 - c->x86_mask == 0x4)
15254 - c->x86_phys_bits = 36;
15257 - if (c->x86 == 15)
15258 - c->x86_cache_alignment = c->x86_clflush_size * 2;
15260 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15261 - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15262 - c->x86_max_cores = intel_num_cpu_cores(c);
15264 - srat_detect_node();
15267 -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
15269 - if (c->x86 == 0x6 && c->x86_model >= 0xf)
15270 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15273 -static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
15275 - /* Cache sizes */
15278 - n = c->extended_cpuid_level;
15279 - if (n >= 0x80000008) {
15280 - unsigned eax = cpuid_eax(0x80000008);
15281 - c->x86_virt_bits = (eax >> 8) & 0xff;
15282 - c->x86_phys_bits = eax & 0xff;
15285 - if (c->x86 == 0x6 && c->x86_model >= 0xf) {
15286 - c->x86_cache_alignment = c->x86_clflush_size * 2;
15287 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15288 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15290 - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15293 -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
15295 - char *v = c->x86_vendor_id;
15297 - if (!strcmp(v, "AuthenticAMD"))
15298 - c->x86_vendor = X86_VENDOR_AMD;
15299 - else if (!strcmp(v, "GenuineIntel"))
15300 - c->x86_vendor = X86_VENDOR_INTEL;
15301 - else if (!strcmp(v, "CentaurHauls"))
15302 - c->x86_vendor = X86_VENDOR_CENTAUR;
15304 - c->x86_vendor = X86_VENDOR_UNKNOWN;
15307 -/* Do some early cpuid on the boot CPU to get some parameter that are
15308 - needed before check_bugs. Everything advanced is in identify_cpu
15310 -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
15314 - c->loops_per_jiffy = loops_per_jiffy;
15315 - c->x86_cache_size = -1;
15316 - c->x86_vendor = X86_VENDOR_UNKNOWN;
15317 - c->x86_model = c->x86_mask = 0; /* So far unknown... */
15318 - c->x86_vendor_id[0] = '\0'; /* Unset */
15319 - c->x86_model_id[0] = '\0'; /* Unset */
15320 - c->x86_clflush_size = 64;
15321 - c->x86_cache_alignment = c->x86_clflush_size;
15322 - c->x86_max_cores = 1;
15323 - c->x86_coreid_bits = 0;
15324 - c->extended_cpuid_level = 0;
15325 - memset(&c->x86_capability, 0, sizeof c->x86_capability);
15327 - /* Get vendor name */
15328 - cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
15329 - (unsigned int *)&c->x86_vendor_id[0],
15330 - (unsigned int *)&c->x86_vendor_id[8],
15331 - (unsigned int *)&c->x86_vendor_id[4]);
15333 - get_cpu_vendor(c);
15335 - /* Initialize the standard set of capabilities */
15336 - /* Note that the vendor-specific code below might override */
15338 - /* Intel-defined flags: level 0x00000001 */
15339 - if (c->cpuid_level >= 0x00000001) {
15341 - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
15342 - &c->x86_capability[0]);
15343 - c->x86 = (tfms >> 8) & 0xf;
15344 - c->x86_model = (tfms >> 4) & 0xf;
15345 - c->x86_mask = tfms & 0xf;
15346 - if (c->x86 == 0xf)
15347 - c->x86 += (tfms >> 20) & 0xff;
15348 - if (c->x86 >= 0x6)
15349 - c->x86_model += ((tfms >> 16) & 0xF) << 4;
15350 - if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
15351 - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
15353 - /* Have CPUID level 0 only - unheard of */
15357 - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
15359 - c->phys_proc_id = c->initial_apicid;
15361 - /* AMD-defined flags: level 0x80000001 */
15362 - xlvl = cpuid_eax(0x80000000);
15363 - c->extended_cpuid_level = xlvl;
15364 - if ((xlvl & 0xffff0000) == 0x80000000) {
15365 - if (xlvl >= 0x80000001) {
15366 - c->x86_capability[1] = cpuid_edx(0x80000001);
15367 - c->x86_capability[6] = cpuid_ecx(0x80000001);
15369 - if (xlvl >= 0x80000004)
15370 - get_model_name(c); /* Default name */
15373 - /* Transmeta-defined flags: level 0x80860001 */
15374 - xlvl = cpuid_eax(0x80860000);
15375 - if ((xlvl & 0xffff0000) == 0x80860000) {
15376 - /* Don't set x86_cpuid_level here for now to not confuse. */
15377 - if (xlvl >= 0x80860001)
15378 - c->x86_capability[2] = cpuid_edx(0x80860001);
15381 - c->extended_cpuid_level = cpuid_eax(0x80000000);
15382 - if (c->extended_cpuid_level >= 0x80000007)
15383 - c->x86_power = cpuid_edx(0x80000007);
15385 - switch (c->x86_vendor) {
15386 - case X86_VENDOR_AMD:
15387 - early_init_amd(c);
15389 - case X86_VENDOR_INTEL:
15390 - early_init_intel(c);
15392 - case X86_VENDOR_CENTAUR:
15393 - early_init_centaur(c);
15397 - validate_pat_support(c);
15401 - * This does the hard work of actually picking apart the CPU stuff...
15403 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
15407 - early_identify_cpu(c);
15409 - init_scattered_cpuid_features(c);
15411 - c->apicid = phys_pkg_id(0);
15414 - * Vendor-specific initialization. In this section we
15415 - * canonicalize the feature flags, meaning if there are
15416 - * features a certain CPU supports which CPUID doesn't
15417 - * tell us, CPUID claiming incorrect flags, or other bugs,
15418 - * we handle them here.
15420 - * At the end of this section, c->x86_capability better
15421 - * indicate the features this CPU genuinely supports!
15423 - switch (c->x86_vendor) {
15424 - case X86_VENDOR_AMD:
15428 - case X86_VENDOR_INTEL:
15432 - case X86_VENDOR_CENTAUR:
15436 - case X86_VENDOR_UNKNOWN:
15438 - display_cacheinfo(c);
15445 - * On SMP, boot_cpu_data holds the common feature set between
15446 - * all CPUs; so make sure that we indicate which features are
15447 - * common between the CPUs. The first time this routine gets
15448 - * executed, c == &boot_cpu_data.
15450 - if (c != &boot_cpu_data) {
15451 - /* AND the already accumulated flags with these */
15452 - for (i = 0; i < NCAPINTS; i++)
15453 - boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
15456 - /* Clear all flags overriden by options */
15457 - for (i = 0; i < NCAPINTS; i++)
15458 - c->x86_capability[i] &= ~cleared_cpu_caps[i];
15460 -#ifdef CONFIG_X86_MCE
15463 - select_idle_routine(c);
15465 -#ifdef CONFIG_NUMA
15466 - numa_add_cpu(smp_processor_id());
15471 -void __cpuinit identify_boot_cpu(void)
15473 - identify_cpu(&boot_cpu_data);
15476 -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
15478 - BUG_ON(c == &boot_cpu_data);
15483 -static __init int setup_noclflush(char *arg)
15485 - setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
15488 -__setup("noclflush", setup_noclflush);
15490 -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
15492 - if (c->x86_model_id[0])
15493 - printk(KERN_CONT "%s", c->x86_model_id);
15495 - if (c->x86_mask || c->cpuid_level >= 0)
15496 - printk(KERN_CONT " stepping %02x\n", c->x86_mask);
15498 - printk(KERN_CONT "\n");
15501 -static __init int setup_disablecpuid(char *arg)
15504 - if (get_option(&arg, &bit) && bit < NCAPINTS*32)
15505 - setup_clear_cpu_cap(bit);
15510 -__setup("clearcpuid=", setup_disablecpuid);
15511 Index: head-2008-12-01/arch/x86/kernel/setup_percpu-xen.c
15512 ===================================================================
15513 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
15514 +++ head-2008-12-01/arch/x86/kernel/setup_percpu-xen.c 2008-12-01 11:49:07.000000000 +0100
15516 +#include <linux/kernel.h>
15517 +#include <linux/module.h>
15518 +#include <linux/init.h>
15519 +#include <linux/bootmem.h>
15520 +#include <linux/percpu.h>
15521 +#include <linux/kexec.h>
15522 +#include <linux/crash_dump.h>
15523 +#include <asm/smp.h>
15524 +#include <asm/percpu.h>
15525 +#include <asm/sections.h>
15526 +#include <asm/processor.h>
15527 +#include <asm/setup.h>
15528 +#include <asm/topology.h>
15529 +#include <asm/mpspec.h>
15530 +#include <asm/apicdef.h>
15531 +#include <asm/highmem.h>
15533 +#ifdef CONFIG_X86_LOCAL_APIC
15534 +unsigned int num_processors;
15535 +unsigned disabled_cpus __cpuinitdata;
15536 +/* Processor that is doing the boot up */
15537 +unsigned int boot_cpu_physical_apicid = -1U;
15538 +unsigned int max_physical_apicid;
15539 +EXPORT_SYMBOL(boot_cpu_physical_apicid);
15541 +/* Bitmask of physically existing CPUs */
15542 +physid_mask_t phys_cpu_present_map;
15545 +/* map cpu index to physical APIC ID */
15546 +#ifndef CONFIG_XEN
15547 +DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
15548 +DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
15549 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15550 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
15552 +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
15553 +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15556 +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
15557 +#define X86_64_NUMA 1
15559 +/* map cpu index to node index */
15560 +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
15561 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
15563 +/* which logical CPUs are on which nodes */
15564 +cpumask_t *node_to_cpumask_map;
15565 +EXPORT_SYMBOL(node_to_cpumask_map);
15567 +/* setup node_to_cpumask_map */
15568 +static void __init setup_node_to_cpumask_map(void);
15571 +static inline void setup_node_to_cpumask_map(void) { }
15574 +#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
15576 + * Copy data used in early init routines from the initial arrays to the
15577 + * per cpu data areas. These arrays then become expendable and the
15578 + * *_early_ptr's are zeroed indicating that the static arrays are gone.
15580 +static void __init setup_per_cpu_maps(void)
15582 +#ifndef CONFIG_XEN
15585 + for_each_possible_cpu(cpu) {
15586 + per_cpu(x86_cpu_to_apicid, cpu) =
15587 + early_per_cpu_map(x86_cpu_to_apicid, cpu);
15588 + per_cpu(x86_bios_cpu_apicid, cpu) =
15589 + early_per_cpu_map(x86_bios_cpu_apicid, cpu);
15590 +#ifdef X86_64_NUMA
15591 + per_cpu(x86_cpu_to_node_map, cpu) =
15592 + early_per_cpu_map(x86_cpu_to_node_map, cpu);
15596 + /* indicate the early static arrays will soon be gone */
15597 + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
15598 + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
15599 +#ifdef X86_64_NUMA
15600 + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
15605 +#ifdef CONFIG_X86_32
15607 + * Great future not-so-futuristic plan: make i386 and x86_64 do it
15610 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
15611 +EXPORT_SYMBOL(__per_cpu_offset);
15612 +static inline void setup_cpu_pda_map(void) { }
15614 +#elif !defined(CONFIG_SMP)
15615 +static inline void setup_cpu_pda_map(void) { }
15617 +#else /* CONFIG_SMP && CONFIG_X86_64 */
15620 + * Allocate cpu_pda pointer table and array via alloc_bootmem.
15622 +static void __init setup_cpu_pda_map(void)
15625 + struct x8664_pda **new_cpu_pda;
15626 + unsigned long size;
15629 + size = roundup(sizeof(struct x8664_pda), cache_line_size());
15631 + /* allocate cpu_pda array and pointer table */
15633 + unsigned long tsize = nr_cpu_ids * sizeof(void *);
15634 + unsigned long asize = size * (nr_cpu_ids - 1);
15636 + tsize = roundup(tsize, cache_line_size());
15637 + new_cpu_pda = alloc_bootmem(tsize + asize);
15638 + pda = (char *)new_cpu_pda + tsize;
15641 + /* initialize pointer table to static pda's */
15642 + for_each_possible_cpu(cpu) {
15644 + /* leave boot cpu pda in place */
15645 + new_cpu_pda[0] = cpu_pda(0);
15648 + new_cpu_pda[cpu] = (struct x8664_pda *)pda;
15649 + new_cpu_pda[cpu]->in_bootmem = 1;
15653 + /* point to new pointer table */
15654 + _cpu_pda = new_cpu_pda;
15659 + * Great future plan:
15660 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
15661 + * Always point %gs to its beginning
15663 +void __init setup_per_cpu_areas(void)
15665 + ssize_t size = PERCPU_ENOUGH_ROOM;
15669 + /* Setup cpu_pda map */
15670 + setup_cpu_pda_map();
15672 + /* Copy section for each CPU (we discard the original) */
15673 + size = PERCPU_ENOUGH_ROOM;
15674 + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
15677 + for_each_possible_cpu(cpu) {
15678 +#ifndef CONFIG_NEED_MULTIPLE_NODES
15679 + ptr = alloc_bootmem_pages(size);
15681 + int node = early_cpu_to_node(cpu);
15682 + if (!node_online(node) || !NODE_DATA(node)) {
15683 + ptr = alloc_bootmem_pages(size);
15685 + "cpu %d has no node %d or node-local memory\n",
15689 + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
15691 + per_cpu_offset(cpu) = ptr - __per_cpu_start;
15692 + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
15696 + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
15697 + NR_CPUS, nr_cpu_ids, nr_node_ids);
15699 + /* Setup percpu data maps */
15700 + setup_per_cpu_maps();
15702 + /* Setup node to cpumask map */
15703 + setup_node_to_cpumask_map();
15708 +#ifdef X86_64_NUMA
15711 + * Allocate node_to_cpumask_map based on number of available nodes
15712 + * Requires node_possible_map to be valid.
15714 + * Note: node_to_cpumask() is not valid until after this is done.
15716 +static void __init setup_node_to_cpumask_map(void)
15718 + unsigned int node, num = 0;
15721 + /* setup nr_node_ids if not done yet */
15722 + if (nr_node_ids == MAX_NUMNODES) {
15723 + for_each_node_mask(node, node_possible_map)
15725 + nr_node_ids = num + 1;
15728 + /* allocate the map */
15729 + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
15731 + pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
15732 + map, nr_node_ids);
15734 + /* node_to_cpumask() will now work */
15735 + node_to_cpumask_map = map;
15738 +void __cpuinit numa_set_node(int cpu, int node)
15740 + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
15742 + if (cpu_pda(cpu) && node != NUMA_NO_NODE)
15743 + cpu_pda(cpu)->nodenumber = node;
15745 + if (cpu_to_node_map)
15746 + cpu_to_node_map[cpu] = node;
15748 + else if (per_cpu_offset(cpu))
15749 + per_cpu(x86_cpu_to_node_map, cpu) = node;
15752 + pr_debug("Setting node for non-present cpu %d\n", cpu);
15755 +void __cpuinit numa_clear_node(int cpu)
15757 + numa_set_node(cpu, NUMA_NO_NODE);
15760 +#ifndef CONFIG_DEBUG_PER_CPU_MAPS
15762 +void __cpuinit numa_add_cpu(int cpu)
15764 + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
15767 +void __cpuinit numa_remove_cpu(int cpu)
15769 + cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
15772 +#else /* CONFIG_DEBUG_PER_CPU_MAPS */
15775 + * --------- debug versions of the numa functions ---------
15777 +static void __cpuinit numa_set_cpumask(int cpu, int enable)
15779 + int node = cpu_to_node(cpu);
15783 + if (node_to_cpumask_map == NULL) {
15784 + printk(KERN_ERR "node_to_cpumask_map NULL\n");
15789 + mask = &node_to_cpumask_map[node];
15791 + cpu_set(cpu, *mask);
15793 + cpu_clear(cpu, *mask);
15795 + cpulist_scnprintf(buf, sizeof(buf), *mask);
15796 + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
15797 + enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
15800 +void __cpuinit numa_add_cpu(int cpu)
15802 + numa_set_cpumask(cpu, 1);
15805 +void __cpuinit numa_remove_cpu(int cpu)
15807 + numa_set_cpumask(cpu, 0);
15810 +int cpu_to_node(int cpu)
15812 + if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
15813 + printk(KERN_WARNING
15814 + "cpu_to_node(%d): usage too early!\n", cpu);
15816 + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15818 + return per_cpu(x86_cpu_to_node_map, cpu);
15820 +EXPORT_SYMBOL(cpu_to_node);
15823 + * Same function as cpu_to_node() but used if called before the
15824 + * per_cpu areas are setup.
15826 +int early_cpu_to_node(int cpu)
15828 + if (early_per_cpu_ptr(x86_cpu_to_node_map))
15829 + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15831 + if (!per_cpu_offset(cpu)) {
15832 + printk(KERN_WARNING
15833 + "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
15835 + return NUMA_NO_NODE;
15837 + return per_cpu(x86_cpu_to_node_map, cpu);
15841 +/* empty cpumask */
15842 +static const cpumask_t cpu_mask_none;
15845 + * Returns a pointer to the bitmask of CPUs on Node 'node'.
15847 +const cpumask_t *_node_to_cpumask_ptr(int node)
15849 + if (node_to_cpumask_map == NULL) {
15850 + printk(KERN_WARNING
15851 + "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
15854 + return (const cpumask_t *)&cpu_online_map;
15856 + if (node >= nr_node_ids) {
15857 + printk(KERN_WARNING
15858 + "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
15859 + node, nr_node_ids);
15861 + return &cpu_mask_none;
15863 + return &node_to_cpumask_map[node];
15865 +EXPORT_SYMBOL(_node_to_cpumask_ptr);
15868 + * Returns a bitmask of CPUs on Node 'node'.
15870 + * Side note: this function creates the returned cpumask on the stack
15871 + * so with a high NR_CPUS count, excessive stack space is used. The
15872 + * node_to_cpumask_ptr function should be used whenever possible.
15874 +cpumask_t node_to_cpumask(int node)
15876 + if (node_to_cpumask_map == NULL) {
15877 + printk(KERN_WARNING
15878 + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
15880 + return cpu_online_map;
15882 + if (node >= nr_node_ids) {
15883 + printk(KERN_WARNING
15884 + "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
15885 + node, nr_node_ids);
15887 + return cpu_mask_none;
15889 + return node_to_cpumask_map[node];
15891 +EXPORT_SYMBOL(node_to_cpumask);
15894 + * --------- end of debug versions of the numa functions ---------
15897 +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
15899 +#endif /* X86_64_NUMA */
15901 Index: head-2008-12-01/arch/x86/kernel/smp-xen.c
15902 ===================================================================
15903 --- head-2008-12-01.orig/arch/x86/kernel/smp-xen.c 2008-12-01 11:44:55.000000000 +0100
15904 +++ head-2008-12-01/arch/x86/kernel/smp-xen.c 2008-12-01 11:49:07.000000000 +0100
15905 @@ -121,132 +121,14 @@ void xen_smp_send_reschedule(int cpu)
15906 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
15910 - * Structure and data for smp_call_function(). This is designed to minimise
15911 - * static memory requirements. It also looks cleaner.
15913 -static DEFINE_SPINLOCK(call_lock);
15915 -struct call_data_struct {
15916 - void (*func) (void *info);
15918 - atomic_t started;
15919 - atomic_t finished;
15923 -void lock_ipi_call_lock(void)
15924 +void xen_send_call_func_single_ipi(int cpu)
15926 - spin_lock_irq(&call_lock);
15927 + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR);
15930 -void unlock_ipi_call_lock(void)
15931 +void xen_send_call_func_ipi(cpumask_t mask)
15933 - spin_unlock_irq(&call_lock);
15936 -static struct call_data_struct *call_data;
15938 -static void __smp_call_function(void (*func) (void *info), void *info,
15939 - int nonatomic, int wait)
15941 - struct call_data_struct data;
15942 - int cpus = num_online_cpus() - 1;
15947 - data.func = func;
15948 - data.info = info;
15949 - atomic_set(&data.started, 0);
15950 - data.wait = wait;
15952 - atomic_set(&data.finished, 0);
15954 - call_data = &data;
15957 - /* Send a message to all other CPUs and wait for them to respond */
15958 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
15960 - /* Wait for response */
15961 - while (atomic_read(&data.started) != cpus)
15965 - while (atomic_read(&data.finished) != cpus)
15971 - * smp_call_function_mask(): Run a function on a set of other CPUs.
15972 - * @mask: The set of cpus to run on. Must not include the current cpu.
15973 - * @func: The function to run. This must be fast and non-blocking.
15974 - * @info: An arbitrary pointer to pass to the function.
15975 - * @wait: If true, wait (atomically) until function has completed on other CPUs.
15977 - * Returns 0 on success, else a negative status code.
15979 - * If @wait is true, then returns once @func has returned; otherwise
15980 - * it returns just before the target cpu calls @func.
15982 - * You must not call this function with disabled interrupts or from a
15983 - * hardware interrupt handler or from a bottom half handler.
15986 -xen_smp_call_function_mask(cpumask_t mask,
15987 - void (*func)(void *), void *info,
15990 - struct call_data_struct data;
15991 - cpumask_t allbutself;
15994 - /* Can deadlock when called with interrupts disabled */
15995 - WARN_ON(irqs_disabled());
15997 - /* Holding any lock stops cpus from going down. */
15998 - spin_lock(&call_lock);
16000 - allbutself = cpu_online_map;
16001 - cpu_clear(smp_processor_id(), allbutself);
16003 - cpus_and(mask, mask, allbutself);
16004 - cpus = cpus_weight(mask);
16007 - spin_unlock(&call_lock);
16011 - data.func = func;
16012 - data.info = info;
16013 - atomic_set(&data.started, 0);
16014 - data.wait = wait;
16016 - atomic_set(&data.finished, 0);
16018 - call_data = &data;
16021 - /* Send a message to other CPUs */
16022 - if (cpus_equal(mask, allbutself) &&
16023 - cpus_equal(cpu_online_map, cpu_callout_map))
16024 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
16026 - send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16028 - /* Wait for response */
16029 - while (atomic_read(&data.started) != cpus)
16033 - while (atomic_read(&data.finished) != cpus)
16035 - spin_unlock(&call_lock);
16038 + send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16041 static void stop_this_cpu(void *dummy)
16042 @@ -268,15 +150,10 @@ static void stop_this_cpu(void *dummy)
16044 void xen_smp_send_stop(void)
16047 unsigned long flags;
16049 - /* Don't deadlock on the call lock in panic */
16050 - nolock = !spin_trylock(&call_lock);
16051 + smp_call_function(stop_this_cpu, NULL, 0);
16052 local_irq_save(flags);
16053 - __smp_call_function(stop_this_cpu, NULL, 0, 0);
16055 - spin_unlock(&call_lock);
16056 disable_all_local_evtchn();
16057 local_irq_restore(flags);
16059 @@ -298,21 +175,8 @@ irqreturn_t smp_reschedule_interrupt(int
16061 irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
16063 - void (*func) (void *info) = call_data->func;
16064 - void *info = call_data->info;
16065 - int wait = call_data->wait;
16068 - * Notify initiating CPU that I've grabbed the data and am
16069 - * about to execute the function
16072 - atomic_inc(&call_data->started);
16074 - * At this point the info structure may be out of scope unless wait==1
16078 + generic_smp_call_function_interrupt();
16079 #ifdef CONFIG_X86_32
16080 __get_cpu_var(irq_stat).irq_call_count++;
16082 @@ -320,10 +184,19 @@ irqreturn_t smp_call_function_interrupt(
16088 - atomic_inc(&call_data->finished);
16090 + return IRQ_HANDLED;
16093 +irqreturn_t smp_call_function_single_interrupt(int irq, void *dev_id)
16096 + generic_smp_call_function_single_interrupt();
16097 +#ifdef CONFIG_X86_32
16098 + __get_cpu_var(irq_stat).irq_call_count++;
16100 + add_pda(irq_call_count, 1);
16104 return IRQ_HANDLED;
16106 Index: head-2008-12-01/arch/x86/kernel/time_32-xen.c
16107 ===================================================================
16108 --- head-2008-12-01.orig/arch/x86/kernel/time_32-xen.c 2008-12-01 11:44:55.000000000 +0100
16109 +++ head-2008-12-01/arch/x86/kernel/time_32-xen.c 2008-12-01 11:58:30.000000000 +0100
16110 @@ -470,7 +470,7 @@ irqreturn_t timer_interrupt(int irq, voi
16112 /* Keep nmi watchdog up to date */
16114 - per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
16115 + x86_add_percpu(irq_stat.irq0_irqs, 1);
16117 add_pda(irq0_irqs, 1);
16119 @@ -748,9 +748,7 @@ void __init time_init(void)
16121 update_wallclock();
16123 -#ifndef CONFIG_X86_64
16127 /* Cannot request_irq() until kmem is initialised. */
16128 late_time_init = setup_cpu0_timer_irq;
16129 @@ -807,7 +805,8 @@ static void stop_hz_timer(void)
16131 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
16132 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
16133 - (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
16134 + (j = get_next_timer_interrupt(jiffies),
16135 + time_before_eq(j, jiffies))) {
16136 cpu_clear(cpu, nohz_cpu_mask);
16139 Index: head-2008-12-01/arch/x86/kernel/traps_32-xen.c
16140 ===================================================================
16141 --- head-2008-12-01.orig/arch/x86/kernel/traps_32-xen.c 2008-12-01 11:44:55.000000000 +0100
16142 +++ head-2008-12-01/arch/x86/kernel/traps_32-xen.c 2008-12-01 11:49:07.000000000 +0100
16145 * Copyright (C) 1991, 1992 Linus Torvalds
16146 + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
16148 * Pentium III FXSR, SSE support
16149 * Gareth Hughes <gareth@valinux.com>, May 2000
16150 @@ -57,11 +58,10 @@
16151 #include <asm/nmi.h>
16152 #include <asm/smp.h>
16153 #include <asm/io.h>
16154 +#include <asm/traps.h>
16156 #include "mach_traps.h"
16158 -int panic_on_unrecovered_nmi;
16161 DECLARE_BITMAP(used_vectors, NR_VECTORS);
16162 EXPORT_SYMBOL_GPL(used_vectors);
16163 @@ -82,43 +82,22 @@ gate_desc idt_table[256]
16164 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
16167 -asmlinkage void divide_error(void);
16168 -asmlinkage void debug(void);
16169 -asmlinkage void nmi(void);
16170 -asmlinkage void int3(void);
16171 -asmlinkage void overflow(void);
16172 -asmlinkage void bounds(void);
16173 -asmlinkage void invalid_op(void);
16174 -asmlinkage void device_not_available(void);
16175 -asmlinkage void coprocessor_segment_overrun(void);
16176 -asmlinkage void invalid_TSS(void);
16177 -asmlinkage void segment_not_present(void);
16178 -asmlinkage void stack_segment(void);
16179 -asmlinkage void general_protection(void);
16180 -asmlinkage void page_fault(void);
16181 -asmlinkage void coprocessor_error(void);
16182 -asmlinkage void simd_coprocessor_error(void);
16183 -asmlinkage void alignment_check(void);
16184 -#ifndef CONFIG_XEN
16185 -asmlinkage void spurious_interrupt_bug(void);
16187 -asmlinkage void fixup_4gb_segment(void);
16189 -asmlinkage void machine_check(void);
16191 +int panic_on_unrecovered_nmi;
16192 int kstack_depth_to_print = 24;
16193 static unsigned int code_bytes = 64;
16194 +static int ignore_nmis;
16195 +static int die_counter;
16197 void printk_address(unsigned long address, int reliable)
16199 #ifdef CONFIG_KALLSYMS
16200 - char namebuf[KSYM_NAME_LEN];
16201 unsigned long offset = 0;
16202 unsigned long symsize;
16203 const char *symname;
16204 - char reliab[4] = "";
16205 - char *delim = ":";
16207 + char *delim = ":";
16208 + char namebuf[KSYM_NAME_LEN];
16209 + char reliab[4] = "";
16211 symname = kallsyms_lookup(address, &symsize, &offset,
16212 &modname, namebuf);
16213 @@ -138,22 +117,23 @@ void printk_address(unsigned long addres
16217 -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
16218 +static inline int valid_stack_ptr(struct thread_info *tinfo,
16219 + void *p, unsigned int size)
16221 - return p > (void *)tinfo &&
16222 - p <= (void *)tinfo + THREAD_SIZE - size;
16224 + return p > t && p <= t + THREAD_SIZE - size;
16227 /* The form of the top of the frame on the stack */
16228 struct stack_frame {
16229 - struct stack_frame *next_frame;
16230 - unsigned long return_address;
16231 + struct stack_frame *next_frame;
16232 + unsigned long return_address;
16235 static inline unsigned long
16236 print_context_stack(struct thread_info *tinfo,
16237 - unsigned long *stack, unsigned long bp,
16238 - const struct stacktrace_ops *ops, void *data)
16239 + unsigned long *stack, unsigned long bp,
16240 + const struct stacktrace_ops *ops, void *data)
16242 struct stack_frame *frame = (struct stack_frame *)bp;
16244 @@ -175,8 +155,6 @@ print_context_stack(struct thread_info *
16248 -#define MSG(msg) ops->warning(data, msg)
16250 void dump_trace(struct task_struct *task, struct pt_regs *regs,
16251 unsigned long *stack, unsigned long bp,
16252 const struct stacktrace_ops *ops, void *data)
16253 @@ -186,7 +164,6 @@ void dump_trace(struct task_struct *task
16256 unsigned long dummy;
16259 if (task != current)
16260 stack = (unsigned long *)task->thread.sp;
16261 @@ -204,7 +181,7 @@ void dump_trace(struct task_struct *task
16267 struct thread_info *context;
16269 context = (struct thread_info *)
16270 @@ -256,15 +233,15 @@ static void print_trace_address(void *da
16273 static const struct stacktrace_ops print_trace_ops = {
16274 - .warning = print_trace_warning,
16275 - .warning_symbol = print_trace_warning_symbol,
16276 - .stack = print_trace_stack,
16277 - .address = print_trace_address,
16278 + .warning = print_trace_warning,
16279 + .warning_symbol = print_trace_warning_symbol,
16280 + .stack = print_trace_stack,
16281 + .address = print_trace_address,
16285 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16286 - unsigned long *stack, unsigned long bp, char *log_lvl)
16287 + unsigned long *stack, unsigned long bp, char *log_lvl)
16289 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16290 printk("%s =======================\n", log_lvl);
16291 @@ -359,15 +336,14 @@ void show_registers(struct pt_regs *regs
16292 printk(KERN_EMERG "Code: ");
16294 ip = (u8 *)regs->ip - code_prologue;
16295 - if (ip < (u8 *)PAGE_OFFSET ||
16296 - probe_kernel_address(ip, c)) {
16297 + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
16298 /* try starting at EIP */
16299 ip = (u8 *)regs->ip;
16300 code_len = code_len - code_prologue + 1;
16302 for (i = 0; i < code_len; i++, ip++) {
16303 if (ip < (u8 *)PAGE_OFFSET ||
16304 - probe_kernel_address(ip, c)) {
16305 + probe_kernel_address(ip, c)) {
16306 printk(" Bad EIP value.");
16309 @@ -392,7 +368,53 @@ int is_valid_bugaddr(unsigned long ip)
16310 return ud2 == 0x0b0f;
16313 -static int die_counter;
16314 +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
16315 +static int die_owner = -1;
16316 +static unsigned int die_nest_count;
16318 +unsigned __kprobes long oops_begin(void)
16320 + unsigned long flags;
16324 + if (die_owner != raw_smp_processor_id()) {
16325 + console_verbose();
16326 + raw_local_irq_save(flags);
16327 + __raw_spin_lock(&die_lock);
16328 + die_owner = smp_processor_id();
16329 + die_nest_count = 0;
16330 + bust_spinlocks(1);
16332 + raw_local_irq_save(flags);
16334 + die_nest_count++;
16338 +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
16340 + bust_spinlocks(0);
16342 + add_taint(TAINT_DIE);
16343 + __raw_spin_unlock(&die_lock);
16344 + raw_local_irq_restore(flags);
16349 + if (kexec_should_crash(current))
16350 + crash_kexec(regs);
16352 + if (in_interrupt())
16353 + panic("Fatal exception in interrupt");
16355 + if (panic_on_oops)
16356 + panic("Fatal exception");
16362 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
16364 @@ -410,26 +432,22 @@ int __kprobes __die(const char *str, str
16365 printk("DEBUG_PAGEALLOC");
16369 if (notify_die(DIE_OOPS, str, regs, err,
16370 - current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
16371 + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
16374 - show_registers(regs);
16375 - /* Executive summary in case the oops scrolled away */
16376 - sp = (unsigned long) (®s->sp);
16377 - savesegment(ss, ss);
16378 - if (user_mode(regs)) {
16380 - ss = regs->ss & 0xffff;
16382 - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16383 - print_symbol("%s", regs->ip);
16384 - printk(" SS:ESP %04x:%08lx\n", ss, sp);
16390 + show_registers(regs);
16391 + /* Executive summary in case the oops scrolled away */
16392 + sp = (unsigned long) (®s->sp);
16393 + savesegment(ss, ss);
16394 + if (user_mode(regs)) {
16396 + ss = regs->ss & 0xffff;
16398 + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16399 + print_symbol("%s", regs->ip);
16400 + printk(" SS:ESP %04x:%08lx\n", ss, sp);
16405 @@ -438,31 +456,9 @@ int __kprobes __die(const char *str, str
16407 void die(const char *str, struct pt_regs *regs, long err)
16410 - raw_spinlock_t lock;
16412 - int lock_owner_depth;
16414 - .lock = __RAW_SPIN_LOCK_UNLOCKED,
16415 - .lock_owner = -1,
16416 - .lock_owner_depth = 0
16418 - unsigned long flags;
16421 + unsigned long flags = oops_begin();
16423 - if (die.lock_owner != raw_smp_processor_id()) {
16424 - console_verbose();
16425 - raw_local_irq_save(flags);
16426 - __raw_spin_lock(&die.lock);
16427 - die.lock_owner = smp_processor_id();
16428 - die.lock_owner_depth = 0;
16429 - bust_spinlocks(1);
16431 - raw_local_irq_save(flags);
16434 - if (++die.lock_owner_depth < 3) {
16435 + if (die_nest_count < 3) {
16436 report_bug(regs->ip, regs);
16438 if (__die(str, regs, err))
16439 @@ -471,26 +467,7 @@ void die(const char *str, struct pt_regs
16440 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
16443 - bust_spinlocks(0);
16444 - die.lock_owner = -1;
16445 - add_taint(TAINT_DIE);
16446 - __raw_spin_unlock(&die.lock);
16447 - raw_local_irq_restore(flags);
16452 - if (kexec_should_crash(current))
16453 - crash_kexec(regs);
16455 - if (in_interrupt())
16456 - panic("Fatal exception in interrupt");
16458 - if (panic_on_oops)
16459 - panic("Fatal exception");
16462 - do_exit(SIGSEGV);
16463 + oops_end(flags, regs, SIGSEGV);
16467 @@ -554,7 +531,7 @@ void do_##name(struct pt_regs *regs, lon
16469 trace_hardirqs_fixup(); \
16470 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16471 - == NOTIFY_STOP) \
16472 + == NOTIFY_STOP) \
16474 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
16476 @@ -570,7 +547,7 @@ void do_##name(struct pt_regs *regs, lon
16477 info.si_code = sicode; \
16478 info.si_addr = (void __user *)siaddr; \
16479 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16480 - == NOTIFY_STOP) \
16481 + == NOTIFY_STOP) \
16483 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
16485 @@ -579,7 +556,7 @@ void do_##name(struct pt_regs *regs, lon
16486 void do_##name(struct pt_regs *regs, long error_code) \
16488 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16489 - == NOTIFY_STOP) \
16490 + == NOTIFY_STOP) \
16492 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
16494 @@ -594,28 +571,29 @@ void do_##name(struct pt_regs *regs, lon
16495 info.si_addr = (void __user *)siaddr; \
16496 trace_hardirqs_fixup(); \
16497 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16498 - == NOTIFY_STOP) \
16499 + == NOTIFY_STOP) \
16501 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
16504 -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16505 +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16506 #ifndef CONFIG_KPROBES
16507 DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
16509 DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
16510 DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
16511 -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16512 -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16513 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16514 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16515 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
16516 -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16517 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16518 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16519 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16520 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
16521 DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
16523 -void __kprobes do_general_protection(struct pt_regs * regs,
16526 +do_general_protection(struct pt_regs *regs, long error_code)
16528 + struct task_struct *tsk;
16529 struct thread_struct *thread;
16531 thread = ¤t->thread;
16532 @@ -623,23 +601,24 @@ void __kprobes do_general_protection(str
16533 if (regs->flags & X86_VM_MASK)
16537 if (!user_mode(regs))
16540 - current->thread.error_code = error_code;
16541 - current->thread.trap_no = 13;
16542 + tsk->thread.error_code = error_code;
16543 + tsk->thread.trap_no = 13;
16545 - if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
16546 - printk_ratelimit()) {
16547 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
16548 + printk_ratelimit()) {
16550 - "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16551 - current->comm, task_pid_nr(current),
16552 - regs->ip, regs->sp, error_code);
16553 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16554 + tsk->comm, task_pid_nr(tsk),
16555 + regs->ip, regs->sp, error_code);
16556 print_vma_addr(" in ", regs->ip);
16560 - force_sig(SIGSEGV, current);
16561 + force_sig(SIGSEGV, tsk);
16565 @@ -648,14 +627,15 @@ gp_in_vm86:
16569 - if (!fixup_exception(regs)) {
16570 - current->thread.error_code = error_code;
16571 - current->thread.trap_no = 13;
16572 - if (notify_die(DIE_GPF, "general protection fault", regs,
16573 + if (fixup_exception(regs))
16576 + tsk->thread.error_code = error_code;
16577 + tsk->thread.trap_no = 13;
16578 + if (notify_die(DIE_GPF, "general protection fault", regs,
16579 error_code, 13, SIGSEGV) == NOTIFY_STOP)
16581 - die("general protection fault", regs, error_code);
16584 + die("general protection fault", regs, error_code);
16587 static notrace __kprobes void
16588 @@ -722,9 +702,9 @@ unknown_nmi_error(unsigned char reason,
16590 static DEFINE_SPINLOCK(nmi_print_lock);
16592 -void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
16593 +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
16595 - if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16596 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16599 spin_lock(&nmi_print_lock);
16600 @@ -733,10 +713,12 @@ void notrace __kprobes die_nmi(struct pt
16601 * to get a message out:
16604 - printk(KERN_EMERG "%s", msg);
16605 + printk(KERN_EMERG "%s", str);
16606 printk(" on CPU%d, ip %08lx, registers:\n",
16607 smp_processor_id(), regs->ip);
16608 show_registers(regs);
16610 + panic("Non maskable interrupt");
16612 spin_unlock(&nmi_print_lock);
16614 @@ -756,14 +738,17 @@ void notrace __kprobes die_nmi(struct pt
16615 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
16617 unsigned char reason = 0;
16620 - /* Only the BSP gets external NMIs from the system: */
16621 - if (!smp_processor_id())
16622 + cpu = smp_processor_id();
16624 + /* Only the BSP gets external NMIs from the system. */
16626 reason = get_nmi_reason();
16628 if (!(reason & 0xc0)) {
16629 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
16633 #ifdef CONFIG_X86_LOCAL_APIC
16635 @@ -772,7 +757,7 @@ static notrace __kprobes void default_do
16637 if (nmi_watchdog_tick(regs, reason))
16639 - if (!do_nmi_callback(regs, smp_processor_id()))
16640 + if (!do_nmi_callback(regs, cpu))
16641 unknown_nmi_error(reason, regs);
16643 unknown_nmi_error(reason, regs);
16644 @@ -782,6 +767,8 @@ static notrace __kprobes void default_do
16646 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
16649 + /* AK: following checks seem to be broken on modern chipsets. FIXME */
16651 mem_parity_error(reason, regs);
16653 @@ -793,8 +780,6 @@ static notrace __kprobes void default_do
16657 -static int ignore_nmis;
16659 notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
16662 @@ -879,7 +864,7 @@ void __kprobes do_debug(struct pt_regs *
16663 tsk->thread.debugctlmsr = 0;
16665 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
16666 - SIGTRAP) == NOTIFY_STOP)
16667 + SIGTRAP) == NOTIFY_STOP)
16669 /* It's safe to allow irq's after DR6 has been saved */
16670 if (regs->flags & X86_EFLAGS_IF)
16671 @@ -940,9 +925,8 @@ clear_TF_reenable:
16672 void math_error(void __user *ip)
16674 struct task_struct *task;
16675 - unsigned short cwd;
16676 - unsigned short swd;
16678 + unsigned short cwd, swd;
16681 * Save the info for the exception handler and clear the error.
16682 @@ -961,7 +945,7 @@ void math_error(void __user *ip)
16683 * C1 reg you need in case of a stack fault, 0x040 is the stack
16684 * fault bit. We should only be taking one exception at a time,
16685 * so if this combination doesn't produce any single exception,
16686 - * then we have a bad program that isn't syncronizing its FPU usage
16687 + * then we have a bad program that isn't synchronizing its FPU usage
16688 * and it will suffer the consequences since we won't be able to
16689 * fully reproduce the context of the exception
16691 @@ -970,7 +954,7 @@ void math_error(void __user *ip)
16692 switch (swd & ~cwd & 0x3f) {
16693 case 0x000: /* No unmasked exception */
16695 - default: /* Multiple exceptions */
16696 + default: /* Multiple exceptions */
16698 case 0x001: /* Invalid Op */
16700 @@ -1006,8 +990,8 @@ void do_coprocessor_error(struct pt_regs
16701 static void simd_math_error(void __user *ip)
16703 struct task_struct *task;
16704 - unsigned short mxcsr;
16706 + unsigned short mxcsr;
16709 * Save the info for the exception handler and clear the error.
16710 @@ -1084,7 +1068,7 @@ void do_spurious_interrupt_bug(struct pt
16712 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
16714 - struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
16715 + struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
16716 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
16717 unsigned long new_kesp = kesp - base;
16718 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
16719 Index: head-2008-12-01/arch/x86/kernel/traps_64-xen.c
16720 ===================================================================
16721 --- head-2008-12-01.orig/arch/x86/kernel/traps_64-xen.c 2008-12-01 11:44:55.000000000 +0100
16722 +++ head-2008-12-01/arch/x86/kernel/traps_64-xen.c 2008-12-01 11:49:07.000000000 +0100
16723 @@ -10,73 +10,56 @@
16724 * 'Traps.c' handles hardware traps and faults after we have saved some
16725 * state in 'entry.S'.
16727 -#include <linux/sched.h>
16728 +#include <linux/moduleparam.h>
16729 +#include <linux/interrupt.h>
16730 +#include <linux/kallsyms.h>
16731 +#include <linux/spinlock.h>
16732 +#include <linux/kprobes.h>
16733 +#include <linux/uaccess.h>
16734 +#include <linux/utsname.h>
16735 +#include <linux/kdebug.h>
16736 #include <linux/kernel.h>
16737 +#include <linux/module.h>
16738 +#include <linux/ptrace.h>
16739 #include <linux/string.h>
16740 +#include <linux/unwind.h>
16741 +#include <linux/delay.h>
16742 #include <linux/errno.h>
16743 -#include <linux/ptrace.h>
16744 +#include <linux/kexec.h>
16745 +#include <linux/sched.h>
16746 #include <linux/timer.h>
16747 -#include <linux/mm.h>
16748 #include <linux/init.h>
16749 -#include <linux/delay.h>
16750 -#include <linux/spinlock.h>
16751 -#include <linux/interrupt.h>
16752 -#include <linux/kallsyms.h>
16753 -#include <linux/module.h>
16754 -#include <linux/moduleparam.h>
16755 -#include <linux/nmi.h>
16756 -#include <linux/kprobes.h>
16757 -#include <linux/kexec.h>
16758 -#include <linux/unwind.h>
16759 -#include <linux/uaccess.h>
16760 #include <linux/bug.h>
16761 -#include <linux/kdebug.h>
16762 -#include <linux/utsname.h>
16764 -#include <mach_traps.h>
16765 +#include <linux/nmi.h>
16766 +#include <linux/mm.h>
16768 #if defined(CONFIG_EDAC)
16769 #include <linux/edac.h>
16772 -#include <asm/system.h>
16773 -#include <asm/io.h>
16774 -#include <asm/atomic.h>
16775 +#include <asm/stacktrace.h>
16776 +#include <asm/processor.h>
16777 #include <asm/debugreg.h>
16778 +#include <asm/atomic.h>
16779 +#include <asm/system.h>
16780 +#include <asm/unwind.h>
16781 #include <asm/desc.h>
16782 #include <asm/i387.h>
16783 -#include <asm/processor.h>
16784 -#include <asm/unwind.h>
16785 +#include <asm/nmi.h>
16786 #include <asm/smp.h>
16787 +#include <asm/io.h>
16788 #include <asm/pgalloc.h>
16789 -#include <asm/pda.h>
16790 #include <asm/proto.h>
16791 -#include <asm/nmi.h>
16792 -#include <asm/stacktrace.h>
16793 +#include <asm/pda.h>
16794 +#include <asm/traps.h>
16796 -asmlinkage void divide_error(void);
16797 -asmlinkage void debug(void);
16798 -asmlinkage void nmi(void);
16799 -asmlinkage void int3(void);
16800 -asmlinkage void overflow(void);
16801 -asmlinkage void bounds(void);
16802 -asmlinkage void invalid_op(void);
16803 -asmlinkage void device_not_available(void);
16804 -asmlinkage void double_fault(void);
16805 -asmlinkage void coprocessor_segment_overrun(void);
16806 -asmlinkage void invalid_TSS(void);
16807 -asmlinkage void segment_not_present(void);
16808 -asmlinkage void stack_segment(void);
16809 -asmlinkage void general_protection(void);
16810 -asmlinkage void page_fault(void);
16811 -asmlinkage void coprocessor_error(void);
16812 -asmlinkage void simd_coprocessor_error(void);
16813 -asmlinkage void reserved(void);
16814 -asmlinkage void alignment_check(void);
16815 -asmlinkage void machine_check(void);
16816 -asmlinkage void spurious_interrupt_bug(void);
16817 +#include <mach_traps.h>
16819 +int panic_on_unrecovered_nmi;
16820 +int kstack_depth_to_print = 12;
16821 static unsigned int code_bytes = 64;
16822 +static int ignore_nmis;
16823 +static int die_counter;
16825 static inline void conditional_sti(struct pt_regs *regs)
16827 @@ -100,34 +83,9 @@ static inline void preempt_conditional_c
16828 dec_preempt_count();
16831 -int kstack_depth_to_print = 12;
16833 void printk_address(unsigned long address, int reliable)
16835 -#ifdef CONFIG_KALLSYMS
16836 - unsigned long offset = 0, symsize;
16837 - const char *symname;
16839 - char *delim = ":";
16840 - char namebuf[KSYM_NAME_LEN];
16841 - char reliab[4] = "";
16843 - symname = kallsyms_lookup(address, &symsize, &offset,
16844 - &modname, namebuf);
16846 - printk(" [<%016lx>]\n", address);
16850 - strcpy(reliab, "? ");
16853 - modname = delim = "";
16854 - printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
16855 - address, reliab, delim, modname, delim, symname, offset, symsize);
16857 - printk(" [<%016lx>]\n", address);
16859 + printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
16862 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
16863 @@ -206,8 +164,6 @@ static unsigned long *in_exception_stack
16867 -#define MSG(txt) ops->warning(data, txt)
16870 * x86-64 can have up to three kernel stacks:
16872 @@ -234,11 +190,11 @@ struct stack_frame {
16873 unsigned long return_address;
16877 -static inline unsigned long print_context_stack(struct thread_info *tinfo,
16878 - unsigned long *stack, unsigned long bp,
16879 - const struct stacktrace_ops *ops, void *data,
16880 - unsigned long *end)
16881 +static inline unsigned long
16882 +print_context_stack(struct thread_info *tinfo,
16883 + unsigned long *stack, unsigned long bp,
16884 + const struct stacktrace_ops *ops, void *data,
16885 + unsigned long *end)
16887 struct stack_frame *frame = (struct stack_frame *)bp;
16889 @@ -260,7 +216,7 @@ static inline unsigned long print_contex
16893 -void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
16894 +void dump_trace(struct task_struct *task, struct pt_regs *regs,
16895 unsigned long *stack, unsigned long bp,
16896 const struct stacktrace_ops *ops, void *data)
16898 @@ -269,36 +225,34 @@ void dump_trace(struct task_struct *tsk,
16900 struct thread_info *tinfo;
16904 - tinfo = task_thread_info(tsk);
16909 unsigned long dummy;
16911 - if (tsk && tsk != current)
16912 - stack = (unsigned long *)tsk->thread.sp;
16913 + if (task && task != current)
16914 + stack = (unsigned long *)task->thread.sp;
16917 #ifdef CONFIG_FRAME_POINTER
16919 - if (tsk == current) {
16920 + if (task == current) {
16921 /* Grab bp right from our regs */
16922 - asm("movq %%rbp, %0" : "=r" (bp):);
16923 + asm("movq %%rbp, %0" : "=r" (bp) :);
16925 /* bp is the last reg pushed by switch_to */
16926 - bp = *(unsigned long *) tsk->thread.sp;
16927 + bp = *(unsigned long *) task->thread.sp;
16935 * Print function call entries in all stacks, starting at the
16936 * current stack address. If the stacks consist of nested
16939 + tinfo = task_thread_info(task);
16942 unsigned long *estack_end;
16943 @@ -383,18 +337,24 @@ static const struct stacktrace_ops print
16944 .address = print_trace_address,
16948 -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
16949 - unsigned long bp)
16951 +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16952 + unsigned long *stack, unsigned long bp, char *log_lvl)
16954 printk("\nCall Trace:\n");
16955 - dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
16956 + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16960 +void show_trace(struct task_struct *task, struct pt_regs *regs,
16961 + unsigned long *stack, unsigned long bp)
16963 + show_trace_log_lvl(task, regs, stack, bp, "");
16967 -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
16968 - unsigned long bp)
16969 +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
16970 + unsigned long *sp, unsigned long bp, char *log_lvl)
16972 unsigned long *stack;
16974 @@ -406,14 +366,14 @@ _show_stack(struct task_struct *tsk, str
16975 // back trace for this cpu.
16979 - sp = (unsigned long *)tsk->thread.sp;
16981 + sp = (unsigned long *)task->thread.sp;
16983 sp = (unsigned long *)&sp;
16987 - for(i=0; i < kstack_depth_to_print; i++) {
16988 + for (i = 0; i < kstack_depth_to_print; i++) {
16989 if (stack >= irqstack && stack <= irqstack_end) {
16990 if (stack == irqstack_end) {
16991 stack = (unsigned long *) (irqstack_end[-1]);
16992 @@ -428,12 +388,12 @@ _show_stack(struct task_struct *tsk, str
16993 printk(" %016lx", *stack++);
16994 touch_nmi_watchdog();
16996 - show_trace(tsk, regs, sp, bp);
16997 + show_trace_log_lvl(task, regs, sp, bp, log_lvl);
17000 -void show_stack(struct task_struct *tsk, unsigned long * sp)
17001 +void show_stack(struct task_struct *task, unsigned long *sp)
17003 - _show_stack(tsk, NULL, sp, 0);
17004 + show_stack_log_lvl(task, NULL, sp, 0, "");
17008 @@ -441,8 +401,8 @@ void show_stack(struct task_struct *tsk,
17010 void dump_stack(void)
17012 - unsigned long dummy;
17013 unsigned long bp = 0;
17014 + unsigned long stack;
17016 #ifdef CONFIG_FRAME_POINTER
17018 @@ -454,7 +414,7 @@ void dump_stack(void)
17019 init_utsname()->release,
17020 (int)strcspn(init_utsname()->version, " "),
17021 init_utsname()->version);
17022 - show_trace(NULL, NULL, &dummy, bp);
17023 + show_trace(NULL, NULL, &stack, bp);
17026 EXPORT_SYMBOL(dump_stack);
17027 @@ -465,12 +425,8 @@ void show_registers(struct pt_regs *regs
17029 const int cpu = smp_processor_id();
17030 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
17032 - unsigned int code_prologue = code_bytes * 43 / 64;
17033 - unsigned int code_len = code_bytes;
17036 - ip = (u8 *) regs->ip - code_prologue;
17037 printk("CPU %d ", cpu);
17039 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
17040 @@ -481,15 +437,22 @@ void show_registers(struct pt_regs *regs
17041 * time of the fault..
17043 if (!user_mode(regs)) {
17044 + unsigned int code_prologue = code_bytes * 43 / 64;
17045 + unsigned int code_len = code_bytes;
17050 - _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
17051 + show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
17055 printk(KERN_EMERG "Code: ");
17057 + ip = (u8 *)regs->ip - code_prologue;
17058 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
17059 /* try starting at RIP */
17060 - ip = (u8 *) regs->ip;
17061 + ip = (u8 *)regs->ip;
17062 code_len = code_len - code_prologue + 1;
17064 for (i = 0; i < code_len; i++, ip++) {
17065 @@ -505,7 +468,7 @@ void show_registers(struct pt_regs *regs
17072 int is_valid_bugaddr(unsigned long ip)
17074 @@ -545,7 +508,7 @@ unsigned __kprobes long oops_begin(void)
17077 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
17083 @@ -563,10 +526,9 @@ void __kprobes oops_end(unsigned long fl
17087 -int __kprobes __die(const char * str, struct pt_regs * regs, long err)
17088 +int __kprobes __die(const char *str, struct pt_regs *regs, long err)
17090 - static int die_counter;
17091 - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
17092 + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
17093 #ifdef CONFIG_PREEMPT
17094 printk("PREEMPT ");
17096 @@ -577,8 +539,10 @@ int __kprobes __die(const char * str, st
17097 printk("DEBUG_PAGEALLOC");
17100 - if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17101 + if (notify_die(DIE_OOPS, str, regs, err,
17102 + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17105 show_registers(regs);
17106 add_taint(TAINT_DIE);
17107 /* Executive summary in case the oops scrolled away */
17108 @@ -590,7 +554,7 @@ int __kprobes __die(const char * str, st
17112 -void die(const char * str, struct pt_regs * regs, long err)
17113 +void die(const char *str, struct pt_regs *regs, long err)
17115 unsigned long flags = oops_begin();
17117 @@ -608,8 +572,7 @@ die_nmi(char *str, struct pt_regs *regs,
17119 unsigned long flags;
17121 - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
17123 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
17126 flags = oops_begin();
17127 @@ -617,7 +580,9 @@ die_nmi(char *str, struct pt_regs *regs,
17128 * We are in trouble anyway, lets at least try
17129 * to get a message out.
17131 - printk(str, smp_processor_id());
17132 + printk(KERN_EMERG "%s", str);
17133 + printk(" on CPU%d, ip %08lx, registers:\n",
17134 + smp_processor_id(), regs->ip);
17135 show_registers(regs);
17136 if (kexec_should_crash(current))
17138 @@ -630,44 +595,44 @@ die_nmi(char *str, struct pt_regs *regs,
17142 -static void __kprobes do_trap(int trapnr, int signr, char *str,
17143 - struct pt_regs * regs, long error_code,
17145 +static void __kprobes
17146 +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
17147 + long error_code, siginfo_t *info)
17149 struct task_struct *tsk = current;
17151 - if (user_mode(regs)) {
17153 - * We want error_code and trap_no set for userspace
17154 - * faults and kernelspace faults which result in
17155 - * die(), but not kernelspace faults which are fixed
17156 - * up. die() gives the process no chance to handle
17157 - * the signal and notice the kernel fault information,
17158 - * so that won't result in polluting the information
17159 - * about previously queued, but not yet delivered,
17160 - * faults. See also do_general_protection below.
17162 - tsk->thread.error_code = error_code;
17163 - tsk->thread.trap_no = trapnr;
17164 + if (!user_mode(regs))
17165 + goto kernel_trap;
17167 - if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17168 - printk_ratelimit()) {
17170 - "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17171 - tsk->comm, tsk->pid, str,
17172 - regs->ip, regs->sp, error_code);
17173 - print_vma_addr(" in ", regs->ip);
17177 + * We want error_code and trap_no set for userspace faults and
17178 + * kernelspace faults which result in die(), but not
17179 + * kernelspace faults which are fixed up. die() gives the
17180 + * process no chance to handle the signal and notice the
17181 + * kernel fault information, so that won't result in polluting
17182 + * the information about previously queued, but not yet
17183 + * delivered, faults. See also do_general_protection below.
17185 + tsk->thread.error_code = error_code;
17186 + tsk->thread.trap_no = trapnr;
17189 - force_sig_info(signr, info, tsk);
17191 - force_sig(signr, tsk);
17193 + if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17194 + printk_ratelimit()) {
17196 + "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17197 + tsk->comm, tsk->pid, str,
17198 + regs->ip, regs->sp, error_code);
17199 + print_vma_addr(" in ", regs->ip);
17204 + force_sig_info(signr, info, tsk);
17206 + force_sig(signr, tsk);
17210 if (!fixup_exception(regs)) {
17211 tsk->thread.error_code = error_code;
17212 tsk->thread.trap_no = trapnr;
17213 @@ -677,41 +642,39 @@ static void __kprobes do_trap(int trapnr
17216 #define DO_ERROR(trapnr, signr, str, name) \
17217 -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17219 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17220 - == NOTIFY_STOP) \
17222 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17224 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17225 + == NOTIFY_STOP) \
17227 conditional_sti(regs); \
17228 - do_trap(trapnr, signr, str, regs, error_code, NULL); \
17229 + do_trap(trapnr, signr, str, regs, error_code, NULL); \
17232 -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17233 -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17235 - siginfo_t info; \
17236 - info.si_signo = signr; \
17237 - info.si_errno = 0; \
17238 - info.si_code = sicode; \
17239 - info.si_addr = (void __user *)siaddr; \
17240 - trace_hardirqs_fixup(); \
17241 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17242 - == NOTIFY_STOP) \
17244 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17245 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17247 + siginfo_t info; \
17248 + info.si_signo = signr; \
17249 + info.si_errno = 0; \
17250 + info.si_code = sicode; \
17251 + info.si_addr = (void __user *)siaddr; \
17252 + trace_hardirqs_fixup(); \
17253 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17254 + == NOTIFY_STOP) \
17256 conditional_sti(regs); \
17257 - do_trap(trapnr, signr, str, regs, error_code, &info); \
17258 + do_trap(trapnr, signr, str, regs, error_code, &info); \
17261 -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17262 -DO_ERROR( 4, SIGSEGV, "overflow", overflow)
17263 -DO_ERROR( 5, SIGSEGV, "bounds", bounds)
17264 -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17265 -DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
17266 -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17267 +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17268 +DO_ERROR(4, SIGSEGV, "overflow", overflow)
17269 +DO_ERROR(5, SIGSEGV, "bounds", bounds)
17270 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17271 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17272 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
17273 -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17274 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17275 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
17276 -DO_ERROR(18, SIGSEGV, "reserved", reserved)
17278 /* Runs on IST stack */
17279 asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
17280 @@ -741,31 +704,34 @@ asmlinkage void do_double_fault(struct p
17281 die(str, regs, error_code);
17284 -asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
17286 +asmlinkage void __kprobes
17287 +do_general_protection(struct pt_regs *regs, long error_code)
17289 - struct task_struct *tsk = current;
17290 + struct task_struct *tsk;
17292 conditional_sti(regs);
17294 - if (user_mode(regs)) {
17295 - tsk->thread.error_code = error_code;
17296 - tsk->thread.trap_no = 13;
17298 + if (!user_mode(regs))
17299 + goto gp_in_kernel;
17301 - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17302 - printk_ratelimit()) {
17304 - "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17305 - tsk->comm, tsk->pid,
17306 - regs->ip, regs->sp, error_code);
17307 - print_vma_addr(" in ", regs->ip);
17310 + tsk->thread.error_code = error_code;
17311 + tsk->thread.trap_no = 13;
17313 - force_sig(SIGSEGV, tsk);
17316 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17317 + printk_ratelimit()) {
17319 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17320 + tsk->comm, tsk->pid,
17321 + regs->ip, regs->sp, error_code);
17322 + print_vma_addr(" in ", regs->ip);
17326 + force_sig(SIGSEGV, tsk);
17330 if (fixup_exception(regs))
17333 @@ -778,14 +744,14 @@ asmlinkage void __kprobes do_general_pro
17336 static notrace __kprobes void
17337 -mem_parity_error(unsigned char reason, struct pt_regs * regs)
17338 +mem_parity_error(unsigned char reason, struct pt_regs *regs)
17340 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
17342 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
17344 #if defined(CONFIG_EDAC)
17345 - if(edac_handler_set()) {
17346 + if (edac_handler_set()) {
17347 edac_atomic_assert_error();
17350 @@ -801,7 +767,7 @@ mem_parity_error(unsigned char reason, s
17353 static notrace __kprobes void
17354 -io_check_error(unsigned char reason, struct pt_regs * regs)
17355 +io_check_error(unsigned char reason, struct pt_regs *regs)
17357 printk("NMI: IOCK error (debug interrupt?)\n");
17358 show_registers(regs);
17359 @@ -827,14 +793,14 @@ unknown_nmi_error(unsigned char reason,
17361 /* Runs on IST stack. This code must keep interrupts off all the time.
17362 Nested NMIs are prevented by the CPU. */
17363 -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17364 +asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17366 unsigned char reason = 0;
17369 cpu = smp_processor_id();
17371 - /* Only the BSP gets external NMIs from the system. */
17372 + /* Only the BSP gets external NMIs from the system. */
17374 reason = get_nmi_reason();
17376 @@ -847,33 +813,58 @@ asmlinkage notrace __kprobes void defau
17377 * Ok, so this is none of the documented NMI sources,
17378 * so it must be the NMI watchdog.
17380 - if (nmi_watchdog_tick(regs,reason))
17381 + if (nmi_watchdog_tick(regs, reason))
17384 - if (!do_nmi_callback(regs,cpu))
17385 + if (!do_nmi_callback(regs, cpu))
17386 unknown_nmi_error(reason, regs);
17390 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
17394 /* AK: following checks seem to be broken on modern chipsets. FIXME */
17397 mem_parity_error(reason, regs);
17399 io_check_error(reason, regs);
17402 +asmlinkage notrace __kprobes void
17403 +do_nmi(struct pt_regs *regs, long error_code)
17407 + add_pda(__nmi_count, 1);
17409 + if (!ignore_nmis)
17410 + default_do_nmi(regs);
17415 +void stop_nmi(void)
17417 + acpi_nmi_disable();
17421 +void restart_nmi(void)
17424 + acpi_nmi_enable();
17427 /* runs on IST stack. */
17428 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
17430 trace_hardirqs_fixup();
17432 - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
17433 + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
17438 preempt_conditional_sti(regs);
17439 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
17440 preempt_conditional_cli(regs);
17441 @@ -904,8 +895,8 @@ asmlinkage __kprobes struct pt_regs *syn
17442 asmlinkage void __kprobes do_debug(struct pt_regs * regs,
17443 unsigned long error_code)
17445 - unsigned long condition;
17446 struct task_struct *tsk = current;
17447 + unsigned long condition;
17450 trace_hardirqs_fixup();
17451 @@ -926,21 +917,19 @@ asmlinkage void __kprobes do_debug(struc
17453 /* Mask out spurious debug traps due to lazy DR7 setting */
17454 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
17455 - if (!tsk->thread.debugreg7) {
17456 + if (!tsk->thread.debugreg7)
17461 tsk->thread.debugreg6 = condition;
17465 * Single-stepping through TF: make sure we ignore any events in
17466 * kernel space (but re-enable TF when returning to user mode).
17468 if (condition & DR_STEP) {
17469 - if (!user_mode(regs))
17470 - goto clear_TF_reenable;
17471 + if (!user_mode(regs))
17472 + goto clear_TF_reenable;
17475 /* Ok, finally something we can handle */
17476 @@ -953,7 +942,7 @@ asmlinkage void __kprobes do_debug(struc
17477 force_sig_info(SIGTRAP, &info, tsk);
17480 - set_debugreg(0UL, 7);
17481 + set_debugreg(0, 7);
17482 preempt_conditional_cli(regs);
17485 @@ -961,6 +950,7 @@ clear_TF_reenable:
17486 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
17487 regs->flags &= ~X86_EFLAGS_TF;
17488 preempt_conditional_cli(regs);
17492 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
17493 @@ -983,7 +973,7 @@ static int kernel_math_error(struct pt_r
17494 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
17496 void __user *ip = (void __user *)(regs->ip);
17497 - struct task_struct * task;
17498 + struct task_struct *task;
17500 unsigned short cwd, swd;
17502 @@ -1016,30 +1006,30 @@ asmlinkage void do_coprocessor_error(str
17503 cwd = get_fpu_cwd(task);
17504 swd = get_fpu_swd(task);
17505 switch (swd & ~cwd & 0x3f) {
17509 - case 0x001: /* Invalid Op */
17511 - * swd & 0x240 == 0x040: Stack Underflow
17512 - * swd & 0x240 == 0x240: Stack Overflow
17513 - * User must clear the SF bit (0x40) if set
17515 - info.si_code = FPE_FLTINV;
17517 - case 0x002: /* Denormalize */
17518 - case 0x010: /* Underflow */
17519 - info.si_code = FPE_FLTUND;
17521 - case 0x004: /* Zero Divide */
17522 - info.si_code = FPE_FLTDIV;
17524 - case 0x008: /* Overflow */
17525 - info.si_code = FPE_FLTOVF;
17527 - case 0x020: /* Precision */
17528 - info.si_code = FPE_FLTRES;
17530 + case 0x000: /* No unmasked exception */
17531 + default: /* Multiple exceptions */
17533 + case 0x001: /* Invalid Op */
17535 + * swd & 0x240 == 0x040: Stack Underflow
17536 + * swd & 0x240 == 0x240: Stack Overflow
17537 + * User must clear the SF bit (0x40) if set
17539 + info.si_code = FPE_FLTINV;
17541 + case 0x002: /* Denormalize */
17542 + case 0x010: /* Underflow */
17543 + info.si_code = FPE_FLTUND;
17545 + case 0x004: /* Zero Divide */
17546 + info.si_code = FPE_FLTDIV;
17548 + case 0x008: /* Overflow */
17549 + info.si_code = FPE_FLTOVF;
17551 + case 0x020: /* Precision */
17552 + info.si_code = FPE_FLTRES;
17555 force_sig_info(SIGFPE, &info, task);
17557 @@ -1052,7 +1042,7 @@ asmlinkage void bad_intr(void)
17558 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
17560 void __user *ip = (void __user *)(regs->ip);
17561 - struct task_struct * task;
17562 + struct task_struct *task;
17564 unsigned short mxcsr;
17566 @@ -1080,25 +1070,25 @@ asmlinkage void do_simd_coprocessor_erro
17568 mxcsr = get_fpu_mxcsr(task);
17569 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
17573 - case 0x001: /* Invalid Op */
17574 - info.si_code = FPE_FLTINV;
17576 - case 0x002: /* Denormalize */
17577 - case 0x010: /* Underflow */
17578 - info.si_code = FPE_FLTUND;
17580 - case 0x004: /* Zero Divide */
17581 - info.si_code = FPE_FLTDIV;
17583 - case 0x008: /* Overflow */
17584 - info.si_code = FPE_FLTOVF;
17586 - case 0x020: /* Precision */
17587 - info.si_code = FPE_FLTRES;
17592 + case 0x001: /* Invalid Op */
17593 + info.si_code = FPE_FLTINV;
17595 + case 0x002: /* Denormalize */
17596 + case 0x010: /* Underflow */
17597 + info.si_code = FPE_FLTUND;
17599 + case 0x004: /* Zero Divide */
17600 + info.si_code = FPE_FLTDIV;
17602 + case 0x008: /* Overflow */
17603 + info.si_code = FPE_FLTOVF;
17605 + case 0x020: /* Precision */
17606 + info.si_code = FPE_FLTRES;
17609 force_sig_info(SIGFPE, &info, task);
17611 @@ -1118,7 +1108,7 @@ asmlinkage void __attribute__((weak)) mc
17615 - * 'math_state_restore()' saves the current math information in the
17616 + * 'math_state_restore()' saves the current math information in the
17617 * old math state array, and gets the new ones from the current task
17619 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
17620 @@ -1145,7 +1135,14 @@ asmlinkage void math_state_restore(void)
17622 /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
17624 - restore_fpu_checking(&me->thread.xstate->fxsave);
17626 + * Paranoid restore. send a SIGSEGV if we fail to restore the state.
17628 + if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
17630 + force_sig(SIGSEGV, me);
17633 task_thread_info(me)->status |= TS_USEDFPU;
17636 @@ -1190,13 +1187,12 @@ void __init trap_init(void)
17637 ret = HYPERVISOR_set_trap_table(trap_table);
17639 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
17642 * initialize the per thread extended state:
17644 - init_thread_xstate();
17645 + init_thread_xstate();
17647 - * Should be a barrier for any external CPU state.
17648 + * Should be a barrier for any external CPU state:
17652 @@ -1212,27 +1208,25 @@ void __cpuinit smp_trap_init(trap_info_t
17657 static int __init oops_setup(char *s)
17662 if (!strcmp(s, "panic"))
17667 early_param("oops", oops_setup);
17669 static int __init kstack_setup(char *s)
17673 - kstack_depth_to_print = simple_strtoul(s,NULL,0);
17674 + kstack_depth_to_print = simple_strtoul(s, NULL, 0);
17677 early_param("kstack", kstack_setup);
17680 static int __init code_bytes_setup(char *s)
17682 code_bytes = simple_strtoul(s, NULL, 0);
17683 Index: head-2008-12-01/arch/x86/kernel/vsyscall_64-xen.c
17684 ===================================================================
17685 --- head-2008-12-01.orig/arch/x86/kernel/vsyscall_64-xen.c 2008-12-01 11:44:55.000000000 +0100
17686 +++ head-2008-12-01/arch/x86/kernel/vsyscall_64-xen.c 2008-12-01 11:49:07.000000000 +0100
17688 #include <asm/topology.h>
17689 #include <asm/vgtod.h>
17691 -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
17692 +#define __vsyscall(nr) \
17693 + __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
17694 #define __syscall_clobber "r11","cx","memory"
17697 @@ -264,10 +265,7 @@ static void __cpuinit vsyscall_set_cpu(i
17699 d |= (node & 0xf) << 12;
17700 d |= (node >> 4) << 48;
17701 - if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
17702 - + GDT_ENTRY_PER_CPU),
17705 + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
17708 static void __cpuinit cpu_vsyscall_init(void *arg)
17709 @@ -281,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_bl
17711 long cpu = (long)arg;
17712 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
17713 - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
17714 + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
17715 return NOTIFY_DONE;
17718 @@ -311,7 +309,7 @@ static int __init vsyscall_init(void)
17719 #ifdef CONFIG_SYSCTL
17720 register_sysctl_table(kernel_root_table2);
17722 - on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
17723 + on_each_cpu(cpu_vsyscall_init, NULL, 1);
17724 hotcpu_notifier(cpu_vsyscall_notifier, 0);
17727 Index: head-2008-12-01/arch/x86/mach-xen/setup.c
17728 ===================================================================
17729 --- head-2008-12-01.orig/arch/x86/mach-xen/setup.c 2008-12-01 11:37:10.000000000 +0100
17730 +++ head-2008-12-01/arch/x86/mach-xen/setup.c 2008-12-01 11:49:07.000000000 +0100
17732 #include <xen/interface/callback.h>
17733 #include <xen/interface/memory.h>
17735 +#ifdef CONFIG_X86_32
17737 #ifdef CONFIG_HOTPLUG_CPU
17738 #define DEFAULT_SEND_IPI (1)
17740 @@ -44,51 +46,6 @@ static int __init print_ipi_mode(void)
17742 late_initcall(print_ipi_mode);
17745 - * machine_specific_memory_setup - Hook for machine specific memory setup.
17748 - * This is included late in kernel/setup.c so that it can make
17749 - * use of all of the static functions.
17752 -char * __init machine_specific_memory_setup(void)
17755 - struct xen_memory_map memmap;
17757 - * This is rather large for a stack variable but this early in
17758 - * the boot process we know we have plenty slack space.
17760 - struct e820entry map[E820MAX];
17762 - memmap.nr_entries = E820MAX;
17763 - set_xen_guest_handle(memmap.buffer, map);
17765 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
17766 - if ( rc == -ENOSYS ) {
17767 - memmap.nr_entries = 1;
17768 - map[0].addr = 0ULL;
17769 - map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
17770 - /* 8MB slack (to balance backend allocations). */
17771 - map[0].size += 8ULL << 20;
17772 - map[0].type = E820_RAM;
17777 - sanitize_e820_map(map, (char *)&memmap.nr_entries);
17779 - BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
17785 -extern void hypervisor_callback(void);
17786 -extern void failsafe_callback(void);
17787 -extern void nmi(void);
17789 unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
17790 EXPORT_SYMBOL(machine_to_phys_mapping);
17791 unsigned int machine_to_phys_order;
17792 @@ -121,33 +78,66 @@ void __init pre_setup_arch_hook(void)
17793 (unsigned long *)xen_start_info->mfn_list;
17796 +#endif /* CONFIG_X86_32 */
17798 +extern void hypervisor_callback(void);
17799 +extern void failsafe_callback(void);
17800 +extern void nmi(void);
17802 +#ifdef CONFIG_X86_64
17803 +#include <asm/proto.h>
17804 +#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
17806 +#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
17809 void __init machine_specific_arch_setup(void)
17812 static struct callback_register __initdata event = {
17813 .type = CALLBACKTYPE_event,
17814 - .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
17815 + .address = CALLBACK_ADDR(hypervisor_callback)
17817 static struct callback_register __initdata failsafe = {
17818 .type = CALLBACKTYPE_failsafe,
17819 - .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
17820 + .address = CALLBACK_ADDR(failsafe_callback)
17822 +#ifdef CONFIG_X86_64
17823 + static struct callback_register __initdata syscall = {
17824 + .type = CALLBACKTYPE_syscall,
17825 + .address = CALLBACK_ADDR(system_call)
17828 +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17829 static struct callback_register __initdata nmi_cb = {
17830 .type = CALLBACKTYPE_nmi,
17831 - .address = { __KERNEL_CS, (unsigned long)nmi },
17832 + .address = CALLBACK_ADDR(nmi)
17836 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
17838 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
17839 +#ifdef CONFIG_X86_64
17841 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
17843 #if CONFIG_XEN_COMPAT <= 0x030002
17844 +#ifdef CONFIG_X86_32
17845 if (ret == -ENOSYS)
17846 ret = HYPERVISOR_set_callbacks(
17847 event.address.cs, event.address.eip,
17848 failsafe.address.cs, failsafe.address.eip);
17850 + ret = HYPERVISOR_set_callbacks(
17852 + failsafe.address,
17853 + syscall.address);
17858 +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17859 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
17860 #if CONFIG_XEN_COMPAT <= 0x030002
17861 if (ret == -ENOSYS) {
17862 @@ -158,15 +148,43 @@ void __init machine_specific_arch_setup(
17863 HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
17868 +#ifdef CONFIG_X86_32
17869 /* Do an early initialization of the fixmap area */
17871 extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
17872 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
17873 pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
17874 pmd_t *pmd = pmd_offset(pud, addr);
17877 make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
17878 set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
17880 +#define __FIXADDR_TOP (-PAGE_SIZE)
17881 +#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
17882 + != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
17883 + FIX_BUG_ON(SHARED_INFO);
17884 + FIX_BUG_ON(ISAMAP_BEGIN);
17885 + FIX_BUG_ON(ISAMAP_END);
17886 +#undef __FIXADDR_TOP
17887 + BUG_ON(pte_index(hypervisor_virt_start));
17889 + /* Switch to the real shared_info page, and clear the
17891 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
17892 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
17893 + memset(empty_zero_page, 0, sizeof(empty_zero_page));
17895 + /* Setup mapping of lower 1st MB */
17896 + for (i = 0; i < NR_FIX_ISAMAPS; i++)
17897 + if (is_initial_xendomain())
17898 + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
17900 + __set_fixmap(FIX_ISAMAP_BEGIN - i,
17901 + virt_to_machine(empty_zero_page),
17906 Index: head-2008-12-01/arch/x86/mm/fault-xen.c
17907 ===================================================================
17908 --- head-2008-12-01.orig/arch/x86/mm/fault-xen.c 2008-12-01 11:44:55.000000000 +0100
17909 +++ head-2008-12-01/arch/x86/mm/fault-xen.c 2008-12-01 11:49:07.000000000 +0100
17911 #include <linux/string.h>
17912 #include <linux/types.h>
17913 #include <linux/ptrace.h>
17914 +#include <linux/mmiotrace.h>
17915 #include <linux/mman.h>
17916 #include <linux/mm.h>
17917 #include <linux/smp.h>
17918 @@ -49,17 +50,23 @@
17919 #define PF_RSVD (1<<3)
17920 #define PF_INSTR (1<<4)
17922 +static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
17924 +#ifdef CONFIG_MMIOTRACE_HOOKS
17925 + if (unlikely(is_kmmio_active()))
17926 + if (kmmio_handler(regs, addr) == 1)
17932 static inline int notify_page_fault(struct pt_regs *regs)
17934 #ifdef CONFIG_KPROBES
17937 /* kprobe_running() needs smp_processor_id() */
17938 -#ifdef CONFIG_X86_32
17939 if (!user_mode_vm(regs)) {
17941 - if (!user_mode(regs)) {
17944 if (kprobe_running() && kprobe_fault_handler(regs, 14))
17946 @@ -409,11 +416,7 @@ static void show_fault_oops(struct pt_re
17947 printk(KERN_CONT "NULL pointer dereference");
17949 printk(KERN_CONT "paging request");
17950 -#ifdef CONFIG_X86_32
17951 - printk(KERN_CONT " at %08lx\n", address);
17953 - printk(KERN_CONT " at %016lx\n", address);
17955 + printk(KERN_CONT " at %p\n", (void *) address);
17956 printk(KERN_ALERT "IP:");
17957 printk_address(regs->ip, 1);
17958 dump_pagetable(address);
17959 @@ -628,6 +631,8 @@ void __kprobes do_page_fault(struct pt_r
17961 if (notify_page_fault(regs))
17963 + if (unlikely(kmmio_fault(regs, address)))
17967 * We fault-in kernel-space virtual memory on-demand. The
17968 @@ -832,14 +837,10 @@ bad_area_nosemaphore:
17969 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17970 printk_ratelimit()) {
17972 -#ifdef CONFIG_X86_32
17973 - "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
17975 - "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
17977 + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
17978 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
17979 - tsk->comm, task_pid_nr(tsk), address, regs->ip,
17980 - regs->sp, error_code);
17981 + tsk->comm, task_pid_nr(tsk), address,
17982 + (void *) regs->ip, (void *) regs->sp, error_code);
17983 print_vma_addr(" in ", regs->ip);
17986 @@ -947,81 +948,45 @@ LIST_HEAD(pgd_list);
17987 void vmalloc_sync_all(void)
17989 #ifdef CONFIG_X86_32
17991 - * Note that races in the updates of insync and start aren't
17992 - * problematic: insync can only get set bits added, and updates to
17993 - * start are only improving performance (without affecting correctness
17995 - * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
17996 - * This change works just fine with 2-level paging too.
17998 -#define sync_index(a) ((a) >> PMD_SHIFT)
17999 - static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
18000 - static unsigned long start = TASK_SIZE;
18001 - unsigned long address;
18002 + unsigned long address = VMALLOC_START & PGDIR_MASK;
18004 if (SHARED_KERNEL_PMD)
18007 BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
18008 - for (address = start;
18009 - address < hypervisor_virt_start;
18010 - address += PMD_SIZE) {
18011 - if (!test_bit(sync_index(address), insync)) {
18012 - unsigned long flags;
18013 - struct page *page;
18015 - spin_lock_irqsave(&pgd_lock, flags);
18016 - /* XEN: failure path assumes non-empty pgd_list. */
18017 - if (unlikely(list_empty(&pgd_list))) {
18018 - spin_unlock_irqrestore(&pgd_lock, flags);
18021 - list_for_each_entry(page, &pgd_list, lru) {
18022 - if (!vmalloc_sync_one(page_address(page),
18026 - spin_unlock_irqrestore(&pgd_lock, flags);
18028 - set_bit(sync_index(address), insync);
18029 + for (; address < hypervisor_virt_start; address += PMD_SIZE) {
18030 + unsigned long flags;
18031 + struct page *page;
18033 + spin_lock_irqsave(&pgd_lock, flags);
18034 + list_for_each_entry(page, &pgd_list, lru) {
18035 + if (!vmalloc_sync_one(page_address(page),
18039 - if (address == start && test_bit(sync_index(address), insync))
18040 - start = address + PMD_SIZE;
18041 + spin_unlock_irqrestore(&pgd_lock, flags);
18043 #else /* CONFIG_X86_64 */
18045 - * Note that races in the updates of insync and start aren't
18046 - * problematic: insync can only get set bits added, and updates to
18047 - * start are only improving performance (without affecting correctness
18050 - static DECLARE_BITMAP(insync, PTRS_PER_PGD);
18051 - static unsigned long start = VMALLOC_START & PGDIR_MASK;
18052 + unsigned long start = VMALLOC_START & PGDIR_MASK;
18053 unsigned long address;
18055 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
18056 - if (!test_bit(pgd_index(address), insync)) {
18057 - const pgd_t *pgd_ref = pgd_offset_k(address);
18058 - unsigned long flags;
18059 - struct page *page;
18061 - if (pgd_none(*pgd_ref))
18063 - spin_lock_irqsave(&pgd_lock, flags);
18064 - list_for_each_entry(page, &pgd_list, lru) {
18066 - pgd = (pgd_t *)page_address(page) + pgd_index(address);
18067 - if (pgd_none(*pgd))
18068 - set_pgd(pgd, *pgd_ref);
18070 - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18072 - spin_unlock_irqrestore(&pgd_lock, flags);
18073 - set_bit(pgd_index(address), insync);
18074 + const pgd_t *pgd_ref = pgd_offset_k(address);
18075 + unsigned long flags;
18076 + struct page *page;
18078 + if (pgd_none(*pgd_ref))
18080 + spin_lock_irqsave(&pgd_lock, flags);
18081 + list_for_each_entry(page, &pgd_list, lru) {
18083 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
18084 + if (pgd_none(*pgd))
18085 + set_pgd(pgd, *pgd_ref);
18087 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18089 - if (address == start)
18090 - start = address + PGDIR_SIZE;
18091 + spin_unlock_irqrestore(&pgd_lock, flags);
18095 Index: head-2008-12-01/arch/x86/mm/hypervisor.c
18096 ===================================================================
18097 --- head-2008-12-01.orig/arch/x86/mm/hypervisor.c 2008-12-01 11:37:10.000000000 +0100
18098 +++ head-2008-12-01/arch/x86/mm/hypervisor.c 2008-12-01 11:49:07.000000000 +0100
18099 @@ -837,42 +837,9 @@ int write_ldt_entry(struct desc_struct *
18100 return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
18103 -#define MAX_BATCHED_FULL_PTES 32
18105 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
18106 - unsigned long addr, unsigned long end, pgprot_t newprot,
18107 - int dirty_accountable)
18108 +int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
18111 - int rc = 0, i = 0;
18112 - mmu_update_t u[MAX_BATCHED_FULL_PTES];
18116 - if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
18119 - pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
18121 - if (pte_present(*pte)) {
18122 - pte_t ptent = pte_modify(*pte, newprot);
18124 - if (dirty_accountable && pte_dirty(ptent))
18125 - ptent = pte_mkwrite(ptent);
18126 - u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
18127 - | ((unsigned long)pte & ~PAGE_MASK)
18128 - | MMU_PT_UPDATE_PRESERVE_AD;
18129 - u[i].val = __pte_val(ptent);
18130 - if (++i == MAX_BATCHED_FULL_PTES) {
18131 - if ((rc = HYPERVISOR_mmu_update(
18132 - &u[0], i, NULL, DOMID_SELF)) != 0)
18137 - } while (pte++, addr += PAGE_SIZE, addr != end);
18139 - rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
18140 - pte_unmap_unlock(pte - 1, ptl);
18141 - BUG_ON(rc && rc != -ENOSYS);
18143 + maddr_t mach_gp = virt_to_machine(gdt + entry);
18144 + return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
18146 Index: head-2008-12-01/arch/x86/mm/init_32-xen.c
18147 ===================================================================
18148 --- head-2008-12-01.orig/arch/x86/mm/init_32-xen.c 2008-12-01 11:44:55.000000000 +0100
18149 +++ head-2008-12-01/arch/x86/mm/init_32-xen.c 2008-12-01 11:49:07.000000000 +0100
18152 unsigned int __VMALLOC_RESERVE = 128 << 20;
18154 +unsigned long max_low_pfn_mapped;
18155 unsigned long max_pfn_mapped;
18157 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
18158 @@ -61,6 +62,27 @@ unsigned long highstart_pfn, highend_pfn
18160 static noinline int do_test_wp_bit(void);
18163 +static unsigned long __initdata table_start;
18164 +static unsigned long __initdata table_end;
18165 +static unsigned long __initdata table_top;
18167 +static int __initdata after_init_bootmem;
18169 +static __init void *alloc_low_page(unsigned long *phys)
18171 + unsigned long pfn = table_end++;
18174 + if (pfn >= table_top)
18175 + panic("alloc_low_page: ran out of memory");
18177 + adr = __va(pfn * PAGE_SIZE);
18178 + memset(adr, 0, PAGE_SIZE);
18179 + *phys = pfn * PAGE_SIZE;
18184 * Creates a middle page table and puts a pointer to it in the
18185 * given global directory entry. This only returns the gd entry
18186 @@ -72,9 +94,12 @@ static pmd_t * __init one_md_table_init(
18189 #ifdef CONFIG_X86_PAE
18190 + unsigned long phys;
18191 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
18192 - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18194 + if (after_init_bootmem)
18195 + pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18197 + pmd_table = (pmd_t *)alloc_low_page(&phys);
18198 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
18199 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
18200 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
18201 @@ -101,12 +126,16 @@ static pte_t * __init one_page_table_ini
18203 pte_t *page_table = NULL;
18205 + if (after_init_bootmem) {
18206 #ifdef CONFIG_DEBUG_PAGEALLOC
18207 - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18208 + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18210 - if (!page_table) {
18214 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18216 + unsigned long phys;
18217 + page_table = (pte_t *)alloc_low_page(&phys);
18220 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
18221 @@ -167,24 +196,24 @@ static inline int is_kernel_text(unsigne
18222 * of max_low_pfn pages, by creating page tables starting from address
18225 -static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
18226 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
18227 + unsigned long start_pfn,
18228 + unsigned long end_pfn,
18231 int pgd_idx, pmd_idx, pte_ofs;
18236 + unsigned pages_2m = 0, pages_4k = 0;
18238 - unsigned long max_ram_pfn = xen_start_info->nr_pages;
18239 - if (max_ram_pfn > max_low_pfn)
18240 - max_ram_pfn = max_low_pfn;
18241 + if (!cpu_has_pse)
18244 - pgd_idx = pgd_index(PAGE_OFFSET);
18246 + pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18247 pgd = pgd_base + pgd_idx;
18249 - pmd_idx = pmd_index(PAGE_OFFSET);
18250 - pte_ofs = pte_index(PAGE_OFFSET);
18252 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
18255 @@ -198,10 +227,16 @@ static void __init kernel_physical_mappi
18257 pmd = one_md_table_init(pgd);
18259 - if (pfn >= max_low_pfn)
18261 + if (pfn >= end_pfn)
18263 +#ifdef CONFIG_X86_PAE
18264 + pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18266 - for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
18270 + for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
18271 pmd++, pmd_idx++) {
18272 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
18274 @@ -211,13 +246,8 @@ static void __init kernel_physical_mappi
18276 * Map with big pages if possible, otherwise
18277 * create normal page tables:
18279 - * Don't use a large page for the first 2/4MB of memory
18280 - * because there are often fixed size MTRRs in there
18281 - * and overlapping MTRRs into large pages can cause
18284 - if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
18286 unsigned int addr2;
18287 pgprot_t prot = PAGE_KERNEL_LARGE;
18289 @@ -228,49 +258,35 @@ static void __init kernel_physical_mappi
18290 is_kernel_text(addr2))
18291 prot = PAGE_KERNEL_LARGE_EXEC;
18294 set_pmd(pmd, pfn_pmd(pfn, prot));
18296 pfn += PTRS_PER_PTE;
18297 - max_pfn_mapped = pfn;
18300 pte = one_page_table_init(pmd);
18302 - for (pte += pte_ofs;
18303 - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
18304 + pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18306 + for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
18307 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
18308 pgprot_t prot = PAGE_KERNEL;
18310 /* XEN: Only map initial RAM allocation. */
18311 - if ((pfn >= max_ram_pfn) || pte_present(*pte))
18312 + if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
18314 if (is_kernel_text(addr))
18315 prot = PAGE_KERNEL_EXEC;
18318 set_pte(pte, pfn_pte(pfn, prot));
18320 - max_pfn_mapped = pfn;
18325 + update_page_count(PG_LEVEL_2M, pages_2m);
18326 + update_page_count(PG_LEVEL_4K, pages_4k);
18329 -#ifndef CONFIG_XEN
18331 -static inline int page_kills_ppro(unsigned long pagenr)
18333 - if (pagenr >= 0x70000 && pagenr <= 0x7003F)
18340 -#define page_kills_ppro(p) 0
18345 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
18346 * is valid. The argument is a physical page number.
18347 @@ -331,30 +347,63 @@ static void __init permanent_kmaps_init(
18348 pkmap_page_table = pte;
18351 -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
18352 +static void __init add_one_highpage_init(struct page *page, int pfn)
18354 - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
18355 - ClearPageReserved(page);
18356 - init_page_count(page);
18357 - if (pfn < xen_start_info->nr_pages)
18358 - __free_page(page);
18359 - totalhigh_pages++;
18361 - SetPageReserved(page);
18362 + ClearPageReserved(page);
18363 + init_page_count(page);
18364 + if (pfn < xen_start_info->nr_pages)
18365 + __free_page(page);
18366 + totalhigh_pages++;
18369 +struct add_highpages_data {
18370 + unsigned long start_pfn;
18371 + unsigned long end_pfn;
18374 +static int __init add_highpages_work_fn(unsigned long start_pfn,
18375 + unsigned long end_pfn, void *datax)
18378 + struct page *page;
18379 + unsigned long final_start_pfn, final_end_pfn;
18380 + struct add_highpages_data *data;
18382 + data = (struct add_highpages_data *)datax;
18384 + final_start_pfn = max(start_pfn, data->start_pfn);
18385 + final_end_pfn = min(end_pfn, data->end_pfn);
18386 + if (final_start_pfn >= final_end_pfn)
18389 + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
18391 + if (!pfn_valid(node_pfn))
18393 + page = pfn_to_page(node_pfn);
18394 + add_one_highpage_init(page, node_pfn);
18401 +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
18402 + unsigned long end_pfn)
18404 + struct add_highpages_data data;
18406 + data.start_pfn = start_pfn;
18407 + data.end_pfn = end_pfn;
18409 + work_with_active_regions(nid, add_highpages_work_fn, &data);
18412 #ifndef CONFIG_NUMA
18413 -static void __init set_highmem_pages_init(int bad_ppro)
18414 +static void __init set_highmem_pages_init(void)
18417 + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
18419 - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
18421 - * Holes under sparsemem might not have no mem_map[]:
18423 - if (pfn_valid(pfn))
18424 - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
18426 totalram_pages += totalhigh_pages;
18428 #endif /* !CONFIG_NUMA */
18429 @@ -362,24 +411,11 @@ static void __init set_highmem_pages_ini
18431 # define kmap_init() do { } while (0)
18432 # define permanent_kmaps_init(pgd_base) do { } while (0)
18433 -# define set_highmem_pages_init(bad_ppro) do { } while (0)
18434 +# define set_highmem_pages_init() do { } while (0)
18435 #endif /* CONFIG_HIGHMEM */
18437 -pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
18438 -EXPORT_SYMBOL(__PAGE_KERNEL);
18440 -pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
18442 pgd_t *swapper_pg_dir;
18444 -static void __init xen_pagetable_setup_start(pgd_t *base)
18448 -static void __init xen_pagetable_setup_done(pgd_t *base)
18453 * Build a proper pagetable for the kernel mappings. Up until this
18454 * point, we've been running on some set of pagetables constructed by
18455 @@ -399,27 +435,10 @@ static void __init xen_pagetable_setup_d
18456 * be partially populated, and so it avoids stomping on any existing
18459 -static void __init pagetable_init(void)
18460 +static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
18462 - pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18463 unsigned long vaddr, end;
18465 - xen_pagetable_setup_start(pgd_base);
18467 - /* Enable PSE if available */
18469 - set_in_cr4(X86_CR4_PSE);
18471 - /* Enable PGE if available */
18472 - if (cpu_has_pge) {
18473 - set_in_cr4(X86_CR4_PGE);
18474 - __PAGE_KERNEL |= _PAGE_GLOBAL;
18475 - __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
18478 - kernel_physical_mapping_init(pgd_base);
18479 - remap_numa_kva();
18482 * Fixed mappings, only the page table structure has to be
18483 * created - mappings will be set by set_fixmap():
18484 @@ -429,10 +448,13 @@ static void __init pagetable_init(void)
18485 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
18486 page_table_range_init(vaddr, end, pgd_base);
18487 early_ioremap_reset();
18490 - permanent_kmaps_init(pgd_base);
18491 +static void __init pagetable_init(void)
18493 + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18495 - xen_pagetable_setup_done(pgd_base);
18496 + permanent_kmaps_init(pgd_base);
18499 #if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
18500 @@ -475,7 +497,7 @@ void zap_low_mappings(void)
18504 -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
18505 +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
18506 EXPORT_SYMBOL_GPL(__supported_pte_mask);
18508 #ifdef CONFIG_X86_PAE
18509 @@ -528,42 +550,369 @@ static void __init set_nx(void)
18513 +/* user-defined highmem size */
18514 +static unsigned int highmem_pages = -1;
18517 - * paging_init() sets up the page tables - note that the first 8MB are
18518 - * already mapped by head.S.
18520 - * This routines also unmaps the page at virtual kernel address 0, so
18521 - * that we can trap those pesky NULL-reference errors in the kernel.
18522 + * highmem=size forces highmem to be exactly 'size' bytes.
18523 + * This works even on boxes that have no highmem otherwise.
18524 + * This also works to reduce highmem size on bigger boxes.
18526 -void __init paging_init(void)
18527 +static int __init parse_highmem(char *arg)
18532 + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
18535 +early_param("highmem", parse_highmem);
18538 + * Determine low and high memory ranges:
18540 +void __init find_low_pfn_range(void)
18542 + /* it could update max_pfn */
18544 + /* max_low_pfn is 0, we already have early_res support */
18546 + max_low_pfn = max_pfn;
18547 + if (max_low_pfn > MAXMEM_PFN) {
18548 + if (highmem_pages == -1)
18549 + highmem_pages = max_pfn - MAXMEM_PFN;
18550 + if (highmem_pages + MAXMEM_PFN < max_pfn)
18551 + max_pfn = MAXMEM_PFN + highmem_pages;
18552 + if (highmem_pages + MAXMEM_PFN > max_pfn) {
18553 + printk(KERN_WARNING "only %luMB highmem pages "
18554 + "available, ignoring highmem size of %uMB.\n",
18555 + pages_to_mb(max_pfn - MAXMEM_PFN),
18556 + pages_to_mb(highmem_pages));
18557 + highmem_pages = 0;
18559 + max_low_pfn = MAXMEM_PFN;
18560 +#ifndef CONFIG_HIGHMEM
18561 + /* Maximum memory usable is what is directly addressable */
18562 + printk(KERN_WARNING "Warning only %ldMB will be used.\n",
18564 + if (max_pfn > MAX_NONPAE_PFN)
18565 + printk(KERN_WARNING
18566 + "Use a HIGHMEM64G enabled kernel.\n");
18568 + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
18569 + max_pfn = MAXMEM_PFN;
18570 +#else /* !CONFIG_HIGHMEM */
18571 +#ifndef CONFIG_HIGHMEM64G
18572 + if (max_pfn > MAX_NONPAE_PFN) {
18573 + max_pfn = MAX_NONPAE_PFN;
18574 + printk(KERN_WARNING "Warning only 4GB will be used."
18575 + "Use a HIGHMEM64G enabled kernel.\n");
18577 +#endif /* !CONFIG_HIGHMEM64G */
18578 +#endif /* !CONFIG_HIGHMEM */
18580 + if (highmem_pages == -1)
18581 + highmem_pages = 0;
18582 +#ifdef CONFIG_HIGHMEM
18583 + if (highmem_pages >= max_pfn) {
18584 + printk(KERN_ERR "highmem size specified (%uMB) is "
18585 + "bigger than pages available (%luMB)!.\n",
18586 + pages_to_mb(highmem_pages),
18587 + pages_to_mb(max_pfn));
18588 + highmem_pages = 0;
18590 + if (highmem_pages) {
18591 + if (max_low_pfn - highmem_pages <
18592 + 64*1024*1024/PAGE_SIZE){
18593 + printk(KERN_ERR "highmem size %uMB results in "
18594 + "smaller than 64MB lowmem, ignoring it.\n"
18595 + , pages_to_mb(highmem_pages));
18596 + highmem_pages = 0;
18598 + max_low_pfn -= highmem_pages;
18601 + if (highmem_pages)
18602 + printk(KERN_ERR "ignoring highmem size on non-highmem"
18608 +#ifndef CONFIG_NEED_MULTIPLE_NODES
18609 +void __init initmem_init(unsigned long start_pfn,
18610 + unsigned long end_pfn)
18612 +#ifdef CONFIG_HIGHMEM
18613 + highstart_pfn = highend_pfn = max_pfn;
18614 + if (max_pfn > max_low_pfn)
18615 + highstart_pfn = max_low_pfn;
18616 + memory_present(0, 0, highend_pfn);
18617 + e820_register_active_regions(0, 0, highend_pfn);
18618 + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
18619 + pages_to_mb(highend_pfn - highstart_pfn));
18620 + num_physpages = highend_pfn;
18621 + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
18623 + memory_present(0, 0, max_low_pfn);
18624 + e820_register_active_regions(0, 0, max_low_pfn);
18625 + num_physpages = max_low_pfn;
18626 + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
18628 +#ifdef CONFIG_FLATMEM
18629 + max_mapnr = num_physpages;
18631 + printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
18632 + pages_to_mb(max_low_pfn));
18634 + setup_bootmem_allocator();
18636 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
18638 +static void __init zone_sizes_init(void)
18640 + unsigned long max_zone_pfns[MAX_NR_ZONES];
18641 + memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
18642 + max_zone_pfns[ZONE_DMA] =
18643 + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
18644 + max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
18645 +#ifdef CONFIG_HIGHMEM
18646 + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
18649 + free_area_init_nodes(max_zone_pfns);
18652 +void __init setup_bootmem_allocator(void)
18655 + unsigned long bootmap_size, bootmap;
18656 + unsigned long end_pfn = min(max_low_pfn, xen_start_info->nr_pages);
18659 + * Initialize the boot-time allocator (with low memory only):
18661 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
18662 + bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
18663 + max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
18665 + if (bootmap == -1L)
18666 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
18667 + reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
18669 + /* don't touch min_low_pfn */
18670 + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
18671 + min_low_pfn, end_pfn);
18672 + printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
18673 + max_pfn_mapped<<PAGE_SHIFT);
18674 + printk(KERN_INFO " low ram: %08lx - %08lx\n",
18675 + min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
18676 + printk(KERN_INFO " bootmap %08lx - %08lx\n",
18677 + bootmap, bootmap + bootmap_size);
18678 + for_each_online_node(i)
18679 + free_bootmem_with_active_regions(i, end_pfn);
18680 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
18682 + after_init_bootmem = 1;
18685 +static unsigned long __init extend_init_mapping(unsigned long tables_space)
18687 + unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
18688 + + xen_start_info->nr_pt_frames;
18689 + unsigned long start = start_pfn, va;
18695 + /* Kill mapping of low 1MB. */
18696 + for (va = PAGE_OFFSET; va < (unsigned long)&_text; va += PAGE_SIZE)
18697 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
18700 + /* Ensure init mappings cover kernel text/data and initial tables. */
18701 + while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
18702 + pgd = pgd_offset_k(va);
18703 + pud = pud_offset(pgd, va);
18704 + pmd = pmd_offset(pud, va);
18705 + if (pmd_none(*pmd)) {
18706 + unsigned long pa = start_pfn++ << PAGE_SHIFT;
18708 + memset(__va(pa), 0, PAGE_SIZE);
18709 + make_lowmem_page_readonly(__va(pa),
18710 + XENFEAT_writable_page_tables);
18711 + xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
18713 + pte = pte_offset_kernel(pmd, va);
18714 + if (pte_none(*pte)) {
18715 + pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
18717 + if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
18723 + /* Finally, blow away any spurious initial mappings. */
18725 + pgd = pgd_offset_k(va);
18726 + pud = pud_offset(pgd, va);
18727 + pmd = pmd_offset(pud, va);
18728 + if (pmd_none(*pmd))
18730 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
18735 + if (start_pfn > start)
18736 + reserve_early(start << PAGE_SHIFT,
18737 + start_pfn << PAGE_SHIFT, "INITMAP");
18739 + return start_pfn;
18742 +static void __init find_early_table_space(unsigned long end)
18744 + unsigned long puds, pmds, ptes, tables;
18746 + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
18747 + tables = PAGE_ALIGN(puds * sizeof(pud_t));
18749 + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
18750 + tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
18752 + if (cpu_has_pse) {
18753 + unsigned long extra;
18755 + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
18756 + extra += PMD_SIZE;
18757 + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
18759 + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
18761 + tables += PAGE_ALIGN(ptes * sizeof(pte_t));
18764 + tables += PAGE_SIZE
18765 + * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK)
18766 + - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK))
18769 + table_start = extend_init_mapping(tables);
18771 + table_end = table_start;
18772 + table_top = table_start + (tables>>PAGE_SHIFT);
18774 + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
18775 + end, table_start << PAGE_SHIFT,
18776 + (table_start << PAGE_SHIFT) + tables);
18779 +unsigned long __init_refok init_memory_mapping(unsigned long start,
18780 + unsigned long end)
18782 + pgd_t *pgd_base = swapper_pg_dir;
18783 + unsigned long start_pfn, end_pfn;
18784 + unsigned long big_page_start;
18787 + * Find space for the kernel direct mapping tables.
18789 + if (!after_init_bootmem)
18790 + find_early_table_space(end);
18792 #ifdef CONFIG_X86_PAE
18795 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
18798 + /* Enable PSE if available */
18800 + set_in_cr4(X86_CR4_PSE);
18802 + /* Enable PGE if available */
18803 + if (cpu_has_pge) {
18804 + set_in_cr4(X86_CR4_PGE);
18805 + __supported_pte_mask |= _PAGE_GLOBAL;
18809 + * Don't use a large page for the first 2/4MB of memory
18810 + * because there are often fixed size MTRRs in there
18811 + * and overlapping MTRRs into large pages can cause
18814 + big_page_start = PMD_SIZE;
18816 + if (start < big_page_start) {
18817 + start_pfn = start >> PAGE_SHIFT;
18818 + end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
18820 + /* head is not big page alignment ? */
18821 + start_pfn = start >> PAGE_SHIFT;
18822 + end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18823 + << (PMD_SHIFT - PAGE_SHIFT);
18825 + if (start_pfn < end_pfn)
18826 + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
18828 + /* big page range */
18829 + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18830 + << (PMD_SHIFT - PAGE_SHIFT);
18831 + if (start_pfn < (big_page_start >> PAGE_SHIFT))
18832 + start_pfn = big_page_start >> PAGE_SHIFT;
18833 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
18834 + if (start_pfn < end_pfn)
18835 + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
18838 + /* tail is not big page alignment ? */
18839 + start_pfn = end_pfn;
18840 + if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
18841 + end_pfn = end >> PAGE_SHIFT;
18842 + if (start_pfn < end_pfn)
18843 + kernel_physical_mapping_init(pgd_base, start_pfn,
18847 + early_ioremap_page_table_range_init(pgd_base);
18849 + __flush_tlb_all();
18851 + if (!after_init_bootmem)
18852 + reserve_early(table_start << PAGE_SHIFT,
18853 + table_end << PAGE_SHIFT, "PGTABLE");
18855 + if (!after_init_bootmem)
18856 + early_memtest(start, end);
18858 + return end >> PAGE_SHIFT;
18863 + * paging_init() sets up the page tables - note that the first 8MB are
18864 + * already mapped by head.S.
18866 + * This routines also unmaps the page at virtual kernel address 0, so
18867 + * that we can trap those pesky NULL-reference errors in the kernel.
18869 +void __init paging_init(void)
18877 - /* Switch to the real shared_info page, and clear the
18879 - set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
18880 - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
18881 - memset(empty_zero_page, 0, sizeof(empty_zero_page));
18883 - /* Setup mapping of lower 1st MB */
18884 - for (i = 0; i < NR_FIX_ISAMAPS; i++)
18885 - if (is_initial_xendomain())
18886 - set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
18888 - __set_fixmap(FIX_ISAMAP_BEGIN - i,
18889 - virt_to_machine(empty_zero_page),
18892 + * NOTE: at this point the bootmem allocator is fully available.
18895 + zone_sizes_init();
18899 @@ -598,7 +947,7 @@ static struct kcore_list kcore_mem, kcor
18900 void __init mem_init(void)
18902 int codesize, reservedpages, datasize, initsize;
18903 - int tmp, bad_ppro;
18908 @@ -606,19 +955,6 @@ void __init mem_init(void)
18909 #ifdef CONFIG_FLATMEM
18912 - bad_ppro = ppro_with_ram_bug();
18914 -#ifdef CONFIG_HIGHMEM
18915 - /* check that fixmap and pkmap do not overlap */
18916 - if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
18918 - "fixmap and kmap areas overlap - this will crash\n");
18919 - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
18920 - PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
18925 /* this will put all low memory onto the freelists */
18926 totalram_pages += free_all_bootmem();
18927 /* XEN: init and count low-mem pages outside initial allocation. */
18928 @@ -636,7 +972,7 @@ void __init mem_init(void)
18929 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
18932 - set_highmem_pages_init(bad_ppro);
18933 + set_highmem_pages_init();
18935 codesize = (unsigned long) &_etext - (unsigned long) &_text;
18936 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
18937 @@ -657,7 +993,6 @@ void __init mem_init(void)
18938 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
18941 -#if 1 /* double-sanity-check paranoia */
18942 printk(KERN_INFO "virtual kernel memory layout:\n"
18943 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
18944 #ifdef CONFIG_HIGHMEM
18945 @@ -698,7 +1033,6 @@ void __init mem_init(void)
18947 BUG_ON(VMALLOC_START > VMALLOC_END);
18948 BUG_ON((unsigned long)high_memory > VMALLOC_START);
18949 -#endif /* double-sanity-check paranoia */
18951 if (boot_cpu_data.wp_works_ok < 0)
18953 @@ -755,6 +1089,8 @@ void mark_rodata_ro(void)
18954 unsigned long start = PFN_ALIGN(_text);
18955 unsigned long size = PFN_ALIGN(_etext) - start;
18957 +#ifndef CONFIG_DYNAMIC_FTRACE
18958 + /* Dynamic tracing modifies the kernel text section */
18959 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
18960 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
18962 @@ -767,6 +1103,8 @@ void mark_rodata_ro(void)
18963 printk(KERN_INFO "Testing CPA: write protecting again\n");
18964 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
18966 +#endif /* CONFIG_DYNAMIC_FTRACE */
18969 size = (unsigned long)__end_rodata - start;
18970 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
18971 @@ -829,3 +1167,9 @@ void free_initrd_mem(unsigned long start
18972 free_init_pages("initrd memory", start, end);
18976 +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
18979 + return reserve_bootmem(phys, len, flags);
18981 Index: head-2008-12-01/arch/x86/mm/init_64-xen.c
18982 ===================================================================
18983 --- head-2008-12-01.orig/arch/x86/mm/init_64-xen.c 2008-12-01 11:44:55.000000000 +0100
18984 +++ head-2008-12-01/arch/x86/mm/init_64-xen.c 2008-12-01 11:49:07.000000000 +0100
18986 #include <linux/swap.h>
18987 #include <linux/smp.h>
18988 #include <linux/init.h>
18989 +#include <linux/initrd.h>
18990 #include <linux/pagemap.h>
18991 #include <linux/bootmem.h>
18992 #include <linux/proc_fs.h>
18995 #include <xen/features.h>
18998 + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
18999 + * The direct mapping extends to max_pfn_mapped, so that we can directly access
19000 + * apertures, ACPI and other tables without having to play with fixmaps.
19002 +unsigned long max_low_pfn_mapped;
19003 +unsigned long max_pfn_mapped;
19005 #if CONFIG_XEN_COMPAT <= 0x030002
19006 unsigned int __kernel_page_user;
19007 EXPORT_SYMBOL(__kernel_page_user);
19008 @@ -60,12 +69,11 @@ EXPORT_SYMBOL(__kernel_page_user);
19011 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
19012 -extern unsigned long start_pfn;
19014 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
19015 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
19017 -int direct_gbpages __meminitdata
19018 +int direct_gbpages
19019 #ifdef CONFIG_DIRECT_GBPAGES
19022 @@ -145,55 +153,23 @@ void __meminit early_make_page_readonly(
19023 * around without checking the pgd every time.
19026 -void show_mem(void)
19028 - long i, total = 0, reserved = 0;
19029 - long shared = 0, cached = 0;
19030 - struct page *page;
19031 - pg_data_t *pgdat;
19033 - printk(KERN_INFO "Mem-info:\n");
19034 - show_free_areas();
19035 - for_each_online_pgdat(pgdat) {
19036 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
19038 - * This loop can take a while with 256 GB and
19039 - * 4k pages so defer the NMI watchdog:
19041 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
19042 - touch_nmi_watchdog();
19044 - if (!pfn_valid(pgdat->node_start_pfn + i))
19047 - page = pfn_to_page(pgdat->node_start_pfn + i);
19049 - if (PageReserved(page))
19051 - else if (PageSwapCache(page))
19053 - else if (page_count(page))
19054 - shared += page_count(page) - 1;
19057 - printk(KERN_INFO "%lu pages of RAM\n", total);
19058 - printk(KERN_INFO "%lu reserved pages\n", reserved);
19059 - printk(KERN_INFO "%lu pages shared\n", shared);
19060 - printk(KERN_INFO "%lu pages swap cached\n", cached);
19063 static unsigned long __meminitdata table_start;
19064 +static unsigned long __meminitdata table_cur;
19065 static unsigned long __meminitdata table_end;
19067 -static __init void *spp_getpage(void)
19069 + * NOTE: This function is marked __ref because it calls __init function
19070 + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
19072 +static __ref void *spp_getpage(void)
19077 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
19078 - else if (start_pfn < table_end) {
19079 - ptr = __va(start_pfn << PAGE_SHIFT);
19081 + else if (table_cur < table_end) {
19082 + ptr = __va(table_cur << PAGE_SHIFT);
19084 memset(ptr, 0, PAGE_SIZE);
19086 ptr = alloc_bootmem_pages(PAGE_SIZE);
19087 @@ -208,30 +184,18 @@ static __init void *spp_getpage(void)
19091 -#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
19092 -#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
19094 -static __init void
19095 -set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
19097 +set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
19102 - pte_t *pte, new_pte;
19104 - pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
19107 - pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
19108 - if (pgd_none(*pgd)) {
19110 - "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19113 - pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
19114 + pud = pud_page + pud_index(vaddr);
19115 if (pud_none(*pud)) {
19116 pmd = (pmd_t *) spp_getpage();
19117 make_page_readonly(pmd, XENFEAT_writable_page_tables);
19118 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19119 + pud_populate(&init_mm, pud, pmd);
19120 if (pmd != pmd_offset(pud, 0)) {
19121 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19122 pmd, pmd_offset(pud, 0));
19123 @@ -242,19 +206,20 @@ set_pte_phys(unsigned long vaddr, unsign
19124 if (pmd_none(*pmd)) {
19125 pte = (pte_t *) spp_getpage();
19126 make_page_readonly(pte, XENFEAT_writable_page_tables);
19127 - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19128 + pmd_populate_kernel(&init_mm, pmd, pte);
19129 if (pte != pte_offset_kernel(pmd, 0)) {
19130 printk(KERN_ERR "PAGETABLE BUG #02!\n");
19134 - if (pgprot_val(prot))
19135 - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
19137 - new_pte = __pte(0);
19139 pte = pte_offset_kernel(pmd, vaddr);
19140 if (!pte_none(*pte) && __pte_val(new_pte) &&
19141 +#ifdef CONFIG_ACPI
19142 + /* __acpi_map_table() fails to properly call clear_fixmap() */
19143 + (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19144 + vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19146 __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19148 set_pte(pte, new_pte);
19149 @@ -266,15 +231,13 @@ set_pte_phys(unsigned long vaddr, unsign
19150 __flush_tlb_one(vaddr);
19153 -static __init void
19154 -set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
19156 +set_pte_vaddr(unsigned long vaddr, pte_t pteval)
19161 - pte_t *pte, new_pte;
19164 - pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
19165 + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
19167 pgd = pgd_offset_k(vaddr);
19168 if (pgd_none(*pgd)) {
19169 @@ -282,47 +245,51 @@ set_pte_phys_ma(unsigned long vaddr, uns
19170 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19173 - pud = pud_offset(pgd, vaddr);
19174 - if (pud_none(*pud)) {
19175 - pmd = (pmd_t *) spp_getpage();
19176 - make_page_readonly(pmd, XENFEAT_writable_page_tables);
19177 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19178 - if (pmd != pmd_offset(pud, 0)) {
19179 - printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19180 - pmd, pmd_offset(pud, 0));
19181 + pud_page = (pud_t*)pgd_page_vaddr(*pgd);
19182 + set_pte_vaddr_pud(pud_page, vaddr, pteval);
19185 +#ifndef CONFIG_XEN
19187 + * Create large page table mappings for a range of physical addresses.
19189 +static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
19196 + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
19197 + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
19198 + pgd = pgd_offset_k((unsigned long)__va(phys));
19199 + if (pgd_none(*pgd)) {
19200 + pud = (pud_t *) spp_getpage();
19201 + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
19205 - pmd = pmd_offset(pud, vaddr);
19206 - if (pmd_none(*pmd)) {
19207 - pte = (pte_t *) spp_getpage();
19208 - make_page_readonly(pte, XENFEAT_writable_page_tables);
19209 - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19210 - if (pte != pte_offset_kernel(pmd, 0)) {
19211 - printk(KERN_ERR "PAGETABLE BUG #02!\n");
19213 + pud = pud_offset(pgd, (unsigned long)__va(phys));
19214 + if (pud_none(*pud)) {
19215 + pmd = (pmd_t *) spp_getpage();
19216 + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
19219 + pmd = pmd_offset(pud, phys);
19220 + BUG_ON(!pmd_none(*pmd));
19221 + set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
19223 - new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
19226 - pte = pte_offset_kernel(pmd, vaddr);
19227 - if (!pte_none(*pte) && __pte_val(new_pte) &&
19228 -#ifdef CONFIG_ACPI
19229 - /* __acpi_map_table() fails to properly call clear_fixmap() */
19230 - (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19231 - vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19233 - __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19235 - set_pte(pte, new_pte);
19236 +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
19238 + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
19242 - * It's enough to flush this one mapping.
19243 - * (PGE mappings get flushed as well)
19245 - __flush_tlb_one(vaddr);
19246 +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
19248 + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
19251 -#ifndef CONFIG_XEN
19253 * The head.S code sets up the kernel high mapping:
19255 @@ -352,33 +319,9 @@ void __init cleanup_highmap(void)
19259 -/* NOTE: this is meant to be run only at boot */
19260 -void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
19262 - unsigned long address = __fix_to_virt(idx);
19264 - if (idx >= __end_of_fixed_addresses) {
19265 - printk(KERN_ERR "Invalid __set_fixmap\n");
19269 - case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
19270 - set_pte_phys(address, phys, prot, 0);
19271 - set_pte_phys(address, phys, prot, 1);
19273 - case FIX_EARLYCON_MEM_BASE:
19274 - xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
19275 - pfn_pte_ma(phys >> PAGE_SHIFT, prot));
19278 - set_pte_phys_ma(address, phys, prot);
19283 static __meminit void *alloc_static_page(unsigned long *phys)
19285 - unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
19286 + unsigned long va = (table_cur << PAGE_SHIFT) + __START_KERNEL_map;
19288 if (after_bootmem) {
19289 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
19290 @@ -387,13 +330,12 @@ static __meminit void *alloc_static_page
19294 - *phys = start_pfn << PAGE_SHIFT;
19296 - memset((void *)va, 0, PAGE_SIZE);
19297 - return (void *)va;
19298 + BUG_ON(!table_cur);
19299 + *phys = table_cur++ << PAGE_SHIFT;
19300 + return memset((void *)va, 0, PAGE_SIZE);
19303 -#define PTE_SIZE PAGE_SIZE
19304 +#define unmap_low_page(p) ((void)(p))
19306 static inline int __meminit make_readonly(unsigned long paddr)
19308 @@ -408,7 +350,7 @@ static inline int __meminit make_readonl
19309 /* Make old page tables read-only. */
19310 if (!xen_feature(XENFEAT_writable_page_tables)
19311 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
19312 - && (paddr < (start_pfn << PAGE_SHIFT)))
19313 + && (paddr < (table_cur << PAGE_SHIFT)))
19317 @@ -425,118 +367,129 @@ static inline int __meminit make_readonl
19321 -#ifndef CONFIG_XEN
19322 -/* Must run before zap_low_mappings */
19323 -__meminit void *early_ioremap(unsigned long addr, unsigned long size)
19324 +static unsigned long __meminit
19325 +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
19327 - pmd_t *pmd, *last_pmd;
19328 - unsigned long vaddr;
19331 - pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19332 - vaddr = __START_KERNEL_map;
19333 - pmd = level2_kernel_pgt;
19334 - last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
19336 - for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
19337 - for (i = 0; i < pmds; i++) {
19338 - if (pmd_present(pmd[i]))
19339 - goto continue_outer_loop;
19341 - vaddr += addr & ~PMD_MASK;
19342 - addr &= PMD_MASK;
19343 + unsigned pages = 0;
19344 + unsigned long last_map_addr = end;
19347 - for (i = 0; i < pmds; i++, addr += PMD_SIZE)
19348 - set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
19349 - __flush_tlb_all();
19351 - return (void *)vaddr;
19352 -continue_outer_loop:
19354 + pte_t *pte = pte_page + pte_index(addr);
19356 + for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
19357 + unsigned long pteval = addr | __PAGE_KERNEL;
19359 + if (addr >= (after_bootmem
19361 + : xen_start_info->nr_pages << PAGE_SHIFT))
19364 + if (__pte_val(*pte))
19367 + if (make_readonly(addr))
19368 + pteval &= ~_PAGE_RW;
19370 + printk(" pte=%p addr=%lx pte=%016lx\n",
19371 + pte, addr, pteval);
19372 + if (!after_bootmem)
19373 + *pte = __pte(pteval & __supported_pte_mask);
19375 + set_pte(pte, __pte(pteval & __supported_pte_mask));
19376 + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
19379 - printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
19381 + update_page_count(PG_LEVEL_4K, pages);
19383 + return last_map_addr;
19387 - * To avoid virtual aliases later:
19389 -__meminit void early_iounmap(void *addr, unsigned long size)
19390 +static unsigned long __meminit
19391 +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
19393 - unsigned long vaddr;
19397 - vaddr = (unsigned long)addr;
19398 - pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19399 - pmd = level2_kernel_pgt + pmd_index(vaddr);
19400 + pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
19402 - for (i = 0; i < pmds; i++)
19403 - pmd_clear(pmd + i);
19405 - __flush_tlb_all();
19406 + BUG_ON(!after_bootmem);
19407 + return phys_pte_init(pte, address, end);
19411 static unsigned long __meminit
19412 -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
19413 +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
19414 + unsigned long page_size_mask)
19416 + unsigned long pages = 0;
19417 + unsigned long last_map_addr = end;
19418 + unsigned long start = address;
19420 int i = pmd_index(address);
19422 - for (; i < PTRS_PER_PMD; i++) {
19423 + for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
19424 unsigned long pte_phys;
19425 - pmd_t *pmd = pmd_page + i;
19426 - pte_t *pte, *pte_save;
19428 + pmd_t *pmd = pmd_page + pmd_index(address);
19431 if (address >= end)
19434 if (__pmd_val(*pmd)) {
19435 - address += PMD_SIZE;
19436 + if (!pmd_large(*pmd)) {
19437 + spin_lock(&init_mm.page_table_lock);
19438 + last_map_addr = phys_pte_update(pmd, address,
19440 + spin_unlock(&init_mm.page_table_lock);
19442 + /* Count entries we're using from level2_ident_pgt */
19448 - pte = alloc_static_page(&pte_phys);
19450 - for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
19451 - unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
19453 - if (address >= (after_bootmem
19455 - : xen_start_info->nr_pages << PAGE_SHIFT))
19457 - else if (make_readonly(address))
19458 - pteval &= ~_PAGE_RW;
19459 - set_pte(pte, __pte(pteval & __supported_pte_mask));
19460 + if (page_size_mask & (1<<PG_LEVEL_2M)) {
19462 + spin_lock(&init_mm.page_table_lock);
19463 + set_pte((pte_t *)pmd,
19464 + pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19465 + spin_unlock(&init_mm.page_table_lock);
19466 + last_map_addr = (address & PMD_MASK) + PMD_SIZE;
19470 + pte = alloc_static_page(&pte_phys);
19471 + last_map_addr = phys_pte_init(pte, address, end);
19472 + unmap_low_page(pte);
19474 if (!after_bootmem) {
19475 - early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19476 - *pmd = __pmd(pte_phys | _KERNPG_TABLE);
19477 + early_make_page_readonly(pte, XENFEAT_writable_page_tables);
19478 + *pmd = __pmd(pte_phys | _PAGE_TABLE);
19480 - make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19481 - set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
19482 + make_page_readonly(pte, XENFEAT_writable_page_tables);
19483 + spin_lock(&init_mm.page_table_lock);
19484 + pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
19485 + spin_unlock(&init_mm.page_table_lock);
19489 + update_page_count(PG_LEVEL_2M, pages);
19490 + return last_map_addr;
19493 static unsigned long __meminit
19494 -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
19495 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
19496 + unsigned long page_size_mask)
19498 pmd_t *pmd = pmd_offset(pud, 0);
19499 unsigned long last_map_addr;
19501 - spin_lock(&init_mm.page_table_lock);
19502 - last_map_addr = phys_pmd_init(pmd, address, end);
19503 - spin_unlock(&init_mm.page_table_lock);
19504 + BUG_ON(!after_bootmem);
19505 + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
19507 return last_map_addr;
19510 static unsigned long __meminit
19511 -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
19512 +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
19513 + unsigned long page_size_mask)
19515 + unsigned long pages = 0;
19516 unsigned long last_map_addr = end;
19517 int i = pud_index(addr);
19519 @@ -550,29 +503,59 @@ phys_pud_init(pud_t *pud_page, unsigned
19521 if (__pud_val(*pud)) {
19522 if (!pud_large(*pud))
19523 - last_map_addr = phys_pmd_update(pud, addr, end);
19524 + last_map_addr = phys_pmd_update(pud, addr, end,
19529 - if (direct_gbpages) {
19530 + if (page_size_mask & (1<<PG_LEVEL_1G)) {
19532 + spin_lock(&init_mm.page_table_lock);
19533 set_pte((pte_t *)pud,
19534 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19535 + spin_unlock(&init_mm.page_table_lock);
19536 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
19540 pmd = alloc_static_page(&pmd_phys);
19541 + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
19542 + unmap_low_page(pmd);
19544 - spin_lock(&init_mm.page_table_lock);
19545 - *pud = __pud(pmd_phys | _KERNPG_TABLE);
19546 - last_map_addr = phys_pmd_init(pmd, addr, end);
19547 - spin_unlock(&init_mm.page_table_lock);
19549 - early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
19550 + if (!after_bootmem) {
19551 + early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
19552 + if (page_size_mask & (1 << PG_LEVEL_NUM))
19553 + xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE));
19555 + *pud = __pud(pmd_phys | _PAGE_TABLE);
19557 + make_page_readonly(pmd, XENFEAT_writable_page_tables);
19558 + spin_lock(&init_mm.page_table_lock);
19559 + pud_populate(&init_mm, pud, __va(pmd_phys));
19560 + spin_unlock(&init_mm.page_table_lock);
19564 + update_page_count(PG_LEVEL_1G, pages);
19566 - return last_map_addr >> PAGE_SHIFT;
19567 + return last_map_addr;
19570 +static unsigned long __meminit
19571 +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
19572 + unsigned long page_size_mask)
19576 + if (!after_bootmem) {
19577 + unsigned long addr = __pgd_val(*pgd), *page;
19579 + addr_to_page(addr, page);
19580 + pud = (pud_t *)page;
19582 + pud = (pud_t *)pgd_page_vaddr(*pgd);
19584 + return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM));
19587 void __init xen_init_pt(void)
19588 @@ -654,7 +637,7 @@ void __init xen_init_pt(void)
19589 static void __init extend_init_mapping(unsigned long tables_space)
19591 unsigned long va = __START_KERNEL_map;
19592 - unsigned long start = start_pfn;
19593 + unsigned long start = table_cur;
19594 unsigned long phys, addr, *pte_page;
19596 pte_t *pte, new_pte;
19597 @@ -674,7 +657,7 @@ static void __init extend_init_mapping(u
19599 /* Ensure init mappings cover kernel text/data and initial tables. */
19600 while (va < (__START_KERNEL_map
19601 - + (start_pfn << PAGE_SHIFT)
19602 + + (table_cur << PAGE_SHIFT)
19604 pmd = (pmd_t *)&page[pmd_index(va)];
19605 if (pmd_none(*pmd)) {
19606 @@ -706,9 +689,9 @@ static void __init extend_init_mapping(u
19610 - if (start_pfn > start)
19611 + if (table_cur > start)
19612 reserve_early(start << PAGE_SHIFT,
19613 - start_pfn << PAGE_SHIFT, "INITMAP");
19614 + table_cur << PAGE_SHIFT, "INITMAP");
19617 static void __init find_early_table_space(unsigned long end)
19618 @@ -717,23 +700,25 @@ static void __init find_early_table_spac
19620 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
19621 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
19622 - ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
19623 + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
19625 tables = round_up(puds * 8, PAGE_SIZE) +
19626 round_up(pmds * 8, PAGE_SIZE) +
19627 round_up(ptes * 8, PAGE_SIZE);
19629 + table_cur = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
19630 + xen_start_info->nr_pt_frames;
19632 extend_init_mapping(tables);
19634 - table_start = start_pfn;
19635 + table_start = table_cur;
19636 table_end = table_start + (tables>>PAGE_SHIFT);
19638 - early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
19639 - end, table_start << PAGE_SHIFT,
19640 - (table_start << PAGE_SHIFT) + tables);
19641 + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
19642 + end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
19645 -static void __init xen_finish_init_mapping(void)
19646 +static void __init xen_finish_init_mapping(bool reserve)
19648 unsigned long i, start, end;
19650 @@ -762,7 +747,8 @@ static void __init xen_finish_init_mappi
19651 if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
19654 - /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
19655 + /* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
19656 + start = table_cur;
19660 @@ -789,8 +775,11 @@ static void __init xen_finish_init_mappi
19664 - /* Disable the 'start_pfn' allocator. */
19665 - table_end = start_pfn;
19666 + /* Disable the 'table_cur' allocator. */
19667 + table_end = table_cur;
19668 + if (reserve && table_cur > start)
19669 + reserve_early(start << PAGE_SHIFT,
19670 + table_cur << PAGE_SHIFT, "FIXMAP");
19673 static void __init init_gbpages(void)
19674 @@ -801,126 +790,89 @@ static void __init init_gbpages(void)
19675 direct_gbpages = 0;
19678 -#ifdef CONFIG_MEMTEST_BOOTPARAM
19680 -static void __init memtest(unsigned long start_phys, unsigned long size,
19681 - unsigned pattern)
19682 +static unsigned long __init kernel_physical_mapping_init(unsigned long start,
19683 + unsigned long end,
19684 + unsigned long page_size_mask)
19687 - unsigned long *start;
19688 - unsigned long start_bad;
19689 - unsigned long last_bad;
19690 - unsigned long val;
19691 - unsigned long start_phys_aligned;
19692 - unsigned long count;
19693 - unsigned long incr;
19695 - switch (pattern) {
19703 - val = 0x5555555555555555UL;
19706 - val = 0xaaaaaaaaaaaaaaaaUL;
19712 - incr = sizeof(unsigned long);
19713 - start_phys_aligned = ALIGN(start_phys, incr);
19714 - count = (size - (start_phys_aligned - start_phys))/incr;
19715 - start = __va(start_phys_aligned);
19719 - for (i = 0; i < count; i++)
19721 - for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
19722 - if (*start != val) {
19723 - if (start_phys_aligned == last_bad + incr) {
19724 - last_bad += incr;
19727 - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19728 - val, start_bad, last_bad + incr);
19729 - reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19731 - start_bad = last_bad = start_phys_aligned;
19736 - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19737 - val, start_bad, last_bad + incr);
19738 - reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19743 -static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
19745 -static int __init parse_memtest(char *arg)
19748 - memtest_pattern = simple_strtoul(arg, NULL, 0);
19751 + unsigned long next, last_map_addr = end;
19753 -early_param("memtest", parse_memtest);
19754 + start = (unsigned long)__va(start);
19755 + end = (unsigned long)__va(end);
19757 -static void __init early_memtest(unsigned long start, unsigned long end)
19759 - u64 t_start, t_size;
19760 - unsigned pattern;
19761 + for (; start < end; start = next) {
19762 + pgd_t *pgd = pgd_offset_k(start);
19763 + unsigned long pud_phys;
19766 - if (!memtest_pattern)
19768 + next = (start + PGDIR_SIZE) & PGDIR_MASK;
19772 - printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
19773 - for (pattern = 0; pattern < memtest_pattern; pattern++) {
19776 - while (t_start < end) {
19777 - t_start = find_e820_area_size(t_start, &t_size, 1);
19780 - if (t_start >= end)
19782 - if (t_start + t_size > end)
19783 - t_size = end - t_start;
19785 - printk(KERN_CONT "\n %016llx - %016llx pattern %d",
19786 - (unsigned long long)t_start,
19787 - (unsigned long long)t_start + t_size, pattern);
19788 + if (__pgd_val(*pgd)) {
19789 + last_map_addr = phys_pud_update(pgd, __pa(start),
19790 + __pa(end), page_size_mask);
19794 - memtest(t_start, t_size, pattern);
19795 + pud = alloc_static_page(&pud_phys);
19796 + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
19798 + unmap_low_page(pud);
19800 - t_start += t_size;
19801 + if(!after_bootmem) {
19802 + early_make_page_readonly(pud, XENFEAT_writable_page_tables);
19803 + xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
19805 + make_page_readonly(pud, XENFEAT_writable_page_tables);
19806 + spin_lock(&init_mm.page_table_lock);
19807 + pgd_populate(&init_mm, pgd, __va(pud_phys));
19808 + spin_unlock(&init_mm.page_table_lock);
19811 - printk(KERN_CONT "\n");
19813 + return last_map_addr;
19816 -static void __init early_memtest(unsigned long start, unsigned long end)
19818 +struct map_range {
19819 + unsigned long start;
19820 + unsigned long end;
19821 + unsigned page_size_mask;
19824 +#define NR_RANGE_MR 5
19826 +static int save_mr(struct map_range *mr, int nr_range,
19827 + unsigned long start_pfn, unsigned long end_pfn,
19828 + unsigned long page_size_mask)
19831 + if (start_pfn < end_pfn) {
19832 + if (nr_range >= NR_RANGE_MR)
19833 + panic("run out of range for init_memory_mapping\n");
19834 + mr[nr_range].start = start_pfn<<PAGE_SHIFT;
19835 + mr[nr_range].end = end_pfn<<PAGE_SHIFT;
19836 + mr[nr_range].page_size_mask = page_size_mask;
19845 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
19846 * This runs before bootmem is initialized and gets pages directly from
19847 * the physical memory. To access them they are temporarily mapped.
19849 -unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
19850 +unsigned long __init_refok init_memory_mapping(unsigned long start,
19851 + unsigned long end)
19853 - unsigned long next, last_map_addr = end;
19854 - unsigned long start_phys = start, end_phys = end;
19855 + unsigned long last_map_addr = 0;
19856 + unsigned long page_size_mask = 0;
19857 + unsigned long start_pfn, end_pfn;
19858 + bool first = !table_start;
19859 + struct map_range mr[NR_RANGE_MR];
19862 printk(KERN_INFO "init_memory_mapping\n");
19864 @@ -931,51 +883,123 @@ unsigned long __init_refok init_memory_m
19865 * memory mapped. Unfortunately this is done currently before the
19866 * nodes are discovered.
19868 - if (!after_bootmem) {
19869 + if (!after_bootmem)
19871 - find_early_table_space(end);
19874 - start = (unsigned long)__va(start);
19875 - end = (unsigned long)__va(end);
19876 + if (direct_gbpages)
19877 + page_size_mask |= 1 << PG_LEVEL_1G;
19879 + page_size_mask |= 1 << PG_LEVEL_2M;
19881 + memset(mr, 0, sizeof(mr));
19884 + /* head if not big page alignment ?*/
19885 + start_pfn = start >> PAGE_SHIFT;
19886 + end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
19887 + << (PMD_SHIFT - PAGE_SHIFT);
19888 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
19890 + /* big page (2M) range*/
19891 + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
19892 + << (PMD_SHIFT - PAGE_SHIFT);
19893 + end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
19894 + << (PUD_SHIFT - PAGE_SHIFT);
19895 + if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
19896 + end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
19897 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
19898 + page_size_mask & (1<<PG_LEVEL_2M));
19900 + /* big page (1G) range */
19901 + start_pfn = end_pfn;
19902 + end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
19903 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
19905 + ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
19907 + /* tail is not big page (1G) alignment */
19908 + start_pfn = end_pfn;
19909 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
19910 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
19911 + page_size_mask & (1<<PG_LEVEL_2M));
19913 + /* tail is not big page (2M) alignment */
19914 + start_pfn = end_pfn;
19915 + end_pfn = end>>PAGE_SHIFT;
19916 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
19918 + /* try to merge same page size and continuous */
19919 + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
19920 + unsigned long old_start;
19921 + if (mr[i].end != mr[i+1].start ||
19922 + mr[i].page_size_mask != mr[i+1].page_size_mask)
19925 + old_start = mr[i].start;
19926 + memmove(&mr[i], &mr[i+1],
19927 + (nr_range - 1 - i) * sizeof (struct map_range));
19928 + mr[i--].start = old_start;
19932 - for (; start < end; start = next) {
19933 - pgd_t *pgd = pgd_offset_k(start);
19934 - unsigned long pud_phys;
19936 + for (i = 0; i < nr_range; i++)
19937 + printk(KERN_DEBUG " %010lx - %010lx page %s\n",
19938 + mr[i].start, mr[i].end,
19939 + (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
19940 + (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
19942 - if (after_bootmem)
19943 - pud = pud_offset(pgd, start & PGDIR_MASK);
19945 - pud = alloc_static_page(&pud_phys);
19946 - next = start + PGDIR_SIZE;
19949 - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
19950 - if (!after_bootmem) {
19951 - early_make_page_readonly(pud, XENFEAT_writable_page_tables);
19952 - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
19956 + find_early_table_space(end);
19958 - if (!after_bootmem) {
19959 - BUG_ON(start_pfn != table_end);
19960 - xen_finish_init_mapping();
19961 + for (i = 0; i < nr_range; i++)
19962 + last_map_addr = kernel_physical_mapping_init(
19963 + mr[i].start, mr[i].end,
19964 + mr[i].page_size_mask);
19966 + BUG_ON(table_cur > table_end);
19967 + if (start < (table_start << PAGE_SHIFT)) {
19968 + WARN_ON(table_cur != table_end);
19969 + xen_finish_init_mapping(!first);
19974 - if (!after_bootmem)
19975 + if (first && table_end > table_start)
19976 reserve_early(table_start << PAGE_SHIFT,
19977 table_end << PAGE_SHIFT, "PGTABLE");
19979 + printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
19980 + last_map_addr, end);
19982 if (!after_bootmem)
19983 - early_memtest(start_phys, end_phys);
19984 + early_memtest(start, end);
19986 - return last_map_addr;
19987 + return last_map_addr >> PAGE_SHIFT;
19990 #ifndef CONFIG_NUMA
19991 +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
19993 + unsigned long bootmap_size, bootmap;
19995 + e820_register_active_regions(0, start_pfn, end_pfn);
19997 + if (end_pfn > xen_start_info->nr_pages)
19998 + end_pfn = xen_start_info->nr_pages;
20000 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
20001 + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
20003 + if (bootmap == -1L)
20004 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
20005 + /* don't touch min_low_pfn */
20006 + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
20008 + free_bootmem_with_active_regions(0, end_pfn);
20009 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
20010 + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
20013 void __init paging_init(void)
20015 unsigned long max_zone_pfns[MAX_NR_ZONES];
20016 @@ -983,9 +1007,9 @@ void __init paging_init(void)
20017 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
20018 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
20019 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
20020 - max_zone_pfns[ZONE_NORMAL] = end_pfn;
20021 + max_zone_pfns[ZONE_NORMAL] = max_pfn;
20023 - memory_present(0, 0, end_pfn);
20024 + memory_present(0, 0, max_pfn);
20026 free_area_init_nodes(max_zone_pfns);
20028 @@ -1076,8 +1100,8 @@ void __init mem_init(void)
20029 init_page_count(pfn_to_page(pfn));
20032 - reservedpages = end_pfn - totalram_pages -
20033 - absent_pages_in_range(0, end_pfn);
20034 + reservedpages = max_pfn - totalram_pages -
20035 + absent_pages_in_range(0, max_pfn);
20038 codesize = (unsigned long) &_etext - (unsigned long) &_text;
20039 @@ -1096,7 +1120,7 @@ void __init mem_init(void)
20040 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
20041 "%ldk reserved, %ldk data, %ldk init)\n",
20042 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
20043 - end_pfn << (PAGE_SHIFT-10),
20044 + max_pfn << (PAGE_SHIFT-10),
20046 reservedpages << (PAGE_SHIFT-10),
20048 @@ -1159,6 +1183,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
20049 void mark_rodata_ro(void)
20051 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
20052 + unsigned long rodata_start =
20053 + ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20055 +#ifdef CONFIG_DYNAMIC_FTRACE
20056 + /* Dynamic tracing modifies the kernel text section */
20057 + start = rodata_start;
20060 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
20061 (end - start) >> 10);
20062 @@ -1168,8 +1199,7 @@ void mark_rodata_ro(void)
20063 * The rodata section (but not the kernel text!) should also be
20066 - start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20067 - set_memory_nx(start, (end - start) >> PAGE_SHIFT);
20068 + set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
20072 @@ -1191,24 +1221,26 @@ void free_initrd_mem(unsigned long start
20076 -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
20077 +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
20084 unsigned long pfn = phys >> PAGE_SHIFT;
20086 - if (pfn >= end_pfn) {
20087 + if (pfn >= max_pfn) {
20089 * This can happen with kdump kernels when accessing
20092 if (pfn < max_pfn_mapped)
20096 - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
20097 + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
20103 /* Should check here against the e820 map to avoid double free */
20104 @@ -1216,9 +1248,13 @@ void __init reserve_bootmem_generic(unsi
20105 nid = phys_to_nid(phys);
20106 next_nid = phys_to_nid(phys + len - 1);
20107 if (nid == next_nid)
20108 - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
20109 + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
20111 - reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20112 + ret = reserve_bootmem(phys, len, flags);
20118 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20120 @@ -1231,6 +1267,8 @@ void __init reserve_bootmem_generic(unsi
20121 set_dma_reserve(dma_reserve);
20128 int kern_addr_valid(unsigned long addr)
20129 @@ -1335,7 +1373,7 @@ vmemmap_populate(struct page *start_page
20132 for (; addr < end; addr = next) {
20133 - next = pmd_addr_end(addr, end);
20136 pgd = vmemmap_pgd_populate(addr, node);
20138 @@ -1345,33 +1383,51 @@ vmemmap_populate(struct page *start_page
20142 - pmd = pmd_offset(pud, addr);
20143 - if (pmd_none(*pmd)) {
20146 + if (!cpu_has_pse) {
20147 + next = (addr + PAGE_SIZE) & PAGE_MASK;
20148 + pmd = vmemmap_pmd_populate(pud, addr, node);
20153 + p = vmemmap_pte_populate(pmd, addr, node);
20155 - p = vmemmap_alloc_block(PMD_SIZE, node);
20159 - entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20160 - PAGE_KERNEL_LARGE);
20161 - set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20163 - /* check to see if we have contiguous blocks */
20164 - if (p_end != p || node_start != node) {
20166 - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20167 - addr_start, addr_end-1, p_start, p_end-1, node_start);
20168 - addr_start = addr;
20169 - node_start = node;
20172 - addr_end = addr + PMD_SIZE;
20173 - p_end = p + PMD_SIZE;
20174 + addr_end = addr + PAGE_SIZE;
20175 + p_end = p + PAGE_SIZE;
20177 - vmemmap_verify((pte_t *)pmd, node, addr, next);
20178 + next = pmd_addr_end(addr, end);
20180 + pmd = pmd_offset(pud, addr);
20181 + if (pmd_none(*pmd)) {
20184 + p = vmemmap_alloc_block(PMD_SIZE, node);
20188 + entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20189 + PAGE_KERNEL_LARGE);
20190 + set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20192 + /* check to see if we have contiguous blocks */
20193 + if (p_end != p || node_start != node) {
20195 + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20196 + addr_start, addr_end-1, p_start, p_end-1, node_start);
20197 + addr_start = addr;
20198 + node_start = node;
20202 + addr_end = addr + PMD_SIZE;
20203 + p_end = p + PMD_SIZE;
20205 + vmemmap_verify((pte_t *)pmd, node, addr, next);
20211 Index: head-2008-12-01/arch/x86/mm/ioremap-xen.c
20212 ===================================================================
20213 --- head-2008-12-01.orig/arch/x86/mm/ioremap-xen.c 2008-12-01 11:44:55.000000000 +0100
20214 +++ head-2008-12-01/arch/x86/mm/ioremap-xen.c 2008-12-01 11:49:07.000000000 +0100
20216 #include <linux/pfn.h>
20217 #include <linux/slab.h>
20218 #include <linux/vmalloc.h>
20219 +#include <linux/mmiotrace.h>
20221 #include <asm/cacheflush.h>
20222 #include <asm/e820.h>
20223 @@ -274,7 +275,8 @@ int ioremap_check_change_attr(unsigned l
20224 for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
20225 unsigned long pfn = mfn_to_local_pfn(mfn);
20227 - if (pfn >= max_pfn_mapped)
20228 + if (pfn >= max_low_pfn_mapped &&
20229 + (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
20231 rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
20232 PAGE_SIZE, prot_val);
20233 @@ -297,11 +299,14 @@ static void __iomem *__ioremap_caller(re
20235 unsigned long mfn, offset, vaddr;
20236 resource_size_t last_addr;
20237 + const resource_size_t unaligned_phys_addr = phys_addr;
20238 + const unsigned long unaligned_size = size;
20239 struct vm_struct *area;
20240 unsigned long new_prot_val;
20243 domid_t domid = DOMID_IO;
20244 + void __iomem *ret_addr;
20246 /* Don't allow wraparound or zero size */
20247 last_addr = phys_addr + size - 1;
20248 @@ -318,7 +323,7 @@ static void __iomem *__ioremap_caller(re
20250 * Don't remap the low PCI/ISA area, it's always mapped..
20252 - if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
20253 + if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
20254 return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
20257 @@ -342,7 +347,7 @@ static void __iomem *__ioremap_caller(re
20258 phys_addr &= PAGE_MASK;
20259 size = PAGE_ALIGN(last_addr+1) - phys_addr;
20261 - retval = reserve_memtype(phys_addr, phys_addr + size,
20262 + retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
20263 prot_val, &new_prot_val);
20265 pr_debug("Warning: reserve_memtype returned %d\n", retval);
20266 @@ -410,7 +415,10 @@ static void __iomem *__ioremap_caller(re
20270 - return (void __iomem *) (vaddr + offset);
20271 + ret_addr = (void __iomem *) (vaddr + offset);
20272 + mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
20278 @@ -438,7 +446,7 @@ void __iomem *ioremap_nocache(resource_s
20281 * Ideally, this should be:
20282 - * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20283 + * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20285 * Till we fix all X drivers to use ioremap_wc(), we will use
20287 @@ -462,7 +470,7 @@ EXPORT_SYMBOL(ioremap_nocache);
20289 void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
20291 - if (pat_wc_enabled)
20293 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
20294 __builtin_return_address(0));
20296 @@ -502,6 +510,14 @@ static void __iomem *ioremap_default(res
20300 +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
20301 + unsigned long prot_val)
20303 + return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
20304 + __builtin_return_address(0));
20306 +EXPORT_SYMBOL(ioremap_prot);
20309 * iounmap - Free a IO remapping
20310 * @addr: virtual address from ioremap_*
20311 @@ -526,6 +542,8 @@ void iounmap(volatile void __iomem *addr
20312 addr = (volatile void __iomem *)
20313 (PAGE_MASK & (unsigned long __force)addr);
20315 + mmiotrace_iounmap(addr);
20317 /* Use the vm area unlocked, assuming the caller
20318 ensures there isn't another iounmap for the same address
20319 in parallel. Reuse of the virtual address is prevented by
20320 @@ -533,7 +551,7 @@ void iounmap(volatile void __iomem *addr
20321 cpa takes care of the direct mappings. */
20322 read_lock(&vmlist_lock);
20323 for (p = vmlist; p; p = p->next) {
20324 - if (p->addr == addr)
20325 + if (p->addr == (void __force *)addr)
20328 read_unlock(&vmlist_lock);
20329 @@ -547,7 +565,7 @@ void iounmap(volatile void __iomem *addr
20330 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
20332 /* Finally remove it */
20333 - o = remove_vm_area((void *)addr);
20334 + o = remove_vm_area((void __force *)addr);
20335 BUG_ON(p != o || o == NULL);
20338 @@ -567,7 +585,7 @@ void *xlate_dev_mem_ptr(unsigned long ph
20339 if (page_is_ram(start >> PAGE_SHIFT))
20342 - addr = (void *)ioremap_default(start, PAGE_SIZE);
20343 + addr = (void __force *)ioremap_default(start, PAGE_SIZE);
20345 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
20347 @@ -595,8 +613,7 @@ static int __init early_ioremap_debug_se
20348 early_param("early_ioremap_debug", early_ioremap_debug_setup);
20350 static __initdata int after_paging_init;
20351 -static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
20352 - __section(.bss.page_aligned);
20353 +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
20355 #ifdef CONFIG_X86_32
20356 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
20357 @@ -693,10 +710,11 @@ static void __init __early_set_fixmap(en
20360 pte = early_ioremap_pte(addr);
20362 if (pgprot_val(flags))
20363 set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
20365 - pte_clear(NULL, addr, pte);
20366 + pte_clear(&init_mm, addr, pte);
20367 __flush_tlb_one(addr);
20370 @@ -724,13 +742,11 @@ static int __init check_early_ioremap_le
20372 if (!early_ioremap_nested)
20375 - printk(KERN_WARNING
20376 + WARN(1, KERN_WARNING
20377 "Debug warning: early ioremap leak of %d areas detected.\n",
20378 - early_ioremap_nested);
20379 + early_ioremap_nested);
20380 printk(KERN_WARNING
20381 - "please boot with early_ioremap_debug and report the dmesg.\n");
20383 + "please boot with early_ioremap_debug and report the dmesg.\n");
20387 Index: head-2008-12-01/arch/x86/mm/pageattr-xen.c
20388 ===================================================================
20389 --- head-2008-12-01.orig/arch/x86/mm/pageattr-xen.c 2008-12-01 12:19:27.000000000 +0100
20390 +++ head-2008-12-01/arch/x86/mm/pageattr-xen.c 2008-12-01 11:49:07.000000000 +0100
20391 @@ -34,6 +34,47 @@ struct cpa_data {
20392 unsigned force_split : 1;
20395 +#ifdef CONFIG_PROC_FS
20396 +static unsigned long direct_pages_count[PG_LEVEL_NUM];
20398 +void update_page_count(int level, unsigned long pages)
20400 + unsigned long flags;
20402 + /* Protect against CPA */
20403 + spin_lock_irqsave(&pgd_lock, flags);
20404 + direct_pages_count[level] += pages;
20405 + spin_unlock_irqrestore(&pgd_lock, flags);
20408 +static void split_page_count(int level)
20410 + direct_pages_count[level]--;
20411 + direct_pages_count[level - 1] += PTRS_PER_PTE;
20414 +int arch_report_meminfo(char *page)
20416 + int n = sprintf(page, "DirectMap4k: %8lu kB\n",
20417 + direct_pages_count[PG_LEVEL_4K] << 2);
20418 +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
20419 + n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
20420 + direct_pages_count[PG_LEVEL_2M] << 11);
20422 + n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
20423 + direct_pages_count[PG_LEVEL_2M] << 12);
20425 +#ifdef CONFIG_X86_64
20426 + if (direct_gbpages)
20427 + n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
20428 + direct_pages_count[PG_LEVEL_1G] << 20);
20433 +static inline void split_page_count(int level) { }
20436 #ifdef CONFIG_X86_64
20438 static inline unsigned long highmap_start_pfn(void)
20439 @@ -106,7 +147,7 @@ static void cpa_flush_all(unsigned long
20441 BUG_ON(irqs_disabled());
20443 - on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
20444 + on_each_cpu(__cpa_flush_all, (void *) cache, 1);
20447 static void __cpa_flush_range(void *arg)
20448 @@ -127,7 +168,7 @@ static void cpa_flush_range(unsigned lon
20449 BUG_ON(irqs_disabled());
20450 WARN_ON(PAGE_ALIGN(start) != start);
20452 - on_each_cpu(__cpa_flush_range, NULL, 1, 1);
20453 + on_each_cpu(__cpa_flush_range, NULL, 1);
20457 @@ -229,6 +270,7 @@ pte_t *lookup_address(unsigned long addr
20459 return pte_offset_kernel(pmd, address);
20461 +EXPORT_SYMBOL_GPL(lookup_address);
20464 * Set the new pmd in all the pgds we know about:
20465 @@ -509,6 +551,16 @@ static int split_large_page(pte_t *kpte,
20469 + if (address >= (unsigned long)__va(0) &&
20470 + address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
20471 + split_page_count(level);
20473 +#ifdef CONFIG_X86_64
20474 + if (address >= (unsigned long)__va(1UL<<32) &&
20475 + address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
20476 + split_page_count(level);
20480 * Get the target mfn from the original entry:
20482 @@ -565,10 +617,9 @@ repeat:
20483 if (!__pte_val(old_pte)) {
20486 - printk(KERN_WARNING "CPA: called for zero pte. "
20487 + WARN(1, KERN_WARNING "CPA: called for zero pte. "
20488 "vaddr = %lx cpa->vaddr = %lx\n", address,
20494 @@ -633,15 +684,24 @@ static int cpa_process_alias(struct cpa_
20495 struct cpa_data alias_cpa;
20498 - if (cpa->pfn > max_pfn_mapped)
20499 + if (cpa->pfn >= max_pfn_mapped)
20502 +#ifdef CONFIG_X86_64
20503 + if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
20507 * No need to redo, when the primary call touched the direct
20510 - if (!within(cpa->vaddr, PAGE_OFFSET,
20511 - PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
20512 + if (!(within(cpa->vaddr, PAGE_OFFSET,
20513 + PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
20514 +#ifdef CONFIG_X86_64
20515 + || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
20516 + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
20521 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
20522 @@ -809,7 +869,7 @@ int set_memory_uc(unsigned long addr, in
20524 * for now UC MINUS. see comments in ioremap_nocache()
20526 - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20527 + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20528 _PAGE_CACHE_UC_MINUS, NULL))
20531 @@ -825,10 +885,10 @@ int _set_memory_wc(unsigned long addr, i
20533 int set_memory_wc(unsigned long addr, int numpages)
20535 - if (!pat_wc_enabled)
20536 + if (!pat_enabled)
20537 return set_memory_uc(addr, numpages);
20539 - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20540 + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20541 _PAGE_CACHE_WC, NULL))
20544 @@ -844,7 +904,7 @@ int _set_memory_wb(unsigned long addr, i
20546 int set_memory_wb(unsigned long addr, int numpages)
20548 - free_memtype(addr, addr + numpages * PAGE_SIZE);
20549 + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
20551 return _set_memory_wb(addr, numpages);
20553 Index: head-2008-12-01/arch/x86/mm/pat-xen.c
20554 ===================================================================
20555 --- head-2008-12-01.orig/arch/x86/mm/pat-xen.c 2008-12-01 11:44:55.000000000 +0100
20556 +++ head-2008-12-01/arch/x86/mm/pat-xen.c 2008-12-01 11:49:07.000000000 +0100
20558 #include <linux/gfp.h>
20559 #include <linux/fs.h>
20560 #include <linux/bootmem.h>
20561 +#include <linux/debugfs.h>
20562 +#include <linux/seq_file.h>
20564 #include <asm/msr.h>
20565 #include <asm/tlbflush.h>
20566 @@ -26,11 +28,11 @@
20567 #include <asm/io.h>
20569 #ifdef CONFIG_X86_PAT
20570 -int __read_mostly pat_wc_enabled = 1;
20571 +int __read_mostly pat_enabled = 1;
20573 void __cpuinit pat_disable(char *reason)
20575 - pat_wc_enabled = 0;
20577 printk(KERN_INFO "%s\n", reason);
20580 @@ -42,6 +44,19 @@ static int __init nopat(char *str)
20581 early_param("nopat", nopat);
20585 +static int debug_enable;
20586 +static int __init pat_debug_setup(char *str)
20588 + debug_enable = 1;
20591 +__setup("debugpat", pat_debug_setup);
20593 +#define dprintk(fmt, arg...) \
20594 + do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
20597 static u64 __read_mostly boot_pat_state;
20600 @@ -53,24 +68,25 @@ enum {
20601 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
20604 -#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
20605 +#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
20607 void pat_init(void)
20611 - if (!pat_wc_enabled)
20612 + if (!pat_enabled)
20615 /* Paranoia check. */
20616 - if (!cpu_has_pat) {
20617 - printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
20618 + if (!cpu_has_pat && boot_pat_state) {
20620 - * Panic if this happens on the secondary CPU, and we
20621 + * If this happens we are on a secondary CPU, but
20622 * switched to PAT on the boot CPU. We have no way to
20625 - BUG_ON(boot_pat_state);
20627 + printk(KERN_ERR "PAT enabled, "
20628 + "but not supported by secondary CPU\n");
20633 @@ -87,8 +103,8 @@ void pat_init(void)
20634 * 011 UC _PAGE_CACHE_UC
20637 - pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
20638 - PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
20639 + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
20640 + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
20642 /* Boot CPU check */
20643 if (!boot_pat_state)
20644 @@ -113,13 +129,13 @@ void pat_init(void)
20645 static char *cattr_name(unsigned long flags)
20647 switch (flags & _PAGE_CACHE_MASK) {
20648 - case _PAGE_CACHE_UC: return "uncached";
20649 - case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20650 - case _PAGE_CACHE_WB: return "write-back";
20651 - case _PAGE_CACHE_WC: return "write-combining";
20652 - case _PAGE_CACHE_WP: return "write-protected";
20653 - case _PAGE_CACHE_WT: return "write-through";
20654 - default: return "broken";
20655 + case _PAGE_CACHE_UC: return "uncached";
20656 + case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20657 + case _PAGE_CACHE_WB: return "write-back";
20658 + case _PAGE_CACHE_WC: return "write-combining";
20659 + case _PAGE_CACHE_WP: return "write-protected";
20660 + case _PAGE_CACHE_WT: return "write-through";
20661 + default: return "broken";
20665 @@ -157,49 +173,55 @@ static DEFINE_SPINLOCK(memtype_lock); /
20666 * The intersection is based on "Effective Memory Type" tables in IA-32
20669 -static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
20670 - unsigned long *ret_prot)
20671 +static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
20673 - unsigned long pat_type;
20676 - pat_type = prot & _PAGE_CACHE_MASK;
20677 - prot &= (~_PAGE_CACHE_MASK);
20680 - * We return the PAT request directly for types where PAT takes
20681 - * precedence with respect to MTRR and for UC_MINUS.
20682 - * Consistency checks with other PAT requests is done later
20683 - * while going through memtype list.
20685 - if (pat_type == _PAGE_CACHE_WC) {
20686 - *ret_prot = prot | _PAGE_CACHE_WC;
20688 - } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
20689 - *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
20691 - } else if (pat_type == _PAGE_CACHE_UC) {
20692 - *ret_prot = prot | _PAGE_CACHE_UC;
20697 * Look for MTRR hint to get the effective type in case where PAT
20698 * request is for WB.
20700 - mtrr_type = mtrr_type_lookup(start, end);
20701 + if (req_type == _PAGE_CACHE_WB) {
20704 - if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
20705 - *ret_prot = prot | _PAGE_CACHE_UC;
20706 - } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
20707 - *ret_prot = prot | _PAGE_CACHE_WC;
20709 - *ret_prot = prot | _PAGE_CACHE_WB;
20710 + mtrr_type = mtrr_type_lookup(start, end);
20711 + if (mtrr_type == MTRR_TYPE_UNCACHABLE)
20712 + return _PAGE_CACHE_UC;
20713 + if (mtrr_type == MTRR_TYPE_WRCOMB)
20714 + return _PAGE_CACHE_WC;
20720 +static int chk_conflict(struct memtype *new, struct memtype *entry,
20721 + unsigned long *type)
20723 + if (new->type != entry->type) {
20725 + new->type = entry->type;
20726 + *type = entry->type;
20731 + /* check overlaps with more than one entry in the list */
20732 + list_for_each_entry_continue(entry, &memtype_list, nd) {
20733 + if (new->end <= entry->start)
20735 + else if (new->type != entry->type)
20741 + printk(KERN_INFO "%s:%d conflicting memory types "
20742 + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
20743 + new->end, cattr_name(new->type), cattr_name(entry->type));
20747 +static struct memtype *cached_entry;
20748 +static u64 cached_start;
20751 * req_type typically has one of the:
20753 @@ -210,37 +232,36 @@ static int pat_x_mtrr_type(u64 start, u6
20754 * req_type will have a special case value '-1', when requester want to inherit
20755 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
20757 - * If ret_type is NULL, function will return an error if it cannot reserve the
20758 - * region with req_type. If ret_type is non-null, function will return
20759 - * available type in ret_type in case of no error. In case of any error
20760 + * If new_type is NULL, function will return an error if it cannot reserve the
20761 + * region with req_type. If new_type is non-NULL, function will return
20762 + * available type in new_type in case of no error. In case of any error
20763 * it will return a negative return value.
20765 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
20766 - unsigned long *ret_type)
20767 + unsigned long *new_type)
20769 - struct memtype *new_entry = NULL;
20770 - struct memtype *parse;
20771 + struct memtype *new, *entry;
20772 unsigned long actual_type;
20773 + struct list_head *where;
20776 - /* Only track when pat_wc_enabled */
20777 - if (!pat_wc_enabled) {
20778 + BUG_ON(start >= end); /* end is exclusive */
20780 + if (!pat_enabled) {
20781 /* This is identical to page table setting without PAT */
20783 - if (req_type == -1) {
20784 - *ret_type = _PAGE_CACHE_WB;
20786 - *ret_type = req_type;
20789 + if (req_type == -1)
20790 + *new_type = _PAGE_CACHE_WB;
20792 + *new_type = req_type & _PAGE_CACHE_MASK;
20797 /* Low ISA region is always mapped WB in page table. No need to track */
20798 - if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
20800 - *ret_type = _PAGE_CACHE_WB;
20802 + if (is_ISA_range(start, end - 1)) {
20804 + *new_type = _PAGE_CACHE_WB;
20808 @@ -253,206 +274,118 @@ int reserve_memtype(u64 start, u64 end,
20810 u8 mtrr_type = mtrr_type_lookup(start, end);
20812 - if (mtrr_type == MTRR_TYPE_WRBACK) {
20813 - req_type = _PAGE_CACHE_WB;
20814 + if (mtrr_type == MTRR_TYPE_WRBACK)
20815 actual_type = _PAGE_CACHE_WB;
20817 - req_type = _PAGE_CACHE_UC_MINUS;
20819 actual_type = _PAGE_CACHE_UC_MINUS;
20822 - req_type &= _PAGE_CACHE_MASK;
20823 - err = pat_x_mtrr_type(start, end, req_type, &actual_type);
20828 - *ret_type = actual_type;
20830 + actual_type = pat_x_mtrr_type(start, end,
20831 + req_type & _PAGE_CACHE_MASK);
20836 - new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
20838 + new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
20842 - new_entry->start = start;
20843 - new_entry->end = end;
20844 - new_entry->type = actual_type;
20845 + new->start = start;
20847 + new->type = actual_type;
20850 - *ret_type = actual_type;
20852 + *new_type = actual_type;
20854 spin_lock(&memtype_lock);
20856 - /* Search for existing mapping that overlaps the current range */
20857 - list_for_each_entry(parse, &memtype_list, nd) {
20858 - struct memtype *saved_ptr;
20859 + if (cached_entry && start >= cached_start)
20860 + entry = cached_entry;
20862 + entry = list_entry(&memtype_list, struct memtype, nd);
20864 - if (parse->start >= end) {
20865 - pr_debug("New Entry\n");
20866 - list_add(&new_entry->nd, parse->nd.prev);
20867 - new_entry = NULL;
20868 + /* Search for existing mapping that overlaps the current range */
20870 + list_for_each_entry_continue(entry, &memtype_list, nd) {
20871 + if (end <= entry->start) {
20872 + where = entry->nd.prev;
20873 + cached_entry = list_entry(where, struct memtype, nd);
20877 - if (start <= parse->start && end >= parse->start) {
20878 - if (actual_type != parse->type && ret_type) {
20879 - actual_type = parse->type;
20880 - *ret_type = actual_type;
20881 - new_entry->type = actual_type;
20884 - if (actual_type != parse->type) {
20886 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
20887 - current->comm, current->pid,
20889 - cattr_name(actual_type),
20890 - cattr_name(parse->type));
20895 - saved_ptr = parse;
20897 - * Check to see whether the request overlaps more
20898 - * than one entry in the list
20900 - list_for_each_entry_continue(parse, &memtype_list, nd) {
20901 - if (end <= parse->start) {
20905 - if (actual_type != parse->type) {
20907 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
20908 - current->comm, current->pid,
20910 - cattr_name(actual_type),
20911 - cattr_name(parse->type));
20919 + } else if (start <= entry->start) { /* end > entry->start */
20920 + err = chk_conflict(new, entry, new_type);
20922 + dprintk("Overlap at 0x%Lx-0x%Lx\n",
20923 + entry->start, entry->end);
20924 + where = entry->nd.prev;
20925 + cached_entry = list_entry(where,
20926 + struct memtype, nd);
20929 - pr_debug("Overlap at 0x%Lx-0x%Lx\n",
20930 - saved_ptr->start, saved_ptr->end);
20931 - /* No conflict. Go ahead and add this new entry */
20932 - list_add(&new_entry->nd, saved_ptr->nd.prev);
20933 - new_entry = NULL;
20937 - if (start < parse->end) {
20938 - if (actual_type != parse->type && ret_type) {
20939 - actual_type = parse->type;
20940 - *ret_type = actual_type;
20941 - new_entry->type = actual_type;
20944 - if (actual_type != parse->type) {
20946 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
20947 - current->comm, current->pid,
20949 - cattr_name(actual_type),
20950 - cattr_name(parse->type));
20955 - saved_ptr = parse;
20957 - * Check to see whether the request overlaps more
20958 - * than one entry in the list
20960 - list_for_each_entry_continue(parse, &memtype_list, nd) {
20961 - if (end <= parse->start) {
20965 - if (actual_type != parse->type) {
20967 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
20968 - current->comm, current->pid,
20970 - cattr_name(actual_type),
20971 - cattr_name(parse->type));
20974 + } else if (start < entry->end) { /* start > entry->start */
20975 + err = chk_conflict(new, entry, new_type);
20977 + dprintk("Overlap at 0x%Lx-0x%Lx\n",
20978 + entry->start, entry->end);
20979 + cached_entry = list_entry(entry->nd.prev,
20980 + struct memtype, nd);
20983 + * Move to right position in the linked
20984 + * list to add this new entry
20986 + list_for_each_entry_continue(entry,
20987 + &memtype_list, nd) {
20988 + if (start <= entry->start) {
20989 + where = entry->nd.prev;
20999 - pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
21000 - saved_ptr->start, saved_ptr->end);
21001 - /* No conflict. Go ahead and add this new entry */
21002 - list_add(&new_entry->nd, &saved_ptr->nd);
21003 - new_entry = NULL;
21010 - "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
21011 - start, end, cattr_name(new_entry->type),
21012 - cattr_name(req_type));
21013 - kfree(new_entry);
21014 + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
21015 + "track %s, req %s\n",
21016 + start, end, cattr_name(new->type), cattr_name(req_type));
21018 spin_unlock(&memtype_lock);
21023 - /* No conflict. Not yet added to the list. Add to the tail */
21024 - list_add_tail(&new_entry->nd, &memtype_list);
21025 - pr_debug("New Entry\n");
21027 + cached_start = start;
21031 - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21032 - start, end, cattr_name(actual_type),
21033 - cattr_name(req_type), cattr_name(*ret_type));
21036 - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
21037 - start, end, cattr_name(actual_type),
21038 - cattr_name(req_type));
21041 + list_add(&new->nd, where);
21043 + list_add_tail(&new->nd, &memtype_list);
21045 spin_unlock(&memtype_lock);
21047 + dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21048 + start, end, cattr_name(new->type), cattr_name(req_type),
21049 + new_type ? cattr_name(*new_type) : "-");
21054 int free_memtype(u64 start, u64 end)
21056 - struct memtype *ml;
21057 + struct memtype *entry;
21060 - /* Only track when pat_wc_enabled */
21061 - if (!pat_wc_enabled) {
21062 + if (!pat_enabled)
21066 /* Low ISA region is always mapped WB. No need to track */
21067 - if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
21068 + if (is_ISA_range(start, end - 1))
21072 spin_lock(&memtype_lock);
21073 - list_for_each_entry(ml, &memtype_list, nd) {
21074 - if (ml->start == start && ml->end == end) {
21075 - list_del(&ml->nd);
21077 + list_for_each_entry(entry, &memtype_list, nd) {
21078 + if (entry->start == start && entry->end == end) {
21079 + if (cached_entry == entry || cached_start == start)
21080 + cached_entry = NULL;
21082 + list_del(&entry->nd);
21087 @@ -464,27 +397,19 @@ int free_memtype(u64 start, u64 end)
21088 current->comm, current->pid, start, end);
21091 - pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21092 + dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21098 - * /dev/mem mmap interface. The memtype used for mapping varies:
21099 - * - Use UC for mappings with O_SYNC flag
21100 - * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
21101 - * inherit the memtype from existing mapping.
21102 - * - Else use UC_MINUS memtype (for backward compatibility with existing
21105 pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
21106 unsigned long size, pgprot_t vma_prot)
21111 -#ifdef CONFIG_NONPROMISC_DEVMEM
21112 -/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
21113 +#ifdef CONFIG_STRICT_DEVMEM
21114 +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
21115 static inline int range_is_allowed(unsigned long mfn, unsigned long size)
21118 @@ -508,20 +433,20 @@ static inline int range_is_allowed(unsig
21122 -#endif /* CONFIG_NONPROMISC_DEVMEM */
21123 +#endif /* CONFIG_STRICT_DEVMEM */
21125 int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
21126 unsigned long size, pgprot_t *vma_prot)
21128 u64 addr = (u64)mfn << PAGE_SHIFT;
21129 - unsigned long flags = _PAGE_CACHE_UC_MINUS;
21130 + unsigned long flags = -1;
21133 if (!range_is_allowed(mfn, size))
21136 if (file->f_flags & O_SYNC) {
21137 - flags = _PAGE_CACHE_UC;
21138 + flags = _PAGE_CACHE_UC_MINUS;
21141 #ifndef CONFIG_X86_32
21142 @@ -534,25 +459,26 @@ int phys_mem_access_prot_allowed(struct
21143 * caching for the high addresses through the KEN pin, but
21144 * we maintain the tradition of paranoia in this code.
21146 - if (!pat_wc_enabled &&
21147 - ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
21148 - test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
21149 - test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
21150 - test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
21151 - (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21152 + if (!pat_enabled &&
21153 + !(boot_cpu_has(X86_FEATURE_MTRR) ||
21154 + boot_cpu_has(X86_FEATURE_K6_MTRR) ||
21155 + boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
21156 + boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
21157 + (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21158 flags = _PAGE_CACHE_UC;
21164 - * With O_SYNC, we can only take UC mapping. Fail if we cannot.
21165 + * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
21167 * Without O_SYNC, we want to get
21168 * - WB for WB-able memory and no other conflicting mappings
21169 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
21170 * - Inherit from confliting mappings otherwise
21172 - if (flags != _PAGE_CACHE_UC_MINUS) {
21173 + if (flags != -1) {
21174 retval = reserve_memtype(addr, addr + size, flags, NULL);
21176 retval = reserve_memtype(addr, addr + size, -1, &flags);
21177 @@ -600,3 +526,88 @@ void unmap_devmem(unsigned long mfn, uns
21178 free_memtype(addr, addr + size);
21181 +#if defined(CONFIG_DEBUG_FS)
21183 +/* get Nth element of the linked list */
21184 +static struct memtype *memtype_get_idx(loff_t pos)
21186 + struct memtype *list_node, *print_entry;
21189 + print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21190 + if (!print_entry)
21193 + spin_lock(&memtype_lock);
21194 + list_for_each_entry(list_node, &memtype_list, nd) {
21196 + *print_entry = *list_node;
21197 + spin_unlock(&memtype_lock);
21198 + return print_entry;
21202 + spin_unlock(&memtype_lock);
21203 + kfree(print_entry);
21207 +static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
21211 + seq_printf(seq, "PAT memtype list:\n");
21214 + return memtype_get_idx(*pos);
21217 +static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
21220 + return memtype_get_idx(*pos);
21223 +static void memtype_seq_stop(struct seq_file *seq, void *v)
21227 +static int memtype_seq_show(struct seq_file *seq, void *v)
21229 + struct memtype *print_entry = (struct memtype *)v;
21231 + seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
21232 + print_entry->start, print_entry->end);
21233 + kfree(print_entry);
21237 +static struct seq_operations memtype_seq_ops = {
21238 + .start = memtype_seq_start,
21239 + .next = memtype_seq_next,
21240 + .stop = memtype_seq_stop,
21241 + .show = memtype_seq_show,
21244 +static int memtype_seq_open(struct inode *inode, struct file *file)
21246 + return seq_open(file, &memtype_seq_ops);
21249 +static const struct file_operations memtype_fops = {
21250 + .open = memtype_seq_open,
21251 + .read = seq_read,
21252 + .llseek = seq_lseek,
21253 + .release = seq_release,
21256 +static int __init pat_memtype_list_init(void)
21258 + debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
21259 + NULL, &memtype_fops);
21263 +late_initcall(pat_memtype_list_init);
21265 +#endif /* CONFIG_DEBUG_FS */
21266 Index: head-2008-12-01/arch/x86/mm/pgtable-xen.c
21267 ===================================================================
21268 --- head-2008-12-01.orig/arch/x86/mm/pgtable-xen.c 2008-12-01 11:46:22.000000000 +0100
21269 +++ head-2008-12-01/arch/x86/mm/pgtable-xen.c 2008-12-01 11:49:07.000000000 +0100
21271 #include <asm/pgalloc.h>
21272 #include <asm/pgtable.h>
21273 #include <asm/tlb.h>
21274 +#include <asm/fixmap.h>
21275 #include <asm/hypervisor.h>
21276 #include <asm/mmu_context.h>
21278 @@ -410,15 +411,9 @@ static inline void pgd_list_del(pgd_t *p
21279 static void pgd_ctor(void *p)
21282 - unsigned long flags;
21284 pgd_test_and_unpin(pgd);
21286 - /* Clear usermode parts of PGD */
21287 - memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
21289 - spin_lock_irqsave(&pgd_lock, flags);
21291 /* If the pgd points to a shared pagetable level (either the
21292 ptes in non-PAE, or shared PMD in PAE), then just copy the
21293 references from swapper_pg_dir. */
21294 @@ -440,13 +435,9 @@ static void pgd_ctor(void *p)
21295 __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
21298 -#ifndef CONFIG_X86_PAE
21299 /* list required to sync kernel mapping updates */
21300 if (!SHARED_KERNEL_PMD)
21304 - spin_unlock_irqrestore(&pgd_lock, flags);
21307 static void pgd_dtor(void *pgd)
21308 @@ -475,33 +466,6 @@ static void pgd_dtor(void *pgd)
21310 #ifdef CONFIG_X86_PAE
21312 - * Mop up any pmd pages which may still be attached to the pgd.
21313 - * Normally they will be freed by munmap/exit_mmap, but any pmd we
21314 - * preallocate which never got a corresponding vma will need to be
21315 - * freed manually.
21317 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21321 - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
21322 - pgd_t pgd = pgdp[i];
21324 - if (__pgd_val(pgd) != 0) {
21325 - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21327 - pgdp[i] = xen_make_pgd(0);
21329 - paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21330 - pmd_free(mm, pmd);
21334 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21335 - xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21339 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
21340 * updating the top-level pagetable entries to guarantee the
21341 * processor notices the update. Since this is expensive, and
21342 @@ -512,61 +476,7 @@ static void pgd_mop_up_pmds(struct mm_st
21343 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
21344 * and initialize the kernel pmds here.
21346 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21349 - pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
21350 - unsigned long addr, flags;
21354 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
21355 - * allocation). We therefore store virtual addresses of pmds as they
21356 - * do not change across save/restore, and poke the machine addresses
21357 - * into the pgdir under the pgd_lock.
21359 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
21360 - pmds[i] = pmd_alloc_one(mm, addr);
21365 - spin_lock_irqsave(&pgd_lock, flags);
21367 - /* Protect against save/restore: move below 4GB under pgd_lock. */
21368 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21369 - && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21370 - spin_unlock_irqrestore(&pgd_lock, flags);
21373 - pmd_free(mm, pmds[i]);
21377 - /* Copy kernel pmd contents and write-protect the new pmds. */
21378 - pud = pud_offset(pgd, 0);
21379 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
21380 - i++, pud++, addr += PUD_SIZE) {
21381 - if (i >= KERNEL_PGD_BOUNDARY) {
21383 - (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21384 - sizeof(pmd_t) * PTRS_PER_PMD);
21385 - make_lowmem_page_readonly(
21386 - pmds[i], XENFEAT_writable_page_tables);
21389 - /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21390 - pud_populate(mm, pud, pmds[i]);
21393 - /* List required to sync kernel mapping updates and
21394 - * to pin/unpin on save/restore. */
21395 - pgd_list_add(pgd);
21397 - spin_unlock_irqrestore(&pgd_lock, flags);
21401 +#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
21403 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
21405 @@ -596,16 +506,97 @@ void pud_populate(struct mm_struct *mm,
21408 #else /* !CONFIG_X86_PAE */
21410 /* No need to prepopulate any pagetable entries in non-PAE modes. */
21411 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21412 +#define PREALLOCATED_PMDS 0
21414 +#endif /* CONFIG_X86_PAE */
21416 +static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
21422 + xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
21424 + for(i = 0; i < PREALLOCATED_PMDS; i++)
21426 + pmd_free(mm, pmds[i]);
21429 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
21430 +static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
21433 + bool failed = false;
21435 + for(i = 0; i < PREALLOCATED_PMDS; i++) {
21436 + pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
21443 + free_pmds(pmds, mm, false);
21451 + * Mop up any pmd pages which may still be attached to the pgd.
21452 + * Normally they will be freed by munmap/exit_mmap, but any pmd we
21453 + * preallocate which never got a corresponding vma will need to be
21454 + * freed manually.
21456 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21460 + for(i = 0; i < PREALLOCATED_PMDS; i++) {
21461 + pgd_t pgd = pgdp[i];
21463 + if (__pgd_val(pgd) != 0) {
21464 + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21466 + pgdp[i] = xen_make_pgd(0);
21468 + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21469 + pmd_free(mm, pmd);
21473 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21474 + xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21477 +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
21480 + unsigned long addr;
21483 + if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
21486 + pud = pud_offset(pgd, 0);
21487 + for (addr = i = 0; i < PREALLOCATED_PMDS;
21488 + i++, pud++, addr += PUD_SIZE) {
21489 + pmd_t *pmd = pmds[i];
21491 + if (i >= KERNEL_PGD_BOUNDARY) {
21493 + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21494 + sizeof(pmd_t) * PTRS_PER_PMD);
21495 + make_lowmem_page_readonly(
21496 + pmd, XENFEAT_writable_page_tables);
21499 + /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21500 + pud_populate(mm, pud, pmd);
21503 -#endif /* CONFIG_X86_PAE */
21505 #ifdef CONFIG_X86_64
21506 /* We allocate two contiguous pages for kernel and user. */
21507 @@ -616,19 +607,52 @@ static void pgd_mop_up_pmds(struct mm_st
21509 pgd_t *pgd_alloc(struct mm_struct *mm)
21511 - pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21513 + pmd_t *pmds[PREALLOCATED_PMDS];
21514 + unsigned long flags;
21516 + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21521 - /* so that alloc_pd can use it */
21526 - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
21527 - free_pages((unsigned long)pgd, PGD_ORDER);
21529 + if (preallocate_pmds(pmds, mm) != 0)
21530 + goto out_free_pgd;
21532 + if (paravirt_pgd_alloc(mm) != 0)
21533 + goto out_free_pmds;
21536 + * Make sure that pre-populating the pmds is atomic with
21537 + * respect to anything walking the pgd_list, so that they
21538 + * never see a partially populated pgd.
21540 + spin_lock_irqsave(&pgd_lock, flags);
21542 +#ifdef CONFIG_X86_PAE
21543 + /* Protect against save/restore: move below 4GB under pgd_lock. */
21544 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21545 + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21546 + spin_unlock_irqrestore(&pgd_lock, flags);
21547 + goto out_free_pmds;
21552 + pgd_prepopulate_pmd(mm, pgd, pmds);
21554 + spin_unlock_irqrestore(&pgd_lock, flags);
21559 + free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
21561 + free_pages((unsigned long)pgd, PGD_ORDER);
21566 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
21567 @@ -644,6 +668,7 @@ void pgd_free(struct mm_struct *mm, pgd_
21570 pgd_mop_up_pmds(mm, pgd);
21571 + paravirt_pgd_free(mm, pgd);
21572 free_pages((unsigned long)pgd, PGD_ORDER);
21575 @@ -685,7 +710,7 @@ int ptep_test_and_clear_young(struct vm_
21577 if (pte_young(*ptep))
21578 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
21580 + (unsigned long *) &ptep->pte);
21583 pte_update(vma->vm_mm, addr, ptep);
21584 @@ -707,3 +732,42 @@ int ptep_clear_flush_young(struct vm_are
21591 +void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21593 + unsigned long address = __fix_to_virt(idx);
21596 + if (idx >= __end_of_fixed_addresses) {
21602 +#ifdef CONFIG_X86_64
21603 + extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
21605 + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
21606 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21607 + set_pte_vaddr_pud(level3_user_pgt, address, pte);
21609 + case FIX_EARLYCON_MEM_BASE:
21610 + xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
21611 + pfn_pte_ma(phys >> PAGE_SHIFT, flags));
21615 + case FIX_WP_TEST:
21617 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21621 + pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21624 + set_pte_vaddr(address, pte);
21627 Index: head-2008-12-01/arch/x86/mm/pgtable_32-xen.c
21628 ===================================================================
21629 --- head-2008-12-01.orig/arch/x86/mm/pgtable_32-xen.c 2008-12-01 11:44:55.000000000 +0100
21630 +++ head-2008-12-01/arch/x86/mm/pgtable_32-xen.c 2008-12-01 11:49:07.000000000 +0100
21631 @@ -25,51 +25,49 @@
21632 #include <xen/features.h>
21633 #include <asm/hypervisor.h>
21635 -void show_mem(void)
21637 + * Associate a virtual page frame with a given physical page frame
21638 + * and protection flags for that frame.
21640 +void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
21642 - int total = 0, reserved = 0;
21643 - int shared = 0, cached = 0;
21645 - struct page *page;
21646 - pg_data_t *pgdat;
21648 - unsigned long flags;
21650 - printk(KERN_INFO "Mem-info:\n");
21651 - show_free_areas();
21652 - for_each_online_pgdat(pgdat) {
21653 - pgdat_resize_lock(pgdat, &flags);
21654 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
21655 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
21656 - touch_nmi_watchdog();
21657 - page = pgdat_page_nr(pgdat, i);
21659 - if (PageHighMem(page))
21661 - if (PageReserved(page))
21663 - else if (PageSwapCache(page))
21665 - else if (page_count(page))
21666 - shared += page_count(page) - 1;
21668 - pgdat_resize_unlock(pgdat, &flags);
21670 - printk(KERN_INFO "%d pages of RAM\n", total);
21671 - printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
21672 - printk(KERN_INFO "%d reserved pages\n", reserved);
21673 - printk(KERN_INFO "%d pages shared\n", shared);
21674 - printk(KERN_INFO "%d pages swap cached\n", cached);
21676 - printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
21677 - printk(KERN_INFO "%lu pages writeback\n",
21678 - global_page_state(NR_WRITEBACK));
21679 - printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
21680 - printk(KERN_INFO "%lu pages slab\n",
21681 - global_page_state(NR_SLAB_RECLAIMABLE) +
21682 - global_page_state(NR_SLAB_UNRECLAIMABLE));
21683 - printk(KERN_INFO "%lu pages pagetables\n",
21684 - global_page_state(NR_PAGETABLE));
21685 +#ifndef CONFIG_XEN
21691 + pgd = swapper_pg_dir + pgd_index(vaddr);
21692 + if (pgd_none(*pgd)) {
21696 + pud = pud_offset(pgd, vaddr);
21697 + if (pud_none(*pud)) {
21701 + pmd = pmd_offset(pud, vaddr);
21702 + if (pmd_none(*pmd)) {
21706 + pte = pte_offset_kernel(pmd, vaddr);
21707 + if (pte_val(pteval))
21708 + set_pte_present(&init_mm, vaddr, pte, pteval);
21710 + pte_clear(&init_mm, vaddr, pte);
21713 + * It's enough to flush this one mapping.
21714 + * (PGE mappings get flushed as well)
21716 + __flush_tlb_one(vaddr);
21718 + if (HYPERVISOR_update_va_mapping(vaddr, pteval,
21719 + UVMF_INVLPG|UVMF_ALL))
21725 @@ -107,35 +105,10 @@ void set_pmd_pfn(unsigned long vaddr, un
21726 __flush_tlb_one(vaddr);
21729 -static int fixmaps;
21730 unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
21731 unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
21732 EXPORT_SYMBOL(__FIXADDR_TOP);
21734 -void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21736 - unsigned long address = __fix_to_virt(idx);
21739 - if (idx >= __end_of_fixed_addresses) {
21744 - case FIX_WP_TEST:
21746 - pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21749 - pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21752 - if (HYPERVISOR_update_va_mapping(address, pte,
21753 - UVMF_INVLPG|UVMF_ALL))
21759 * reserve_top_address - reserves a hole in the top of kernel address space
21760 * @reserve - size of hole to reserve
21761 @@ -145,13 +118,48 @@ void __set_fixmap (enum fixed_addresses
21763 void __init reserve_top_address(unsigned long reserve)
21765 - BUG_ON(fixmaps > 0);
21766 + BUG_ON(fixmaps_set > 0);
21767 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
21769 __FIXADDR_TOP = -reserve - PAGE_SIZE;
21770 __VMALLOC_RESERVE += reserve;
21774 + * vmalloc=size forces the vmalloc area to be exactly 'size'
21775 + * bytes. This can be used to increase (or decrease) the
21776 + * vmalloc area - the default is 128m.
21778 +static int __init parse_vmalloc(char *arg)
21783 + __VMALLOC_RESERVE = memparse(arg, &arg);
21786 +early_param("vmalloc", parse_vmalloc);
21788 +#ifndef CONFIG_XEN
21790 + * reservetop=size reserves a hole at the top of the kernel address space which
21791 + * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
21792 + * so relocating the fixmap can be done before paging initialization.
21794 +static int __init parse_reservetop(char *arg)
21796 + unsigned long address;
21801 + address = memparse(arg, &arg);
21802 + reserve_top_address(address);
21805 +early_param("reservetop", parse_reservetop);
21808 void make_lowmem_page_readonly(void *va, unsigned int feature)
21811 Index: head-2008-12-01/arch/x86/pci/amd_bus.c
21812 ===================================================================
21813 --- head-2008-12-01.orig/arch/x86/pci/amd_bus.c 2008-12-03 15:48:43.000000000 +0100
21814 +++ head-2008-12-01/arch/x86/pci/amd_bus.c 2008-12-01 11:49:07.000000000 +0100
21815 @@ -607,6 +607,14 @@ static int __init pci_io_ecs_init(void)
21816 for_each_online_cpu(cpu)
21817 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
21818 (void *)(long)cpu);
21822 + rdmsrl(MSR_AMD64_NB_CFG, reg);
21823 + if (!(reg & ENABLE_CF8_EXT_CFG))
21827 pci_probe |= PCI_HAS_IO_ECS;
21830 @@ -614,6 +622,10 @@ static int __init pci_io_ecs_init(void)
21832 static int __init amd_postcore_init(void)
21835 + if (!is_initial_xendomain())
21838 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
21841 Index: head-2008-12-01/arch/x86/pci/irq-xen.c
21842 ===================================================================
21843 --- head-2008-12-01.orig/arch/x86/pci/irq-xen.c 2008-12-01 11:44:55.000000000 +0100
21844 +++ head-2008-12-01/arch/x86/pci/irq-xen.c 2008-12-01 11:49:07.000000000 +0100
21846 #include <linux/slab.h>
21847 #include <linux/interrupt.h>
21848 #include <linux/dmi.h>
21849 -#include <asm/io.h>
21850 -#include <asm/smp.h>
21851 +#include <linux/io.h>
21852 +#include <linux/smp.h>
21853 #include <asm/io_apic.h>
21854 #include <linux/irq.h>
21855 #include <linux/acpi.h>
21856 @@ -45,7 +45,8 @@ struct irq_router {
21858 u16 vendor, device;
21859 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
21860 - int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
21861 + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
21865 struct irq_router_handler {
21866 @@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_d
21867 * and perform checksum verification.
21870 -static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
21871 +static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
21873 struct irq_routing_table *rt;
21875 @@ -74,10 +75,11 @@ static inline struct irq_routing_table *
21876 rt->size < sizeof(struct irq_routing_table))
21879 - for (i=0; i < rt->size; i++)
21880 + for (i = 0; i < rt->size; i++)
21883 - DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
21884 + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
21889 @@ -104,7 +106,9 @@ static struct irq_routing_table * __init
21891 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
21893 - for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
21894 + for (addr = (u8 *) isa_bus_to_virt(0xf0000);
21895 + addr < (u8 *) isa_bus_to_virt(0x100000);
21897 rt = pirq_check_routing_table(addr);
21900 @@ -126,20 +130,20 @@ static void __init pirq_peer_trick(void)
21901 struct irq_info *e;
21903 memset(busmap, 0, sizeof(busmap));
21904 - for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
21905 + for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
21910 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
21911 - for(j=0; j<4; j++)
21912 + for (j = 0; j < 4; j++)
21913 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
21917 busmap[e->bus] = 1;
21919 - for(i = 1; i < 256; i++) {
21920 + for (i = 1; i < 256; i++) {
21922 if (!busmap[i] || pci_find_bus(0, i))
21924 @@ -187,7 +191,8 @@ static unsigned int read_config_nybble(s
21925 return (nr & 1) ? (x >> 4) : (x & 0xf);
21928 -static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
21929 +static void write_config_nybble(struct pci_dev *router, unsigned offset,
21930 + unsigned nr, unsigned int val)
21933 unsigned reg = offset + (nr >> 1);
21934 @@ -289,7 +294,7 @@ static int pirq_ite_get(struct pci_dev *
21935 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
21937 WARN_ON_ONCE(pirq > 4);
21938 - return read_config_nybble(router,0x43, pirqmap[pirq-1]);
21939 + return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
21942 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21943 @@ -318,7 +323,7 @@ static int pirq_opti_set(struct pci_dev
21946 * Cyrix: nibble offset 0x5C
21947 - * 0x5C bits 7:4 is INTB bits 3:0 is INTA
21948 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA
21949 * 0x5D bits 7:4 is INTD bits 3:0 is INTC
21951 static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
21952 @@ -354,7 +359,7 @@ static int pirq_cyrix_set(struct pci_dev
21953 * Apparently there are systems implementing PCI routing table using
21954 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
21955 * We try our best to handle both link mappings.
21958 * Currently (2003-05-21) it appears most SiS chipsets follow the
21959 * definition of routing registers from the SiS-5595 southbridge.
21960 * According to the SiS 5595 datasheets the revision id's of the
21961 @@ -374,7 +379,7 @@ static int pirq_cyrix_set(struct pci_dev
21964 * bit 6 OHCI function disabled (0), enabled (1)
21967 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
21969 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
21970 @@ -437,7 +442,7 @@ static int pirq_vlsi_get(struct pci_dev
21972 WARN_ON_ONCE(pirq >= 9);
21974 - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
21975 + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
21978 return read_config_nybble(router, 0x74, pirq-1);
21979 @@ -447,7 +452,7 @@ static int pirq_vlsi_set(struct pci_dev
21981 WARN_ON_ONCE(pirq >= 9);
21983 - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
21984 + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
21987 write_config_nybble(router, 0x74, pirq-1, irq);
21988 @@ -471,7 +476,8 @@ static int pirq_serverworks_get(struct p
21989 return inb(0xc01) & 0xf;
21992 -static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21993 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
21994 + int pirq, int irq)
21998 @@ -491,22 +497,20 @@ static int pirq_amd756_get(struct pci_de
22003 irq = read_config_nybble(router, 0x56, pirq - 1);
22005 - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
22006 - dev->vendor, dev->device, pirq, irq);
22007 + dev_info(&dev->dev,
22008 + "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
22009 + dev->vendor, dev->device, pirq, irq);
22013 static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22015 - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
22016 - dev->vendor, dev->device, pirq, irq);
22017 + dev_info(&dev->dev,
22018 + "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
22019 + dev->vendor, dev->device, pirq, irq);
22022 write_config_nybble(router, 0x56, pirq - 1, irq);
22027 @@ -553,50 +557,51 @@ static __init int intel_router_probe(str
22028 if (pci_dev_present(pirq_440gx))
22033 - case PCI_DEVICE_ID_INTEL_82371FB_0:
22034 - case PCI_DEVICE_ID_INTEL_82371SB_0:
22035 - case PCI_DEVICE_ID_INTEL_82371AB_0:
22036 - case PCI_DEVICE_ID_INTEL_82371MX:
22037 - case PCI_DEVICE_ID_INTEL_82443MX_0:
22038 - case PCI_DEVICE_ID_INTEL_82801AA_0:
22039 - case PCI_DEVICE_ID_INTEL_82801AB_0:
22040 - case PCI_DEVICE_ID_INTEL_82801BA_0:
22041 - case PCI_DEVICE_ID_INTEL_82801BA_10:
22042 - case PCI_DEVICE_ID_INTEL_82801CA_0:
22043 - case PCI_DEVICE_ID_INTEL_82801CA_12:
22044 - case PCI_DEVICE_ID_INTEL_82801DB_0:
22045 - case PCI_DEVICE_ID_INTEL_82801E_0:
22046 - case PCI_DEVICE_ID_INTEL_82801EB_0:
22047 - case PCI_DEVICE_ID_INTEL_ESB_1:
22048 - case PCI_DEVICE_ID_INTEL_ICH6_0:
22049 - case PCI_DEVICE_ID_INTEL_ICH6_1:
22050 - case PCI_DEVICE_ID_INTEL_ICH7_0:
22051 - case PCI_DEVICE_ID_INTEL_ICH7_1:
22052 - case PCI_DEVICE_ID_INTEL_ICH7_30:
22053 - case PCI_DEVICE_ID_INTEL_ICH7_31:
22054 - case PCI_DEVICE_ID_INTEL_ESB2_0:
22055 - case PCI_DEVICE_ID_INTEL_ICH8_0:
22056 - case PCI_DEVICE_ID_INTEL_ICH8_1:
22057 - case PCI_DEVICE_ID_INTEL_ICH8_2:
22058 - case PCI_DEVICE_ID_INTEL_ICH8_3:
22059 - case PCI_DEVICE_ID_INTEL_ICH8_4:
22060 - case PCI_DEVICE_ID_INTEL_ICH9_0:
22061 - case PCI_DEVICE_ID_INTEL_ICH9_1:
22062 - case PCI_DEVICE_ID_INTEL_ICH9_2:
22063 - case PCI_DEVICE_ID_INTEL_ICH9_3:
22064 - case PCI_DEVICE_ID_INTEL_ICH9_4:
22065 - case PCI_DEVICE_ID_INTEL_ICH9_5:
22066 - case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22067 - case PCI_DEVICE_ID_INTEL_ICH10_0:
22068 - case PCI_DEVICE_ID_INTEL_ICH10_1:
22069 - case PCI_DEVICE_ID_INTEL_ICH10_2:
22070 - case PCI_DEVICE_ID_INTEL_ICH10_3:
22071 - r->name = "PIIX/ICH";
22072 - r->get = pirq_piix_get;
22073 - r->set = pirq_piix_set;
22075 + switch (device) {
22076 + case PCI_DEVICE_ID_INTEL_82371FB_0:
22077 + case PCI_DEVICE_ID_INTEL_82371SB_0:
22078 + case PCI_DEVICE_ID_INTEL_82371AB_0:
22079 + case PCI_DEVICE_ID_INTEL_82371MX:
22080 + case PCI_DEVICE_ID_INTEL_82443MX_0:
22081 + case PCI_DEVICE_ID_INTEL_82801AA_0:
22082 + case PCI_DEVICE_ID_INTEL_82801AB_0:
22083 + case PCI_DEVICE_ID_INTEL_82801BA_0:
22084 + case PCI_DEVICE_ID_INTEL_82801BA_10:
22085 + case PCI_DEVICE_ID_INTEL_82801CA_0:
22086 + case PCI_DEVICE_ID_INTEL_82801CA_12:
22087 + case PCI_DEVICE_ID_INTEL_82801DB_0:
22088 + case PCI_DEVICE_ID_INTEL_82801E_0:
22089 + case PCI_DEVICE_ID_INTEL_82801EB_0:
22090 + case PCI_DEVICE_ID_INTEL_ESB_1:
22091 + case PCI_DEVICE_ID_INTEL_ICH6_0:
22092 + case PCI_DEVICE_ID_INTEL_ICH6_1:
22093 + case PCI_DEVICE_ID_INTEL_ICH7_0:
22094 + case PCI_DEVICE_ID_INTEL_ICH7_1:
22095 + case PCI_DEVICE_ID_INTEL_ICH7_30:
22096 + case PCI_DEVICE_ID_INTEL_ICH7_31:
22097 + case PCI_DEVICE_ID_INTEL_ESB2_0:
22098 + case PCI_DEVICE_ID_INTEL_ICH8_0:
22099 + case PCI_DEVICE_ID_INTEL_ICH8_1:
22100 + case PCI_DEVICE_ID_INTEL_ICH8_2:
22101 + case PCI_DEVICE_ID_INTEL_ICH8_3:
22102 + case PCI_DEVICE_ID_INTEL_ICH8_4:
22103 + case PCI_DEVICE_ID_INTEL_ICH9_0:
22104 + case PCI_DEVICE_ID_INTEL_ICH9_1:
22105 + case PCI_DEVICE_ID_INTEL_ICH9_2:
22106 + case PCI_DEVICE_ID_INTEL_ICH9_3:
22107 + case PCI_DEVICE_ID_INTEL_ICH9_4:
22108 + case PCI_DEVICE_ID_INTEL_ICH9_5:
22109 + case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22110 + case PCI_DEVICE_ID_INTEL_ICH10_0:
22111 + case PCI_DEVICE_ID_INTEL_ICH10_1:
22112 + case PCI_DEVICE_ID_INTEL_ICH10_2:
22113 + case PCI_DEVICE_ID_INTEL_ICH10_3:
22114 + case PCI_DEVICE_ID_INTEL_PCH_0:
22115 + case PCI_DEVICE_ID_INTEL_PCH_1:
22116 + r->name = "PIIX/ICH";
22117 + r->get = pirq_piix_get;
22118 + r->set = pirq_piix_set;
22123 @@ -610,7 +615,7 @@ static __init int via_router_probe(struc
22124 * workarounds for some buggy BIOSes
22126 if (device == PCI_DEVICE_ID_VIA_82C586_0) {
22127 - switch(router->device) {
22128 + switch (router->device) {
22129 case PCI_DEVICE_ID_VIA_82C686:
22131 * Asus k7m bios wrongly reports 82C686A
22132 @@ -635,7 +640,7 @@ static __init int via_router_probe(struc
22137 + switch (device) {
22138 case PCI_DEVICE_ID_VIA_82C586_0:
22140 r->get = pirq_via586_get;
22141 @@ -658,28 +663,27 @@ static __init int via_router_probe(struc
22143 static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22147 - case PCI_DEVICE_ID_VLSI_82C534:
22148 - r->name = "VLSI 82C534";
22149 - r->get = pirq_vlsi_get;
22150 - r->set = pirq_vlsi_set;
22152 + switch (device) {
22153 + case PCI_DEVICE_ID_VLSI_82C534:
22154 + r->name = "VLSI 82C534";
22155 + r->get = pirq_vlsi_get;
22156 + r->set = pirq_vlsi_set;
22163 -static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22164 +static __init int serverworks_router_probe(struct irq_router *r,
22165 + struct pci_dev *router, u16 device)
22169 - case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22170 - case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22171 - r->name = "ServerWorks";
22172 - r->get = pirq_serverworks_get;
22173 - r->set = pirq_serverworks_set;
22175 + switch (device) {
22176 + case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22177 + case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22178 + r->name = "ServerWorks";
22179 + r->get = pirq_serverworks_get;
22180 + r->set = pirq_serverworks_set;
22185 @@ -688,7 +692,7 @@ static __init int sis_router_probe(struc
22187 if (device != PCI_DEVICE_ID_SI_503)
22192 r->get = pirq_sis_get;
22193 r->set = pirq_sis_set;
22194 @@ -697,50 +701,45 @@ static __init int sis_router_probe(struc
22196 static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22200 - case PCI_DEVICE_ID_CYRIX_5520:
22201 - r->name = "NatSemi";
22202 - r->get = pirq_cyrix_get;
22203 - r->set = pirq_cyrix_set;
22205 + switch (device) {
22206 + case PCI_DEVICE_ID_CYRIX_5520:
22207 + r->name = "NatSemi";
22208 + r->get = pirq_cyrix_get;
22209 + r->set = pirq_cyrix_set;
22215 static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22219 - case PCI_DEVICE_ID_OPTI_82C700:
22220 - r->name = "OPTI";
22221 - r->get = pirq_opti_get;
22222 - r->set = pirq_opti_set;
22224 + switch (device) {
22225 + case PCI_DEVICE_ID_OPTI_82C700:
22226 + r->name = "OPTI";
22227 + r->get = pirq_opti_get;
22228 + r->set = pirq_opti_set;
22234 static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22238 - case PCI_DEVICE_ID_ITE_IT8330G_0:
22240 - r->get = pirq_ite_get;
22241 - r->set = pirq_ite_set;
22243 + switch (device) {
22244 + case PCI_DEVICE_ID_ITE_IT8330G_0:
22246 + r->get = pirq_ite_get;
22247 + r->set = pirq_ite_set;
22253 static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22257 + switch (device) {
22258 case PCI_DEVICE_ID_AL_M1533:
22259 case PCI_DEVICE_ID_AL_M1563:
22260 - printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
22262 r->get = pirq_ali_get;
22263 r->set = pirq_ali_set;
22264 @@ -751,25 +750,24 @@ static __init int ali_router_probe(struc
22266 static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22270 - case PCI_DEVICE_ID_AMD_VIPER_740B:
22271 - r->name = "AMD756";
22273 - case PCI_DEVICE_ID_AMD_VIPER_7413:
22274 - r->name = "AMD766";
22276 - case PCI_DEVICE_ID_AMD_VIPER_7443:
22277 - r->name = "AMD768";
22281 + switch (device) {
22282 + case PCI_DEVICE_ID_AMD_VIPER_740B:
22283 + r->name = "AMD756";
22285 + case PCI_DEVICE_ID_AMD_VIPER_7413:
22286 + r->name = "AMD766";
22288 + case PCI_DEVICE_ID_AMD_VIPER_7443:
22289 + r->name = "AMD768";
22294 r->get = pirq_amd756_get;
22295 r->set = pirq_amd756_set;
22300 static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22303 @@ -811,7 +809,7 @@ static struct pci_dev *pirq_router_dev;
22304 * FIXME: should we have an option to say "generic for
22309 static void __init pirq_find_router(struct irq_router *r)
22311 struct irq_routing_table *rt = pirq_table;
22312 @@ -830,7 +828,7 @@ static void __init pirq_find_router(stru
22313 r->name = "default";
22318 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
22319 rt->rtr_vendor, rt->rtr_device);
22321 @@ -841,19 +839,19 @@ static void __init pirq_find_router(stru
22325 - for( h = pirq_routers; h->vendor; h++) {
22326 + for (h = pirq_routers; h->vendor; h++) {
22327 /* First look for a router match */
22328 - if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
22329 + if (rt->rtr_vendor == h->vendor &&
22330 + h->probe(r, pirq_router_dev, rt->rtr_device))
22332 /* Fall back to a device match */
22333 - if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
22334 + if (pirq_router_dev->vendor == h->vendor &&
22335 + h->probe(r, pirq_router_dev, pirq_router_dev->device))
22338 - printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
22339 - pirq_router.name,
22340 - pirq_router_dev->vendor,
22341 - pirq_router_dev->device,
22342 - pci_name(pirq_router_dev));
22343 + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
22344 + pirq_router.name,
22345 + pirq_router_dev->vendor, pirq_router_dev->device);
22347 /* The device remains referenced for the kernel lifetime */
22349 @@ -861,11 +859,13 @@ static void __init pirq_find_router(stru
22350 static struct irq_info *pirq_get_info(struct pci_dev *dev)
22352 struct irq_routing_table *rt = pirq_table;
22353 - int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
22354 + int entries = (rt->size - sizeof(struct irq_routing_table)) /
22355 + sizeof(struct irq_info);
22356 struct irq_info *info;
22358 for (info = rt->slots; entries--; info++)
22359 - if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22360 + if (info->bus == dev->bus->number &&
22361 + PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22365 @@ -884,7 +884,7 @@ static int pcibios_lookup_irq(struct pci
22367 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
22369 - DBG(KERN_DEBUG " -> no interrupt pin\n");
22370 + dev_dbg(&dev->dev, "no interrupt pin\n");
22374 @@ -893,20 +893,21 @@ static int pcibios_lookup_irq(struct pci
22379 - DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
22381 info = pirq_get_info(dev);
22383 - DBG(" -> not found in routing table\n" KERN_DEBUG);
22384 + dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
22388 pirq = info->irq[pin].link;
22389 mask = info->irq[pin].bitmap;
22391 - DBG(" -> not routed\n" KERN_DEBUG);
22392 + dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
22395 - DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
22396 + dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
22397 + 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
22398 mask &= pcibios_irq_mask;
22400 /* Work around broken HP Pavilion Notebooks which assign USB to
22401 @@ -919,7 +920,8 @@ static int pcibios_lookup_irq(struct pci
22404 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
22405 - if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
22406 + if (acer_tm360_irqrouting && dev->irq == 11 &&
22407 + dev->vendor == PCI_VENDOR_ID_O2) {
22410 dev->irq = r->get(pirq_router_dev, dev, pirq);
22411 @@ -932,51 +934,50 @@ static int pcibios_lookup_irq(struct pci
22414 if (newirq && !((1 << newirq) & mask)) {
22415 - if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
22416 - else printk("\n" KERN_WARNING
22417 - "PCI: IRQ %i for device %s doesn't match PIRQ mask "
22418 - "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
22420 + if (pci_probe & PCI_USE_PIRQ_MASK)
22423 + dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
22424 + "%#x; try pci=usepirqmask\n", newirq, mask);
22426 if (!newirq && assign) {
22427 for (i = 0; i < 16; i++) {
22428 if (!(mask & (1 << i)))
22430 - if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
22431 + if (pirq_penalty[i] < pirq_penalty[newirq] &&
22432 + can_request_irq(i, IRQF_SHARED))
22436 - DBG(" -> newirq=%d", newirq);
22437 + dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
22439 /* Check if it is hardcoded */
22440 if ((pirq & 0xf0) == 0xf0) {
22442 - DBG(" -> hardcoded IRQ %d\n", irq);
22443 - msg = "Hardcoded";
22444 - } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22445 - ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
22446 - DBG(" -> got IRQ %d\n", irq);
22448 + msg = "hardcoded";
22449 + } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22450 + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
22452 eisa_set_level_irq(irq);
22453 - } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22454 - DBG(" -> assigning IRQ %d", newirq);
22455 + } else if (newirq && r->set &&
22456 + (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22457 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
22458 eisa_set_level_irq(newirq);
22459 - DBG(" ... OK\n");
22460 - msg = "Assigned";
22461 + msg = "assigned";
22467 - DBG(" ... failed\n");
22468 if (newirq && mask == (1 << newirq)) {
22474 + dev_dbg(&dev->dev, "can't route interrupt\n");
22478 - printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
22479 + dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
22481 /* Update IRQ for all devices with the same pirq value */
22482 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
22483 @@ -988,20 +989,25 @@ static int pcibios_lookup_irq(struct pci
22486 if (info->irq[pin].link == pirq) {
22487 - /* We refuse to override the dev->irq information. Give a warning! */
22488 - if ( dev2->irq && dev2->irq != irq && \
22490 + * We refuse to override the dev->irq
22491 + * information. Give a warning!
22493 + if (dev2->irq && dev2->irq != irq && \
22494 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
22495 - ((1 << dev2->irq) & mask)) ) {
22496 + ((1 << dev2->irq) & mask))) {
22497 #ifndef CONFIG_PCI_MSI
22498 - printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
22499 - pci_name(dev2), dev2->irq, irq);
22500 + dev_info(&dev2->dev, "IRQ routing conflict: "
22501 + "have IRQ %d, want IRQ %d\n",
22509 pirq_penalty[irq]++;
22511 - printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
22512 + dev_info(&dev->dev, "sharing IRQ %d with %s\n",
22513 + irq, pci_name(dev2));
22517 @@ -1015,15 +1021,20 @@ static void __init pcibios_fixup_irqs(vo
22518 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
22519 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
22521 - * If the BIOS has set an out of range IRQ number, just ignore it.
22522 - * Also keep track of which IRQ's are already in use.
22523 + * If the BIOS has set an out of range IRQ number, just
22524 + * ignore it. Also keep track of which IRQ's are
22525 + * already in use.
22527 if (dev->irq >= 16) {
22528 - DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
22529 + dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
22532 - /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
22533 - if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
22535 + * If the IRQ is already assigned to a PCI device,
22536 + * ignore its ISA use penalty
22538 + if (pirq_penalty[dev->irq] >= 100 &&
22539 + pirq_penalty[dev->irq] < 100000)
22540 pirq_penalty[dev->irq] = 0;
22541 pirq_penalty[dev->irq]++;
22543 @@ -1035,13 +1046,17 @@ static void __init pcibios_fixup_irqs(vo
22545 * Recalculate IRQ numbers if we use the I/O APIC.
22547 - if (io_apic_assign_pci_irqs)
22549 + if (io_apic_assign_pci_irqs) {
22553 - pin--; /* interrupt pins are numbered starting from 1 */
22554 - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
22556 + * interrupt pins are numbered starting
22560 + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
22561 + PCI_SLOT(dev->devfn), pin);
22563 * Busses behind bridges are typically not listed in the MP-table.
22564 * In this case we have to look up the IRQ based on the parent bus,
22565 @@ -1049,18 +1064,18 @@ static void __init pcibios_fixup_irqs(vo
22566 * busses itself so we should get into this branch reliably.
22568 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22569 - struct pci_dev * bridge = dev->bus->self;
22570 + struct pci_dev *bridge = dev->bus->self;
22572 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22573 - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22574 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22575 PCI_SLOT(bridge->devfn), pin);
22577 - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22578 - pci_name(bridge), 'A' + pin, irq);
22579 + dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
22580 + pci_name(bridge),
22584 - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22585 - pci_name(dev), 'A' + pin, irq);
22586 + dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
22590 @@ -1082,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq
22592 if (!broken_hp_bios_irq9) {
22593 broken_hp_bios_irq9 = 1;
22594 - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22595 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22600 @@ -1095,7 +1111,8 @@ static int __init fix_acer_tm360_irqrout
22602 if (!acer_tm360_irqrouting) {
22603 acer_tm360_irqrouting = 1;
22604 - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22605 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22610 @@ -1107,7 +1124,8 @@ static struct dmi_system_id __initdata p
22612 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
22613 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
22614 - DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
22615 + DMI_MATCH(DMI_PRODUCT_VERSION,
22616 + "HP Pavilion Notebook Model GE"),
22617 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
22620 @@ -1122,7 +1140,7 @@ static struct dmi_system_id __initdata p
22624 -static int __init pcibios_irq_init(void)
22625 +int __init pcibios_irq_init(void)
22627 DBG(KERN_DEBUG "PCI: IRQ init\n");
22629 @@ -1142,11 +1160,14 @@ static int __init pcibios_irq_init(void)
22630 pirq_find_router(&pirq_router);
22631 if (pirq_table->exclusive_irqs) {
22633 - for (i=0; i<16; i++)
22634 + for (i = 0; i < 16; i++)
22635 if (!(pirq_table->exclusive_irqs & (1 << i)))
22636 pirq_penalty[i] += 100;
22638 - /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
22640 + * If we're using the I/O APIC, avoid using the PCI IRQ
22643 if (io_apic_assign_pci_irqs)
22646 @@ -1157,9 +1178,6 @@ static int __init pcibios_irq_init(void)
22650 -subsys_initcall(pcibios_irq_init);
22653 static void pirq_penalize_isa_irq(int irq, int active)
22656 @@ -1193,7 +1211,7 @@ static int pirq_enable_irq(struct pci_de
22657 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
22660 - pin--; /* interrupt pins are numbered starting from 1 */
22661 + pin--; /* interrupt pins are numbered starting from 1 */
22663 if (io_apic_assign_pci_irqs) {
22665 @@ -1207,35 +1225,41 @@ static int pirq_enable_irq(struct pci_de
22668 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22669 - struct pci_dev * bridge = dev->bus->self;
22670 + struct pci_dev *bridge = dev->bus->self;
22672 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22673 - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22674 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22675 PCI_SLOT(bridge->devfn), pin);
22677 - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22678 - pci_name(bridge), 'A' + pin, irq);
22679 + dev_warn(&dev->dev, "using bridge %s "
22680 + "INT %c to get IRQ %d\n",
22681 + pci_name(bridge), 'A' + pin,
22687 - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22688 - pci_name(dev), 'A' + pin, irq);
22689 + dev_info(&dev->dev, "PCI->APIC IRQ transform: "
22690 + "INT %c -> IRQ %d\n", 'A' + pin, irq);
22694 - msg = " Probably buggy MP table.";
22695 + msg = "; probably buggy MP table";
22696 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
22699 - msg = " Please try using pci=biosirq.";
22700 + msg = "; please try using pci=biosirq";
22702 - /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
22703 - if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
22705 + * With IDE legacy devices the IRQ lookup failure is not
22708 + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
22709 + !(dev->class & 0x5))
22712 - printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
22713 - 'A' + pin, pci_name(dev), msg);
22714 + dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
22719 Index: head-2008-12-01/arch/x86/vdso/Makefile
22720 ===================================================================
22721 --- head-2008-12-01.orig/arch/x86/vdso/Makefile 2008-12-01 11:37:10.000000000 +0100
22722 +++ head-2008-12-01/arch/x86/vdso/Makefile 2008-12-01 11:49:07.000000000 +0100
22723 @@ -65,9 +65,7 @@ obj-$(VDSO32-y) += vdso32-syms.lds
22724 vdso32.so-$(VDSO32-y) += int80
22725 vdso32.so-$(CONFIG_COMPAT) += syscall
22726 vdso32.so-$(VDSO32-y) += sysenter
22727 -xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
22728 -xen-vdso32-$(CONFIG_X86_32) += syscall
22729 -vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
22730 +vdso32.so-$(CONFIG_X86_XEN) += syscall
22732 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
22734 Index: head-2008-12-01/arch/x86/vdso/vdso32.S
22735 ===================================================================
22736 --- head-2008-12-01.orig/arch/x86/vdso/vdso32.S 2008-12-01 11:37:10.000000000 +0100
22737 +++ head-2008-12-01/arch/x86/vdso/vdso32.S 2008-12-01 11:49:07.000000000 +0100
22738 @@ -9,7 +9,7 @@ vdso32_int80_end:
22740 .globl vdso32_syscall_start, vdso32_syscall_end
22741 vdso32_syscall_start:
22742 -#ifdef CONFIG_COMPAT
22743 +#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
22744 .incbin "arch/x86/vdso/vdso32-syscall.so"
22746 vdso32_syscall_end:
22747 @@ -19,16 +19,4 @@ vdso32_sysenter_start:
22748 .incbin "arch/x86/vdso/vdso32-sysenter.so"
22749 vdso32_sysenter_end:
22751 -#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
22752 - .globl vdso32_int80_start, vdso32_int80_end
22753 -vdso32_int80_start:
22754 - .incbin "arch/x86/vdso/vdso32-int80.so"
22756 -#elif defined(CONFIG_X86_XEN)
22757 - .globl vdso32_syscall_start, vdso32_syscall_end
22758 -vdso32_syscall_start:
22759 - .incbin "arch/x86/vdso/vdso32-syscall.so"
22760 -vdso32_syscall_end:
22764 Index: head-2008-12-01/arch/x86/vdso/vdso32-setup-xen.c
22765 ===================================================================
22766 --- head-2008-12-01.orig/arch/x86/vdso/vdso32-setup-xen.c 2008-12-01 11:44:55.000000000 +0100
22767 +++ head-2008-12-01/arch/x86/vdso/vdso32-setup-xen.c 2008-12-01 11:49:07.000000000 +0100
22768 @@ -195,50 +195,28 @@ static __init void relocate_vdso(Elf32_E
22773 - * These symbols are defined by vdso32.S to mark the bounds
22774 - * of the ELF DSO images included therein.
22776 -extern const char vdso32_default_start, vdso32_default_end;
22777 -extern const char vdso32_sysenter_start, vdso32_sysenter_end;
22778 static struct page *vdso32_pages[1];
22780 #ifdef CONFIG_X86_64
22782 -#if CONFIG_XEN_COMPAT < 0x030200
22783 -static int use_int80 = 1;
22785 -static int use_sysenter __read_mostly = -1;
22787 -#define vdso32_sysenter() (use_sysenter > 0)
22788 +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
22789 +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
22791 -/* May not be __init: called during resume */
22792 -void syscall32_cpu_init(void)
22793 +void __cpuinit syscall32_cpu_init(void)
22795 - static const struct callback_register cstar = {
22796 + static /*const*/ struct callback_register __cpuinitdata cstar = {
22797 .type = CALLBACKTYPE_syscall32,
22798 .address = (unsigned long)ia32_cstar_target
22800 - static const struct callback_register sysenter = {
22801 + static /*const*/ struct callback_register __cpuinitdata sysenter = {
22802 .type = CALLBACKTYPE_sysenter,
22803 .address = (unsigned long)ia32_sysenter_target
22806 - if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
22807 - (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
22808 -#if CONFIG_XEN_COMPAT < 0x030200
22815 - if (use_sysenter < 0) {
22816 - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
22817 - use_sysenter = 1;
22818 - if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
22819 - use_sysenter = 1;
22821 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
22822 + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
22823 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
22824 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
22827 #define compat_uses_vma 1
22828 @@ -250,6 +228,7 @@ static inline void map_compat_vdso(int m
22829 #else /* CONFIG_X86_32 */
22831 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
22832 +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
22834 extern asmlinkage void ia32pv_cstar_target(void);
22835 static /*const*/ struct callback_register __cpuinitdata cstar = {
22836 @@ -265,13 +244,13 @@ void __cpuinit enable_sep_cpu(void)
22837 .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
22840 - if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
22841 + if (vdso32_syscall()) {
22842 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
22847 - if (!boot_cpu_has(X86_FEATURE_SEP))
22848 + if (!vdso32_sysenter())
22851 if (xen_feature(XENFEAT_supervisor_mode_kernel))
22852 @@ -341,34 +320,26 @@ int __init sysenter_setup(void)
22854 #ifdef CONFIG_X86_32
22858 -#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
22860 - extern const char vdso32_int80_start, vdso32_int80_end;
22862 - vsyscall = &vdso32_int80_start;
22863 - vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
22865 -#elif defined(CONFIG_X86_32)
22866 - if (boot_cpu_has(X86_FEATURE_SYSCALL)
22867 - && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
22868 - || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
22869 - setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
22870 - barrier(); /* until clear_bit()'s constraints are correct ... */
22871 if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
22872 - extern const char vdso32_syscall_start, vdso32_syscall_end;
22874 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
22875 + && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
22876 + setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
22878 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
22879 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
22883 + if (vdso32_syscall()) {
22884 vsyscall = &vdso32_syscall_start;
22885 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
22888 - if (!vdso32_sysenter()) {
22889 - vsyscall = &vdso32_default_start;
22890 - vsyscall_len = &vdso32_default_end - &vdso32_default_start;
22892 + } else if (vdso32_sysenter()){
22893 vsyscall = &vdso32_sysenter_start;
22894 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
22896 + vsyscall = &vdso32_int80_start;
22897 + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
22900 memcpy(syscall_page, vsyscall, vsyscall_len);
22901 Index: head-2008-12-01/arch/x86/xen/Kconfig
22902 ===================================================================
22903 --- head-2008-12-01.orig/arch/x86/xen/Kconfig 2008-12-01 11:36:47.000000000 +0100
22904 +++ head-2008-12-01/arch/x86/xen/Kconfig 2008-12-01 11:49:07.000000000 +0100
22905 @@ -17,7 +17,7 @@ config XEN_MAX_DOMAIN_MEMORY
22906 int "Maximum allowed size of a domain in gigabytes"
22907 default 8 if X86_32
22908 default 32 if X86_64
22910 + depends on PARAVIRT_XEN
22912 The pseudo-physical to machine address array is sized
22913 according to the maximum possible memory size of a Xen
22914 @@ -26,5 +26,5 @@ config XEN_MAX_DOMAIN_MEMORY
22916 config XEN_SAVE_RESTORE
22919 + depends on PARAVIRT_XEN && PM
22921 \ No newline at end of file
22922 Index: head-2008-12-01/drivers/acpi/processor_core.c
22923 ===================================================================
22924 --- head-2008-12-01.orig/drivers/acpi/processor_core.c 2008-12-01 11:44:55.000000000 +0100
22925 +++ head-2008-12-01/drivers/acpi/processor_core.c 2008-12-01 11:49:07.000000000 +0100
22926 @@ -721,9 +721,11 @@ static int __cpuinit acpi_processor_star
22930 - sysdev = get_cpu_sysdev(pr->id);
22931 - if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
22933 + if (pr->id != -1) {
22934 + sysdev = get_cpu_sysdev(pr->id);
22935 + if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
22939 status = acpi_install_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
22940 acpi_processor_notify, pr);
22941 @@ -895,7 +897,8 @@ static int acpi_processor_remove(struct
22942 status = acpi_remove_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
22943 acpi_processor_notify);
22945 - sysfs_remove_link(&device->dev.kobj, "sysdev");
22946 + if (pr->id != -1)
22947 + sysfs_remove_link(&device->dev.kobj, "sysdev");
22949 acpi_processor_remove_fs(device);
22951 Index: head-2008-12-01/drivers/char/tpm/tpm_vtpm.c
22952 ===================================================================
22953 --- head-2008-12-01.orig/drivers/char/tpm/tpm_vtpm.c 2008-12-03 15:48:43.000000000 +0100
22954 +++ head-2008-12-01/drivers/char/tpm/tpm_vtpm.c 2008-12-01 11:49:07.000000000 +0100
22955 @@ -347,7 +347,7 @@ static int _vtpm_send_queued(struct tpm_
22960 + unsigned long flags;
22961 unsigned char buffer[1];
22962 struct vtpm_state *vtpms;
22963 vtpms = (struct vtpm_state *)chip_get_private(chip);
22964 Index: head-2008-12-01/drivers/misc/Kconfig
22965 ===================================================================
22966 --- head-2008-12-01.orig/drivers/misc/Kconfig 2008-12-03 15:48:43.000000000 +0100
22967 +++ head-2008-12-01/drivers/misc/Kconfig 2008-12-01 11:49:07.000000000 +0100
22968 @@ -438,7 +438,7 @@ config ENCLOSURE_SERVICES
22970 tristate "Support communication between SGI SSIs"
22972 - depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP
22973 + depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP && !XEN
22974 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
22975 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
22976 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP
22977 @@ -465,7 +465,7 @@ config HP_ILO
22980 tristate "SGI GRU driver"
22981 - depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP
22982 + depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP && !XEN
22984 select MMU_NOTIFIER
22986 Index: head-2008-12-01/drivers/pci/msi-xen.c
22987 ===================================================================
22988 --- head-2008-12-01.orig/drivers/pci/msi-xen.c 2008-12-01 11:44:55.000000000 +0100
22989 +++ head-2008-12-01/drivers/pci/msi-xen.c 2008-12-01 11:49:07.000000000 +0100
22990 @@ -90,12 +90,10 @@ arch_teardown_msi_irqs(struct pci_dev *d
22994 -static void msi_set_enable(struct pci_dev *dev, int enable)
22995 +static void __msi_set_enable(struct pci_dev *dev, int pos, int enable)
23000 - pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
23002 pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
23003 control &= ~PCI_MSI_FLAGS_ENABLE;
23004 @@ -105,6 +103,11 @@ static void msi_set_enable(struct pci_de
23008 +static void msi_set_enable(struct pci_dev *dev, int enable)
23010 + __msi_set_enable(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), enable);
23013 static void msix_set_enable(struct pci_dev *dev, int enable)
23016 @@ -573,9 +576,8 @@ int pci_enable_msi(struct pci_dev* dev)
23018 /* Check whether driver already requested for MSI-X irqs */
23019 if (dev->msix_enabled) {
23020 - printk(KERN_INFO "PCI: %s: Can't enable MSI. "
23021 - "Device already has MSI-X enabled\n",
23023 + dev_info(&dev->dev, "can't enable MSI "
23024 + "(MSI-X already enabled)\n");
23028 @@ -707,9 +709,8 @@ int pci_enable_msix(struct pci_dev* dev,
23030 /* Check whether driver already requested for MSI vector */
23031 if (dev->msi_enabled) {
23032 - printk(KERN_INFO "PCI: %s: Can't enable MSI-X. "
23033 - "Device already has an MSI irq assigned\n",
23035 + dev_info(&dev->dev, "can't enable MSI-X "
23036 + "(MSI IRQ already assigned)\n");
23040 Index: head-2008-12-01/drivers/pci/quirks.c
23041 ===================================================================
23042 --- head-2008-12-01.orig/drivers/pci/quirks.c 2008-12-03 15:48:43.000000000 +0100
23043 +++ head-2008-12-01/drivers/pci/quirks.c 2008-12-01 11:49:07.000000000 +0100
23044 @@ -42,9 +42,7 @@ static void __devinit quirk_release_reso
23045 /* PCI Host Bridge isn't a target device */
23049 - "PCI: Disable device and release resources [%s].\n",
23051 + dev_info(&dev->dev, "disable device and release resources\n");
23052 pci_disable_device(dev);
23054 for (i=0; i < PCI_NUM_RESOURCES; i++) {
23055 Index: head-2008-12-01/drivers/pci/setup-res.c
23056 ===================================================================
23057 --- head-2008-12-01.orig/drivers/pci/setup-res.c 2008-12-03 15:48:43.000000000 +0100
23058 +++ head-2008-12-01/drivers/pci/setup-res.c 2008-12-01 11:50:17.000000000 +0100
23059 @@ -129,7 +129,7 @@ int pci_claim_resource(struct pci_dev *d
23060 #ifdef CONFIG_PCI_REASSIGN
23061 void pci_disable_bridge_window(struct pci_dev *dev)
23063 - printk(KERN_DEBUG "PCI: Disable bridge window on %s\n", pci_name(dev));
23064 + dev_dbg(&dev->dev, "disable bridge window\n");
23066 /* MMIO Base/Limit */
23067 pci_write_config_dword(dev, PCI_MEMORY_BASE, 0x0000fff0);
23068 @@ -190,8 +190,8 @@ int pci_assign_resource(struct pci_dev *
23069 res->flags &= ~IORESOURCE_STARTALIGN;
23070 if (resno < PCI_BRIDGE_RESOURCES) {
23071 #ifdef CONFIG_PCI_REASSIGN
23072 - printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23073 - "%016llx - %016llx\n", resno, pci_name(dev),
23074 + dev_dbg(&dev->dev, "assign resource(%d) "
23075 + "%016llx - %016llx\n", resno,
23076 (unsigned long long)res->start,
23077 (unsigned long long)res->end);
23079 @@ -235,8 +235,8 @@ int pci_assign_resource_fixed(struct pci
23080 (unsigned long long)res->end);
23081 } else if (resno < PCI_BRIDGE_RESOURCES) {
23082 #ifdef CONFIG_PCI_REASSIGN
23083 - printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23084 - "%016llx - %016llx\n", resno, pci_name(dev),
23085 + dev_dbg(&dev->dev, "assign resource(%d) "
23086 + "%016llx - %016llx\n", resno,
23087 (unsigned long long)res->start,
23088 (unsigned long long)res->end);
23090 Index: head-2008-12-01/drivers/xen/Makefile
23091 ===================================================================
23092 --- head-2008-12-01.orig/drivers/xen/Makefile 2008-12-01 11:44:55.000000000 +0100
23093 +++ head-2008-12-01/drivers/xen/Makefile 2008-12-01 11:49:07.000000000 +0100
23095 -obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
23096 +obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o manage.o
23097 xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
23098 xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
23100 Index: head-2008-12-01/drivers/xen/balloon/sysfs.c
23101 ===================================================================
23102 --- head-2008-12-01.orig/drivers/xen/balloon/sysfs.c 2008-12-01 11:37:10.000000000 +0100
23103 +++ head-2008-12-01/drivers/xen/balloon/sysfs.c 2008-12-01 11:49:07.000000000 +0100
23106 #define BALLOON_SHOW(name, format, args...) \
23107 static ssize_t show_##name(struct sys_device *dev, \
23108 + struct sysdev_attribute *attr, \
23111 return sprintf(buf, format, ##args); \
23112 @@ -59,14 +60,15 @@ BALLOON_SHOW(hard_limit_kb,
23113 (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0);
23114 BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
23116 -static ssize_t show_target_kb(struct sys_device *dev, char *buf)
23117 +static ssize_t show_target_kb(struct sys_device *dev,
23118 + struct sysdev_attribute *attr, char *buf)
23120 return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
23123 static ssize_t store_target_kb(struct sys_device *dev,
23126 + struct sysdev_attribute *attr,
23127 + const char *buf, size_t count)
23129 char memstring[64], *endchar;
23130 unsigned long long target_bytes;
23131 Index: head-2008-12-01/drivers/xen/blktap/blktap.c
23132 ===================================================================
23133 --- head-2008-12-01.orig/drivers/xen/blktap/blktap.c 2008-12-01 11:44:55.000000000 +0100
23134 +++ head-2008-12-01/drivers/xen/blktap/blktap.c 2008-12-01 11:49:07.000000000 +0100
23136 #include <linux/gfp.h>
23137 #include <linux/poll.h>
23138 #include <linux/delay.h>
23139 +#include <linux/nsproxy.h>
23140 #include <asm/tlbflush.h>
23142 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
23143 @@ -483,7 +484,7 @@ found:
23145 if ((class = get_xen_class()) != NULL)
23146 device_create(class, NULL, MKDEV(blktap_major, minor),
23147 - "blktap%d", minor);
23148 + NULL, "blktap%d", minor);
23152 @@ -1686,7 +1687,8 @@ static int __init blkif_init(void)
23153 * We only create the device when a request of a new device is
23156 - device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
23157 + device_create(class, NULL, MKDEV(blktap_major, 0), NULL,
23160 /* this is bad, but not fatal */
23161 WPRINTK("blktap: sysfs xen_class not created\n");
23162 Index: head-2008-12-01/drivers/xen/char/mem.c
23163 ===================================================================
23164 --- head-2008-12-01.orig/drivers/xen/char/mem.c 2008-12-01 11:44:55.000000000 +0100
23165 +++ head-2008-12-01/drivers/xen/char/mem.c 2008-12-01 11:49:07.000000000 +0100
23166 @@ -35,7 +35,7 @@ static inline int uncached_access(struct
23168 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
23170 -#ifdef CONFIG_NONPROMISC_DEVMEM
23171 +#ifdef CONFIG_STRICT_DEVMEM
23172 u64 from = ((u64)pfn) << PAGE_SHIFT;
23173 u64 to = from + size;
23175 @@ -172,7 +172,10 @@ static void mmap_mem_close(struct vm_are
23177 static struct vm_operations_struct mmap_mem_ops = {
23178 .open = mmap_mem_open,
23179 - .close = mmap_mem_close
23180 + .close = mmap_mem_close,
23181 +#ifdef CONFIG_HAVE_IOREMAP_PROT
23182 + .access = generic_access_phys
23186 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
23187 Index: head-2008-12-01/drivers/xen/console/console.c
23188 ===================================================================
23189 --- head-2008-12-01.orig/drivers/xen/console/console.c 2008-12-01 11:44:55.000000000 +0100
23190 +++ head-2008-12-01/drivers/xen/console/console.c 2008-12-01 11:49:07.000000000 +0100
23191 @@ -416,9 +416,7 @@ static void __xencons_tx_flush(void)
23193 if (work_done && (xencons_tty != NULL)) {
23194 wake_up_interruptible(&xencons_tty->write_wait);
23195 - if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
23196 - (xencons_tty->ldisc.write_wakeup != NULL))
23197 - (xencons_tty->ldisc.write_wakeup)(xencons_tty);
23198 + tty_wakeup(xencons_tty);
23202 @@ -619,8 +617,8 @@ static void xencons_close(struct tty_str
23204 tty_wait_until_sent(tty, 0);
23205 tty_driver_flush_buffer(tty);
23206 - if (tty->ldisc.flush_buffer != NULL)
23207 - tty->ldisc.flush_buffer(tty);
23208 + if (tty->ldisc.ops->flush_buffer != NULL)
23209 + tty->ldisc.ops->flush_buffer(tty);
23211 spin_lock_irqsave(&xencons_lock, flags);
23212 xencons_tty = NULL;
23213 Index: head-2008-12-01/drivers/xen/core/evtchn.c
23214 ===================================================================
23215 --- head-2008-12-01.orig/drivers/xen/core/evtchn.c 2008-12-01 11:37:10.000000000 +0100
23216 +++ head-2008-12-01/drivers/xen/core/evtchn.c 2008-12-03 15:53:53.000000000 +0100
23217 @@ -744,9 +744,9 @@ static struct irq_chip dynirq_chip = {
23220 /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
23221 -static int pirq_eoi_does_unmask;
23222 +static bool pirq_eoi_does_unmask;
23223 static DECLARE_BITMAP(pirq_needs_eoi, ALIGN(NR_PIRQS, PAGE_SIZE * 8))
23224 - __attribute__ ((__section__(".bss.page_aligned"), __aligned__(PAGE_SIZE)));
23225 + __page_aligned_bss;
23227 static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
23229 @@ -1002,6 +1002,7 @@ void xen_poll_irq(int irq)
23233 +#ifdef CONFIG_PM_SLEEP
23234 static void restore_cpu_virqs(unsigned int cpu)
23236 struct evtchn_bind_virq bind_virq;
23237 @@ -1094,6 +1095,7 @@ void irq_resume(void)
23243 #if defined(CONFIG_X86_IO_APIC)
23244 #define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
23245 @@ -1175,7 +1177,7 @@ void __init xen_init_IRQ(void)
23246 BUG_ON(!bitmap_empty(pirq_needs_eoi, PAGE_SIZE * 8));
23247 eoi_mfn.mfn = virt_to_bus(pirq_needs_eoi) >> PAGE_SHIFT;
23248 if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_mfn, &eoi_mfn) == 0)
23249 - pirq_eoi_does_unmask = 1;
23250 + pirq_eoi_does_unmask = true;
23252 /* No event channels are 'live' right now. */
23253 for (i = 0; i < NR_EVENT_CHANNELS; i++)
23254 Index: head-2008-12-01/drivers/xen/core/gnttab.c
23255 ===================================================================
23256 --- head-2008-12-01.orig/drivers/xen/core/gnttab.c 2008-12-03 15:48:43.000000000 +0100
23257 +++ head-2008-12-01/drivers/xen/core/gnttab.c 2008-12-02 09:26:17.000000000 +0100
23258 @@ -449,6 +449,7 @@ static int map_pte_fn(pte_t *pte, struct
23262 +#ifdef CONFIG_PM_SLEEP
23263 static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
23264 unsigned long addr, void *data)
23266 @@ -456,6 +457,7 @@ static int unmap_pte_fn(pte_t *pte, stru
23267 set_pte_at(&init_mm, addr, pte, __pte(0));
23272 void *arch_gnttab_alloc_shared(unsigned long *frames)
23274 @@ -633,6 +635,75 @@ void __gnttab_dma_map_page(struct page *
23275 } while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
23278 +#ifdef __HAVE_ARCH_PTE_SPECIAL
23280 +static unsigned int GNTMAP_pte_special;
23282 +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
23283 + unsigned int count)
23287 + if (unlikely(cmd != GNTTABOP_map_grant_ref))
23290 + for (i = 0; i < count; ++i, ++map) {
23291 + if (!(map->flags & GNTMAP_host_map)
23292 + || !(map->flags & GNTMAP_application_map))
23294 + if (GNTMAP_pte_special)
23295 + map->flags |= GNTMAP_pte_special;
23297 + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
23304 +EXPORT_SYMBOL(gnttab_pre_map_adjust);
23306 +#if CONFIG_XEN_COMPAT < 0x030400
23307 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
23312 + for (i = 0; i < count && rc == 0; ++i, ++map) {
23315 + if (!(map->flags & GNTMAP_host_map)
23316 + || !(map->flags & GNTMAP_application_map))
23320 + pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
23321 + | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
23323 + & __supported_pte_mask);
23325 +#error Architecture not yet supported.
23327 + if (!(map->flags & GNTMAP_readonly))
23328 + pte = pte_mkwrite(pte);
23330 + if (map->flags & GNTMAP_contains_pte) {
23333 + u.ptr = map->host_addr;
23334 + u.val = __pte_val(pte);
23335 + rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
23337 + rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
23342 +EXPORT_SYMBOL(gnttab_post_map_adjust);
23345 +#endif /* __HAVE_ARCH_PTE_SPECIAL */
23347 int gnttab_resume(void)
23349 if (max_nr_grant_frames() < nr_grant_frames)
23350 @@ -640,6 +711,7 @@ int gnttab_resume(void)
23351 return gnttab_map(0, nr_grant_frames - 1);
23354 +#ifdef CONFIG_PM_SLEEP
23355 int gnttab_suspend(void)
23358 @@ -649,6 +721,7 @@ int gnttab_suspend(void)
23364 #else /* !CONFIG_XEN */
23366 @@ -759,6 +832,18 @@ int __devinit gnttab_init(void)
23367 gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
23368 gnttab_free_head = NR_RESERVED_ENTRIES;
23370 +#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
23371 + if (!xen_feature(XENFEAT_auto_translated_physmap)
23372 + && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
23374 + GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
23375 + >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
23377 +#error Architecture not yet supported.
23385 Index: head-2008-12-01/drivers/xen/core/machine_kexec.c
23386 ===================================================================
23387 --- head-2008-12-01.orig/drivers/xen/core/machine_kexec.c 2008-12-01 11:44:55.000000000 +0100
23388 +++ head-2008-12-01/drivers/xen/core/machine_kexec.c 2008-12-01 11:49:07.000000000 +0100
23389 @@ -90,7 +90,7 @@ void __init xen_machine_kexec_setup_reso
23390 xen_hypervisor_res.start = range.start;
23391 xen_hypervisor_res.end = range.start + range.size - 1;
23392 xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
23393 -#ifdef CONFIG_X86_64
23395 insert_resource(&iomem_resource, &xen_hypervisor_res);
23398 @@ -105,7 +105,7 @@ void __init xen_machine_kexec_setup_reso
23400 crashk_res.start = range.start;
23401 crashk_res.end = range.start + range.size - 1;
23402 -#ifdef CONFIG_X86_64
23404 insert_resource(&iomem_resource, &crashk_res);
23407 @@ -152,7 +152,7 @@ void __init xen_machine_kexec_setup_reso
23411 -#ifndef CONFIG_X86_64
23412 +#ifndef CONFIG_X86
23413 void __init xen_machine_kexec_register_resources(struct resource *res)
23415 request_resource(res, &xen_hypervisor_res);
23416 Index: head-2008-12-01/drivers/xen/core/machine_reboot.c
23417 ===================================================================
23418 --- head-2008-12-01.orig/drivers/xen/core/machine_reboot.c 2008-12-01 11:44:55.000000000 +0100
23419 +++ head-2008-12-01/drivers/xen/core/machine_reboot.c 2008-12-01 11:49:07.000000000 +0100
23420 @@ -65,6 +65,7 @@ EXPORT_SYMBOL(machine_restart);
23421 EXPORT_SYMBOL(machine_halt);
23422 EXPORT_SYMBOL(machine_power_off);
23424 +#ifdef CONFIG_PM_SLEEP
23425 static void pre_suspend(void)
23427 HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
23428 @@ -119,6 +120,7 @@ static void post_suspend(int suspend_can
23429 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
23430 virt_to_mfn(pfn_to_mfn_frame_list_list);
23434 #else /* !(defined(__i386__) || defined(__x86_64__)) */
23436 @@ -137,6 +139,7 @@ static void post_suspend(int suspend_can
23440 +#ifdef CONFIG_PM_SLEEP
23443 void (*resume_notifier)(int);
23444 @@ -230,7 +233,8 @@ int __xen_suspend(int fast_suspend, void
23446 if (fast_suspend) {
23448 - err = stop_machine_run(take_machine_down, &suspend, 0);
23449 + err = stop_machine(take_machine_down, &suspend,
23450 + &cpumask_of_cpu(0));
23452 xenbus_suspend_cancel();
23454 @@ -253,3 +257,4 @@ int __xen_suspend(int fast_suspend, void
23459 Index: head-2008-12-01/drivers/xen/core/reboot.c
23460 ===================================================================
23461 --- head-2008-12-01.orig/drivers/xen/core/reboot.c 2008-12-01 11:36:47.000000000 +0100
23462 +++ head-2008-12-01/drivers/xen/core/reboot.c 2008-12-01 11:49:07.000000000 +0100
23463 @@ -29,17 +29,12 @@ MODULE_LICENSE("Dual BSD/GPL");
23464 /* Ignore multiple shutdown requests. */
23465 static int shutting_down = SHUTDOWN_INVALID;
23467 -/* Was last suspend request cancelled? */
23468 -static int suspend_cancelled;
23470 /* Can we leave APs online when we suspend? */
23471 static int fast_suspend;
23473 static void __shutdown_handler(struct work_struct *unused);
23474 static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
23476 -static int setup_suspend_evtchn(void);
23478 int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));
23480 static int shutdown_process(void *__unused)
23481 @@ -69,6 +64,13 @@ static int shutdown_process(void *__unus
23485 +#ifdef CONFIG_PM_SLEEP
23487 +static int setup_suspend_evtchn(void);
23489 +/* Was last suspend request cancelled? */
23490 +static int suspend_cancelled;
23492 static void xen_resume_notifier(int _suspend_cancelled)
23494 int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
23495 @@ -118,6 +120,10 @@ static int xen_suspend(void *__unused)
23500 +# define xen_suspend NULL
23503 static void switch_shutdown_state(int new_state)
23505 int prev_state, old_state = SHUTDOWN_INVALID;
23506 @@ -194,8 +200,10 @@ static void shutdown_handler(struct xenb
23507 new_state = SHUTDOWN_POWEROFF;
23508 else if (strcmp(str, "reboot") == 0)
23510 +#ifdef CONFIG_PM_SLEEP
23511 else if (strcmp(str, "suspend") == 0)
23512 new_state = SHUTDOWN_SUSPEND;
23514 else if (strcmp(str, "halt") == 0)
23515 new_state = SHUTDOWN_HALT;
23517 @@ -247,6 +255,7 @@ static struct xenbus_watch sysrq_watch =
23518 .callback = sysrq_handler
23521 +#ifdef CONFIG_PM_SLEEP
23522 static irqreturn_t suspend_int(int irq, void* dev_id)
23524 switch_shutdown_state(SHUTDOWN_SUSPEND);
23525 @@ -274,6 +283,9 @@ static int setup_suspend_evtchn(void)
23530 +#define setup_suspend_evtchn() 0
23533 static int setup_shutdown_watcher(void)
23535 Index: head-2008-12-01/drivers/xen/core/smpboot.c
23536 ===================================================================
23537 --- head-2008-12-01.orig/drivers/xen/core/smpboot.c 2008-12-01 11:44:55.000000000 +0100
23538 +++ head-2008-12-01/drivers/xen/core/smpboot.c 2008-12-01 11:49:07.000000000 +0100
23541 extern irqreturn_t smp_reschedule_interrupt(int, void *);
23542 extern irqreturn_t smp_call_function_interrupt(int, void *);
23543 +extern irqreturn_t smp_call_function_single_interrupt(int, void *);
23545 extern int local_setup_timer(unsigned int cpu);
23546 extern void local_teardown_timer(unsigned int cpu);
23547 @@ -54,8 +55,10 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
23549 static DEFINE_PER_CPU(int, resched_irq);
23550 static DEFINE_PER_CPU(int, callfunc_irq);
23551 +static DEFINE_PER_CPU(int, call1func_irq);
23552 static char resched_name[NR_CPUS][15];
23553 static char callfunc_name[NR_CPUS][15];
23554 +static char call1func_name[NR_CPUS][15];
23556 #ifdef CONFIG_X86_LOCAL_APIC
23557 #define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
23558 @@ -77,8 +80,10 @@ void __init prefill_possible_map(void)
23560 for (i = 0; i < NR_CPUS; i++) {
23561 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
23564 cpu_set(i, cpu_possible_map);
23565 + nr_cpu_ids = i + 1;
23570 @@ -114,7 +119,8 @@ static int __cpuinit xen_smp_intr_init(u
23574 - per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
23575 + per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) =
23576 + per_cpu(call1func_irq, cpu) = -1;
23578 sprintf(resched_name[cpu], "resched%u", cpu);
23579 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
23580 @@ -138,6 +144,17 @@ static int __cpuinit xen_smp_intr_init(u
23582 per_cpu(callfunc_irq, cpu) = rc;
23584 + sprintf(call1func_name[cpu], "call1func%u", cpu);
23585 + rc = bind_ipi_to_irqhandler(CALL_FUNC_SINGLE_VECTOR,
23587 + smp_call_function_single_interrupt,
23588 + IRQF_DISABLED|IRQF_NOBALANCING,
23589 + call1func_name[cpu],
23593 + per_cpu(call1func_irq, cpu) = rc;
23595 rc = xen_spinlock_init(cpu);
23598 @@ -152,6 +169,8 @@ static int __cpuinit xen_smp_intr_init(u
23599 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23600 if (per_cpu(callfunc_irq, cpu) >= 0)
23601 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23602 + if (per_cpu(call1func_irq, cpu) >= 0)
23603 + unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23604 xen_spinlock_cleanup(cpu);
23607 @@ -164,6 +183,7 @@ static void __cpuexit xen_smp_intr_exit(
23609 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23610 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23611 + unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23612 xen_spinlock_cleanup(cpu);
23615 @@ -171,11 +191,7 @@ static void __cpuexit xen_smp_intr_exit(
23616 void __cpuinit cpu_bringup(void)
23620 identify_secondary_cpu(¤t_cpu_data);
23622 - identify_cpu(¤t_cpu_data);
23624 touch_softlockup_watchdog();
23626 local_irq_enable();
23627 @@ -255,9 +271,6 @@ void __init smp_prepare_cpus(unsigned in
23628 struct task_struct *idle;
23630 struct vcpu_get_physid cpu_id;
23632 - struct desc_ptr *gdt_descr;
23637 @@ -270,7 +283,7 @@ void __init smp_prepare_cpus(unsigned in
23639 current_thread_info()->cpu = 0;
23641 - for (cpu = 0; cpu < NR_CPUS; cpu++) {
23642 + for_each_possible_cpu (cpu) {
23643 cpus_clear(per_cpu(cpu_sibling_map, cpu));
23644 cpus_clear(per_cpu(cpu_core_map, cpu));
23646 @@ -297,21 +310,10 @@ void __init smp_prepare_cpus(unsigned in
23648 panic("failed fork for CPU %d", cpu);
23651 - gdt_descr = &cpu_gdt_descr[cpu];
23652 - gdt_descr->address = get_zeroed_page(GFP_KERNEL);
23653 - if (unlikely(!gdt_descr->address)) {
23654 - printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
23658 - gdt_descr->size = GDT_SIZE;
23659 - memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
23660 - gdt_addr = (void *)gdt_descr->address;
23664 - gdt_addr = get_cpu_gdt_table(cpu);
23666 + gdt_addr = get_cpu_gdt_table(cpu);
23667 make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
23670 Index: head-2008-12-01/drivers/xen/core/spinlock.c
23671 ===================================================================
23672 --- head-2008-12-01.orig/drivers/xen/core/spinlock.c 2008-12-01 11:37:10.000000000 +0100
23673 +++ head-2008-12-01/drivers/xen/core/spinlock.c 2008-12-01 11:51:53.000000000 +0100
23674 @@ -73,9 +73,9 @@ int xen_spin_wait(raw_spinlock_t *lock,
23675 /* announce we're spinning */
23676 spinning.ticket = token;
23677 spinning.lock = lock;
23678 - spinning.prev = __get_cpu_var(spinning);
23679 + spinning.prev = x86_read_percpu(spinning);
23681 - __get_cpu_var(spinning) = &spinning;
23682 + x86_write_percpu(spinning, &spinning);
23684 /* clear pending */
23685 xen_clear_irq_pending(irq);
23686 @@ -102,7 +102,7 @@ int xen_spin_wait(raw_spinlock_t *lock,
23687 kstat_this_cpu.irqs[irq] += !rc;
23689 /* announce we're done */
23690 - __get_cpu_var(spinning) = spinning.prev;
23691 + x86_write_percpu(spinning, spinning.prev);
23692 rm_lock = &__get_cpu_var(spinning_rm_lock);
23693 raw_local_irq_save(flags);
23694 __raw_write_lock(rm_lock);
23695 Index: head-2008-12-01/drivers/xen/fbfront/xenfb.c
23696 ===================================================================
23697 --- head-2008-12-01.orig/drivers/xen/fbfront/xenfb.c 2008-12-01 11:44:55.000000000 +0100
23698 +++ head-2008-12-01/drivers/xen/fbfront/xenfb.c 2008-12-01 11:49:07.000000000 +0100
23703 +#include <linux/console.h>
23704 #include <linux/kernel.h>
23705 #include <linux/errno.h>
23706 #include <linux/fb.h>
23707 @@ -544,6 +545,28 @@ static unsigned long vmalloc_to_mfn(void
23708 return pfn_to_mfn(vmalloc_to_pfn(address));
23711 +static __devinit void
23712 +xenfb_make_preferred_console(void)
23714 + struct console *c;
23716 + if (console_set_on_cmdline)
23719 + acquire_console_sem();
23720 + for (c = console_drivers; c; c = c->next) {
23721 + if (!strcmp(c->name, "tty") && c->index == 0)
23724 + release_console_sem();
23726 + unregister_console(c);
23727 + c->flags |= CON_CONSDEV;
23728 + c->flags &= ~CON_PRINTBUFFER; /* don't print again */
23729 + register_console(c);
23733 static int __devinit xenfb_probe(struct xenbus_device *dev,
23734 const struct xenbus_device_id *id)
23736 @@ -673,6 +696,7 @@ static int __devinit xenfb_probe(struct
23740 + xenfb_make_preferred_console();
23744 @@ -881,4 +905,5 @@ static void __exit xenfb_cleanup(void)
23745 module_init(xenfb_init);
23746 module_exit(xenfb_cleanup);
23748 +MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
23749 MODULE_LICENSE("GPL");
23750 Index: head-2008-12-01/drivers/xen/fbfront/xenkbd.c
23751 ===================================================================
23752 --- head-2008-12-01.orig/drivers/xen/fbfront/xenkbd.c 2008-12-01 11:36:07.000000000 +0100
23753 +++ head-2008-12-01/drivers/xen/fbfront/xenkbd.c 2008-12-01 11:49:07.000000000 +0100
23754 @@ -350,4 +350,5 @@ static void __exit xenkbd_cleanup(void)
23755 module_init(xenkbd_init);
23756 module_exit(xenkbd_cleanup);
23758 +MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
23759 MODULE_LICENSE("GPL");
23760 Index: head-2008-12-01/drivers/xen/gntdev/gntdev.c
23761 ===================================================================
23762 --- head-2008-12-01.orig/drivers/xen/gntdev/gntdev.c 2008-12-01 11:44:55.000000000 +0100
23763 +++ head-2008-12-01/drivers/xen/gntdev/gntdev.c 2008-12-01 11:49:07.000000000 +0100
23764 @@ -418,7 +418,7 @@ static int __init gntdev_init(void)
23767 device = device_create(class, NULL, MKDEV(gntdev_major, 0),
23769 + NULL, GNTDEV_NAME);
23770 if (IS_ERR(device)) {
23771 printk(KERN_ERR "Error creating gntdev device in xen_class\n");
23772 printk(KERN_ERR "gntdev created with major number = %d\n",
23773 Index: head-2008-12-01/drivers/xen/netfront/accel.c
23774 ===================================================================
23775 --- head-2008-12-01.orig/drivers/xen/netfront/accel.c 2008-12-01 11:36:55.000000000 +0100
23776 +++ head-2008-12-01/drivers/xen/netfront/accel.c 2008-12-01 11:49:07.000000000 +0100
23781 +#include <linux/version.h>
23782 #include <linux/netdevice.h>
23783 #include <linux/skbuff.h>
23784 #include <linux/list.h>
23785 Index: head-2008-12-01/drivers/xen/netfront/netfront.c
23786 ===================================================================
23787 --- head-2008-12-01.orig/drivers/xen/netfront/netfront.c 2008-12-01 11:44:55.000000000 +0100
23788 +++ head-2008-12-01/drivers/xen/netfront/netfront.c 2008-12-01 11:49:07.000000000 +0100
23789 @@ -640,7 +640,7 @@ static int network_open(struct net_devic
23791 spin_unlock_bh(&np->rx_lock);
23793 - network_maybe_wake_tx(dev);
23794 + netif_start_queue(dev);
23798 Index: head-2008-12-01/drivers/xen/sfc_netback/accel.h
23799 ===================================================================
23800 --- head-2008-12-01.orig/drivers/xen/sfc_netback/accel.h 2008-12-03 15:48:43.000000000 +0100
23801 +++ head-2008-12-01/drivers/xen/sfc_netback/accel.h 2008-12-01 11:49:07.000000000 +0100
23803 #ifndef NETBACK_ACCEL_H
23804 #define NETBACK_ACCEL_H
23806 +#include <linux/version.h>
23807 #include <linux/slab.h>
23808 #include <linux/ip.h>
23809 #include <linux/tcp.h>
23810 Index: head-2008-12-01/drivers/xen/sfc_netfront/accel.h
23811 ===================================================================
23812 --- head-2008-12-01.orig/drivers/xen/sfc_netfront/accel.h 2008-12-01 11:29:05.000000000 +0100
23813 +++ head-2008-12-01/drivers/xen/sfc_netfront/accel.h 2008-12-01 11:49:07.000000000 +0100
23815 #include <xen/evtchn.h>
23817 #include <linux/kernel.h>
23818 +#include <linux/version.h>
23819 #include <linux/list.h>
23821 enum netfront_accel_post_status {
23822 Index: head-2008-12-01/drivers/xen/xenbus/xenbus_comms.c
23823 ===================================================================
23824 --- head-2008-12-01.orig/drivers/xen/xenbus/xenbus_comms.c 2008-12-01 11:36:47.000000000 +0100
23825 +++ head-2008-12-01/drivers/xen/xenbus/xenbus_comms.c 2008-12-01 11:49:07.000000000 +0100
23826 @@ -228,14 +228,11 @@ int xb_init_comms(void)
23827 intf->rsp_cons = intf->rsp_prod;
23830 +#if defined(CONFIG_XEN) || defined(MODULE)
23832 unbind_from_irqhandler(xenbus_irq, &xb_waitq);
23834 -#if defined(CONFIG_XEN) || defined(MODULE)
23835 err = bind_caller_port_to_irqhandler(
23837 - err = bind_evtchn_to_irqhandler(
23839 xen_store_evtchn, wake_waiting,
23840 0, "xenbus", &xb_waitq);
23842 @@ -244,6 +241,20 @@ int xb_init_comms(void)
23847 + if (xenbus_irq) {
23848 + /* Already have an irq; assume we're resuming */
23849 + rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
23851 + err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
23852 + 0, "xenbus", &xb_waitq);
23854 + printk(KERN_ERR "XENBUS request irq failed %i\n", err);
23857 + xenbus_irq = err;
23863 Index: head-2008-12-01/drivers/xen/xenbus/xenbus_probe.c
23864 ===================================================================
23865 --- head-2008-12-01.orig/drivers/xen/xenbus/xenbus_probe.c 2008-12-01 11:44:55.000000000 +0100
23866 +++ head-2008-12-01/drivers/xen/xenbus/xenbus_probe.c 2008-12-01 11:49:07.000000000 +0100
23868 __FUNCTION__, __LINE__, ##args)
23870 #include <linux/kernel.h>
23871 +#include <linux/version.h>
23872 #include <linux/err.h>
23873 #include <linux/string.h>
23874 #include <linux/ctype.h>
23875 Index: head-2008-12-01/fs/aio.c
23876 ===================================================================
23877 --- head-2008-12-01.orig/fs/aio.c 2008-12-01 11:44:55.000000000 +0100
23878 +++ head-2008-12-01/fs/aio.c 2008-12-01 11:49:07.000000000 +0100
23879 @@ -1319,7 +1319,7 @@ static int make_aio_fd(struct kioctx *io
23883 - fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
23884 + fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
23888 Index: head-2008-12-01/include/asm-generic/pgtable.h
23889 ===================================================================
23890 --- head-2008-12-01.orig/include/asm-generic/pgtable.h 2008-12-01 11:29:05.000000000 +0100
23891 +++ head-2008-12-01/include/asm-generic/pgtable.h 2008-12-01 11:49:07.000000000 +0100
23892 @@ -99,10 +99,6 @@ static inline void ptep_set_wrprotect(st
23896 -#ifndef arch_change_pte_range
23897 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
23900 #ifndef __HAVE_ARCH_PTE_SAME
23901 #define pte_same(A,B) (pte_val(A) == pte_val(B))
23903 Index: head-2008-12-01/include/asm-x86/dma-mapping.h
23904 ===================================================================
23905 --- head-2008-12-01.orig/include/asm-x86/dma-mapping.h 2008-12-01 11:44:55.000000000 +0100
23906 +++ head-2008-12-01/include/asm-x86/dma-mapping.h 2008-12-01 11:49:07.000000000 +0100
23907 @@ -74,7 +74,7 @@ static inline struct dma_mapping_ops *ge
23908 /* Make sure we keep the same behaviour */
23909 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
23911 -#ifdef CONFIG_X86_32
23912 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
23915 struct dma_mapping_ops *ops = get_dma_ops(dev);
23916 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/desc.h
23917 ===================================================================
23918 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/desc.h 2008-12-01 11:44:55.000000000 +0100
23919 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/desc.h 2008-12-01 11:49:07.000000000 +0100
23920 @@ -31,11 +31,17 @@ extern struct desc_ptr idt_descr;
23921 extern gate_desc idt_table[];
23925 + struct desc_struct gdt[GDT_ENTRIES];
23926 +} __attribute__((aligned(PAGE_SIZE)));
23927 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
23929 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
23931 + return per_cpu(gdt_page, cpu).gdt;
23934 #ifdef CONFIG_X86_64
23935 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
23936 -extern struct desc_ptr cpu_gdt_descr[];
23937 -/* the cpu gdt accessor */
23938 -#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
23940 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
23941 unsigned dpl, unsigned ist, unsigned seg)
23942 @@ -53,16 +59,6 @@ static inline void pack_gate(gate_desc *
23947 - struct desc_struct gdt[GDT_ENTRIES];
23948 -} __attribute__((aligned(PAGE_SIZE)));
23949 -DECLARE_PER_CPU(struct gdt_page, gdt_page);
23951 -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
23953 - return per_cpu(gdt_page, cpu).gdt;
23956 static inline void pack_gate(gate_desc *gate, unsigned char type,
23957 unsigned long base, unsigned dpl, unsigned flags,
23958 unsigned short seg)
23959 @@ -333,6 +329,28 @@ static inline void set_intr_gate(unsigne
23960 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
23963 +#define SYS_VECTOR_FREE 0
23964 +#define SYS_VECTOR_ALLOCED 1
23966 +extern int first_system_vector;
23967 +extern char system_vectors[];
23969 +static inline void alloc_system_vector(int vector)
23971 + if (system_vectors[vector] == SYS_VECTOR_FREE) {
23972 + system_vectors[vector] = SYS_VECTOR_ALLOCED;
23973 + if (first_system_vector > vector)
23974 + first_system_vector = vector;
23979 +static inline void alloc_intr_gate(unsigned int n, void *addr)
23981 + alloc_system_vector(n);
23982 + set_intr_gate(n, addr);
23986 * This routine sets up an interrupt gate at directory privilege level 3.
23988 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap.h
23989 ===================================================================
23990 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/fixmap.h 2008-12-01 11:44:55.000000000 +0100
23991 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap.h 2008-12-01 11:49:07.000000000 +0100
23993 # include "fixmap_64.h"
23996 +extern int fixmaps_set;
23998 +void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
24000 +static inline void __set_fixmap(enum fixed_addresses idx,
24001 + maddr_t phys, pgprot_t flags)
24003 + xen_set_fixmap(idx, phys, flags);
24006 +#define set_fixmap(idx, phys) \
24007 + __set_fixmap(idx, phys, PAGE_KERNEL)
24010 + * Some hardware wants to get fixmapped without caching.
24012 +#define set_fixmap_nocache(idx, phys) \
24013 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24015 #define clear_fixmap(idx) \
24016 __set_fixmap(idx, 0, __pgprot(0))
24018 +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24019 +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24021 +extern void __this_fixmap_does_not_exist(void);
24024 + * 'index to address' translation. If anyone tries to use the idx
24025 + * directly without translation, we catch the bug with a NULL-deference
24026 + * kernel oops. Illegal ranges of incoming indices are caught too.
24028 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24031 + * this branch gets completely eliminated after inlining,
24032 + * except when someone tries to use fixaddr indices in an
24033 + * illegal way. (such as mixing up address types or using
24034 + * out-of-range indices).
24036 + * If it doesn't get removed, the linker will complain
24037 + * loudly with a reasonably clear error message..
24039 + if (idx >= __end_of_fixed_addresses)
24040 + __this_fixmap_does_not_exist();
24042 + return __fix_to_virt(idx);
24045 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
24047 + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24048 + return __virt_to_fix(vaddr);
24051 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_32.h
24052 ===================================================================
24053 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2008-12-01 11:44:55.000000000 +0100
24054 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_32.h 2008-12-01 11:49:07.000000000 +0100
24055 @@ -58,10 +58,17 @@ enum fixed_addresses {
24056 #ifdef CONFIG_X86_LOCAL_APIC
24057 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24059 -#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
24060 +#ifndef CONFIG_XEN
24061 +#ifdef CONFIG_X86_IO_APIC
24062 FIX_IO_APIC_BASE_0,
24063 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
24067 +#define NR_FIX_ISAMAPS 256
24069 + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24071 #ifdef CONFIG_X86_VISWS_APIC
24072 FIX_CO_CPU, /* Cobalt timer */
24073 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
24074 @@ -78,51 +85,38 @@ enum fixed_addresses {
24075 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
24076 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
24078 -#ifdef CONFIG_ACPI
24080 - FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24082 #ifdef CONFIG_PCI_MMCONFIG
24085 #ifdef CONFIG_PARAVIRT
24086 FIX_PARAVIRT_BOOTMAP,
24089 -#define NR_FIX_ISAMAPS 256
24091 - FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24092 __end_of_permanent_fixed_addresses,
24094 * 256 temporary boot-time mappings, used by early_ioremap(),
24095 * before ioremap() is functional.
24097 - * We round it up to the next 512 pages boundary so that we
24098 + * We round it up to the next 256 pages boundary so that we
24099 * can have a single pgd entry and a single pte table:
24101 #define NR_FIX_BTMAPS 64
24102 #define FIX_BTMAPS_NESTING 4
24103 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24104 - (__end_of_permanent_fixed_addresses & 511),
24105 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
24106 + (__end_of_permanent_fixed_addresses & 255),
24107 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24109 +#ifdef CONFIG_ACPI
24111 + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24113 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24116 __end_of_fixed_addresses
24119 -extern void __set_fixmap(enum fixed_addresses idx,
24120 - maddr_t phys, pgprot_t flags);
24121 extern void reserve_top_address(unsigned long reserve);
24123 -#define set_fixmap(idx, phys) \
24124 - __set_fixmap(idx, phys, PAGE_KERNEL)
24126 - * Some hardware wants to get fixmapped without caching.
24128 -#define set_fixmap_nocache(idx, phys) \
24129 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24131 #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
24133 @@ -131,38 +125,5 @@ extern void reserve_top_address(unsigned
24134 #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
24135 #define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
24137 -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24138 -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24140 -extern void __this_fixmap_does_not_exist(void);
24143 - * 'index to address' translation. If anyone tries to use the idx
24144 - * directly without tranlation, we catch the bug with a NULL-deference
24145 - * kernel oops. Illegal ranges of incoming indices are caught too.
24147 -static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24150 - * this branch gets completely eliminated after inlining,
24151 - * except when someone tries to use fixaddr indices in an
24152 - * illegal way. (such as mixing up address types or using
24153 - * out-of-range indices).
24155 - * If it doesn't get removed, the linker will complain
24156 - * loudly with a reasonably clear error message..
24158 - if (idx >= __end_of_fixed_addresses)
24159 - __this_fixmap_does_not_exist();
24161 - return __fix_to_virt(idx);
24164 -static inline unsigned long virt_to_fix(const unsigned long vaddr)
24166 - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24167 - return __virt_to_fix(vaddr);
24170 #endif /* !__ASSEMBLY__ */
24172 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_64.h
24173 ===================================================================
24174 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2008-12-01 11:44:55.000000000 +0100
24175 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_64.h 2008-12-01 11:49:07.000000000 +0100
24177 #define _ASM_FIXMAP_64_H
24179 #include <linux/kernel.h>
24180 +#include <asm/acpi.h>
24181 #include <asm/apicdef.h>
24182 #include <asm/page.h>
24183 #include <asm/vsyscall.h>
24184 @@ -40,7 +41,6 @@ enum fixed_addresses {
24187 FIX_EARLYCON_MEM_BASE,
24189 #ifdef CONFIG_X86_LOCAL_APIC
24190 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24192 @@ -53,14 +53,21 @@ enum fixed_addresses {
24193 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
24194 + MAX_EFI_IO_PAGES - 1,
24196 +#ifdef CONFIG_PARAVIRT
24197 + FIX_PARAVIRT_BOOTMAP,
24203 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24206 #define NR_FIX_ISAMAPS 256
24208 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24209 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24210 + FIX_OHCI1394_BASE,
24212 __end_of_permanent_fixed_addresses,
24214 * 256 temporary boot-time mappings, used by early_ioremap(),
24215 @@ -71,27 +78,12 @@ enum fixed_addresses {
24217 #define NR_FIX_BTMAPS 64
24218 #define FIX_BTMAPS_NESTING 4
24220 - __end_of_permanent_fixed_addresses + 512 -
24221 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24222 (__end_of_permanent_fixed_addresses & 511),
24223 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24224 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24225 - FIX_OHCI1394_BASE,
24227 __end_of_fixed_addresses
24230 -extern void __set_fixmap(enum fixed_addresses idx,
24231 - unsigned long phys, pgprot_t flags);
24233 -#define set_fixmap(idx, phys) \
24234 - __set_fixmap(idx, phys, PAGE_KERNEL)
24236 - * Some hardware wants to get fixmapped without caching.
24238 -#define set_fixmap_nocache(idx, phys) \
24239 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24241 #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
24242 #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
24243 #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
24244 @@ -100,30 +92,4 @@ extern void __set_fixmap(enum fixed_addr
24245 #define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
24246 #define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
24248 -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24250 -extern void __this_fixmap_does_not_exist(void);
24253 - * 'index to address' translation. If anyone tries to use the idx
24254 - * directly without translation, we catch the bug with a NULL-deference
24255 - * kernel oops. Illegal ranges of incoming indices are caught too.
24257 -static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24260 - * this branch gets completely eliminated after inlining,
24261 - * except when someone tries to use fixaddr indices in an
24262 - * illegal way. (such as mixing up address types or using
24263 - * out-of-range indices).
24265 - * If it doesn't get removed, the linker will complain
24266 - * loudly with a reasonably clear error message..
24268 - if (idx >= __end_of_fixed_addresses)
24269 - __this_fixmap_does_not_exist();
24271 - return __fix_to_virt(idx);
24275 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/highmem.h
24276 ===================================================================
24277 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/highmem.h 2008-12-01 11:44:55.000000000 +0100
24278 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/highmem.h 2008-12-01 11:49:07.000000000 +0100
24279 @@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *p
24281 #define flush_cache_kmaps() do { } while (0)
24283 +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
24284 + unsigned long end_pfn);
24286 void clear_highpage(struct page *);
24287 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
24289 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/hypercall.h
24290 ===================================================================
24291 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/hypercall.h 2008-12-01 11:36:55.000000000 +0100
24292 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/hypercall.h 2008-12-01 11:49:07.000000000 +0100
24293 @@ -323,9 +323,19 @@ static inline int __must_check
24294 HYPERVISOR_grant_table_op(
24295 unsigned int cmd, void *uop, unsigned int count)
24297 + bool fixup = false;
24300 if (arch_use_lazy_mmu_mode())
24301 xen_multicall_flush(false);
24302 - return _hypercall3(int, grant_table_op, cmd, uop, count);
24303 +#ifdef GNTTABOP_map_grant_ref
24304 + if (cmd == GNTTABOP_map_grant_ref)
24306 + fixup = gnttab_pre_map_adjust(cmd, uop, count);
24307 + rc = _hypercall3(int, grant_table_op, cmd, uop, count);
24308 + if (rc == 0 && fixup)
24309 + rc = gnttab_post_map_adjust(uop, count);
24313 static inline int __must_check
24314 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/hypervisor.h
24315 ===================================================================
24316 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2008-12-01 11:37:10.000000000 +0100
24317 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/hypervisor.h 2008-12-01 11:49:07.000000000 +0100
24320 #include <linux/types.h>
24321 #include <linux/kernel.h>
24322 -#include <linux/version.h>
24323 #include <linux/errno.h>
24324 #include <xen/interface/xen.h>
24325 #include <xen/interface/platform.h>
24326 @@ -171,6 +170,20 @@ static inline void arch_flush_lazy_mmu_m
24330 +struct gnttab_map_grant_ref;
24331 +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
24332 + unsigned int count);
24333 +#if CONFIG_XEN_COMPAT < 0x030400
24334 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
24336 +static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
24337 + unsigned int count)
24344 #else /* CONFIG_XEN */
24346 static inline void xen_multicall_flush(bool ignore) {}
24347 @@ -179,6 +192,9 @@ static inline void xen_multicall_flush(b
24348 #define xen_multi_mmu_update(...) ({ BUG(); -ENOSYS; })
24349 #define xen_multi_mmuext_op(...) ({ BUG(); -ENOSYS; })
24351 +#define gnttab_pre_map_adjust(...) false
24352 +#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
24354 #endif /* CONFIG_XEN */
24356 #if defined(CONFIG_X86_64)
24357 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/io.h
24358 ===================================================================
24359 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/io.h 2008-12-01 11:44:55.000000000 +0100
24360 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/io.h 2008-12-01 11:49:07.000000000 +0100
24363 #define ARCH_HAS_IOREMAP_WC
24365 +#include <linux/compiler.h>
24368 + * early_ioremap() and early_iounmap() are for temporary early boot-time
24369 + * mappings, before the real ioremap() is functional.
24370 + * A boot-time mapping is currently limited to at most 16 pages.
24372 +#ifndef __ASSEMBLY__
24373 +extern void early_ioremap_init(void);
24374 +extern void early_ioremap_clear(void);
24375 +extern void early_ioremap_reset(void);
24376 +extern void *early_ioremap(unsigned long offset, unsigned long size);
24377 +extern void early_iounmap(void *addr, unsigned long size);
24378 +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24381 +#define build_mmio_read(name, size, type, reg, barrier) \
24382 +static inline type name(const volatile void __iomem *addr) \
24383 +{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
24384 +:"m" (*(volatile type __force *)addr) barrier); return ret; }
24386 +#define build_mmio_write(name, size, type, reg, barrier) \
24387 +static inline void name(type val, volatile void __iomem *addr) \
24388 +{ asm volatile("mov" size " %0,%1": :reg (val), \
24389 +"m" (*(volatile type __force *)addr) barrier); }
24391 +build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
24392 +build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
24393 +build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
24395 +build_mmio_read(__readb, "b", unsigned char, "=q", )
24396 +build_mmio_read(__readw, "w", unsigned short, "=r", )
24397 +build_mmio_read(__readl, "l", unsigned int, "=r", )
24399 +build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
24400 +build_mmio_write(writew, "w", unsigned short, "r", :"memory")
24401 +build_mmio_write(writel, "l", unsigned int, "r", :"memory")
24403 +build_mmio_write(__writeb, "b", unsigned char, "q", )
24404 +build_mmio_write(__writew, "w", unsigned short, "r", )
24405 +build_mmio_write(__writel, "l", unsigned int, "r", )
24407 +#define readb_relaxed(a) __readb(a)
24408 +#define readw_relaxed(a) __readw(a)
24409 +#define readl_relaxed(a) __readl(a)
24410 +#define __raw_readb __readb
24411 +#define __raw_readw __readw
24412 +#define __raw_readl __readl
24414 +#define __raw_writeb __writeb
24415 +#define __raw_writew __writew
24416 +#define __raw_writel __writel
24418 +#define mmiowb() barrier()
24420 +#ifdef CONFIG_X86_64
24421 +build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
24422 +build_mmio_read(__readq, "q", unsigned long, "=r", )
24423 +build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
24424 +build_mmio_write(__writeq, "q", unsigned long, "r", )
24426 +#define readq_relaxed(a) __readq(a)
24427 +#define __raw_readq __readq
24428 +#define __raw_writeq writeq
24430 +/* Let people know we have them */
24431 +#define readq readq
24432 +#define writeq writeq
24435 #ifdef CONFIG_X86_32
24436 # include "io_32.h"
24438 @@ -19,4 +89,17 @@ extern int ioremap_check_change_attr(uns
24439 unsigned long prot_val);
24440 extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
24443 + * early_ioremap() and early_iounmap() are for temporary early boot-time
24444 + * mappings, before the real ioremap() is functional.
24445 + * A boot-time mapping is currently limited to at most 16 pages.
24447 +extern void early_ioremap_init(void);
24448 +extern void early_ioremap_clear(void);
24449 +extern void early_ioremap_reset(void);
24450 +extern void *early_ioremap(unsigned long offset, unsigned long size);
24451 +extern void early_iounmap(void *addr, unsigned long size);
24452 +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24455 #endif /* _ASM_X86_IO_H */
24456 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/io_32.h
24457 ===================================================================
24458 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/io_32.h 2008-12-01 11:44:55.000000000 +0100
24459 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/io_32.h 2008-12-01 11:49:07.000000000 +0100
24460 @@ -123,6 +123,8 @@ static inline void *phys_to_virt(unsigne
24462 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
24463 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
24464 +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
24465 + unsigned long prot_val);
24468 * The default ioremap() behavior is non-cached:
24469 @@ -135,18 +137,6 @@ static inline void __iomem *ioremap(reso
24470 extern void iounmap(volatile void __iomem *addr);
24473 - * early_ioremap() and early_iounmap() are for temporary early boot-time
24474 - * mappings, before the real ioremap() is functional.
24475 - * A boot-time mapping is currently limited to at most 16 pages.
24477 -extern void early_ioremap_init(void);
24478 -extern void early_ioremap_clear(void);
24479 -extern void early_ioremap_reset(void);
24480 -extern void *early_ioremap(unsigned long offset, unsigned long size);
24481 -extern void early_iounmap(void *addr, unsigned long size);
24482 -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24485 * ISA I/O bus memory addresses are 1:1 with the physical address.
24487 #define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
24488 @@ -162,55 +152,6 @@ extern void __iomem *fix_ioremap(unsigne
24489 #define virt_to_bus(_x) phys_to_machine(__pa(_x))
24490 #define bus_to_virt(_x) __va(machine_to_phys(_x))
24493 - * readX/writeX() are used to access memory mapped devices. On some
24494 - * architectures the memory mapped IO stuff needs to be accessed
24495 - * differently. On the x86 architecture, we just read/write the
24496 - * memory location directly.
24499 -static inline unsigned char readb(const volatile void __iomem *addr)
24501 - return *(volatile unsigned char __force *)addr;
24504 -static inline unsigned short readw(const volatile void __iomem *addr)
24506 - return *(volatile unsigned short __force *)addr;
24509 -static inline unsigned int readl(const volatile void __iomem *addr)
24511 - return *(volatile unsigned int __force *) addr;
24514 -#define readb_relaxed(addr) readb(addr)
24515 -#define readw_relaxed(addr) readw(addr)
24516 -#define readl_relaxed(addr) readl(addr)
24517 -#define __raw_readb readb
24518 -#define __raw_readw readw
24519 -#define __raw_readl readl
24521 -static inline void writeb(unsigned char b, volatile void __iomem *addr)
24523 - *(volatile unsigned char __force *)addr = b;
24526 -static inline void writew(unsigned short b, volatile void __iomem *addr)
24528 - *(volatile unsigned short __force *)addr = b;
24531 -static inline void writel(unsigned int b, volatile void __iomem *addr)
24533 - *(volatile unsigned int __force *)addr = b;
24535 -#define __raw_writeb writeb
24536 -#define __raw_writew writew
24537 -#define __raw_writel writel
24542 memset_io(volatile void __iomem *addr, unsigned char val, int count)
24544 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/io_64.h
24545 ===================================================================
24546 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/io_64.h 2008-12-01 11:44:55.000000000 +0100
24547 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/io_64.h 2008-12-01 11:49:07.000000000 +0100
24548 @@ -190,6 +190,8 @@ extern void early_iounmap(void *addr, un
24550 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
24551 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
24552 +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
24553 + unsigned long prot_val);
24556 * The default ioremap() behavior is non-cached:
24557 @@ -220,77 +222,6 @@ extern void __iomem *fix_ioremap(unsigne
24558 #define virt_to_bus(_x) phys_to_machine(__pa(_x))
24559 #define bus_to_virt(_x) __va(machine_to_phys(_x))
24562 - * readX/writeX() are used to access memory mapped devices. On some
24563 - * architectures the memory mapped IO stuff needs to be accessed
24564 - * differently. On the x86 architecture, we just read/write the
24565 - * memory location directly.
24568 -static inline __u8 __readb(const volatile void __iomem *addr)
24570 - return *(__force volatile __u8 *)addr;
24573 -static inline __u16 __readw(const volatile void __iomem *addr)
24575 - return *(__force volatile __u16 *)addr;
24578 -static __always_inline __u32 __readl(const volatile void __iomem *addr)
24580 - return *(__force volatile __u32 *)addr;
24583 -static inline __u64 __readq(const volatile void __iomem *addr)
24585 - return *(__force volatile __u64 *)addr;
24588 -#define readb(x) __readb(x)
24589 -#define readw(x) __readw(x)
24590 -#define readl(x) __readl(x)
24591 -#define readq(x) __readq(x)
24592 -#define readb_relaxed(a) readb(a)
24593 -#define readw_relaxed(a) readw(a)
24594 -#define readl_relaxed(a) readl(a)
24595 -#define readq_relaxed(a) readq(a)
24596 -#define __raw_readb readb
24597 -#define __raw_readw readw
24598 -#define __raw_readl readl
24599 -#define __raw_readq readq
24603 -static inline void __writel(__u32 b, volatile void __iomem *addr)
24605 - *(__force volatile __u32 *)addr = b;
24608 -static inline void __writeq(__u64 b, volatile void __iomem *addr)
24610 - *(__force volatile __u64 *)addr = b;
24613 -static inline void __writeb(__u8 b, volatile void __iomem *addr)
24615 - *(__force volatile __u8 *)addr = b;
24618 -static inline void __writew(__u16 b, volatile void __iomem *addr)
24620 - *(__force volatile __u16 *)addr = b;
24623 -#define writeq(val, addr) __writeq((val), (addr))
24624 -#define writel(val, addr) __writel((val), (addr))
24625 -#define writew(val, addr) __writew((val), (addr))
24626 -#define writeb(val, addr) __writeb((val), (addr))
24627 -#define __raw_writeb writeb
24628 -#define __raw_writew writew
24629 -#define __raw_writel writel
24630 -#define __raw_writeq writeq
24632 void __memcpy_fromio(void *, unsigned long, unsigned);
24633 void __memcpy_toio(unsigned long, const void *, unsigned);
24635 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/irq_vectors.h
24636 ===================================================================
24637 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
24638 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/irq_vectors.h 2008-12-01 11:49:07.000000000 +0100
24640 +#ifndef _ASM_IRQ_VECTORS_H
24641 +#define _ASM_IRQ_VECTORS_H
24643 +#ifdef CONFIG_X86_32
24644 +# define SYSCALL_VECTOR 0x80
24646 +# define IA32_SYSCALL_VECTOR 0x80
24649 +#define RESCHEDULE_VECTOR 0
24650 +#define CALL_FUNCTION_VECTOR 1
24651 +#define CALL_FUNC_SINGLE_VECTOR 2
24652 +#define SPIN_UNLOCK_VECTOR 3
24656 + * The maximum number of vectors supported by i386 processors
24657 + * is limited to 256. For processors other than i386, NR_VECTORS
24658 + * should be changed accordingly.
24660 +#define NR_VECTORS 256
24662 +#define FIRST_VM86_IRQ 3
24663 +#define LAST_VM86_IRQ 15
24664 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
24667 + * The flat IRQ space is divided into two regions:
24668 + * 1. A one-to-one mapping of real physical IRQs. This space is only used
24669 + * if we have physical device-access privilege. This region is at the
24670 + * start of the IRQ space so that existing device drivers do not need
24671 + * to be modified to translate physical IRQ numbers into our IRQ space.
24672 + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
24673 + * are bound using the provided bind/unbind functions.
24676 +#define PIRQ_BASE 0
24677 +#if !defined(MAX_IO_APICS)
24678 +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
24679 +#elif NR_CPUS < MAX_IO_APICS
24680 +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
24682 +# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
24685 +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
24686 +#define NR_DYNIRQS 256
24688 +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
24689 +#define NR_IRQ_VECTORS NR_IRQS
24691 +#endif /* _ASM_IRQ_VECTORS_H */
24692 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/irqflags.h
24693 ===================================================================
24694 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/irqflags.h 2008-12-01 11:44:55.000000000 +0100
24695 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/irqflags.h 2008-12-01 11:49:07.000000000 +0100
24696 @@ -118,7 +118,7 @@ static inline void halt(void)
24698 #ifndef CONFIG_X86_64
24699 #define INTERRUPT_RETURN iret
24700 -#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
24701 +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
24702 sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
24704 jnz 14f /* process more events if necessary... */ ; \
24705 @@ -175,18 +175,6 @@ static inline void trace_hardirqs_fixup_
24708 #ifdef CONFIG_X86_64
24710 - * Currently paravirt can't handle swapgs nicely when we
24711 - * don't have a stack we can rely on (such as a user space
24712 - * stack). So we either find a way around these or just fault
24713 - * and emulate if a guest tries to call swapgs directly.
24715 - * Either way, this is a good way to document that we don't
24716 - * have a reliable stack. x86_64 only.
24718 -#define SWAPGS_UNSAFE_STACK swapgs
24719 -#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
24720 -#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
24721 #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
24722 #define ARCH_LOCKDEP_SYS_EXIT_IRQ \
24724 @@ -198,24 +186,6 @@ static inline void trace_hardirqs_fixup_
24728 -#define ARCH_TRACE_IRQS_ON \
24732 - call trace_hardirqs_on; \
24737 -#define ARCH_TRACE_IRQS_OFF \
24741 - call trace_hardirqs_off; \
24746 #define ARCH_LOCKDEP_SYS_EXIT \
24749 @@ -229,8 +199,8 @@ static inline void trace_hardirqs_fixup_
24752 #ifdef CONFIG_TRACE_IRQFLAGS
24753 -# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
24754 -# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
24755 +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
24756 +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
24758 # define TRACE_IRQS_ON
24759 # define TRACE_IRQS_OFF
24760 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context.h
24761 ===================================================================
24762 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/mmu_context.h 2008-12-01 11:36:55.000000000 +0100
24763 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context.h 2008-12-01 11:49:07.000000000 +0100
24765 +#ifndef __ASM_X86_MMU_CONTEXT_H
24766 +#define __ASM_X86_MMU_CONTEXT_H
24768 +#include <asm/desc.h>
24769 +#include <asm/atomic.h>
24770 +#include <asm/pgalloc.h>
24771 +#include <asm/tlbflush.h>
24773 +void arch_exit_mmap(struct mm_struct *mm);
24774 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24776 +void mm_pin(struct mm_struct *mm);
24777 +void mm_unpin(struct mm_struct *mm);
24778 +void mm_pin_all(void);
24780 +static inline void xen_activate_mm(struct mm_struct *prev,
24781 + struct mm_struct *next)
24783 + if (!PagePinned(virt_to_page(next->pgd)))
24788 + * Used for LDT copy/destruction.
24790 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24791 +void destroy_context(struct mm_struct *mm);
24793 #ifdef CONFIG_X86_32
24794 # include "mmu_context_32.h"
24796 # include "mmu_context_64.h"
24799 +#define activate_mm(prev, next) \
24801 + xen_activate_mm(prev, next); \
24802 + switch_mm((prev), (next), NULL); \
24806 +#endif /* __ASM_X86_MMU_CONTEXT_H */
24807 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_32.h
24808 ===================================================================
24809 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2008-12-01 11:44:55.000000000 +0100
24810 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_32.h 2008-12-01 11:49:07.000000000 +0100
24812 #ifndef __I386_SCHED_H
24813 #define __I386_SCHED_H
24815 -#include <asm/desc.h>
24816 -#include <asm/atomic.h>
24817 -#include <asm/pgalloc.h>
24818 -#include <asm/tlbflush.h>
24820 -void arch_exit_mmap(struct mm_struct *mm);
24821 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24823 -void mm_pin(struct mm_struct *mm);
24824 -void mm_unpin(struct mm_struct *mm);
24825 -void mm_pin_all(void);
24827 -static inline void xen_activate_mm(struct mm_struct *prev,
24828 - struct mm_struct *next)
24830 - if (!PagePinned(virt_to_page(next->pgd)))
24835 - * Used for LDT copy/destruction.
24837 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24838 -void destroy_context(struct mm_struct *mm);
24841 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24843 #if 0 /* XEN: no lazy tlb */
24844 @@ -107,10 +81,4 @@ static inline void switch_mm(struct mm_s
24845 #define deactivate_mm(tsk, mm) \
24846 asm("movl %0,%%gs": :"r" (0));
24848 -#define activate_mm(prev, next) \
24850 - xen_activate_mm(prev, next); \
24851 - switch_mm((prev), (next), NULL); \
24855 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_64.h
24856 ===================================================================
24857 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2008-12-01 11:44:55.000000000 +0100
24858 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_64.h 2008-12-01 11:49:07.000000000 +0100
24860 #ifndef __X86_64_MMU_CONTEXT_H
24861 #define __X86_64_MMU_CONTEXT_H
24863 -#include <asm/desc.h>
24864 -#include <asm/atomic.h>
24865 -#include <asm/pgalloc.h>
24866 -#include <asm/page.h>
24867 -#include <asm/pda.h>
24868 -#include <asm/pgtable.h>
24869 -#include <asm/tlbflush.h>
24871 -void arch_exit_mmap(struct mm_struct *mm);
24872 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24875 - * possibly do the LDT unload here?
24877 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24878 -void destroy_context(struct mm_struct *mm);
24880 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24882 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
24883 @@ -58,10 +41,6 @@ static inline void __prepare_arch_switch
24887 -extern void mm_pin(struct mm_struct *mm);
24888 -extern void mm_unpin(struct mm_struct *mm);
24889 -void mm_pin_all(void);
24891 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
24892 struct task_struct *tsk)
24894 @@ -124,11 +103,4 @@ do { \
24895 asm volatile("movl %0,%%fs"::"r"(0)); \
24898 -static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
24900 - if (!PagePinned(virt_to_page(next->pgd)))
24902 - switch_mm(prev, next, NULL);
24906 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/page.h
24907 ===================================================================
24908 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/page.h 2008-12-01 11:44:55.000000000 +0100
24909 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/page.h 2008-12-01 11:49:07.000000000 +0100
24911 * below. The preprocessor will warn if the two definitions aren't identical.
24913 #define _PAGE_BIT_PRESENT 0
24914 -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
24915 -#define _PAGE_BIT_IO 9
24916 -#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
24917 +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
24918 +#define _PAGE_BIT_IO 11
24919 +#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
24921 #define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
24922 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
24924 (ie, 32-bit PAE). */
24925 #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
24927 -/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
24928 -#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
24929 +/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
24930 +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
24932 +/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
24933 +#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
24935 #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
24936 #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
24938 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
24939 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
24941 -/* to align the pointer to the (next) page boundary */
24942 -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
24943 +#define HUGE_MAX_HSTATE 2
24945 #ifndef __ASSEMBLY__
24946 #include <linux/types.h>
24949 #ifndef __ASSEMBLY__
24951 +typedef struct { pgdval_t pgd; } pgd_t;
24952 +typedef struct { pgprotval_t pgprot; } pgprot_t;
24954 extern int page_is_ram(unsigned long pagenr);
24955 extern int devmem_is_allowed(unsigned long pagenr);
24956 +extern void map_devmem(unsigned long pfn, unsigned long size,
24957 + pgprot_t vma_prot);
24958 +extern void unmap_devmem(unsigned long pfn, unsigned long size,
24959 + pgprot_t vma_prot);
24961 +extern unsigned long max_low_pfn_mapped;
24962 extern unsigned long max_pfn_mapped;
24965 @@ -84,15 +94,11 @@ static inline void copy_user_page(void *
24966 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
24967 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
24969 -typedef struct { pgprotval_t pgprot; } pgprot_t;
24971 #define pgprot_val(x) ((x).pgprot)
24972 #define __pgprot(x) ((pgprot_t) { (x) } )
24974 #include <asm/maddr.h>
24976 -typedef struct { pgdval_t pgd; } pgd_t;
24978 #define __pgd_ma(x) ((pgd_t) { (x) } )
24979 static inline pgd_t xen_make_pgd(pgdval_t val)
24981 @@ -196,6 +202,11 @@ static inline pteval_t xen_pte_val(pte_t
24985 +static inline pteval_t xen_pte_flags(pte_t pte)
24987 + return __pte_val(pte) & PTE_FLAGS_MASK;
24990 #define pgd_val(x) xen_pgd_val(x)
24991 #define __pgd(x) xen_make_pgd(x)
24993 @@ -210,6 +221,7 @@ static inline pteval_t xen_pte_val(pte_t
24996 #define pte_val(x) xen_pte_val(x)
24997 +#define pte_flags(x) xen_pte_flags(x)
24998 #define __pte(x) xen_make_pte(x)
25000 #define __pa(x) __phys_addr((unsigned long)(x))
25001 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/page_64.h
25002 ===================================================================
25003 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/page_64.h 2008-12-01 11:44:55.000000000 +0100
25004 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/page_64.h 2008-12-01 11:49:07.000000000 +0100
25006 #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
25007 #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
25010 + * Set __PAGE_OFFSET to the most negative possible address +
25011 + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
25012 + * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
25013 + * what Xen requires.
25015 #define __PAGE_OFFSET _AC(0xffff880000000000, UL)
25017 #define __PHYSICAL_START CONFIG_PHYSICAL_START
25019 void clear_page(void *page);
25020 void copy_page(void *to, void *from);
25022 -extern unsigned long end_pfn;
25023 +/* duplicated to the one in bootmem.h */
25024 +extern unsigned long max_pfn;
25026 static inline unsigned long __phys_addr(unsigned long x)
25028 @@ -91,6 +98,11 @@ typedef union { pteval_t pte; unsigned i
25029 extern unsigned long init_memory_mapping(unsigned long start,
25030 unsigned long end);
25032 +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
25034 +extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
25035 +extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
25037 #endif /* !__ASSEMBLY__ */
25039 #ifdef CONFIG_FLATMEM
25040 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pci.h
25041 ===================================================================
25042 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pci.h 2008-12-01 11:44:55.000000000 +0100
25043 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pci.h 2008-12-01 11:49:07.000000000 +0100
25044 @@ -21,6 +21,8 @@ struct pci_sysdata {
25048 +extern int pci_routeirq;
25050 /* scan a bus after allocating a pci_sysdata for it */
25051 extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
25053 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pci_32.h
25054 ===================================================================
25055 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pci_32.h 2008-12-01 11:36:55.000000000 +0100
25056 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pci_32.h 2008-12-01 11:49:07.000000000 +0100
25057 @@ -38,12 +38,14 @@ struct pci_dev;
25058 #define PCI_DMA_BUS_IS_PHYS (1)
25060 /* pci_unmap_{page,single} is a nop so... */
25061 -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
25062 -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
25063 -#define pci_unmap_addr(PTR, ADDR_NAME) (0)
25064 -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
25065 -#define pci_unmap_len(PTR, LEN_NAME) (0)
25066 -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
25067 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
25068 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
25069 +#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
25070 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
25071 + do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
25072 +#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
25073 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
25074 + do { break; } while (pci_unmap_len(PTR, LEN_NAME))
25078 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgalloc.h
25079 ===================================================================
25080 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgalloc.h 2008-12-01 11:44:55.000000000 +0100
25081 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgalloc.h 2008-12-01 11:49:07.000000000 +0100
25084 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
25086 +static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
25087 +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
25089 static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
25090 static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
25091 static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
25092 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable.h
25093 ===================================================================
25094 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable.h 2008-12-01 11:44:55.000000000 +0100
25095 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable.h 2008-12-01 11:49:07.000000000 +0100
25096 @@ -13,11 +13,12 @@
25097 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
25098 #define _PAGE_BIT_PAT 7 /* on 4KB pages */
25099 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
25100 -#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
25101 +#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
25102 +#define _PAGE_BIT_UNUSED2 10
25103 +#define _PAGE_BIT_IO 11 /* Mapped page is I/O or foreign and
25104 * has no associated page struct. */
25105 -#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
25106 -#define _PAGE_BIT_UNUSED3 11
25107 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
25108 +#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
25109 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
25111 /* If _PAGE_BIT_PRESENT is clear, we use these: */
25112 @@ -28,34 +29,31 @@
25113 /* if the user mapped it with PROT_NONE; pte_present gives true */
25114 #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
25117 - * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
25118 - * sign-extended value on 32-bit with all 1's in the upper word,
25119 - * which preserves the upper pte values on 64-bit ptes:
25121 -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25122 -#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
25123 -#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
25124 -#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
25125 -#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
25126 -#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
25127 -#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
25128 -#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
25129 -#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
25130 -#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25131 -#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
25132 -#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
25133 -#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
25134 -#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
25135 +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25136 +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
25137 +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
25138 +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
25139 +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
25140 +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
25141 +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
25142 +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
25143 +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
25144 +#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
25145 +#define _PAGE_UNUSED2 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2)
25146 +#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25147 +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
25148 +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
25149 +#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
25150 +#define __HAVE_ARCH_PTE_SPECIAL
25152 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
25153 -#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
25154 +#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
25156 -#define _PAGE_NX 0
25157 +#define _PAGE_NX (_AT(pteval_t, 0))
25160 -#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
25161 -#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
25162 +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
25163 +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
25165 #ifndef __ASSEMBLY__
25166 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
25167 @@ -71,8 +69,8 @@ extern unsigned int __kernel_page_user;
25168 _PAGE_DIRTY | __kernel_page_user)
25170 /* Set of bits not changed in pte_modify */
25171 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25172 - _PAGE_ACCESSED | _PAGE_DIRTY)
25173 +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25174 + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
25177 * PAT settings are part of the hypervisor interface, which sets the
25178 @@ -102,19 +100,9 @@ extern unsigned int __kernel_page_user;
25179 #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
25182 -#ifdef CONFIG_X86_32
25183 -#define _PAGE_KERNEL_EXEC \
25184 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
25185 -#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
25187 -#ifndef __ASSEMBLY__
25188 -extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
25189 -#endif /* __ASSEMBLY__ */
25191 #define __PAGE_KERNEL_EXEC \
25192 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
25193 #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
25196 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
25197 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
25198 @@ -125,25 +113,22 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
25199 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
25200 #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
25201 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
25202 +#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
25203 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
25206 - * We don't support GLOBAL page in xenolinux64
25208 -#define MAKE_GLOBAL(x) __pgprot((x))
25210 -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
25211 -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
25212 -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
25213 -#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
25214 -#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
25215 -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
25216 -#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
25217 -#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
25218 -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
25219 -#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
25220 -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
25221 -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25222 +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
25223 +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
25224 +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
25225 +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
25226 +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
25227 +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
25228 +#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
25229 +#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
25230 +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
25231 +#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
25232 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
25233 +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
25234 +#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25237 #define __P000 PAGE_NONE
25238 @@ -182,27 +167,27 @@ extern struct list_head pgd_list;
25240 static inline int pte_dirty(pte_t pte)
25242 - return __pte_val(pte) & _PAGE_DIRTY;
25243 + return pte_flags(pte) & _PAGE_DIRTY;
25246 static inline int pte_young(pte_t pte)
25248 - return __pte_val(pte) & _PAGE_ACCESSED;
25249 + return pte_flags(pte) & _PAGE_ACCESSED;
25252 static inline int pte_write(pte_t pte)
25254 - return __pte_val(pte) & _PAGE_RW;
25255 + return pte_flags(pte) & _PAGE_RW;
25258 static inline int pte_file(pte_t pte)
25260 - return __pte_val(pte) & _PAGE_FILE;
25261 + return pte_flags(pte) & _PAGE_FILE;
25264 static inline int pte_huge(pte_t pte)
25266 - return __pte_val(pte) & _PAGE_PSE;
25267 + return pte_flags(pte) & _PAGE_PSE;
25270 static inline int pte_global(pte_t pte)
25271 @@ -212,12 +197,12 @@ static inline int pte_global(pte_t pte)
25273 static inline int pte_exec(pte_t pte)
25275 - return !(__pte_val(pte) & _PAGE_NX);
25276 + return !(pte_flags(pte) & _PAGE_NX);
25279 static inline int pte_special(pte_t pte)
25282 + return pte_flags(pte) & _PAGE_SPECIAL;
25285 static inline int pmd_large(pmd_t pte)
25286 @@ -228,22 +213,22 @@ static inline int pmd_large(pmd_t pte)
25288 static inline pte_t pte_mkclean(pte_t pte)
25290 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
25291 + return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY);
25294 static inline pte_t pte_mkold(pte_t pte)
25296 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
25297 + return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED);
25300 static inline pte_t pte_wrprotect(pte_t pte)
25302 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
25303 + return __pte_ma(__pte_val(pte) & ~_PAGE_RW);
25306 static inline pte_t pte_mkexec(pte_t pte)
25308 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
25309 + return __pte_ma(__pte_val(pte) & ~_PAGE_NX);
25312 static inline pte_t pte_mkdirty(pte_t pte)
25313 @@ -268,7 +253,7 @@ static inline pte_t pte_mkhuge(pte_t pte
25315 static inline pte_t pte_clrhuge(pte_t pte)
25317 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
25318 + return __pte_ma(__pte_val(pte) & ~_PAGE_PSE);
25321 static inline pte_t pte_mkglobal(pte_t pte)
25322 @@ -283,35 +268,46 @@ static inline pte_t pte_clrglobal(pte_t
25324 static inline pte_t pte_mkspecial(pte_t pte)
25327 + return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL);
25330 extern pteval_t __supported_pte_mask;
25332 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
25334 - return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
25335 - pgprot_val(pgprot)) & __supported_pte_mask);
25336 + pgprotval_t prot = pgprot_val(pgprot);
25338 + if (prot & _PAGE_PRESENT)
25339 + prot &= __supported_pte_mask;
25340 + return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25343 static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
25345 - return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
25346 - pgprot_val(pgprot)) & __supported_pte_mask);
25347 + pgprotval_t prot = pgprot_val(pgprot);
25349 + if (prot & _PAGE_PRESENT)
25350 + prot &= __supported_pte_mask;
25351 + return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25354 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
25356 - return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
25357 - pgprot_val(pgprot)) & __supported_pte_mask);
25358 + pgprotval_t prot = pgprot_val(pgprot);
25360 + if (prot & _PAGE_PRESENT)
25361 + prot &= __supported_pte_mask;
25362 + return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25365 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
25367 - pteval_t val = pte_val(pte);
25368 + pgprotval_t prot = pgprot_val(newprot);
25369 + pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
25371 - val &= _PAGE_CHG_MASK;
25372 - val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
25373 + if (prot & _PAGE_PRESENT)
25374 + prot &= __supported_pte_mask;
25375 + val |= prot & ~_PAGE_CHG_MASK;
25379 @@ -325,9 +321,11 @@ static inline pgprot_t pgprot_modify(pgp
25380 return __pgprot(preservebits | addbits);
25383 -#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
25384 +#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
25386 -#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
25387 +#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \
25388 + ? pgprot_val(p) & __supported_pte_mask \
25391 #ifndef __ASSEMBLY__
25392 #define __HAVE_PHYS_MEM_ACCESS_PROT
25393 @@ -338,6 +336,17 @@ int phys_mem_access_prot_allowed(struct
25394 unsigned long size, pgprot_t *vma_prot);
25397 +/* Install a pte for a particular vaddr in kernel space. */
25398 +void set_pte_vaddr(unsigned long vaddr, pte_t pte);
25400 +#ifndef CONFIG_XEN
25401 +extern void native_pagetable_setup_start(pgd_t *base);
25402 +extern void native_pagetable_setup_done(pgd_t *base);
25404 +static inline void xen_pagetable_setup_start(pgd_t *base) {}
25405 +static inline void xen_pagetable_setup_done(pgd_t *base) {}
25408 #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
25409 #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
25411 @@ -373,6 +382,26 @@ int phys_mem_access_prot_allowed(struct
25412 # include "pgtable_64.h"
25416 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25418 + * this macro returns the index of the entry in the pgd page which would
25419 + * control the given virtual address
25421 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25424 + * pgd_offset() returns a (pgd_t *)
25425 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25427 +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25429 + * a shortcut which implies the use of the kernel's pgd, instead
25432 +#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25435 #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
25436 #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
25438 @@ -383,8 +412,15 @@ enum {
25445 +#ifdef CONFIG_PROC_FS
25446 +extern void update_page_count(int level, unsigned long pages);
25448 +static inline void update_page_count(int level, unsigned long pages) { }
25452 * Helper function that returns the kernel pagetable entry controlling
25453 * the virtual address 'address'. NULL means no pagetable entry present.
25454 @@ -441,6 +477,8 @@ static inline void xen_pte_clear(struct
25455 * race with other CPU's that might be updating the dirty
25456 * bit at the same time.
25458 +struct vm_area_struct;
25460 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
25461 extern int ptep_set_access_flags(struct vm_area_struct *vma,
25462 unsigned long address, pte_t *ptep,
25463 @@ -523,9 +561,6 @@ static inline void clone_pgd_range(pgd_t
25464 memcpy(dst, src, count * sizeof(pgd_t));
25467 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
25468 - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
25470 #define arbitrary_virt_to_machine(va) \
25472 unsigned int __lvl; \
25473 @@ -535,6 +570,44 @@ static inline void clone_pgd_range(pgd_t
25474 | ((unsigned long)(va) & (PAGE_SIZE - 1))); \
25477 +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
25478 +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
25481 +#if CONFIG_XEN_COMPAT < 0x030300
25482 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
25483 + return ptep_get_and_clear(mm, addr, ptep);
25488 +#ifdef CONFIG_HIGHPTE
25489 +extern void *high_memory;
25492 +static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
25493 + pte_t *ptep, pte_t pte)
25497 +#if CONFIG_XEN_COMPAT < 0x030300
25498 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
25499 + set_pte_at(mm, addr, ptep, pte);
25503 +#ifdef CONFIG_HIGHPTE
25504 + if ((void *)ptep > high_memory)
25505 + u.ptr = arbitrary_virt_to_machine(ptep)
25506 + | MMU_PT_UPDATE_PRESERVE_AD;
25509 + u.ptr = virt_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
25510 + u.val = __pte_val(pte);
25511 + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
25515 #include <asm-generic/pgtable.h>
25517 #include <xen/features.h>
25518 @@ -563,10 +636,6 @@ int touch_pte_range(struct mm_struct *mm
25519 unsigned long address,
25520 unsigned long size);
25522 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
25523 - unsigned long addr, unsigned long end, pgprot_t newprot,
25524 - int dirty_accountable);
25526 #endif /* __ASSEMBLY__ */
25528 #endif /* _ASM_X86_PGTABLE_H */
25529 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable-3level.h
25530 ===================================================================
25531 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2008-12-01 11:44:55.000000000 +0100
25532 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable-3level.h 2008-12-01 11:49:07.000000000 +0100
25533 @@ -14,11 +14,11 @@
25534 #define pmd_ERROR(e) \
25535 printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
25536 __FILE__, __LINE__, &(e), __pmd_val(e), \
25537 - (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25538 + (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25539 #define pgd_ERROR(e) \
25540 printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
25541 __FILE__, __LINE__, &(e), __pgd_val(e), \
25542 - (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25543 + (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25545 static inline int pud_none(pud_t pud)
25547 @@ -27,7 +27,7 @@ static inline int pud_none(pud_t pud)
25549 static inline int pud_bad(pud_t pud)
25551 - return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25552 + return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25555 static inline int pud_present(pud_t pud)
25556 @@ -102,9 +102,9 @@ static inline void pud_clear(pud_t *pudp
25560 -#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
25561 +#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))
25563 -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
25564 +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
25567 /* Find an entry in the second-level page table.. */
25568 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_32.h
25569 ===================================================================
25570 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2008-12-01 11:44:55.000000000 +0100
25571 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_32.h 2008-12-01 11:49:07.000000000 +0100
25572 @@ -89,10 +89,10 @@ extern unsigned long pg0[];
25573 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
25574 can temporarily clear it. */
25575 #define pmd_present(x) (__pmd_val(x))
25576 -#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25577 +#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25579 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
25580 -#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25581 +#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25585 @@ -119,26 +119,6 @@ extern unsigned long pg0[];
25587 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
25590 - * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25592 - * this macro returns the index of the entry in the pgd page which would
25593 - * control the given virtual address
25595 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25596 -#define pgd_index_k(addr) pgd_index((addr))
25599 - * pgd_offset() returns a (pgd_t *)
25600 - * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25602 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25605 - * a shortcut which implies the use of the kernel's pgd, instead
25608 -#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25610 static inline int pud_large(pud_t pud) { return 0; }
25612 @@ -165,7 +145,7 @@ static inline int pud_large(pud_t pud) {
25613 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25615 #define pmd_page_vaddr(pmd) \
25616 - ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
25617 + ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
25619 #if defined(CONFIG_HIGHPTE)
25620 #define pte_offset_map(dir, address) \
25621 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_64.h
25622 ===================================================================
25623 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-01 11:44:55.000000000 +0100
25624 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-01 11:49:07.000000000 +0100
25625 @@ -23,6 +23,8 @@ extern void xen_init_pt(void);
25626 extern pud_t level3_kernel_pgt[512];
25627 extern pud_t level3_ident_pgt[512];
25628 extern pmd_t level2_kernel_pgt[512];
25629 +extern pmd_t level2_fixmap_pgt[512];
25630 +extern pmd_t level2_ident_pgt[512];
25631 extern pgd_t init_level4_pgt[];
25633 #define swapper_pg_dir init_level4_pgt
25634 @@ -79,6 +81,9 @@ extern void paging_init(void);
25638 +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
25641 #define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
25643 static inline void xen_set_pte(pte_t *ptep, pte_t pte)
25644 @@ -150,24 +155,24 @@ static inline void xen_pgd_clear(pgd_t *
25645 #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
25646 #define VMEMMAP_START _AC(0xffffe20000000000, UL)
25647 #define MODULES_VADDR _AC(0xffffffffa0000000, UL)
25648 -#define MODULES_END _AC(0xfffffffffff00000, UL)
25649 +#define MODULES_END _AC(0xffffffffff000000, UL)
25650 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
25652 #ifndef __ASSEMBLY__
25654 static inline int pgd_bad(pgd_t pgd)
25656 - return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25657 + return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25660 static inline int pud_bad(pud_t pud)
25662 - return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25663 + return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25666 static inline int pmd_bad(pmd_t pmd)
25668 - return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25669 + return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25672 #define pte_none(x) (!(x).pte)
25673 @@ -175,7 +180,7 @@ static inline int pmd_bad(pmd_t pmd)
25675 #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
25677 -#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
25678 +#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
25679 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
25680 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
25681 #define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
25682 @@ -200,11 +205,8 @@ static inline int pmd_bad(pmd_t pmd)
25685 #define pgd_page_vaddr(pgd) \
25686 - ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
25687 + ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
25688 #define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
25689 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25690 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25691 -#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
25692 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
25693 static inline int pgd_large(pgd_t pgd) { return 0; }
25694 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
25695 @@ -226,7 +228,7 @@ static inline int pud_large(pud_t pte)
25698 /* PMD - Level 2 access */
25699 -#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
25700 +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
25701 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25703 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
25704 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/processor.h
25705 ===================================================================
25706 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/processor.h 2008-12-01 11:44:55.000000000 +0100
25707 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/processor.h 2008-12-01 11:49:07.000000000 +0100
25708 @@ -134,7 +134,7 @@ extern __u32 cleared_cpu_caps[NCAPINTS
25710 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25711 #define cpu_data(cpu) per_cpu(cpu_info, cpu)
25712 -#define current_cpu_data cpu_data(smp_processor_id())
25713 +#define current_cpu_data __get_cpu_var(cpu_info)
25715 #define cpu_data(cpu) boot_cpu_data
25716 #define current_cpu_data boot_cpu_data
25717 @@ -153,7 +153,7 @@ static inline int hlt_works(int cpu)
25719 extern void cpu_detect(struct cpuinfo_x86 *c);
25721 -extern void identify_cpu(struct cpuinfo_x86 *);
25722 +extern void early_cpu_init(void);
25723 extern void identify_boot_cpu(void);
25724 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25725 extern void print_cpu_info(struct cpuinfo_x86 *);
25726 @@ -267,15 +267,11 @@ struct tss_struct {
25727 struct thread_struct *io_bitmap_owner;
25730 - * Pad the TSS to be cacheline-aligned (size is 0x100):
25732 - unsigned long __cacheline_filler[35];
25734 * .. and then another 0x100 bytes for the emergency kernel stack:
25736 unsigned long stack[64];
25738 -} __attribute__((packed));
25739 +} ____cacheline_aligned;
25741 DECLARE_PER_CPU(struct tss_struct, init_tss);
25743 @@ -668,11 +664,36 @@ static inline void __sti_mwait(unsigned
25745 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25747 -extern int force_mwait;
25749 extern void select_idle_routine(const struct cpuinfo_x86 *c);
25751 extern unsigned long boot_option_idle_override;
25752 +extern unsigned long idle_halt;
25753 +extern unsigned long idle_nomwait;
25755 +#ifndef CONFIG_XEN
25757 + * on systems with caches, caches must be flashed as the absolute
25758 + * last instruction before going into a suspended halt. Otherwise,
25759 + * dirty data can linger in the cache and become stale on resume,
25760 + * leading to strange errors.
25762 + * perform a variety of operations to guarantee that the compiler
25763 + * will not reorder instructions. wbinvd itself is serializing
25764 + * so the processor will not reorder.
25766 + * Systems without cache can just go into halt.
25768 +static inline void wbinvd_halt(void)
25771 + /* check for clflush to determine if wbinvd is legal */
25772 + if (cpu_has_clflush)
25773 + asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
25780 extern void enable_sep_cpu(void);
25781 extern int sysenter_setup(void);
25782 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/segment.h
25783 ===================================================================
25784 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/segment.h 2008-12-01 11:44:55.000000000 +0100
25785 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/segment.h 2008-12-01 11:49:07.000000000 +0100
25787 #ifndef _ASM_X86_SEGMENT_H_
25788 #define _ASM_X86_SEGMENT_H_
25790 +/* Constructor for a conventional segment GDT (or LDT) entry */
25791 +/* This is a macro so it can be used in initializers */
25792 +#define GDT_ENTRY(flags, base, limit) \
25793 + ((((base) & 0xff000000ULL) << (56-24)) | \
25794 + (((flags) & 0x0000f0ffULL) << 40) | \
25795 + (((limit) & 0x000f0000ULL) << (48-16)) | \
25796 + (((base) & 0x00ffffffULL) << 16) | \
25797 + (((limit) & 0x0000ffffULL)))
25799 /* Simple and small GDT entries for booting only */
25801 #define GDT_ENTRY_BOOT_CS 2
25802 @@ -61,18 +70,14 @@
25803 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
25805 #define GDT_ENTRY_DEFAULT_USER_CS 14
25806 -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
25808 #define GDT_ENTRY_DEFAULT_USER_DS 15
25809 -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
25811 #define GDT_ENTRY_KERNEL_BASE 12
25813 #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
25814 -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25816 #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
25817 -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25819 #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
25820 #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
25821 @@ -143,10 +148,11 @@
25823 #include <asm/cache.h>
25825 -#define __KERNEL_CS 0x10
25826 -#define __KERNEL_DS 0x18
25827 +#define GDT_ENTRY_KERNEL32_CS 1
25828 +#define GDT_ENTRY_KERNEL_CS 2
25829 +#define GDT_ENTRY_KERNEL_DS 3
25831 -#define __KERNEL32_CS 0x08
25832 +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
25835 * we cannot use the same code segment descriptor for user and kernel
25836 @@ -154,10 +160,10 @@
25837 * The segment offset needs to contain a RPL. Grr. -AK
25838 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
25841 -#define __USER32_CS 0x23 /* 4*8+3 */
25842 -#define __USER_DS 0x2b /* 5*8+3 */
25843 -#define __USER_CS 0x33 /* 6*8+3 */
25844 +#define GDT_ENTRY_DEFAULT_USER32_CS 4
25845 +#define GDT_ENTRY_DEFAULT_USER_DS 5
25846 +#define GDT_ENTRY_DEFAULT_USER_CS 6
25847 +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
25848 #define __USER32_DS __USER_DS
25850 #define GDT_ENTRY_TSS 8 /* needs two entries */
25851 @@ -179,6 +185,11 @@
25855 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25856 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25857 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
25858 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
25860 /* User mode is privilege level 3 */
25861 #define USER_RPL 0x3
25862 /* LDT segment has TI set, GDT has it cleared */
25863 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/smp.h
25864 ===================================================================
25865 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/smp.h 2008-12-01 11:44:55.000000000 +0100
25866 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/smp.h 2008-12-01 11:49:07.000000000 +0100
25867 @@ -25,23 +25,16 @@ extern cpumask_t cpu_initialized;
25868 extern void (*mtrr_hook)(void);
25869 extern void zap_low_mappings(void);
25871 +extern int __cpuinit get_local_pda(int cpu);
25873 extern int smp_num_siblings;
25874 extern unsigned int num_processors;
25875 extern cpumask_t cpu_initialized;
25877 -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
25878 -extern u16 x86_cpu_to_apicid_init[];
25879 -extern u16 x86_bios_cpu_apicid_init[];
25880 -extern void *x86_cpu_to_apicid_early_ptr;
25881 -extern void *x86_bios_cpu_apicid_early_ptr;
25883 -#define x86_cpu_to_apicid_early_ptr NULL
25884 -#define x86_bios_cpu_apicid_early_ptr NULL
25887 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
25888 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
25889 DECLARE_PER_CPU(u16, cpu_llc_id);
25891 DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
25892 DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
25894 @@ -63,9 +56,9 @@ struct smp_ops {
25896 void (*smp_send_stop)(void);
25897 void (*smp_send_reschedule)(int cpu);
25898 - int (*smp_call_function_mask)(cpumask_t mask,
25899 - void (*func)(void *info), void *info,
25902 + void (*send_call_func_ipi)(cpumask_t mask);
25903 + void (*send_call_func_single_ipi)(int cpu);
25906 /* Globals due to paravirt */
25907 @@ -106,11 +99,14 @@ static inline void smp_send_reschedule(i
25908 smp_ops.smp_send_reschedule(cpu);
25911 -static inline int smp_call_function_mask(cpumask_t mask,
25912 - void (*func) (void *info), void *info,
25914 +static inline void arch_send_call_function_single_ipi(int cpu)
25916 - return smp_ops.smp_call_function_mask(mask, func, info, wait);
25917 + smp_ops.send_call_func_single_ipi(cpu);
25920 +static inline void arch_send_call_function_ipi(cpumask_t mask)
25922 + smp_ops.send_call_func_ipi(mask);
25925 void native_smp_prepare_boot_cpu(void);
25926 @@ -122,23 +118,19 @@ int native_cpu_up(unsigned int cpunum);
25928 void xen_smp_send_stop(void);
25929 void xen_smp_send_reschedule(int cpu);
25930 -int xen_smp_call_function_mask(cpumask_t mask,
25931 - void (*func) (void *info), void *info,
25933 +void xen_send_call_func_ipi(cpumask_t mask);
25934 +void xen_send_call_func_single_ipi(int cpu);
25936 #define smp_send_stop xen_smp_send_stop
25937 #define smp_send_reschedule xen_smp_send_reschedule
25938 -#define smp_call_function_mask xen_smp_call_function_mask
25940 -extern void prefill_possible_map(void);
25941 +#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi
25942 +#define arch_send_call_function_ipi xen_send_call_func_ipi
25944 #endif /* CONFIG_XEN */
25946 extern int __cpu_disable(void);
25947 extern void __cpu_die(unsigned int cpu);
25949 -extern void prefill_possible_map(void);
25951 void smp_store_cpu_info(int id);
25952 #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
25954 @@ -149,6 +141,14 @@ static inline int num_booting_cpus(void)
25956 #endif /* CONFIG_SMP */
25958 +#if defined(CONFIG_SMP) && (defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_XEN))
25959 +extern void prefill_possible_map(void);
25961 +static inline void prefill_possible_map(void)
25966 extern unsigned disabled_cpus __cpuinitdata;
25968 #ifdef CONFIG_X86_32_SMP
25969 @@ -216,12 +216,8 @@ static inline int hard_smp_processor_id(
25970 #endif /* CONFIG_X86_LOCAL_APIC */
25972 #ifdef CONFIG_HOTPLUG_CPU
25973 -extern void cpu_exit_clear(void);
25974 extern void cpu_uninit(void);
25977 -extern void smp_alloc_memory(void);
25978 -extern void lock_ipi_call_lock(void);
25979 -extern void unlock_ipi_call_lock(void);
25980 #endif /* __ASSEMBLY__ */
25982 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/spinlock.h
25983 ===================================================================
25984 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/spinlock.h 2008-12-01 11:44:55.000000000 +0100
25985 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/spinlock.h 2008-12-01 11:49:07.000000000 +0100
25986 @@ -65,14 +65,14 @@ extern void xen_spin_kick(raw_spinlock_t
25988 #if (NR_CPUS < 256)
25989 #define TICKET_SHIFT 8
25990 -#define __raw_spin_lock_preamble \
25991 +#define __ticket_spin_lock_preamble \
25992 asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
25993 "cmpb %h0, %b0\n\t" \
25995 : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
25998 -#define __raw_spin_lock_body \
25999 +#define __ticket_spin_lock_body \
26001 "cmpb %h0, %b0\n\t" \
26003 @@ -88,7 +88,7 @@ extern void xen_spin_kick(raw_spinlock_t
26007 -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26008 +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26012 @@ -107,7 +107,7 @@ static __always_inline int __raw_spin_tr
26016 -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26017 +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26019 unsigned int token;
26020 unsigned char kick;
26021 @@ -124,7 +124,7 @@ static __always_inline void __raw_spin_u
26024 #define TICKET_SHIFT 16
26025 -#define __raw_spin_lock_preamble \
26026 +#define __ticket_spin_lock_preamble \
26028 unsigned int tmp; \
26029 asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
26030 @@ -136,7 +136,7 @@ static __always_inline void __raw_spin_u
26031 : "0" (0x00010000) \
26032 : "memory", "cc"); \
26034 -#define __raw_spin_lock_body \
26035 +#define __ticket_spin_lock_body \
26037 unsigned int tmp; \
26038 asm("shldl $16, %0, %2\n" \
26039 @@ -155,7 +155,7 @@ static __always_inline void __raw_spin_u
26040 : "memory", "cc"); \
26043 -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26044 +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26048 @@ -177,7 +177,7 @@ static __always_inline int __raw_spin_tr
26052 -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26053 +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26055 unsigned int token, tmp;
26057 @@ -195,49 +195,145 @@ static __always_inline void __raw_spin_u
26061 -static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26062 +static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
26064 int tmp = ACCESS_ONCE(lock->slock);
26066 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
26069 -static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26070 +static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
26072 int tmp = ACCESS_ONCE(lock->slock);
26074 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
26077 -static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26078 +static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
26080 unsigned int token, count;
26083 - __raw_spin_lock_preamble;
26084 + __ticket_spin_lock_preamble;
26085 if (unlikely(!free))
26086 token = xen_spin_adjust(lock, token);
26089 - __raw_spin_lock_body;
26090 + __ticket_spin_lock_body;
26091 } while (unlikely(!count) && !xen_spin_wait(lock, token));
26094 -static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26095 - unsigned long flags)
26096 +static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock,
26097 + unsigned long flags)
26099 unsigned int token, count;
26102 - __raw_spin_lock_preamble;
26103 + __ticket_spin_lock_preamble;
26104 if (unlikely(!free))
26105 token = xen_spin_adjust(lock, token);
26108 - __raw_spin_lock_body;
26109 + __ticket_spin_lock_body;
26110 } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
26113 +#ifdef CONFIG_PARAVIRT
26115 + * Define virtualization-friendly old-style lock byte lock, for use in
26116 + * pv_lock_ops if desired.
26118 + * This differs from the pre-2.6.24 spinlock by always using xchgb
26119 + * rather than decb to take the lock; this allows it to use a
26120 + * zero-initialized lock structure. It also maintains a 1-byte
26121 + * contention counter, so that we can implement
26122 + * __byte_spin_is_contended.
26124 +struct __byte_spinlock {
26129 +static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
26131 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26132 + return bl->lock != 0;
26135 +static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
26137 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26138 + return bl->spinners != 0;
26141 +static inline void __byte_spin_lock(raw_spinlock_t *lock)
26143 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26146 + asm("1: xchgb %1, %0\n"
26149 + " " LOCK_PREFIX "incb %2\n"
26153 + " " LOCK_PREFIX "decb %2\n"
26156 + : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
26159 +static inline int __byte_spin_trylock(raw_spinlock_t *lock)
26161 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26164 + asm("xchgb %1,%0"
26165 + : "+m" (bl->lock), "+q" (old) : : "memory");
26170 +static inline void __byte_spin_unlock(raw_spinlock_t *lock)
26172 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26176 +#else /* !CONFIG_PARAVIRT */
26177 +static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26179 + return __ticket_spin_is_locked(lock);
26182 +static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26184 + return __ticket_spin_is_contended(lock);
26187 +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26189 + __ticket_spin_lock(lock);
26192 +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26193 + unsigned long flags)
26195 + __ticket_spin_lock_flags(lock, flags);
26198 +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26200 + return __ticket_spin_trylock(lock);
26203 +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26205 + __ticket_spin_unlock(lock);
26207 +#endif /* CONFIG_PARAVIRT */
26209 static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
26211 while (__raw_spin_is_locked(lock))
26212 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/system.h
26213 ===================================================================
26214 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/system.h 2008-12-01 11:44:55.000000000 +0100
26215 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/system.h 2008-12-01 11:49:07.000000000 +0100
26216 @@ -137,7 +137,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
26217 #define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
26218 #define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
26220 -extern void load_gs_index(unsigned);
26221 +extern void xen_load_gs_index(unsigned);
26224 * Load a segment. Fall back on loading the zero
26225 @@ -154,14 +154,14 @@ extern void load_gs_index(unsigned);
26228 _ASM_EXTABLE(1b,3b) \
26229 - : :"r" (value), "r" (0))
26230 + : :"r" (value), "r" (0) : "memory")
26234 * Save a segment register away
26236 #define savesegment(seg, value) \
26237 - asm volatile("mov %%" #seg ",%0":"=rm" (value))
26238 + asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
26240 static inline unsigned long get_limit(unsigned long segment)
26242 @@ -269,6 +269,7 @@ static inline void xen_wbinvd(void)
26243 #ifdef CONFIG_X86_64
26244 #define read_cr8() (xen_read_cr8())
26245 #define write_cr8(x) (xen_write_cr8(x))
26246 +#define load_gs_index xen_load_gs_index
26249 /* Clear the 'TS' bit */
26250 @@ -287,13 +288,12 @@ static inline void clflush(volatile void
26251 void disable_hlt(void);
26252 void enable_hlt(void);
26254 -extern int es7000_plat;
26255 void cpu_idle_wait(void);
26257 extern unsigned long arch_align_stack(unsigned long sp);
26258 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
26260 -void default_idle(void);
26261 +void xen_idle(void);
26264 * Force strict CPU ordering.
26265 Index: head-2008-12-01/include/asm-x86/mach-xen/asm/xor_64.h
26266 ===================================================================
26267 --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/xor_64.h 2008-12-01 11:44:55.000000000 +0100
26268 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/xor_64.h 2008-12-01 11:49:07.000000000 +0100
26270 +#ifndef ASM_X86__XOR_64_H
26271 +#define ASM_X86__XOR_64_H
26274 * x86-64 changes / gcc fixes from Andi Kleen.
26275 * Copyright 2002 Andi Kleen, SuSE Labs.
26276 @@ -330,3 +333,5 @@ do { \
26277 We may also be able to load into the L1 only depending on how the cpu
26278 deals with a load to a line that is being prefetched. */
26279 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
26281 +#endif /* ASM_X86__XOR_64_H */
26282 Index: head-2008-12-01/include/asm-x86/mach-xen/irq_vectors.h
26283 ===================================================================
26284 --- head-2008-12-01.orig/include/asm-x86/mach-xen/irq_vectors.h 2008-12-01 11:37:10.000000000 +0100
26285 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26288 - * This file should contain #defines for all of the interrupt vector
26289 - * numbers used by this architecture.
26291 - * In addition, there are some standard defines:
26293 - * FIRST_EXTERNAL_VECTOR:
26294 - * The first free place for external interrupts
26296 - * SYSCALL_VECTOR:
26297 - * The IRQ vector a syscall makes the user to kernel transition
26301 - * The IRQ number the timer interrupt comes in at.
26304 - * The total number of interrupt vectors (including all the
26305 - * architecture specific interrupts) needed.
26308 -#ifndef _ASM_IRQ_VECTORS_H
26309 -#define _ASM_IRQ_VECTORS_H
26312 - * IDT vectors usable for external interrupt sources start
26315 -#define FIRST_EXTERNAL_VECTOR 0x20
26317 -#define SYSCALL_VECTOR 0x80
26320 - * Vectors 0x20-0x2f are used for ISA interrupts.
26325 - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
26327 - * some of the following vectors are 'rare', they are merged
26328 - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
26329 - * TLB, reschedule and local APIC vectors are performance-critical.
26331 - * Vectors 0xf0-0xfa are free (reserved for future Linux use).
26333 -#define SPURIOUS_APIC_VECTOR 0xff
26334 -#define ERROR_APIC_VECTOR 0xfe
26335 -#define INVALIDATE_TLB_VECTOR 0xfd
26336 -#define RESCHEDULE_VECTOR 0xfc
26337 -#define CALL_FUNCTION_VECTOR 0xfb
26339 -#define THERMAL_APIC_VECTOR 0xf0
26341 - * Local APIC timer IRQ vector is on a different priority level,
26342 - * to work around the 'lost local interrupt if more than 2 IRQ
26343 - * sources per level' errata.
26345 -#define LOCAL_TIMER_VECTOR 0xef
26348 -#define SPURIOUS_APIC_VECTOR 0xff
26349 -#define ERROR_APIC_VECTOR 0xfe
26352 - * First APIC vector available to drivers: (vectors 0x30-0xee)
26353 - * we start at 0x31 to spread out vectors evenly between priority
26354 - * levels. (0x80 is the syscall vector)
26356 -#define FIRST_DEVICE_VECTOR 0x31
26357 -#define FIRST_SYSTEM_VECTOR 0xef
26360 - * 16 8259A IRQ's, 208 potential APIC interrupt sources.
26361 - * Right now the APIC is mostly only used for SMP.
26362 - * 256 vectors is an architectural limit. (we can have
26363 - * more than 256 devices theoretically, but they will
26364 - * have to use shared interrupts)
26365 - * Since vectors 0x00-0x1f are used/reserved for the CPU,
26366 - * the usable vector space is 0x20-0xff (224 vectors)
26369 -#define RESCHEDULE_VECTOR 0
26370 -#define CALL_FUNCTION_VECTOR 1
26371 -#define SPIN_UNLOCK_VECTOR 2
26375 - * The maximum number of vectors supported by i386 processors
26376 - * is limited to 256. For processors other than i386, NR_VECTORS
26377 - * should be changed accordingly.
26379 -#define NR_VECTORS 256
26381 -#define FPU_IRQ 13
26383 -#define FIRST_VM86_IRQ 3
26384 -#define LAST_VM86_IRQ 15
26385 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
26388 - * The flat IRQ space is divided into two regions:
26389 - * 1. A one-to-one mapping of real physical IRQs. This space is only used
26390 - * if we have physical device-access privilege. This region is at the
26391 - * start of the IRQ space so that existing device drivers do not need
26392 - * to be modified to translate physical IRQ numbers into our IRQ space.
26393 - * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
26394 - * are bound using the provided bind/unbind functions.
26397 -#define PIRQ_BASE 0
26398 -#if !defined(MAX_IO_APICS)
26399 -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26400 -#elif NR_CPUS < MAX_IO_APICS
26401 -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26403 -# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
26406 -#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
26407 -#define NR_DYNIRQS 256
26409 -#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
26410 -#define NR_IRQ_VECTORS NR_IRQS
26412 -#endif /* _ASM_IRQ_VECTORS_H */
26413 Index: head-2008-12-01/include/asm-x86/mach-xen/setup_arch_post.h
26414 ===================================================================
26415 --- head-2008-12-01.orig/include/asm-x86/mach-xen/setup_arch_post.h 2008-12-03 15:48:43.000000000 +0100
26416 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26419 - * machine_specific_* - Hooks for machine specific setup.
26422 - * This is included late in kernel/setup.c so that it can make
26423 - * use of all of the static functions.
26426 -#include <xen/interface/callback.h>
26428 -extern void hypervisor_callback(void);
26429 -extern void failsafe_callback(void);
26430 -extern void nmi(void);
26432 -static void __init machine_specific_arch_setup(void)
26435 - static struct callback_register __initdata event = {
26436 - .type = CALLBACKTYPE_event,
26437 - .address = (unsigned long) hypervisor_callback,
26439 - static struct callback_register __initdata failsafe = {
26440 - .type = CALLBACKTYPE_failsafe,
26441 - .address = (unsigned long)failsafe_callback,
26443 - static struct callback_register __initdata syscall = {
26444 - .type = CALLBACKTYPE_syscall,
26445 - .address = (unsigned long)system_call,
26447 -#ifdef CONFIG_X86_LOCAL_APIC
26448 - static struct callback_register __initdata nmi_cb = {
26449 - .type = CALLBACKTYPE_nmi,
26450 - .address = (unsigned long)nmi,
26454 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
26456 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
26458 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
26459 -#if CONFIG_XEN_COMPAT <= 0x030002
26460 - if (ret == -ENOSYS)
26461 - ret = HYPERVISOR_set_callbacks(
26463 - failsafe.address,
26464 - syscall.address);
26468 -#ifdef CONFIG_X86_LOCAL_APIC
26469 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
26470 -#if CONFIG_XEN_COMPAT <= 0x030002
26471 - if (ret == -ENOSYS) {
26472 - static struct xennmi_callback __initdata cb = {
26473 - .handler_address = (unsigned long)nmi
26476 - HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
26481 Index: head-2008-12-01/include/asm-x86/mach-xen/setup_arch_pre.h
26482 ===================================================================
26483 --- head-2008-12-01.orig/include/asm-x86/mach-xen/setup_arch_pre.h 2008-12-03 15:48:43.000000000 +0100
26484 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26486 -/* Hook to call BIOS initialisation function */
26488 -#define ARCH_SETUP machine_specific_arch_setup();
26490 -static void __init machine_specific_arch_setup(void);
26491 Index: head-2008-12-01/include/asm-x86/traps.h
26492 ===================================================================
26493 --- head-2008-12-01.orig/include/asm-x86/traps.h 2008-12-03 15:48:43.000000000 +0100
26494 +++ head-2008-12-01/include/asm-x86/traps.h 2008-12-01 11:49:07.000000000 +0100
26495 @@ -23,6 +23,9 @@ asmlinkage void spurious_interrupt_bug(v
26496 #ifdef CONFIG_X86_MCE
26497 asmlinkage void machine_check(void);
26498 #endif /* CONFIG_X86_MCE */
26499 +#ifdef CONFIG_X86_XEN
26500 +asmlinkage void fixup_4gb_segment(void);
26503 void do_divide_error(struct pt_regs *, long);
26504 void do_overflow(struct pt_regs *, long);
26505 @@ -48,6 +51,9 @@ void math_error(void __user *);
26506 void do_coprocessor_error(struct pt_regs *, long);
26507 void do_simd_coprocessor_error(struct pt_regs *, long);
26508 void do_spurious_interrupt_bug(struct pt_regs *, long);
26510 +void do_fixup_4gb_segment(struct pt_regs *, long);
26512 unsigned long patch_espfix_desc(unsigned long, unsigned long);
26513 asmlinkage void math_emulate(long);
26515 Index: head-2008-12-01/include/asm-x86/xen/hypercall.h
26516 ===================================================================
26517 --- head-2008-12-01.orig/include/asm-x86/xen/hypercall.h 2008-12-03 15:48:43.000000000 +0100
26518 +++ head-2008-12-01/include/asm-x86/xen/hypercall.h 2008-12-01 11:49:07.000000000 +0100
26519 @@ -264,7 +264,7 @@ HYPERVISOR_fpu_taskswitch(int set)
26521 HYPERVISOR_sched_op(int cmd, void *arg)
26523 - return _hypercall2(int, sched_op_new, cmd, arg);
26524 + return _hypercall2(int, sched_op, cmd, arg);
26528 Index: head-2008-12-01/include/asm-x86/xen/interface_64.h
26529 ===================================================================
26530 --- head-2008-12-01.orig/include/asm-x86/xen/interface_64.h 2008-12-03 15:48:43.000000000 +0100
26531 +++ head-2008-12-01/include/asm-x86/xen/interface_64.h 2008-12-01 11:49:07.000000000 +0100
26532 @@ -136,7 +136,7 @@ struct cpu_user_regs {
26533 uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
26534 uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
26536 -DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
26537 +DEFINE_XEN_GUEST_HANDLE_STRUCT(cpu_user_regs);
26541 Index: head-2008-12-01/include/linux/page-flags.h
26542 ===================================================================
26543 --- head-2008-12-01.orig/include/linux/page-flags.h 2008-12-01 11:48:52.000000000 +0100
26544 +++ head-2008-12-01/include/linux/page-flags.h 2008-12-01 11:49:07.000000000 +0100
26545 @@ -109,9 +109,11 @@ enum pageflags {
26547 PG_checked = PG_owner_priv_1,
26549 +#ifdef CONFIG_PARAVIRT_XEN
26551 PG_pinned = PG_owner_priv_1,
26552 PG_savepinned = PG_dirty,
26556 PG_slob_page = PG_active,
26557 @@ -185,8 +187,12 @@ PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU,
26558 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
26559 __PAGEFLAG(Slab, slab)
26560 PAGEFLAG(Checked, checked) /* Used by some filesystems */
26561 +#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
26562 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
26564 +#ifdef CONFIG_PARAVIRT_XEN
26565 PAGEFLAG(SavePinned, savepinned); /* Xen */
26567 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
26568 PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
26569 __SETPAGEFLAG(Private, private)
26570 Index: head-2008-12-01/include/xen/interface/memory.h
26571 ===================================================================
26572 --- head-2008-12-01.orig/include/xen/interface/memory.h 2008-12-01 11:44:55.000000000 +0100
26573 +++ head-2008-12-01/include/xen/interface/memory.h 2008-12-01 11:49:07.000000000 +0100
26574 @@ -82,6 +82,7 @@ struct xen_memory_reservation {
26578 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_memory_reservation);
26579 typedef struct xen_memory_reservation xen_memory_reservation_t;
26580 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
26582 @@ -167,11 +168,7 @@ struct xen_machphys_mfn_list {
26583 * any large discontiguities in the machine address space, 2MB gaps in
26584 * the machphys table will be represented by an MFN base of zero.
26586 -#ifndef CONFIG_PARAVIRT_XEN
26587 XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
26589 - ulong extent_start;
26593 * Number of extents written to the above array. This will be smaller
26594 @@ -179,6 +176,7 @@ struct xen_machphys_mfn_list {
26596 unsigned int nr_extents;
26598 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
26599 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
26600 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
26602 @@ -218,6 +216,7 @@ struct xen_add_to_physmap {
26603 /* GPFN where the source mapping page should appear. */
26606 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
26607 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
26608 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
26610 @@ -250,21 +249,13 @@ struct xen_translate_gpfn_list {
26611 xen_ulong_t nr_gpfns;
26613 /* List of GPFNs to translate. */
26614 -#ifndef CONFIG_PARAVIRT_XEN
26615 XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
26621 * Output list to contain MFN translations. May be the same as the input
26622 * list (in which case each input GPFN is overwritten with the output MFN).
26624 -#ifndef CONFIG_PARAVIRT_XEN
26625 XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
26630 DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
26631 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
26632 Index: head-2008-12-01/kernel/hrtimer.c
26633 ===================================================================
26634 --- head-2008-12-01.orig/kernel/hrtimer.c 2008-12-03 15:48:43.000000000 +0100
26635 +++ head-2008-12-01/kernel/hrtimer.c 2008-12-01 11:49:07.000000000 +0100
26636 @@ -1084,7 +1084,7 @@ ktime_t hrtimer_get_remaining(const stru
26638 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
26640 -#ifdef CONFIG_NO_HZ
26641 +#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26643 * hrtimer_get_next_event - get the time until next expiry event
26645 Index: head-2008-12-01/kernel/timer.c
26646 ===================================================================
26647 --- head-2008-12-01.orig/kernel/timer.c 2008-12-03 15:48:43.000000000 +0100
26648 +++ head-2008-12-01/kernel/timer.c 2008-12-01 11:49:07.000000000 +0100
26649 @@ -815,7 +815,7 @@ static inline void __run_timers(struct t
26650 spin_unlock_irq(&base->lock);
26653 -#ifdef CONFIG_NO_HZ
26654 +#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26656 * Find out when the next timer event is due to happen. This
26657 * is used on S/390 to stop all activity when a cpus is idle.
26658 Index: head-2008-12-01/lib/swiotlb-xen.c
26659 ===================================================================
26660 --- head-2008-12-01.orig/lib/swiotlb-xen.c 2008-12-01 11:44:55.000000000 +0100
26661 +++ head-2008-12-01/lib/swiotlb-xen.c 2008-12-01 11:49:07.000000000 +0100
26662 @@ -750,7 +750,7 @@ swiotlb_sync_sg_for_device(struct device
26666 -swiotlb_dma_mapping_error(dma_addr_t dma_addr)
26667 +swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
26669 return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
26671 Index: head-2008-12-01/mm/mprotect.c
26672 ===================================================================
26673 --- head-2008-12-01.orig/mm/mprotect.c 2008-12-01 11:29:05.000000000 +0100
26674 +++ head-2008-12-01/mm/mprotect.c 2008-12-01 11:49:07.000000000 +0100
26675 @@ -92,8 +92,6 @@ static inline void change_pmd_range(stru
26676 next = pmd_addr_end(addr, end);
26677 if (pmd_none_or_clear_bad(pmd))
26679 - if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
26681 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
26682 } while (pmd++, addr = next, addr != end);