1 From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
2 Subject: [PATCH] Linux: Update to 2.6.27
5 This patch contains the differences between Linux 2.6.26 and 2.6.27.
7 Acked-by: Jeff Mahoney <jeffm@suse.com>
8 Automatically created from "patches.kernel.org/patch-2.6.27" by xen-port-patches.py
10 --- sle11-2009-10-16.orig/arch/x86/Kconfig 2009-03-16 16:38:05.000000000 +0100
11 +++ sle11-2009-10-16/arch/x86/Kconfig 2009-06-04 10:21:39.000000000 +0200
12 @@ -594,7 +594,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
14 bool "AMD IOMMU support"
16 - depends on X86_64 && PCI && ACPI
17 + depends on X86_64 && PCI && ACPI && !X86_64_XEN
19 With this option you can enable support for AMD IOMMU hardware in
20 your system. An IOMMU is a hardware component which provides
21 @@ -629,8 +629,10 @@ config MAXSMP
24 int "Maximum number of CPUs (2-4096)"
28 + default "32" if MAXSMP && XEN
29 default "4096" if MAXSMP
30 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
31 default "16" if X86_64_XEN
32 @@ -1227,7 +1229,7 @@ config MTRR
35 prompt "MTRR cleanup support"
37 + depends on MTRR && !XEN
39 Convert MTRR layout from continuous to discrete, so X drivers can
40 add writeback entries.
41 --- sle11-2009-10-16.orig/arch/x86/Kconfig.debug 2009-03-16 16:33:40.000000000 +0100
42 +++ sle11-2009-10-16/arch/x86/Kconfig.debug 2009-06-04 10:21:39.000000000 +0200
43 @@ -25,6 +25,7 @@ config STRICT_DEVMEM
44 config X86_VERBOSE_BOOTUP
45 bool "Enable verbose x86 bootup info messages"
49 Enables the informational output from the decompression stage
50 (e.g. bzImage) of the boot. If you disable this you will still
51 @@ -179,7 +180,7 @@ config MMIOTRACE_HOOKS
54 bool "Memory mapped IO tracing"
55 - depends on DEBUG_KERNEL && PCI
56 + depends on DEBUG_KERNEL && PCI && !XEN
58 select MMIOTRACE_HOOKS
60 --- sle11-2009-10-16.orig/arch/x86/Makefile 2009-02-16 16:18:36.000000000 +0100
61 +++ sle11-2009-10-16/arch/x86/Makefile 2009-06-04 10:21:39.000000000 +0200
62 @@ -116,8 +116,8 @@ mflags-$(CONFIG_X86_VOYAGER) := -Iinclud
63 mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
66 -mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-x86/mach-xen
67 -mcore-$(CONFIG_X86_XEN) := arch/x86/mach-xen/
68 +mflags-$(CONFIG_XEN) := -Iinclude/asm-x86/mach-xen
69 +mcore-$(CONFIG_XEN) := arch/x86/mach-xen/
71 # generic subarchitecture
72 mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
73 @@ -128,8 +128,10 @@ mcore-$(CONFIG_X86_GENERICARCH) := arch/
74 mflags-y += -Iinclude/asm-x86/mach-default
76 # 64 bit does not support subarch support - clear sub arch variables
77 +ifneq ($(CONFIG_XEN),y)
78 fcore-$(CONFIG_X86_64) :=
79 mcore-$(CONFIG_X86_64) :=
82 KBUILD_CFLAGS += $(mflags-y)
83 KBUILD_AFLAGS += $(mflags-y)
84 --- sle11-2009-10-16.orig/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:38:05.000000000 +0100
85 +++ sle11-2009-10-16/arch/x86/ia32/ia32entry-xen.S 2009-10-16 14:51:56.000000000 +0200
87 #include <asm/irqflags.h>
88 #include <linux/linkage.h>
90 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
91 +#include <linux/elf-em.h>
92 +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
93 +#define __AUDIT_ARCH_LE 0x40000000
95 +#ifndef CONFIG_AUDITSYSCALL
96 +#define sysexit_audit int_ret_from_sys_call
97 +#define sysretl_audit int_ret_from_sys_call
100 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
102 .macro IA32_ARG_FIXUP noebp=0
108 + * Reload arg registers from stack in case ptrace changed them.
109 + * We don't reload %eax because syscall_trace_enter() returned
110 + * the value it wants us to use in the table lookup.
112 .macro LOAD_ARGS32 offset
113 movl \offset(%rsp),%r11d
114 movl \offset+8(%rsp),%r10d
116 movl \offset+48(%rsp),%edx
117 movl \offset+56(%rsp),%esi
118 movl \offset+64(%rsp),%edi
119 - movl \offset+72(%rsp),%eax
122 .macro CFI_STARTPROC32 simple
127 +#ifdef CONFIG_PARAVIRT
128 +ENTRY(native_usergs_sysret32)
131 +ENDPROC(native_usergs_sysret32)
133 +ENTRY(native_irq_enable_sysexit)
137 +ENDPROC(native_irq_enable_sysexit)
141 * 32bit SYSENTER instruction entry.
143 @@ -98,7 +125,7 @@ ENTRY(ia32_sysenter_target)
145 movl %ebp,%ebp /* zero extension */
147 - movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
148 + movl 48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d
149 movl $__USER32_DS,40(%rsp)
151 movl $__USER32_CS,16(%rsp)
152 @@ -113,19 +140,75 @@ ENTRY(ia32_sysenter_target)
155 GET_THREAD_INFO(%r10)
156 - orl $TS_COMPAT,threadinfo_status(%r10)
157 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
158 + orl $TS_COMPAT,TI_status(%r10)
159 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
160 jnz sysenter_tracesys
162 cmpl $(IA32_NR_syscalls-1),%eax
167 call *ia32_sys_call_table(,%rax,8)
168 movq %rax,RAX-ARGOFFSET(%rsp)
169 + GET_THREAD_INFO(%r10)
170 + DISABLE_INTERRUPTS(CLBR_NONE)
172 + testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
174 jmp int_ret_from_sys_call
176 +#ifdef CONFIG_AUDITSYSCALL
177 + .macro auditsys_entry_common
178 + movl %esi,%r9d /* 6th arg: 4th syscall arg */
179 + movl %edx,%r8d /* 5th arg: 3rd syscall arg */
180 + /* (already in %ecx) 4th arg: 2nd syscall arg */
181 + movl %ebx,%edx /* 3rd arg: 1st syscall arg */
182 + movl %eax,%esi /* 2nd arg: syscall number */
183 + movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
184 + call audit_syscall_entry
185 + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
186 + cmpl $(IA32_NR_syscalls-1),%eax
188 + movl %ebx,%edi /* reload 1st syscall arg */
189 + movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
190 + movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
191 + movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
192 + movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
195 + .macro auditsys_exit exit,ebpsave=RBP
196 + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
197 + jnz int_ret_from_sys_call
199 + ENABLE_INTERRUPTS(CLBR_NONE)
200 + movl %eax,%esi /* second arg, syscall return value */
201 + cmpl $0,%eax /* is it < 0? */
202 + setl %al /* 1 if so, 0 if not */
203 + movzbl %al,%edi /* zero-extend that into %edi */
204 + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
205 + call audit_syscall_exit
206 + movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
207 + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
208 + DISABLE_INTERRUPTS(CLBR_NONE)
214 + auditsys_entry_common
215 + movl %ebp,%r9d /* reload 6th syscall arg */
216 + jmp sysenter_dispatch
219 + auditsys_exit sysexit_from_sys_call
224 +#ifdef CONFIG_AUDITSYSCALL
225 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
226 + jz sysenter_auditsys
231 @@ -186,18 +269,38 @@ ENTRY(ia32_cstar_target)
234 GET_THREAD_INFO(%r10)
235 - orl $TS_COMPAT,threadinfo_status(%r10)
236 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
237 + orl $TS_COMPAT,TI_status(%r10)
238 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
241 cmpl $IA32_NR_syscalls-1,%eax
245 call *ia32_sys_call_table(,%rax,8)
246 movq %rax,RAX-ARGOFFSET(%rsp)
247 + GET_THREAD_INFO(%r10)
248 + DISABLE_INTERRUPTS(CLBR_NONE)
249 + testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
251 jmp int_ret_from_sys_call
254 +#ifdef CONFIG_AUDITSYSCALL
256 + movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
257 + auditsys_entry_common
258 + movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
262 + auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
266 +#ifdef CONFIG_AUDITSYSCALL
267 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
273 @@ -263,8 +366,8 @@ ENTRY(ia32_syscall)
274 this could be a problem. */
276 GET_THREAD_INFO(%r10)
277 - orl $TS_COMPAT,threadinfo_status(%r10)
278 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
279 + orl $TS_COMPAT,TI_status(%r10)
280 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
283 cmpl $(IA32_NR_syscalls-1),%eax
284 @@ -309,13 +412,11 @@ quiet_ni_syscall:
285 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
286 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
287 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
288 - PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
289 PTREGSCALL stub32_execve, sys32_execve, %rcx
290 PTREGSCALL stub32_fork, sys_fork, %rdi
291 PTREGSCALL stub32_clone, sys32_clone, %rdx
292 PTREGSCALL stub32_vfork, sys_vfork, %rdi
293 PTREGSCALL stub32_iopl, sys_iopl, %rsi
294 - PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
296 ENTRY(ia32_ptregs_common)
298 @@ -415,7 +516,7 @@ ia32_sys_call_table:
300 .quad sys_setreuid16 /* 70 */
302 - .quad stub32_sigsuspend
303 + .quad sys32_sigsuspend
304 .quad compat_sys_sigpending
305 .quad sys_sethostname
306 .quad compat_sys_setrlimit /* 75 */
307 @@ -522,7 +623,7 @@ ia32_sys_call_table:
308 .quad sys32_rt_sigpending
309 .quad compat_sys_rt_sigtimedwait
310 .quad sys32_rt_sigqueueinfo
311 - .quad stub32_rt_sigsuspend
312 + .quad sys_rt_sigsuspend
313 .quad sys32_pread /* 180 */
316 @@ -670,4 +771,10 @@ ia32_sys_call_table:
317 .quad sys32_fallocate
318 .quad compat_sys_timerfd_settime /* 325 */
319 .quad compat_sys_timerfd_gettime
320 + .quad compat_sys_signalfd4
322 + .quad sys_epoll_create1
323 + .quad sys_dup3 /* 330 */
325 + .quad sys_inotify_init1
327 --- sle11-2009-10-16.orig/arch/x86/kernel/Makefile 2009-03-16 16:38:05.000000000 +0100
328 +++ sle11-2009-10-16/arch/x86/kernel/Makefile 2009-06-04 10:21:39.000000000 +0200
329 @@ -120,9 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
331 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
333 - obj-$(CONFIG_XEN) += nmi_64.o
334 + obj-$(CONFIG_XEN) += nmi.o
335 time_64-$(CONFIG_XEN) += time_32.o
338 -disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
339 - pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
340 +disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \
341 + i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \
342 + tlb_$(BITS).o tsc.o tsc_sync.o vsmp_64.o
343 --- sle11-2009-10-16.orig/arch/x86/kernel/acpi/boot.c 2009-08-26 11:55:26.000000000 +0200
344 +++ sle11-2009-10-16/arch/x86/kernel/acpi/boot.c 2009-08-26 12:03:49.000000000 +0200
345 @@ -949,7 +949,9 @@ void __init mp_register_ioapic(int id, u
346 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
347 mp_ioapics[idx].mp_apicaddr = address;
350 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
352 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
354 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
355 @@ -1106,7 +1108,7 @@ int mp_register_gsi(u32 gsi, int trigger
359 -#ifdef CONFIG_X86_32
360 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
361 #define MAX_GSI_NUM 4096
362 #define IRQ_COMPRESSION_START 64
364 @@ -1154,7 +1156,7 @@ int mp_register_gsi(u32 gsi, int trigger
365 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
366 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
367 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
368 -#ifdef CONFIG_X86_32
369 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
370 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
373 @@ -1162,7 +1164,7 @@ int mp_register_gsi(u32 gsi, int trigger
376 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
377 -#ifdef CONFIG_X86_32
378 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
380 * For GSI >= 64, use IRQ compression
382 --- sle11-2009-10-16.orig/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:38:05.000000000 +0100
383 +++ sle11-2009-10-16/arch/x86/kernel/acpi/sleep-xen.c 2009-06-04 10:21:39.000000000 +0200
385 #include <linux/bootmem.h>
386 #include <linux/dmi.h>
387 #include <linux/cpumask.h>
388 +#include <asm/segment.h>
390 #include "realmode/wakeup.h"
392 @@ -20,7 +21,7 @@ unsigned long acpi_realmode_flags;
393 /* address in low memory of the wakeup routine. */
394 static unsigned long acpi_realmode;
397 +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
398 static char temp_stack[10240];
401 @@ -54,18 +55,27 @@ int acpi_save_state_mem(void)
402 header->video_mode = saved_video_mode;
404 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
407 + * Set up the wakeup GDT. We set these up as Big Real Mode,
408 + * that is, with limits set to 4 GB. At least the Lenovo
409 + * Thinkpad X61 is known to need this for the video BIOS
410 + * initialization quirk to work; this is likely to also
411 + * be the case for other laptops or integrated video devices.
414 /* GDT[0]: GDT self-pointer */
415 header->wakeup_gdt[0] =
416 (u64)(sizeof(header->wakeup_gdt) - 1) +
417 ((u64)(acpi_wakeup_address +
418 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
420 - /* GDT[1]: real-mode-like code segment */
421 - header->wakeup_gdt[1] = (0x009bULL << 40) +
422 - ((u64)acpi_wakeup_address << 16) + 0xffff;
423 - /* GDT[2]: real-mode-like data segment */
424 - header->wakeup_gdt[2] = (0x0093ULL << 40) +
425 - ((u64)acpi_wakeup_address << 16) + 0xffff;
426 + /* GDT[1]: big real mode-like code segment */
427 + header->wakeup_gdt[1] =
428 + GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
429 + /* GDT[2]: big real mode-like data segment */
430 + header->wakeup_gdt[2] =
431 + GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
434 store_gdt((struct desc_ptr *)&header->pmode_gdt);
435 @@ -79,7 +89,7 @@ int acpi_save_state_mem(void)
436 #endif /* !CONFIG_64BIT */
438 header->pmode_cr0 = read_cr0();
439 - header->pmode_cr4 = read_cr4();
440 + header->pmode_cr4 = read_cr4_safe();
441 header->realmode_flags = acpi_realmode_flags;
442 header->real_magic = 0x12345678;
444 @@ -89,7 +99,9 @@ int acpi_save_state_mem(void)
445 saved_magic = 0x12345678;
446 #else /* CONFIG_64BIT */
447 header->trampoline_segment = setup_trampoline() >> 4;
448 - init_rsp = (unsigned long)temp_stack + 4096;
450 + stack_start.sp = temp_stack + 4096;
452 initial_code = (unsigned long)wakeup_long64;
453 saved_magic = 0x123456789abcdef0;
454 #endif /* CONFIG_64BIT */
455 @@ -145,6 +157,12 @@ static int __init acpi_sleep_setup(char
456 acpi_realmode_flags |= 2;
457 if (strncmp(str, "s3_beep", 7) == 0)
458 acpi_realmode_flags |= 4;
459 +#ifdef CONFIG_HIBERNATION
460 + if (strncmp(str, "s4_nohwsig", 10) == 0)
461 + acpi_no_s4_hw_signature();
463 + if (strncmp(str, "old_ordering", 12) == 0)
464 + acpi_old_suspend_ordering();
465 str = strchr(str, ',');
467 str += strspn(str, ", \t");
468 --- sle11-2009-10-16.orig/arch/x86/kernel/apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
469 +++ sle11-2009-10-16/arch/x86/kernel/apic_32-xen.c 2009-06-04 10:21:39.000000000 +0200
470 @@ -59,7 +59,10 @@ static cpumask_t timer_bcast_ipi;
472 * Debug level, exported for io_apic.c
475 +unsigned int apic_verbosity;
477 +/* Have we found an MP table */
478 +int smp_found_config;
481 static int modern_apic(void)
482 --- sle11-2009-10-16.orig/arch/x86/kernel/apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
483 +++ sle11-2009-10-16/arch/x86/kernel/apic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
484 @@ -39,7 +39,10 @@ int disable_apic;
486 * Debug level, exported for io_apic.c
489 +unsigned int apic_verbosity;
491 +/* Have we found an MP table */
492 +int smp_found_config;
495 * The guts of the apic timer interrupt
496 --- sle11-2009-10-16.orig/arch/x86/kernel/asm-offsets_64.c 2008-11-25 12:35:54.000000000 +0100
497 +++ sle11-2009-10-16/arch/x86/kernel/asm-offsets_64.c 2009-06-04 10:21:39.000000000 +0200
498 @@ -138,7 +138,7 @@ int main(void)
501 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
503 +#ifdef CONFIG_PARAVIRT_XEN
505 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
506 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
507 --- sle11-2009-10-16.orig/arch/x86/kernel/cpu/amd_64.c 2009-10-28 14:55:02.000000000 +0100
508 +++ sle11-2009-10-16/arch/x86/kernel/cpu/amd_64.c 2009-06-04 10:21:39.000000000 +0200
509 @@ -193,6 +193,7 @@ static void __cpuinit init_amd(struct cp
510 fam10h_check_enable_mmcfg();
514 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
515 unsigned long long tseg;
517 @@ -211,6 +212,7 @@ static void __cpuinit init_amd(struct cp
518 set_memory_4k((unsigned long)__va(tseg), 1);
524 static struct cpu_dev amd_cpu_dev __cpuinitdata = {
525 --- sle11-2009-10-16.orig/arch/x86/kernel/cpu/bugs_64.c 2009-10-28 14:55:02.000000000 +0100
526 +++ sle11-2009-10-16/arch/x86/kernel/cpu/bugs_64.c 2009-06-04 10:21:39.000000000 +0200
527 @@ -20,6 +20,7 @@ void __init check_bugs(void)
529 alternative_instructions();
533 * Make sure the first 2MB area is not mapped by huge pages
534 * There are typically fixed size MTRRs in there and overlapping
535 @@ -30,4 +31,5 @@ void __init check_bugs(void)
538 set_memory_4k((unsigned long)__va(0), 1);
541 --- sle11-2009-10-16.orig/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:38:05.000000000 +0100
542 +++ sle11-2009-10-16/arch/x86/kernel/cpu/common-xen.c 2009-06-04 10:21:39.000000000 +0200
544 #include <asm/mtrr.h>
547 +#include <asm/asm.h>
548 #ifdef CONFIG_X86_LOCAL_APIC
549 #include <asm/mpspec.h>
550 #include <asm/apic.h>
551 @@ -341,11 +342,24 @@ static void __init early_cpu_detect(void
553 get_cpu_vendor(c, 1);
557 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
558 cpu_devs[c->x86_vendor]->c_early_init)
559 cpu_devs[c->x86_vendor]->c_early_init(c);
564 + * The NOPL instruction is supposed to exist on all CPUs with
565 + * family >= 6; unfortunately, that's not true in practice because
566 + * of early VIA chips and (more importantly) broken virtualizers that
567 + * are not easy to detect. In the latter case it doesn't even *fail*
568 + * reliably, so probing for it doesn't even work. Disable it completely
569 + * unless we can find a reliable way to detect all the broken cases.
571 +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
573 + clear_cpu_cap(c, X86_FEATURE_NOPL);
576 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
577 @@ -402,8 +416,8 @@ static void __cpuinit generic_identify(s
580 init_scattered_cpuid_features(c);
586 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
587 @@ -434,7 +448,7 @@ __setup("serialnumber", x86_serial_nr_se
589 * This does the hard work of actually picking apart the CPU stuff...
591 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
592 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
596 @@ -448,6 +462,8 @@ void __cpuinit identify_cpu(struct cpuin
597 c->x86_max_cores = 1;
598 c->x86_clflush_size = 32;
599 memset(&c->x86_capability, 0, sizeof c->x86_capability);
600 + if (boot_cpu_has(X86_FEATURE_SYSCALL32))
601 + set_cpu_cap(c, X86_FEATURE_SYSCALL32);
603 if (!have_cpuid_p()) {
605 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
606 +++ sle11-2009-10-16/arch/x86/kernel/cpu/common_64-xen.c 2009-06-04 10:21:39.000000000 +0200
608 +#include <linux/init.h>
609 +#include <linux/kernel.h>
610 +#include <linux/sched.h>
611 +#include <linux/string.h>
612 +#include <linux/bootmem.h>
613 +#include <linux/bitops.h>
614 +#include <linux/module.h>
615 +#include <linux/kgdb.h>
616 +#include <linux/topology.h>
617 +#include <linux/delay.h>
618 +#include <linux/smp.h>
619 +#include <linux/percpu.h>
620 +#include <asm/i387.h>
621 +#include <asm/msr.h>
623 +#include <asm/linkage.h>
624 +#include <asm/mmu_context.h>
625 +#include <asm/mtrr.h>
626 +#include <asm/mce.h>
627 +#include <asm/pat.h>
628 +#include <asm/asm.h>
629 +#include <asm/numa.h>
630 +#ifdef CONFIG_X86_LOCAL_APIC
631 +#include <asm/mpspec.h>
632 +#include <asm/apic.h>
633 +#include <mach_apic.h>
634 +#elif defined(CONFIG_XEN)
635 +#include <mach_apic.h>
637 +#include <asm/pda.h>
638 +#include <asm/pgtable.h>
639 +#include <asm/processor.h>
640 +#include <asm/desc.h>
641 +#include <asm/atomic.h>
642 +#include <asm/proto.h>
643 +#include <asm/sections.h>
644 +#include <asm/setup.h>
645 +#include <asm/genapic.h>
649 +/* We need valid kernel segments for data and code in long mode too
650 + * IRET will check the segment types kkeil 2000/10/28
651 + * Also sysret mandates a special GDT layout
653 +/* The TLS descriptors are currently at a different place compared to i386.
654 + Hopefully nobody expects them at a fixed place (Wine?) */
655 +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
656 + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
657 + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
658 + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
659 + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
660 + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
661 + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
663 +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
665 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
667 +/* Current gdt points %fs at the "master" per-cpu area: after this,
668 + * it's on the real one. */
669 +void switch_to_new_gdt(void)
672 + struct desc_ptr gdt_descr;
674 + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
675 + gdt_descr.size = GDT_SIZE - 1;
676 + load_gdt(&gdt_descr);
678 + void *va, *gdt_addr = get_cpu_gdt_table(smp_processor_id());
679 + unsigned long frames[16];
680 + unsigned int f = 0;
682 + for (va = gdt_addr; va < gdt_addr + GDT_SIZE; va += PAGE_SIZE) {
683 + frames[f++] = virt_to_mfn(va);
684 + make_page_readonly(va, XENFEAT_writable_descriptor_tables);
686 + if (HYPERVISOR_set_gdt(frames, GDT_SIZE / sizeof(struct desc_struct)))
691 +struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
693 +static void __cpuinit default_init(struct cpuinfo_x86 *c)
695 + display_cacheinfo(c);
698 +static struct cpu_dev __cpuinitdata default_cpu = {
699 + .c_init = default_init,
700 + .c_vendor = "Unknown",
702 +static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
704 +int __cpuinit get_model_name(struct cpuinfo_x86 *c)
708 + if (c->extended_cpuid_level < 0x80000004)
711 + v = (unsigned int *) c->x86_model_id;
712 + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
713 + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
714 + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
715 + c->x86_model_id[48] = 0;
720 +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
722 + unsigned int n, dummy, ebx, ecx, edx;
724 + n = c->extended_cpuid_level;
726 + if (n >= 0x80000005) {
727 + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
728 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
729 + "D cache %dK (%d bytes/line)\n",
730 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
731 + c->x86_cache_size = (ecx>>24) + (edx>>24);
732 + /* On K8 L1 TLB is inclusive, so don't count it */
733 + c->x86_tlbsize = 0;
736 + if (n >= 0x80000006) {
737 + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
738 + ecx = cpuid_ecx(0x80000006);
739 + c->x86_cache_size = ecx >> 16;
740 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
742 + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
743 + c->x86_cache_size, ecx & 0xFF);
747 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
750 + u32 eax, ebx, ecx, edx;
751 + int index_msb, core_bits;
753 + cpuid(1, &eax, &ebx, &ecx, &edx);
756 + if (!cpu_has(c, X86_FEATURE_HT))
758 + if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
761 + smp_num_siblings = (ebx & 0xff0000) >> 16;
763 + if (smp_num_siblings == 1) {
764 + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
765 + } else if (smp_num_siblings > 1) {
767 + if (smp_num_siblings > NR_CPUS) {
768 + printk(KERN_WARNING "CPU: Unsupported number of "
769 + "siblings %d", smp_num_siblings);
770 + smp_num_siblings = 1;
774 + index_msb = get_count_order(smp_num_siblings);
775 + c->phys_proc_id = phys_pkg_id(index_msb);
777 + smp_num_siblings = smp_num_siblings / c->x86_max_cores;
779 + index_msb = get_count_order(smp_num_siblings);
781 + core_bits = get_count_order(c->x86_max_cores);
783 + c->cpu_core_id = phys_pkg_id(index_msb) &
784 + ((1 << core_bits) - 1);
787 + if ((c->x86_max_cores * smp_num_siblings) > 1) {
788 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
790 + printk(KERN_INFO "CPU: Processor Core ID: %d\n",
797 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
799 + char *v = c->x86_vendor_id;
801 + static int printed;
803 + for (i = 0; i < X86_VENDOR_NUM; i++) {
805 + if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
806 + (cpu_devs[i]->c_ident[1] &&
807 + !strcmp(v, cpu_devs[i]->c_ident[1]))) {
809 + this_cpu = cpu_devs[i];
816 + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
817 + printk(KERN_ERR "CPU: Your system may be unstable.\n");
819 + c->x86_vendor = X86_VENDOR_UNKNOWN;
822 +static void __init early_cpu_support_print(void)
825 + struct cpu_dev *cpu_devx;
827 + printk("KERNEL supported cpus:\n");
828 + for (i = 0; i < X86_VENDOR_NUM; i++) {
829 + cpu_devx = cpu_devs[i];
832 + for (j = 0; j < 2; j++) {
833 + if (!cpu_devx->c_ident[j])
835 + printk(" %s %s\n", cpu_devx->c_vendor,
836 + cpu_devx->c_ident[j]);
842 + * The NOPL instruction is supposed to exist on all CPUs with
843 + * family >= 6, unfortunately, that's not true in practice because
844 + * of early VIA chips and (more importantly) broken virtualizers that
845 + * are not easy to detect. Hence, probe for it based on first
848 + * Note: no 64-bit chip is known to lack these, but put the code here
849 + * for consistency with 32 bits, and to make it utterly trivial to
850 + * diagnose the problem should it ever surface.
852 +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
854 + const u32 nopl_signature = 0x888c53b1; /* Random number */
855 + u32 has_nopl = nopl_signature;
857 + clear_cpu_cap(c, X86_FEATURE_NOPL);
860 + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
862 + " .section .fixup,\"ax\"\n"
866 + _ASM_EXTABLE(1b,3b)
867 + : "+a" (has_nopl));
869 + if (has_nopl == nopl_signature)
870 + set_cpu_cap(c, X86_FEATURE_NOPL);
874 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
876 +void __init early_cpu_init(void)
878 + struct cpu_vendor_dev *cvdev;
880 + for (cvdev = __x86cpuvendor_start ;
881 + cvdev < __x86cpuvendor_end ;
883 + cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
884 + early_cpu_support_print();
885 + early_identify_cpu(&boot_cpu_data);
888 +/* Do some early cpuid on the boot CPU to get some parameter that are
889 + needed before check_bugs. Everything advanced is in identify_cpu
891 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
895 + c->loops_per_jiffy = loops_per_jiffy;
896 + c->x86_cache_size = -1;
897 + c->x86_vendor = X86_VENDOR_UNKNOWN;
898 + c->x86_model = c->x86_mask = 0; /* So far unknown... */
899 + c->x86_vendor_id[0] = '\0'; /* Unset */
900 + c->x86_model_id[0] = '\0'; /* Unset */
901 + c->x86_clflush_size = 64;
902 + c->x86_cache_alignment = c->x86_clflush_size;
903 + c->x86_max_cores = 1;
904 + c->x86_coreid_bits = 0;
905 + c->extended_cpuid_level = 0;
906 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
908 + /* Get vendor name */
909 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
910 + (unsigned int *)&c->x86_vendor_id[0],
911 + (unsigned int *)&c->x86_vendor_id[8],
912 + (unsigned int *)&c->x86_vendor_id[4]);
916 + /* Initialize the standard set of capabilities */
917 + /* Note that the vendor-specific code below might override */
919 + /* Intel-defined flags: level 0x00000001 */
920 + if (c->cpuid_level >= 0x00000001) {
922 + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
923 + &c->x86_capability[0]);
924 + c->x86 = (tfms >> 8) & 0xf;
925 + c->x86_model = (tfms >> 4) & 0xf;
926 + c->x86_mask = tfms & 0xf;
928 + c->x86 += (tfms >> 20) & 0xff;
930 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
931 + if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
932 + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
934 + /* Have CPUID level 0 only - unheard of */
938 + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
940 + c->phys_proc_id = c->initial_apicid;
942 + /* AMD-defined flags: level 0x80000001 */
943 + xlvl = cpuid_eax(0x80000000);
944 + c->extended_cpuid_level = xlvl;
945 + if ((xlvl & 0xffff0000) == 0x80000000) {
946 + if (xlvl >= 0x80000001) {
947 + c->x86_capability[1] = cpuid_edx(0x80000001);
948 + c->x86_capability[6] = cpuid_ecx(0x80000001);
950 + if (xlvl >= 0x80000004)
951 + get_model_name(c); /* Default name */
954 + /* Transmeta-defined flags: level 0x80860001 */
955 + xlvl = cpuid_eax(0x80860000);
956 + if ((xlvl & 0xffff0000) == 0x80860000) {
957 + /* Don't set x86_cpuid_level here for now to not confuse. */
958 + if (xlvl >= 0x80860001)
959 + c->x86_capability[2] = cpuid_edx(0x80860001);
962 + if (c->extended_cpuid_level >= 0x80000007)
963 + c->x86_power = cpuid_edx(0x80000007);
965 + if (c->extended_cpuid_level >= 0x80000008) {
966 + u32 eax = cpuid_eax(0x80000008);
968 + c->x86_virt_bits = (eax >> 8) & 0xff;
969 + c->x86_phys_bits = eax & 0xff;
974 + if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
975 + cpu_devs[c->x86_vendor]->c_early_init)
976 + cpu_devs[c->x86_vendor]->c_early_init(c);
978 + validate_pat_support(c);
982 + * This does the hard work of actually picking apart the CPU stuff...
984 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
988 + early_identify_cpu(c);
990 + init_scattered_cpuid_features(c);
992 + c->apicid = phys_pkg_id(0);
995 + * Vendor-specific initialization. In this section we
996 + * canonicalize the feature flags, meaning if there are
997 + * features a certain CPU supports which CPUID doesn't
998 + * tell us, CPUID claiming incorrect flags, or other bugs,
999 + * we handle them here.
1001 + * At the end of this section, c->x86_capability better
1002 + * indicate the features this CPU genuinely supports!
1004 + if (this_cpu->c_init)
1005 + this_cpu->c_init(c);
1010 + * On SMP, boot_cpu_data holds the common feature set between
1011 + * all CPUs; so make sure that we indicate which features are
1012 + * common between the CPUs. The first time this routine gets
1013 + * executed, c == &boot_cpu_data.
1015 + if (c != &boot_cpu_data) {
1016 + /* AND the already accumulated flags with these */
1017 + for (i = 0; i < NCAPINTS; i++)
1018 + boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1021 + /* Clear all flags overriden by options */
1022 + for (i = 0; i < NCAPINTS; i++)
1023 + c->x86_capability[i] &= ~cleared_cpu_caps[i];
1025 +#ifdef CONFIG_X86_MCE
1028 + select_idle_routine(c);
1031 + numa_add_cpu(smp_processor_id());
1036 +void __cpuinit identify_boot_cpu(void)
1038 + identify_cpu(&boot_cpu_data);
1041 +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1043 + BUG_ON(c == &boot_cpu_data);
1048 +static __init int setup_noclflush(char *arg)
1050 + setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1053 +__setup("noclflush", setup_noclflush);
1055 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1057 + if (c->x86_model_id[0])
1058 + printk(KERN_CONT "%s", c->x86_model_id);
1060 + if (c->x86_mask || c->cpuid_level >= 0)
1061 + printk(KERN_CONT " stepping %02x\n", c->x86_mask);
1063 + printk(KERN_CONT "\n");
1066 +static __init int setup_disablecpuid(char *arg)
1069 + if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1070 + setup_clear_cpu_cap(bit);
1075 +__setup("clearcpuid=", setup_disablecpuid);
1077 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1079 +struct x8664_pda **_cpu_pda __read_mostly;
1080 +EXPORT_SYMBOL(_cpu_pda);
1082 +#ifndef CONFIG_X86_NO_IDT
1083 +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
1086 +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
1088 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
1089 +EXPORT_SYMBOL_GPL(__supported_pte_mask);
1091 +static int do_not_nx __cpuinitdata;
1094 +Control non executable mappings for 64bit processes.
1099 +static int __init nonx_setup(char *str)
1103 + if (!strncmp(str, "on", 2)) {
1104 + __supported_pte_mask |= _PAGE_NX;
1106 + } else if (!strncmp(str, "off", 3)) {
1108 + __supported_pte_mask &= ~_PAGE_NX;
1112 +early_param("noexec", nonx_setup);
1114 +int force_personality32;
1117 +Control non executable heap for 32bit processes.
1118 +To control the stack too use noexec=off
1120 +on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
1121 +off PROT_READ implies PROT_EXEC
1123 +static int __init nonx32_setup(char *str)
1125 + if (!strcmp(str, "on"))
1126 + force_personality32 &= ~READ_IMPLIES_EXEC;
1127 + else if (!strcmp(str, "off"))
1128 + force_personality32 |= READ_IMPLIES_EXEC;
1131 +__setup("noexec32=", nonx32_setup);
1133 +static void __init_refok switch_pt(int cpu)
1138 + xen_pt_switch(__pa_symbol(init_level4_pgt));
1139 + xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
1143 +void pda_init(int cpu)
1145 + struct x8664_pda *pda = cpu_pda(cpu);
1147 + /* Setup up data that may be needed in __get_free_pages early */
1148 + loadsegment(fs, 0);
1149 + loadsegment(gs, 0);
1151 + /* Memory clobbers used to order PDA accessed */
1153 + wrmsrl(MSR_GS_BASE, pda);
1156 + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
1157 + (unsigned long)pda))
1161 + pda->cpunumber = cpu;
1162 + pda->irqcount = -1;
1163 + pda->kernelstack = (unsigned long)stack_thread_info() -
1164 + PDA_STACKOFFSET + THREAD_SIZE;
1165 + pda->active_mm = &init_mm;
1166 + pda->mmu_state = 0;
1169 + /* others are initialized in smpboot.c */
1170 + pda->pcurrent = &init_task;
1171 + pda->irqstackptr = boot_cpu_stack;
1172 + pda->irqstackptr += IRQSTACKSIZE - 64;
1174 + if (!pda->irqstackptr) {
1175 + pda->irqstackptr = (char *)
1176 + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
1177 + if (!pda->irqstackptr)
1178 + panic("cannot allocate irqstack for cpu %d",
1180 + pda->irqstackptr += IRQSTACKSIZE - 64;
1183 + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
1184 + pda->nodenumber = cpu_to_node(cpu);
1190 +#ifndef CONFIG_X86_NO_TSS
1191 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
1192 + DEBUG_STKSZ] __page_aligned_bss;
1195 +extern asmlinkage void ignore_sysret(void);
1197 +void __cpuinit syscall_init(void)
1201 + * LSTAR and STAR live in a bit strange symbiosis.
1202 + * They both write to the same internal register. STAR allows to
1203 + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
1205 + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1206 + wrmsrl(MSR_LSTAR, system_call);
1207 + wrmsrl(MSR_CSTAR, ignore_sysret);
1209 + /* Flags to clear on syscall */
1210 + wrmsrl(MSR_SYSCALL_MASK,
1211 + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1213 +#ifdef CONFIG_IA32_EMULATION
1214 + syscall32_cpu_init();
1216 + static const struct callback_register __cpuinitconst cstar = {
1217 + .type = CALLBACKTYPE_syscall32,
1218 + .address = (unsigned long)ignore_sysret
1221 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
1222 + printk(KERN_WARNING "Unable to register CSTAR callback\n");
1226 +void __cpuinit check_efer(void)
1228 + unsigned long efer;
1230 + rdmsrl(MSR_EFER, efer);
1231 + if (!(efer & EFER_NX) || do_not_nx)
1232 + __supported_pte_mask &= ~_PAGE_NX;
1235 +unsigned long kernel_eflags;
1237 +#ifndef CONFIG_X86_NO_TSS
1239 + * Copies of the original ist values from the tss are only accessed during
1240 + * debugging, no special alignment required.
1242 +DEFINE_PER_CPU(struct orig_ist, orig_ist);
1246 + * cpu_init() initializes state that is per-CPU. Some data is already
1247 + * initialized (naturally) in the bootstrap process, such as the GDT
1248 + * and IDT. We reload them nevertheless, this function acts as a
1249 + * 'CPU state barrier', nothing should get across.
1250 + * A lot of state is already set up in PDA init.
1252 +void __cpuinit cpu_init(void)
1254 + int cpu = stack_smp_processor_id();
1255 +#ifndef CONFIG_X86_NO_TSS
1256 + struct tss_struct *t = &per_cpu(init_tss, cpu);
1257 + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
1259 + char *estacks = NULL;
1262 + struct task_struct *me;
1264 + /* CPU 0 is initialised in head64.c */
1267 +#ifndef CONFIG_X86_NO_TSS
1269 + estacks = boot_exception_stacks;
1274 + if (cpu_test_and_set(cpu, cpu_initialized))
1275 + panic("CPU#%d already initialized!\n", cpu);
1277 + printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1279 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1282 + * Initialize the per-CPU GDT with the boot GDT,
1283 + * and set up the GDT descriptor:
1286 + switch_to_new_gdt();
1287 +#ifndef CONFIG_X86_NO_IDT
1288 + load_idt((const struct desc_ptr *)&idt_descr);
1291 + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
1294 + wrmsrl(MSR_FS_BASE, 0);
1295 + wrmsrl(MSR_KERNEL_GS_BASE, 0);
1300 +#ifndef CONFIG_X86_NO_TSS
1302 + * set up and load the per-CPU TSS
1304 + if (!orig_ist->ist[0]) {
1305 + static const unsigned int order[N_EXCEPTION_STACKS] = {
1306 + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1307 + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1309 + for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1311 + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1313 + panic("Cannot allocate exception "
1314 + "stack %ld %d\n", v, cpu);
1316 + estacks += PAGE_SIZE << order[v];
1317 + orig_ist->ist[v] = t->x86_tss.ist[v] =
1318 + (unsigned long)estacks;
1322 + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1324 + * <= is required because the CPU will access up to
1325 + * 8 bits beyond the end of the IO permission bitmap.
1327 + for (i = 0; i <= IO_BITMAP_LONGS; i++)
1328 + t->io_bitmap[i] = ~0UL;
1331 + atomic_inc(&init_mm.mm_count);
1332 + me->active_mm = &init_mm;
1335 + enter_lazy_tlb(&init_mm, me);
1337 + load_sp0(t, ¤t->thread);
1338 +#ifndef CONFIG_X86_NO_TSS
1339 + set_tss_desc(cpu, t);
1342 + load_LDT(&init_mm.context);
1346 + * If the kgdb is connected no debug regs should be altered. This
1347 + * is only applicable when KGDB and a KGDB I/O module are built
1348 + * into the kernel and you are using early debugging with
1349 + * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1351 + if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1352 + arch_kgdb_ops.correct_hw_break();
1356 + * Clear all 6 debug registers:
1359 + set_debugreg(0UL, 0);
1360 + set_debugreg(0UL, 1);
1361 + set_debugreg(0UL, 2);
1362 + set_debugreg(0UL, 3);
1363 + set_debugreg(0UL, 6);
1364 + set_debugreg(0UL, 7);
1366 + /* If the kgdb is connected no debug regs should be altered. */
1372 + asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
1373 + if (raw_irqs_disabled())
1374 + kernel_eflags &= ~X86_EFLAGS_IF;
1376 + if (is_uv_system())
1379 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
1380 +++ sle11-2009-10-16/arch/x86/kernel/e820-xen.c 2009-06-04 10:21:39.000000000 +0200
1383 + * Handle the memory map.
1384 + * The functions here do the job until bootmem takes over.
1386 + * Getting sanitize_e820_map() in sync with i386 version by applying change:
1387 + * - Provisions for empty E820 memory regions (reported by certain BIOSes).
1388 + * Alex Achenbach <xela@slit.de>, December 2002.
1389 + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
1392 +#include <linux/kernel.h>
1393 +#include <linux/types.h>
1394 +#include <linux/init.h>
1395 +#include <linux/bootmem.h>
1396 +#include <linux/ioport.h>
1397 +#include <linux/string.h>
1398 +#include <linux/kexec.h>
1399 +#include <linux/module.h>
1400 +#include <linux/mm.h>
1401 +#include <linux/pfn.h>
1402 +#include <linux/suspend.h>
1403 +#include <linux/firmware-map.h>
1405 +#include <asm/pgtable.h>
1406 +#include <asm/page.h>
1407 +#include <asm/e820.h>
1408 +#include <asm/proto.h>
1409 +#include <asm/setup.h>
1410 +#include <xen/interface/memory.h>
1413 + * The e820 map is the map that gets modified e.g. with command line parameters
1414 + * and that is also registered with modifications in the kernel resource tree
1415 + * with the iomem_resource as parent.
1417 + * The e820_saved is directly saved after the BIOS-provided memory map is
1418 + * copied. It doesn't get modified afterwards. It's registered for the
1419 + * /sys/firmware/memmap interface.
1421 + * That memory map is not modified and is used as base for kexec. The kexec'd
1422 + * kernel should get the same memory map as the firmware provides. Then the
1423 + * user can e.g. boot the original kernel with mem=1G while still booting the
1424 + * next kernel with full memory.
1426 +struct e820map e820;
1428 +struct e820map e820_saved;
1430 +static struct e820map machine_e820;
1431 +#define e820_saved machine_e820
1434 +/* For PCI or other memory-mapped resources */
1435 +unsigned long pci_mem_start = 0xaeedbabe;
1437 +EXPORT_SYMBOL(pci_mem_start);
1441 + * This function checks if any part of the range <start,end> is mapped
1445 +e820_any_mapped(u64 start, u64 end, unsigned type)
1450 + for (i = 0; i < e820.nr_map; i++) {
1451 + struct e820entry *ei = &e820.map[i];
1453 + if (!is_initial_xendomain())
1455 + for (i = 0; i < machine_e820.nr_map; ++i) {
1456 + const struct e820entry *ei = &machine_e820.map[i];
1459 + if (type && ei->type != type)
1461 + if (ei->addr >= end || ei->addr + ei->size <= start)
1467 +EXPORT_SYMBOL_GPL(e820_any_mapped);
1470 + * This function checks if the entire range <start,end> is mapped with type.
1472 + * Note: this function only works correct if the e820 table is sorted and
1473 + * not-overlapping, which is the case
1475 +int __init e820_all_mapped(u64 start, u64 end, unsigned type)
1480 + for (i = 0; i < e820.nr_map; i++) {
1481 + struct e820entry *ei = &e820.map[i];
1483 + if (!is_initial_xendomain())
1485 + for (i = 0; i < machine_e820.nr_map; ++i) {
1486 + const struct e820entry *ei = &machine_e820.map[i];
1489 + if (type && ei->type != type)
1491 + /* is the region (part) in overlap with the current region ?*/
1492 + if (ei->addr >= end || ei->addr + ei->size <= start)
1495 + /* if the region is at the beginning of <start,end> we move
1496 + * start to the end of the region since it's ok until there
1498 + if (ei->addr <= start)
1499 + start = ei->addr + ei->size;
1501 + * if start is now at or beyond end, we're done, full
1511 + * Add a memory region to the kernel e820 map.
1513 +void __init e820_add_region(u64 start, u64 size, int type)
1515 + int x = e820.nr_map;
1517 + if (x == ARRAY_SIZE(e820.map)) {
1518 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1522 + e820.map[x].addr = start;
1523 + e820.map[x].size = size;
1524 + e820.map[x].type = type;
1528 +void __init e820_print_map(char *who)
1532 + for (i = 0; i < e820.nr_map; i++) {
1533 + printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1534 + (unsigned long long) e820.map[i].addr,
1535 + (unsigned long long)
1536 + (e820.map[i].addr + e820.map[i].size));
1537 + switch (e820.map[i].type) {
1539 + case E820_RESERVED_KERN:
1540 + printk(KERN_CONT "(usable)\n");
1542 + case E820_RESERVED:
1543 + printk(KERN_CONT "(reserved)\n");
1546 + printk(KERN_CONT "(ACPI data)\n");
1549 + printk(KERN_CONT "(ACPI NVS)\n");
1552 + printk(KERN_CONT "type %u\n", e820.map[i].type);
1559 + * Sanitize the BIOS e820 map.
1561 + * Some e820 responses include overlapping entries. The following
1562 + * replaces the original e820 map with a new one, removing overlaps,
1563 + * and resolving conflicting memory types in favor of highest
1566 + * The input parameter biosmap points to an array of 'struct
1567 + * e820entry' which on entry has elements in the range [0, *pnr_map)
1568 + * valid, and which has space for up to max_nr_map entries.
1569 + * On return, the resulting sanitized e820 map entries will be in
1570 + * overwritten in the same location, starting at biosmap.
1572 + * The integer pointed to by pnr_map must be valid on entry (the
1573 + * current number of valid entries located at biosmap) and will
1574 + * be updated on return, with the new number of valid entries
1575 + * (something no more than max_nr_map.)
1577 + * The return value from sanitize_e820_map() is zero if it
1578 + * successfully 'sanitized' the map entries passed in, and is -1
1579 + * if it did nothing, which can happen if either of (1) it was
1580 + * only passed one map entry, or (2) any of the input map entries
1581 + * were invalid (start + size < start, meaning that the size was
1582 + * so big the described memory range wrapped around through zero.)
1584 + * Visually we're performing the following
1585 + * (1,2,3,4 = memory types)...
1587 + * Sample memory map (w/overlaps):
1588 + * ____22__________________
1589 + * ______________________4_
1590 + * ____1111________________
1591 + * _44_____________________
1592 + * 11111111________________
1593 + * ____________________33__
1594 + * ___________44___________
1595 + * __________33333_________
1596 + * ______________22________
1597 + * ___________________2222_
1598 + * _________111111111______
1599 + * _____________________11_
1600 + * _________________4______
1602 + * Sanitized equivalent (no overlap):
1603 + * 1_______________________
1604 + * _44_____________________
1605 + * ___1____________________
1606 + * ____22__________________
1607 + * ______11________________
1608 + * _________1______________
1609 + * __________3_____________
1610 + * ___________44___________
1611 + * _____________33_________
1612 + * _______________2________
1613 + * ________________1_______
1614 + * _________________4______
1615 + * ___________________2____
1616 + * ____________________33__
1617 + * ______________________4_
1620 +int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
1623 + struct change_member {
1624 + struct e820entry *pbios; /* pointer to original bios entry */
1625 + unsigned long long addr; /* address for this change point */
1627 + static struct change_member change_point_list[2*E820_X_MAX] __initdata;
1628 + static struct change_member *change_point[2*E820_X_MAX] __initdata;
1629 + static struct e820entry *overlap_list[E820_X_MAX] __initdata;
1630 + static struct e820entry new_bios[E820_X_MAX] __initdata;
1631 + struct change_member *change_tmp;
1632 + unsigned long current_type, last_type;
1633 + unsigned long long last_addr;
1634 + int chgidx, still_changing;
1635 + int overlap_entries;
1636 + int new_bios_entry;
1637 + int old_nr, new_nr, chg_nr;
1640 + /* if there's only one memory region, don't bother */
1642 + if (*pnr_map == 1)
1648 + old_nr = *pnr_map;
1649 + BUG_ON(old_nr > max_nr_map);
1651 + /* bail out if we find any unreasonable addresses in bios map */
1652 + for (i = 0; i < old_nr; i++)
1653 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
1656 + /* create pointers for initial change-point information (for sorting) */
1657 + for (i = 0; i < 2 * old_nr; i++)
1658 + change_point[i] = &change_point_list[i];
1660 + /* record all known change-points (starting and ending addresses),
1661 + omitting those that are for empty memory regions */
1663 + for (i = 0; i < old_nr; i++) {
1664 + if (biosmap[i].size != 0) {
1665 + change_point[chgidx]->addr = biosmap[i].addr;
1666 + change_point[chgidx++]->pbios = &biosmap[i];
1667 + change_point[chgidx]->addr = biosmap[i].addr +
1669 + change_point[chgidx++]->pbios = &biosmap[i];
1674 + /* sort change-point list by memory addresses (low -> high) */
1675 + still_changing = 1;
1676 + while (still_changing) {
1677 + still_changing = 0;
1678 + for (i = 1; i < chg_nr; i++) {
1679 + unsigned long long curaddr, lastaddr;
1680 + unsigned long long curpbaddr, lastpbaddr;
1682 + curaddr = change_point[i]->addr;
1683 + lastaddr = change_point[i - 1]->addr;
1684 + curpbaddr = change_point[i]->pbios->addr;
1685 + lastpbaddr = change_point[i - 1]->pbios->addr;
1688 + * swap entries, when:
1690 + * curaddr > lastaddr or
1691 + * curaddr == lastaddr and curaddr == curpbaddr and
1692 + * lastaddr != lastpbaddr
1694 + if (curaddr < lastaddr ||
1695 + (curaddr == lastaddr && curaddr == curpbaddr &&
1696 + lastaddr != lastpbaddr)) {
1697 + change_tmp = change_point[i];
1698 + change_point[i] = change_point[i-1];
1699 + change_point[i-1] = change_tmp;
1700 + still_changing = 1;
1705 + /* create a new bios memory map, removing overlaps */
1706 + overlap_entries = 0; /* number of entries in the overlap table */
1707 + new_bios_entry = 0; /* index for creating new bios map entries */
1708 + last_type = 0; /* start with undefined memory type */
1709 + last_addr = 0; /* start with 0 as last starting address */
1711 + /* loop through change-points, determining affect on the new bios map */
1712 + for (chgidx = 0; chgidx < chg_nr; chgidx++) {
1713 + /* keep track of all overlapping bios entries */
1714 + if (change_point[chgidx]->addr ==
1715 + change_point[chgidx]->pbios->addr) {
1717 + * add map entry to overlap list (> 1 entry
1718 + * implies an overlap)
1720 + overlap_list[overlap_entries++] =
1721 + change_point[chgidx]->pbios;
1724 + * remove entry from list (order independent,
1725 + * so swap with last)
1727 + for (i = 0; i < overlap_entries; i++) {
1728 + if (overlap_list[i] ==
1729 + change_point[chgidx]->pbios)
1731 + overlap_list[overlap_entries-1];
1733 + overlap_entries--;
1736 + * if there are overlapping entries, decide which
1737 + * "type" to use (larger value takes precedence --
1738 + * 1=usable, 2,3,4,4+=unusable)
1741 + for (i = 0; i < overlap_entries; i++)
1742 + if (overlap_list[i]->type > current_type)
1743 + current_type = overlap_list[i]->type;
1745 + * continue building up new bios map based on this
1748 + if (current_type != last_type) {
1749 + if (last_type != 0) {
1750 + new_bios[new_bios_entry].size =
1751 + change_point[chgidx]->addr - last_addr;
1753 + * move forward only if the new size
1756 + if (new_bios[new_bios_entry].size != 0)
1758 + * no more space left for new
1761 + if (++new_bios_entry >= max_nr_map)
1764 + if (current_type != 0) {
1765 + new_bios[new_bios_entry].addr =
1766 + change_point[chgidx]->addr;
1767 + new_bios[new_bios_entry].type = current_type;
1768 + last_addr = change_point[chgidx]->addr;
1770 + last_type = current_type;
1773 + /* retain count for new bios entries */
1774 + new_nr = new_bios_entry;
1776 + /* copy new bios mapping into original location */
1777 + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
1778 + *pnr_map = new_nr;
1783 +static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
1786 + u64 start = biosmap->addr;
1787 + u64 size = biosmap->size;
1788 + u64 end = start + size;
1789 + u32 type = biosmap->type;
1791 + /* Overflow in 64 bits? Ignore the memory map. */
1795 + e820_add_region(start, size, type);
1804 + * Copy the BIOS e820 map into a safe place.
1806 + * Sanity-check it while we're at it..
1808 + * If we're lucky and live on a modern system, the setup code
1809 + * will have given us a memory map that we can use to properly
1810 + * set up memory. If we aren't, we'll fake a memory map.
1812 +static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
1815 + /* Only one memory region (or negative)? Ignore it */
1819 + BUG_ON(nr_map < 1);
1822 + return __append_e820_map(biosmap, nr_map);
1825 +static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
1826 + u64 size, unsigned old_type,
1827 + unsigned new_type)
1829 + unsigned int i, x;
1830 + u64 real_updated_size = 0;
1832 + BUG_ON(old_type == new_type);
1834 + if (size > (ULLONG_MAX - start))
1835 + size = ULLONG_MAX - start;
1837 + for (i = 0; i < e820x->nr_map; i++) {
1838 + struct e820entry *ei = &e820x->map[i];
1839 + u64 final_start, final_end;
1840 + if (ei->type != old_type)
1842 + /* totally covered? */
1843 + if (ei->addr >= start &&
1844 + (ei->addr + ei->size) <= (start + size)) {
1845 + ei->type = new_type;
1846 + real_updated_size += ei->size;
1849 + /* partially covered */
1850 + final_start = max(start, ei->addr);
1851 + final_end = min(start + size, ei->addr + ei->size);
1852 + if (final_start >= final_end)
1855 + x = e820x->nr_map;
1856 + if (x == ARRAY_SIZE(e820x->map)) {
1857 + printk(KERN_ERR "Too many memory map entries!\n");
1860 + e820x->map[x].addr = final_start;
1861 + e820x->map[x].size = final_end - final_start;
1862 + e820x->map[x].type = new_type;
1865 + real_updated_size += final_end - final_start;
1867 + if (ei->addr < final_start)
1869 + ei->addr = final_end;
1870 + ei->size -= final_end - final_start;
1872 + return real_updated_size;
1875 +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
1876 + unsigned new_type)
1878 + return e820_update_range_map(&e820, start, size, old_type, new_type);
1881 +static u64 __init e820_update_range_saved(u64 start, u64 size,
1882 + unsigned old_type, unsigned new_type)
1885 + if (is_initial_xendomain())
1886 + return e820_update_range_map(&machine_e820,
1887 + phys_to_machine(start), size,
1888 + old_type, new_type);
1890 + return e820_update_range_map(&e820_saved, start, size, old_type,
1894 +/* make e820 not cover the range */
1895 +u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
1899 + u64 real_removed_size = 0;
1901 + if (size > (ULLONG_MAX - start))
1902 + size = ULLONG_MAX - start;
1904 + for (i = 0; i < e820.nr_map; i++) {
1905 + struct e820entry *ei = &e820.map[i];
1906 + u64 final_start, final_end;
1908 + if (checktype && ei->type != old_type)
1910 + /* totally covered? */
1911 + if (ei->addr >= start &&
1912 + (ei->addr + ei->size) <= (start + size)) {
1913 + real_removed_size += ei->size;
1914 + memset(ei, 0, sizeof(struct e820entry));
1917 + /* partially covered */
1918 + final_start = max(start, ei->addr);
1919 + final_end = min(start + size, ei->addr + ei->size);
1920 + if (final_start >= final_end)
1922 + real_removed_size += final_end - final_start;
1924 + ei->size -= final_end - final_start;
1925 + if (ei->addr < final_start)
1927 + ei->addr = final_end;
1929 + return real_removed_size;
1932 +void __init update_e820(void)
1936 + nr_map = e820.nr_map;
1937 + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
1939 + e820.nr_map = nr_map;
1940 + printk(KERN_INFO "modified physical RAM map:\n");
1941 + e820_print_map("modified");
1943 +static void __init update_e820_saved(void)
1947 + nr_map = e820_saved.nr_map;
1948 + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
1950 + e820_saved.nr_map = nr_map;
1954 +#define e820 machine_e820
1957 +#define MAX_GAP_END 0x100000000ull
1959 + * Search for a gap in the e820 memory space from start_addr to end_addr.
1961 +__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
1962 + unsigned long start_addr, unsigned long long end_addr)
1964 + unsigned long long last;
1965 + int i = e820.nr_map;
1968 + last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
1969 +#ifdef CONFIG_X86_64
1970 + if (start_addr >= MAX_GAP_END)
1971 + last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
1974 + while (--i >= 0) {
1975 + unsigned long long start = e820.map[i].addr;
1976 + unsigned long long end = start + e820.map[i].size;
1978 + if (end < start_addr)
1982 + * Since "last" is at most 4GB, we know we'll
1983 + * fit in 32 bits if this condition is true
1986 + unsigned long gap = last - end;
1988 + if (gap >= *gapsize) {
2001 + * Search for the biggest gap in the low 32 bits of the e820
2002 + * memory space. We pass this space to PCI to assign MMIO resources
2003 + * for hotplug or unconfigured devices in.
2004 + * Hopefully the BIOS let enough space left.
2006 +__init void e820_setup_gap(void)
2008 + unsigned long gapstart, gapsize, round;
2011 + gapstart = 0x10000000;
2012 + gapsize = 0x400000;
2013 + found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
2015 +#ifdef CONFIG_X86_64
2017 + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2019 + KERN_ERR "PCI: Unassigned devices with 32bit resource "
2020 + "registers may break!\n");
2021 + found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
2027 + * See how much we want to round up: start off with
2028 + * rounding to the next 1MB area.
2031 + while ((gapsize >> 4) > round)
2033 + /* Fun with two's complement */
2034 + pci_mem_start = (gapstart + round) & -round;
2037 + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2038 + pci_mem_start, gapstart, gapsize);
2045 + * Because of the size limitation of struct boot_params, only first
2046 + * 128 E820 memory entries are passed to kernel via
2047 + * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
2048 + * linked list of struct setup_data, which is parsed here.
2050 +void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
2054 + struct e820entry *extmap;
2056 + entries = sdata->len / sizeof(struct e820entry);
2057 + map_len = sdata->len + sizeof(struct setup_data);
2058 + if (map_len > PAGE_SIZE)
2059 + sdata = early_ioremap(pa_data, map_len);
2060 + extmap = (struct e820entry *)(sdata->data);
2061 + __append_e820_map(extmap, entries);
2062 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
2063 + if (map_len > PAGE_SIZE)
2064 + early_iounmap(sdata, map_len);
2065 + printk(KERN_INFO "extended physical RAM map:\n");
2066 + e820_print_map("extended");
2069 +#if defined(CONFIG_X86_64) || \
2070 + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
2072 + * Find the ranges of physical addresses that do not correspond to
2073 + * e820 RAM areas and mark the corresponding pages as nosave for
2074 + * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
2076 + * This function requires the e820 map to be sorted and without any
2077 + * overlapping entries and assumes the first e820 area to be RAM.
2079 +void __init e820_mark_nosave_regions(unsigned long limit_pfn)
2082 + unsigned long pfn;
2084 + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
2085 + for (i = 1; i < e820.nr_map; i++) {
2086 + struct e820entry *ei = &e820.map[i];
2088 + if (pfn < PFN_UP(ei->addr))
2089 + register_nosave_region(pfn, PFN_UP(ei->addr));
2091 + pfn = PFN_DOWN(ei->addr + ei->size);
2092 + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
2093 + register_nosave_region(PFN_UP(ei->addr), pfn);
2095 + if (pfn >= limit_pfn)
2103 + * Early reserved memory areas.
2105 +#define MAX_EARLY_RES 20
2112 +static struct early_res early_res[MAX_EARLY_RES] __initdata = {
2114 + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
2115 +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
2116 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
2118 +#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
2120 + * But first pinch a few for the stack/trampoline stuff
2121 + * FIXME: Don't need the extra page at 4K, but need to fix
2122 + * trampoline before removing it. (see the GDT stuff)
2124 + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
2126 + * Has to be in very low memory so we can execute
2127 + * real-mode AP code.
2129 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
2135 +static int __init find_overlapped_early(u64 start, u64 end)
2138 + struct early_res *r;
2140 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2141 + r = &early_res[i];
2142 + if (end > r->start && start < r->end)
2150 + * Drop the i-th range from the early reservation map,
2151 + * by copying any higher ranges down one over it, and
2152 + * clearing what had been the last slot.
2154 +static void __init drop_range(int i)
2158 + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
2161 + memmove(&early_res[i], &early_res[i + 1],
2162 + (j - 1 - i) * sizeof(struct early_res));
2164 + early_res[j - 1].end = 0;
2168 + * Split any existing ranges that:
2169 + * 1) are marked 'overlap_ok', and
2170 + * 2) overlap with the stated range [start, end)
2171 + * into whatever portion (if any) of the existing range is entirely
2172 + * below or entirely above the stated range. Drop the portion
2173 + * of the existing range that overlaps with the stated range,
2174 + * which will allow the caller of this routine to then add that
2175 + * stated range without conflicting with any existing range.
2177 +static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
2180 + struct early_res *r;
2181 + u64 lower_start, lower_end;
2182 + u64 upper_start, upper_end;
2185 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2186 + r = &early_res[i];
2188 + /* Continue past non-overlapping ranges */
2189 + if (end <= r->start || start >= r->end)
2193 + * Leave non-ok overlaps as is; let caller
2194 + * panic "Overlapping early reservations"
2195 + * when it hits this overlap.
2197 + if (!r->overlap_ok)
2201 + * We have an ok overlap. We will drop it from the early
2202 + * reservation map, and add back in any non-overlapping
2203 + * portions (lower or upper) as separate, overlap_ok,
2204 + * non-overlapping ranges.
2207 + /* 1. Note any non-overlapping (lower or upper) ranges. */
2208 + strncpy(name, r->name, sizeof(name) - 1);
2210 + lower_start = lower_end = 0;
2211 + upper_start = upper_end = 0;
2212 + if (r->start < start) {
2213 + lower_start = r->start;
2214 + lower_end = start;
2216 + if (r->end > end) {
2217 + upper_start = end;
2218 + upper_end = r->end;
2221 + /* 2. Drop the original ok overlapping range */
2224 + i--; /* resume for-loop on copied down entry */
2226 + /* 3. Add back in any non-overlapping ranges. */
2228 + reserve_early_overlap_ok(lower_start, lower_end, name);
2230 + reserve_early_overlap_ok(upper_start, upper_end, name);
2234 +static void __init __reserve_early(u64 start, u64 end, char *name,
2238 + struct early_res *r;
2240 + i = find_overlapped_early(start, end);
2241 + if (i >= MAX_EARLY_RES)
2242 + panic("Too many early reservations");
2243 + r = &early_res[i];
2245 + panic("Overlapping early reservations "
2246 + "%llx-%llx %s to %llx-%llx %s\n",
2247 + start, end - 1, name?name:"", r->start,
2248 + r->end - 1, r->name);
2251 + r->overlap_ok = overlap_ok;
2253 + strncpy(r->name, name, sizeof(r->name) - 1);
2257 + * A few early reservtations come here.
2259 + * The 'overlap_ok' in the name of this routine does -not- mean it
2260 + * is ok for these reservations to overlap an earlier reservation.
2261 + * Rather it means that it is ok for subsequent reservations to
2262 + * overlap this one.
2264 + * Use this entry point to reserve early ranges when you are doing
2265 + * so out of "Paranoia", reserving perhaps more memory than you need,
2266 + * just in case, and don't mind a subsequent overlapping reservation
2267 + * that is known to be needed.
2269 + * The drop_overlaps_that_are_ok() call here isn't really needed.
2270 + * It would be needed if we had two colliding 'overlap_ok'
2271 + * reservations, so that the second such would not panic on the
2272 + * overlap with the first. We don't have any such as of this
2273 + * writing, but might as well tolerate such if it happens in
2276 +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
2278 + drop_overlaps_that_are_ok(start, end);
2279 + __reserve_early(start, end, name, 1);
2283 + * Most early reservations come here.
2285 + * We first have drop_overlaps_that_are_ok() drop any pre-existing
2286 + * 'overlap_ok' ranges, so that we can then reserve this memory
2287 + * range without risk of panic'ing on an overlapping overlap_ok
2288 + * early reservation.
2290 +void __init reserve_early(u64 start, u64 end, char *name)
2292 + drop_overlaps_that_are_ok(start, end);
2293 + __reserve_early(start, end, name, 0);
2296 +void __init free_early(u64 start, u64 end)
2298 + struct early_res *r;
2301 + i = find_overlapped_early(start, end);
2302 + r = &early_res[i];
2303 + if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
2304 + panic("free_early on not reserved area: %llx-%llx!",
2310 +void __init early_res_to_bootmem(u64 start, u64 end)
2313 + u64 final_start, final_end;
2316 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
2319 + printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
2320 + count, start, end);
2321 + for (i = 0; i < count; i++) {
2322 + struct early_res *r = &early_res[i];
2323 + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
2324 + r->start, r->end, r->name);
2325 + final_start = max(start, r->start);
2326 + final_end = min(end, r->end);
2327 + if (final_start >= final_end) {
2328 + printk(KERN_CONT "\n");
2331 + printk(KERN_CONT " ==> [%010llx - %010llx]\n",
2332 + final_start, final_end);
2333 + reserve_bootmem_generic(final_start, final_end - final_start,
2338 +/* Check for already reserved areas */
2339 +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
2342 + u64 addr = *addrp;
2344 + struct early_res *r;
2346 + i = find_overlapped_early(addr, addr + size);
2347 + r = &early_res[i];
2348 + if (i < MAX_EARLY_RES && r->end) {
2349 + *addrp = addr = round_up(r->end, align);
2356 +/* Check for already reserved areas */
2357 +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
2360 + u64 addr = *addrp, last;
2361 + u64 size = *sizep;
2364 + last = addr + size;
2365 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2366 + struct early_res *r = &early_res[i];
2367 + if (last > r->start && addr < r->start) {
2368 + size = r->start - addr;
2372 + if (last > r->end && addr < r->end) {
2373 + addr = round_up(r->end, align);
2374 + size = last - addr;
2378 + if (last <= r->end && addr >= r->start) {
2391 + * Find a free area with specified alignment in a specific range.
2393 +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
2397 + for (i = 0; i < e820.nr_map; i++) {
2398 + struct e820entry *ei = &e820.map[i];
2402 + if (ei->type != E820_RAM)
2404 + addr = round_up(ei->addr, align);
2405 + ei_last = ei->addr + ei->size;
2407 + addr = round_up(start, align);
2408 + if (addr >= ei_last)
2410 + while (bad_addr(&addr, size, align) && addr+size <= ei_last)
2412 + last = addr + size;
2413 + if (last > ei_last)
2423 + * Find next free range after *start
2425 +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
2429 + for (i = 0; i < e820.nr_map; i++) {
2430 + struct e820entry *ei = &e820.map[i];
2434 + if (ei->type != E820_RAM)
2436 + addr = round_up(ei->addr, align);
2437 + ei_last = ei->addr + ei->size;
2439 + addr = round_up(start, align);
2440 + if (addr >= ei_last)
2442 + *sizep = ei_last - addr;
2443 + while (bad_addr_size(&addr, sizep, align) &&
2444 + addr + *sizep <= ei_last)
2446 + last = addr + *sizep;
2447 + if (last > ei_last)
2456 + * pre allocated 4k and reserved it in e820
2458 +u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
2464 + unsigned int order = get_order(sizet);
2466 + if (is_initial_xendomain()) {
2467 + sizet = PAGE_SIZE << order;
2468 + if (align < PAGE_SIZE)
2469 + align = PAGE_SIZE;
2472 + for (start = startt; ; start += size) {
2473 + start = find_e820_area_size(start, &size, align);
2476 + if (size >= sizet)
2480 +#ifdef CONFIG_X86_32
2481 + if (start >= MAXMEM)
2483 + if (start + size > MAXMEM)
2484 + size = MAXMEM - start;
2487 + if ((start >> PAGE_SHIFT) >= xen_start_info->nr_pages)
2489 + if (PFN_UP(start + size) > xen_start_info->nr_pages)
2490 + size = ((u64)xen_start_info->nr_pages << PAGE_SHIFT) - start;
2493 + addr = round_down(start + size - sizet, align);
2497 + if (is_initial_xendomain()) {
2499 + unsigned long max_initmap_pfn;
2501 + max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base))
2502 + + xen_start_info->nr_pt_frames
2503 + + 1 + (1 << (19 - PAGE_SHIFT)),
2504 + 1UL << (22 - PAGE_SHIFT));
2505 +#ifdef CONFIG_X86_32
2506 + if ((addr >> PAGE_SHIFT)
2507 + < max(max_initmap_pfn, max_pfn_mapped))
2508 + rc = xen_create_contiguous_region((unsigned long)
2512 + if ((addr >> PAGE_SHIFT) < max_pfn_mapped)
2513 + rc = xen_create_contiguous_region((unsigned long)
2516 + else if ((addr >> PAGE_SHIFT) < max_initmap_pfn)
2517 + rc = xen_create_contiguous_region(__START_KERNEL_map
2522 + rc = early_create_contiguous_region(addr >> PAGE_SHIFT,
2528 + e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
2529 + e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
2530 + printk(KERN_INFO "update e820 for early_reserve_e820\n");
2532 + update_e820_saved();
2537 +#ifdef CONFIG_X86_32
2538 +# ifdef CONFIG_X86_PAE
2539 +# define MAX_ARCH_PFN (1ULL<<(40-PAGE_SHIFT))
2541 +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
2543 +#else /* CONFIG_X86_32 */
2544 +# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
2548 + * Find the highest page frame number we have available
2550 +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
2553 + unsigned long last_pfn = 0;
2554 + unsigned long max_arch_pfn = MAX_ARCH_PFN;
2556 + for (i = 0; i < e820.nr_map; i++) {
2557 + struct e820entry *ei = &e820.map[i];
2558 + unsigned long start_pfn;
2559 + unsigned long end_pfn;
2561 + if (ei->type != type)
2564 + start_pfn = ei->addr >> PAGE_SHIFT;
2565 + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
2567 + if (start_pfn >= limit_pfn)
2569 + if (end_pfn > limit_pfn) {
2570 + last_pfn = limit_pfn;
2573 + if (end_pfn > last_pfn)
2574 + last_pfn = end_pfn;
2577 + if (last_pfn > max_arch_pfn)
2578 + last_pfn = max_arch_pfn;
2580 + printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
2581 + last_pfn, max_arch_pfn);
2584 +unsigned long __init e820_end_of_ram_pfn(void)
2586 + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
2589 +unsigned long __init e820_end_of_low_ram_pfn(void)
2591 + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
2594 + * Finds an active region in the address range from start_pfn to last_pfn and
2595 + * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
2597 +int __init e820_find_active_region(const struct e820entry *ei,
2598 + unsigned long start_pfn,
2599 + unsigned long last_pfn,
2600 + unsigned long *ei_startpfn,
2601 + unsigned long *ei_endpfn)
2603 + u64 align = PAGE_SIZE;
2605 + *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
2606 + *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
2608 + /* Skip map entries smaller than a page */
2609 + if (*ei_startpfn >= *ei_endpfn)
2612 + /* Skip if map is outside the node */
2613 + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
2614 + *ei_startpfn >= last_pfn)
2617 + /* Check for overlaps */
2618 + if (*ei_startpfn < start_pfn)
2619 + *ei_startpfn = start_pfn;
2620 + if (*ei_endpfn > last_pfn)
2621 + *ei_endpfn = last_pfn;
2626 +/* Walk the e820 map and register active regions within a node */
2627 +void __init e820_register_active_regions(int nid, unsigned long start_pfn,
2628 + unsigned long last_pfn)
2630 + unsigned long ei_startpfn;
2631 + unsigned long ei_endpfn;
2634 + for (i = 0; i < e820.nr_map; i++)
2635 + if (e820_find_active_region(&e820.map[i],
2636 + start_pfn, last_pfn,
2637 + &ei_startpfn, &ei_endpfn))
2638 + add_active_range(nid, ei_startpfn, ei_endpfn);
2642 + * Find the hole size (in bytes) in the memory range.
2643 + * @start: starting address of the memory range to scan
2644 + * @end: ending address of the memory range to scan
2646 +u64 __init e820_hole_size(u64 start, u64 end)
2648 + unsigned long start_pfn = start >> PAGE_SHIFT;
2649 + unsigned long last_pfn = end >> PAGE_SHIFT;
2650 + unsigned long ei_startpfn, ei_endpfn, ram = 0;
2653 + for (i = 0; i < e820.nr_map; i++) {
2654 + if (e820_find_active_region(&e820.map[i],
2655 + start_pfn, last_pfn,
2656 + &ei_startpfn, &ei_endpfn))
2657 + ram += ei_endpfn - ei_startpfn;
2659 + return end - start - ((u64)ram << PAGE_SHIFT);
2662 +static void early_panic(char *msg)
2664 + early_printk(msg);
2668 +static int userdef __initdata;
2670 +/* "mem=nopentium" disables the 4MB page tables. */
2671 +static int __init parse_memopt(char *p)
2673 + u64 mem_size, current_end;
2679 +#ifdef CONFIG_X86_32
2680 + if (!strcmp(p, "nopentium")) {
2681 + setup_clear_cpu_cap(X86_FEATURE_PSE);
2687 + mem_size = memparse(p, &p);
2688 + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2690 + i = e820.nr_map - 1;
2691 + current_end = e820.map[i].addr + e820.map[i].size;
2692 + if (current_end < mem_size) {
2694 + * The e820 map ends before our requested size so
2695 + * extend the final entry to the requested address.
2697 + if (e820.map[i].type == E820_RAM)
2698 + e820.map[i].size = mem_size - e820.map[i].addr;
2700 + e820_add_region(current_end, mem_size - current_end, E820_RAM);
2705 +early_param("mem", parse_memopt);
2708 +static int __init parse_memmap_opt(char *p)
2711 + u64 start_at, mem_size;
2716 + if (!strncmp(p, "exactmap", 8)) {
2717 +#ifdef CONFIG_CRASH_DUMP
2719 + * If we are doing a crash dump, we still need to know
2720 + * the real mem size before original memory map is
2723 + saved_max_pfn = e820_end_of_ram_pfn();
2731 + mem_size = memparse(p, &p);
2737 + start_at = memparse(p+1, &p);
2738 + e820_add_region(start_at, mem_size, E820_RAM);
2739 + } else if (*p == '#') {
2740 + start_at = memparse(p+1, &p);
2741 + e820_add_region(start_at, mem_size, E820_ACPI);
2742 + } else if (*p == '$') {
2743 + start_at = memparse(p+1, &p);
2744 + e820_add_region(start_at, mem_size, E820_RESERVED);
2746 + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2748 + return *p == '\0' ? 0 : -EINVAL;
2750 +early_param("memmap", parse_memmap_opt);
2753 +void __init finish_e820_parsing(void)
2756 + int nr = e820.nr_map;
2758 + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
2759 + early_panic("Invalid user supplied memory map");
2762 + printk(KERN_INFO "user-defined physical RAM map:\n");
2763 + e820_print_map("user");
2767 +static inline const char *e820_type_to_string(int e820_type)
2769 + switch (e820_type) {
2770 + case E820_RESERVED_KERN:
2771 + case E820_RAM: return "System RAM";
2772 + case E820_ACPI: return "ACPI Tables";
2773 + case E820_NVS: return "ACPI Non-volatile Storage";
2774 + default: return "reserved";
2779 +#define e820 machine_e820
2783 + * Mark e820 reserved areas as busy for the resource manager.
2785 +void __init e820_reserve_resources(void)
2788 + struct resource *res;
2791 + res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
2792 + for (i = 0; i < e820.nr_map; i++) {
2793 + end = e820.map[i].addr + e820.map[i].size - 1;
2794 +#ifndef CONFIG_RESOURCES_64BIT
2795 + if (end > 0x100000000ULL) {
2800 + res->name = e820_type_to_string(e820.map[i].type);
2801 + res->start = e820.map[i].addr;
2804 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
2805 + insert_resource(&iomem_resource, res);
2809 + for (i = 0; i < e820_saved.nr_map; i++) {
2810 + struct e820entry *entry = &e820_saved.map[i];
2811 + firmware_map_add_early(entry->addr,
2812 + entry->addr + entry->size - 1,
2813 + e820_type_to_string(entry->type));
2820 +char *__init default_machine_specific_memory_setup(void)
2822 + char *who = "BIOS-e820";
2825 + * Try to copy the BIOS-supplied E820-map.
2827 + * Otherwise fake a memory map; one section from 0k->640k,
2828 + * the next section from 1mb->appropriate_mem_k
2830 + new_nr = boot_params.e820_entries;
2831 + sanitize_e820_map(boot_params.e820_map,
2832 + ARRAY_SIZE(boot_params.e820_map),
2834 + boot_params.e820_entries = new_nr;
2835 + if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
2839 + /* compare results from other methods and take the greater */
2840 + if (boot_params.alt_mem_k
2841 + < boot_params.screen_info.ext_mem_k) {
2842 + mem_size = boot_params.screen_info.ext_mem_k;
2845 + mem_size = boot_params.alt_mem_k;
2846 + who = "BIOS-e801";
2850 + e820_add_region(0, LOWMEMSIZE(), E820_RAM);
2851 + e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
2854 + /* In case someone cares... */
2858 +char *__init __attribute__((weak)) machine_specific_memory_setup(void)
2860 + if (x86_quirks->arch_memory_setup) {
2861 + char *who = x86_quirks->arch_memory_setup();
2866 + return default_machine_specific_memory_setup();
2870 +char * __init memory_setup(void)
2873 + struct xen_memory_map memmap;
2875 + * This is rather large for a stack variable but this early in
2876 + * the boot process we know we have plenty slack space.
2878 + struct e820entry map[E820MAX];
2880 + memmap.nr_entries = E820MAX;
2881 + set_xen_guest_handle(memmap.buffer, map);
2883 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
2884 + if (rc == -ENOSYS) {
2885 + memmap.nr_entries = 1;
2886 + map[0].addr = 0ULL;
2887 + map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
2888 + /* 8MB slack (to balance backend allocations). */
2889 + map[0].size += 8ULL << 20;
2890 + map[0].type = E820_RAM;
2895 + nr_map = memmap.nr_entries;
2896 + sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
2898 + if (append_e820_map(map, nr_map) < 0)
2902 + if (is_initial_xendomain()) {
2903 + memmap.nr_entries = E820MAX;
2904 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
2906 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
2908 + machine_e820.nr_map = memmap.nr_entries;
2915 +void __init setup_memory_map(void)
2919 + who = memory_setup();
2921 + if (!is_initial_xendomain())
2923 + memcpy(&e820_saved, &e820, sizeof(struct e820map));
2924 + printk(KERN_INFO "Xen-provided physical RAM map:\n");
2925 + e820_print_map(who);
2927 --- sle11-2009-10-16.orig/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:38:05.000000000 +0100
2928 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
2930 -#include <linux/kernel.h>
2931 -#include <linux/types.h>
2932 -#include <linux/init.h>
2933 -#include <linux/bootmem.h>
2934 -#include <linux/ioport.h>
2935 -#include <linux/string.h>
2936 -#include <linux/kexec.h>
2937 -#include <linux/module.h>
2938 -#include <linux/mm.h>
2939 -#include <linux/pfn.h>
2940 -#include <linux/uaccess.h>
2941 -#include <linux/suspend.h>
2943 -#include <asm/pgtable.h>
2944 -#include <asm/page.h>
2945 -#include <asm/e820.h>
2946 -#include <asm/setup.h>
2947 -#include <xen/interface/memory.h>
2949 -struct e820map e820;
2950 -struct change_member {
2951 - struct e820entry *pbios; /* pointer to original bios entry */
2952 - unsigned long long addr; /* address for this change point */
2954 -static struct change_member change_point_list[2*E820MAX] __initdata;
2955 -static struct change_member *change_point[2*E820MAX] __initdata;
2956 -static struct e820entry *overlap_list[E820MAX] __initdata;
2957 -static struct e820entry new_bios[E820MAX] __initdata;
2958 -/* For PCI or other memory-mapped resources */
2959 -unsigned long pci_mem_start = 0x10000000;
2961 -EXPORT_SYMBOL(pci_mem_start);
2963 -extern int user_defined_memmap;
2965 -static struct resource system_rom_resource = {
2966 - .name = "System ROM",
2969 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2972 -static struct resource extension_rom_resource = {
2973 - .name = "Extension ROM",
2976 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2979 -static struct resource adapter_rom_resources[] = { {
2980 - .name = "Adapter ROM",
2983 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2985 - .name = "Adapter ROM",
2988 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2990 - .name = "Adapter ROM",
2993 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2995 - .name = "Adapter ROM",
2998 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3000 - .name = "Adapter ROM",
3003 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3005 - .name = "Adapter ROM",
3008 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3011 -static struct resource video_rom_resource = {
3012 - .name = "Video ROM",
3015 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3018 -#define ROMSIGNATURE 0xaa55
3020 -static int __init romsignature(const unsigned char *rom)
3022 - const unsigned short * const ptr = (const unsigned short *)rom;
3023 - unsigned short sig;
3025 - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
3028 -static int __init romchecksum(const unsigned char *rom, unsigned long length)
3030 - unsigned char sum, c;
3032 - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
3034 - return !length && !sum;
3037 -static void __init probe_roms(void)
3039 - const unsigned char *rom;
3040 - unsigned long start, length, upper;
3045 - /* Nothing to do if not running in dom0. */
3046 - if (!is_initial_xendomain())
3051 - upper = adapter_rom_resources[0].start;
3052 - for (start = video_rom_resource.start; start < upper; start += 2048) {
3053 - rom = isa_bus_to_virt(start);
3054 - if (!romsignature(rom))
3057 - video_rom_resource.start = start;
3059 - if (probe_kernel_address(rom + 2, c) != 0)
3062 - /* 0 < length <= 0x7f * 512, historically */
3065 - /* if checksum okay, trust length byte */
3066 - if (length && romchecksum(rom, length))
3067 - video_rom_resource.end = start + length - 1;
3069 - request_resource(&iomem_resource, &video_rom_resource);
3073 - start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
3074 - if (start < upper)
3078 - request_resource(&iomem_resource, &system_rom_resource);
3079 - upper = system_rom_resource.start;
3081 - /* check for extension rom (ignore length byte!) */
3082 - rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
3083 - if (romsignature(rom)) {
3084 - length = extension_rom_resource.end - extension_rom_resource.start + 1;
3085 - if (romchecksum(rom, length)) {
3086 - request_resource(&iomem_resource, &extension_rom_resource);
3087 - upper = extension_rom_resource.start;
3091 - /* check for adapter roms on 2k boundaries */
3092 - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
3093 - rom = isa_bus_to_virt(start);
3094 - if (!romsignature(rom))
3097 - if (probe_kernel_address(rom + 2, c) != 0)
3100 - /* 0 < length <= 0x7f * 512, historically */
3103 - /* but accept any length that fits if checksum okay */
3104 - if (!length || start + length > upper || !romchecksum(rom, length))
3107 - adapter_rom_resources[i].start = start;
3108 - adapter_rom_resources[i].end = start + length - 1;
3109 - request_resource(&iomem_resource, &adapter_rom_resources[i]);
3111 - start = adapter_rom_resources[i++].end & ~2047UL;
3116 -static struct e820map machine_e820;
3117 -#define e820 machine_e820
3121 - * Request address space for all standard RAM and ROM resources
3122 - * and also for regions reported as reserved by the e820.
3124 -void __init init_iomem_resources(struct resource *code_resource,
3125 - struct resource *data_resource,
3126 - struct resource *bss_resource)
3131 - for (i = 0; i < e820.nr_map; i++) {
3132 - struct resource *res;
3133 -#ifndef CONFIG_RESOURCES_64BIT
3134 - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
3137 - res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
3138 - switch (e820.map[i].type) {
3139 - case E820_RAM: res->name = "System RAM"; break;
3140 - case E820_ACPI: res->name = "ACPI Tables"; break;
3141 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
3142 - default: res->name = "reserved";
3144 - res->start = e820.map[i].addr;
3145 - res->end = res->start + e820.map[i].size - 1;
3146 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3147 - if (request_resource(&iomem_resource, res)) {
3151 - if (e820.map[i].type == E820_RAM) {
3153 - * We don't know which RAM region contains kernel data,
3154 - * so we try it repeatedly and let the resource manager
3158 - request_resource(res, code_resource);
3159 - request_resource(res, data_resource);
3160 - request_resource(res, bss_resource);
3162 -#ifdef CONFIG_KEXEC
3163 - if (crashk_res.start != crashk_res.end)
3164 - request_resource(res, &crashk_res);
3166 - xen_machine_kexec_register_resources(res);
3175 -#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
3177 - * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
3178 - * correspond to e820 RAM areas and mark the corresponding pages as nosave for
3181 - * This function requires the e820 map to be sorted and without any
3182 - * overlapping entries and assumes the first e820 area to be RAM.
3184 -void __init e820_mark_nosave_regions(void)
3187 - unsigned long pfn;
3189 - pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
3190 - for (i = 1; i < e820.nr_map; i++) {
3191 - struct e820entry *ei = &e820.map[i];
3193 - if (pfn < PFN_UP(ei->addr))
3194 - register_nosave_region(pfn, PFN_UP(ei->addr));
3196 - pfn = PFN_DOWN(ei->addr + ei->size);
3197 - if (ei->type != E820_RAM)
3198 - register_nosave_region(PFN_UP(ei->addr), pfn);
3200 - if (pfn >= max_low_pfn)
3206 -void __init add_memory_region(unsigned long long start,
3207 - unsigned long long size, int type)
3213 - if (x == E820MAX) {
3214 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
3218 - e820.map[x].addr = start;
3219 - e820.map[x].size = size;
3220 - e820.map[x].type = type;
3222 -} /* add_memory_region */
3225 - * Sanitize the BIOS e820 map.
3227 - * Some e820 responses include overlapping entries. The following
3228 - * replaces the original e820 map with a new one, removing overlaps.
3231 -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
3233 - struct change_member *change_tmp;
3234 - unsigned long current_type, last_type;
3235 - unsigned long long last_addr;
3236 - int chgidx, still_changing;
3237 - int overlap_entries;
3238 - int new_bios_entry;
3239 - int old_nr, new_nr, chg_nr;
3243 - Visually we're performing the following (1,2,3,4 = memory types)...
3245 - Sample memory map (w/overlaps):
3246 - ____22__________________
3247 - ______________________4_
3248 - ____1111________________
3249 - _44_____________________
3250 - 11111111________________
3251 - ____________________33__
3252 - ___________44___________
3253 - __________33333_________
3254 - ______________22________
3255 - ___________________2222_
3256 - _________111111111______
3257 - _____________________11_
3258 - _________________4______
3260 - Sanitized equivalent (no overlap):
3261 - 1_______________________
3262 - _44_____________________
3263 - ___1____________________
3264 - ____22__________________
3265 - ______11________________
3266 - _________1______________
3267 - __________3_____________
3268 - ___________44___________
3269 - _____________33_________
3270 - _______________2________
3271 - ________________1_______
3272 - _________________4______
3273 - ___________________2____
3274 - ____________________33__
3275 - ______________________4_
3277 - /* if there's only one memory region, don't bother */
3278 - if (*pnr_map < 2) {
3282 - old_nr = *pnr_map;
3284 - /* bail out if we find any unreasonable addresses in bios map */
3285 - for (i=0; i<old_nr; i++)
3286 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
3290 - /* create pointers for initial change-point information (for sorting) */
3291 - for (i=0; i < 2*old_nr; i++)
3292 - change_point[i] = &change_point_list[i];
3294 - /* record all known change-points (starting and ending addresses),
3295 - omitting those that are for empty memory regions */
3297 - for (i=0; i < old_nr; i++) {
3298 - if (biosmap[i].size != 0) {
3299 - change_point[chgidx]->addr = biosmap[i].addr;
3300 - change_point[chgidx++]->pbios = &biosmap[i];
3301 - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
3302 - change_point[chgidx++]->pbios = &biosmap[i];
3305 - chg_nr = chgidx; /* true number of change-points */
3307 - /* sort change-point list by memory addresses (low -> high) */
3308 - still_changing = 1;
3309 - while (still_changing) {
3310 - still_changing = 0;
3311 - for (i=1; i < chg_nr; i++) {
3312 - /* if <current_addr> > <last_addr>, swap */
3313 - /* or, if current=<start_addr> & last=<end_addr>, swap */
3314 - if ((change_point[i]->addr < change_point[i-1]->addr) ||
3315 - ((change_point[i]->addr == change_point[i-1]->addr) &&
3316 - (change_point[i]->addr == change_point[i]->pbios->addr) &&
3317 - (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
3320 - change_tmp = change_point[i];
3321 - change_point[i] = change_point[i-1];
3322 - change_point[i-1] = change_tmp;
3328 - /* create a new bios memory map, removing overlaps */
3329 - overlap_entries=0; /* number of entries in the overlap table */
3330 - new_bios_entry=0; /* index for creating new bios map entries */
3331 - last_type = 0; /* start with undefined memory type */
3332 - last_addr = 0; /* start with 0 as last starting address */
3333 - /* loop through change-points, determining affect on the new bios map */
3334 - for (chgidx=0; chgidx < chg_nr; chgidx++)
3336 - /* keep track of all overlapping bios entries */
3337 - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
3339 - /* add map entry to overlap list (> 1 entry implies an overlap) */
3340 - overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
3344 - /* remove entry from list (order independent, so swap with last) */
3345 - for (i=0; i<overlap_entries; i++)
3347 - if (overlap_list[i] == change_point[chgidx]->pbios)
3348 - overlap_list[i] = overlap_list[overlap_entries-1];
3350 - overlap_entries--;
3352 - /* if there are overlapping entries, decide which "type" to use */
3353 - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
3355 - for (i=0; i<overlap_entries; i++)
3356 - if (overlap_list[i]->type > current_type)
3357 - current_type = overlap_list[i]->type;
3358 - /* continue building up new bios map based on this information */
3359 - if (current_type != last_type) {
3360 - if (last_type != 0) {
3361 - new_bios[new_bios_entry].size =
3362 - change_point[chgidx]->addr - last_addr;
3363 - /* move forward only if the new size was non-zero */
3364 - if (new_bios[new_bios_entry].size != 0)
3365 - if (++new_bios_entry >= E820MAX)
3366 - break; /* no more space left for new bios entries */
3368 - if (current_type != 0) {
3369 - new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
3370 - new_bios[new_bios_entry].type = current_type;
3371 - last_addr=change_point[chgidx]->addr;
3373 - last_type = current_type;
3376 - new_nr = new_bios_entry; /* retain count for new bios entries */
3378 - /* copy new bios mapping into original location */
3379 - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
3380 - *pnr_map = new_nr;
3386 - * Copy the BIOS e820 map into a safe place.
3388 - * Sanity-check it while we're at it..
3390 - * If we're lucky and live on a modern system, the setup code
3391 - * will have given us a memory map that we can use to properly
3392 - * set up memory. If we aren't, we'll fake a memory map.
3394 - * We check to see that the memory map contains at least 2 elements
3395 - * before we'll use it, because the detection code in setup.S may
3396 - * not be perfect and most every PC known to man has two memory
3397 - * regions: one from 0 to 640k, and one from 1mb up. (The IBM
3398 - * thinkpad 560x, for example, does not cooperate with the memory
3399 - * detection code.)
3401 -int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
3404 - /* Only one memory region (or negative)? Ignore it */
3408 - BUG_ON(nr_map < 1);
3412 - u64 start = biosmap->addr;
3413 - u64 size = biosmap->size;
3414 - u64 end = start + size;
3415 - u32 type = biosmap->type;
3417 - /* Overflow in 64 bits? Ignore the memory map. */
3421 - add_memory_region(start, size, type);
3422 - } while (biosmap++, --nr_map);
3425 - if (is_initial_xendomain()) {
3426 - struct xen_memory_map memmap;
3428 - memmap.nr_entries = E820MAX;
3429 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
3431 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3433 - machine_e820.nr_map = memmap.nr_entries;
3435 - machine_e820 = e820;
3442 - * Find the highest page frame number we have available
3444 -void __init propagate_e820_map(void)
3450 - for (i = 0; i < e820.nr_map; i++) {
3451 - unsigned long start, end;
3453 - if (e820.map[i].type != E820_RAM)
3455 - start = PFN_UP(e820.map[i].addr);
3456 - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3459 - if (end > max_pfn)
3461 - memory_present(0, start, end);
3466 - * Register fully available low RAM pages with the bootmem allocator.
3468 -void __init register_bootmem_low_pages(unsigned long max_low_pfn)
3472 - for (i = 0; i < e820.nr_map; i++) {
3473 - unsigned long curr_pfn, last_pfn, size;
3475 - * Reserve usable low memory
3477 - if (e820.map[i].type != E820_RAM)
3480 - * We are rounding up the start address of usable memory:
3482 - curr_pfn = PFN_UP(e820.map[i].addr);
3483 - if (curr_pfn >= max_low_pfn)
3486 - * ... and at the end of the usable range downwards:
3488 - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3492 - * Truncate to the number of actual pages currently
3495 - if (last_pfn > xen_start_info->nr_pages)
3496 - last_pfn = xen_start_info->nr_pages;
3499 - if (last_pfn > max_low_pfn)
3500 - last_pfn = max_low_pfn;
3503 - * .. finally, did all the rounding and playing
3504 - * around just make the area go away?
3506 - if (last_pfn <= curr_pfn)
3509 - size = last_pfn - curr_pfn;
3510 - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
3514 -void __init e820_register_memory(void)
3516 - unsigned long gapstart, gapsize, round;
3517 - unsigned long long last;
3521 - if (is_initial_xendomain()) {
3522 - struct xen_memory_map memmap;
3524 - memmap.nr_entries = E820MAX;
3525 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
3527 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3529 - machine_e820.nr_map = memmap.nr_entries;
3532 - machine_e820 = e820;
3533 -#define e820 machine_e820
3537 - * Search for the biggest gap in the low 32 bits of the e820
3540 - last = 0x100000000ull;
3541 - gapstart = 0x10000000;
3542 - gapsize = 0x400000;
3544 - while (--i >= 0) {
3545 - unsigned long long start = e820.map[i].addr;
3546 - unsigned long long end = start + e820.map[i].size;
3549 - * Since "last" is at most 4GB, we know we'll
3550 - * fit in 32 bits if this condition is true
3553 - unsigned long gap = last - end;
3555 - if (gap > gapsize) {
3566 - * See how much we want to round up: start off with
3567 - * rounding to the next 1MB area.
3570 - while ((gapsize >> 4) > round)
3572 - /* Fun with two's complement */
3573 - pci_mem_start = (gapstart + round) & -round;
3575 - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
3576 - pci_mem_start, gapstart, gapsize);
3579 -void __init print_memory_map(char *who)
3583 - for (i = 0; i < e820.nr_map; i++) {
3584 - printk(" %s: %016Lx - %016Lx ", who,
3586 - e820.map[i].addr + e820.map[i].size);
3587 - switch (e820.map[i].type) {
3588 - case E820_RAM: printk("(usable)\n");
3590 - case E820_RESERVED:
3591 - printk("(reserved)\n");
3594 - printk("(ACPI data)\n");
3597 - printk("(ACPI NVS)\n");
3599 - default: printk("type %u\n", e820.map[i].type);
3605 -void __init limit_regions(unsigned long long size)
3607 - unsigned long long current_addr = 0;
3610 - print_memory_map("limit_regions start");
3611 - for (i = 0; i < e820.nr_map; i++) {
3612 - current_addr = e820.map[i].addr + e820.map[i].size;
3613 - if (current_addr < size)
3616 - if (e820.map[i].type != E820_RAM)
3619 - if (e820.map[i].addr >= size) {
3621 - * This region starts past the end of the
3622 - * requested size, skip it completely.
3626 - e820.nr_map = i + 1;
3627 - e820.map[i].size -= current_addr - size;
3629 - print_memory_map("limit_regions endfor");
3633 - if (current_addr < size) {
3635 - * The e820 map finished before our requested size so
3636 - * extend the final entry to the requested address.
3639 - if (e820.map[i].type == E820_RAM)
3640 - e820.map[i].size -= current_addr - size;
3642 - add_memory_region(current_addr, size - current_addr, E820_RAM);
3645 - print_memory_map("limit_regions endfunc");
3649 - * This function checks if any part of the range <start,end> is mapped
3653 -e820_any_mapped(u64 start, u64 end, unsigned type)
3658 - for (i = 0; i < e820.nr_map; i++) {
3659 - const struct e820entry *ei = &e820.map[i];
3661 - if (!is_initial_xendomain())
3663 - for (i = 0; i < machine_e820.nr_map; ++i) {
3664 - const struct e820entry *ei = &machine_e820.map[i];
3667 - if (type && ei->type != type)
3669 - if (ei->addr >= end || ei->addr + ei->size <= start)
3675 -EXPORT_SYMBOL_GPL(e820_any_mapped);
3678 - * This function checks if the entire range <start,end> is mapped with type.
3680 - * Note: this function only works correct if the e820 table is sorted and
3681 - * not-overlapping, which is the case
3684 -e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
3691 - for (i = 0; i < e820.nr_map; i++) {
3692 - struct e820entry *ei = &e820.map[i];
3694 - if (!is_initial_xendomain())
3696 - for (i = 0; i < machine_e820.nr_map; ++i) {
3697 - const struct e820entry *ei = &machine_e820.map[i];
3700 - if (type && ei->type != type)
3702 - /* is the region (part) in overlap with the current region ?*/
3703 - if (ei->addr >= end || ei->addr + ei->size <= start)
3705 - /* if the region is at the beginning of <start,end> we move
3706 - * start to the end of the region since it's ok until there
3708 - if (ei->addr <= start)
3709 - start = ei->addr + ei->size;
3710 - /* if start is now at or beyond end, we're done, full
3713 - return 1; /* we're done */
3718 -static int __init parse_memmap(char *arg)
3723 - if (strcmp(arg, "exactmap") == 0) {
3724 -#ifdef CONFIG_CRASH_DUMP
3725 - /* If we are doing a crash dump, we
3726 - * still need to know the real mem
3727 - * size before original memory map is
3730 - propagate_e820_map();
3731 - saved_max_pfn = max_pfn;
3734 - user_defined_memmap = 1;
3736 - /* If the user specifies memory size, we
3737 - * limit the BIOS-provided memory map to
3738 - * that size. exactmap can be used to specify
3739 - * the exact map. mem=number can be used to
3740 - * trim the existing memory map.
3742 - unsigned long long start_at, mem_size;
3744 - mem_size = memparse(arg, &arg);
3745 - if (*arg == '@') {
3746 - start_at = memparse(arg+1, &arg);
3747 - add_memory_region(start_at, mem_size, E820_RAM);
3748 - } else if (*arg == '#') {
3749 - start_at = memparse(arg+1, &arg);
3750 - add_memory_region(start_at, mem_size, E820_ACPI);
3751 - } else if (*arg == '$') {
3752 - start_at = memparse(arg+1, &arg);
3753 - add_memory_region(start_at, mem_size, E820_RESERVED);
3755 - limit_regions(mem_size);
3756 - user_defined_memmap = 1;
3761 -early_param("memmap", parse_memmap);
3764 -void __init update_memory_range(u64 start, u64 size, unsigned old_type,
3765 - unsigned new_type)
3769 - BUG_ON(old_type == new_type);
3771 - for (i = 0; i < e820.nr_map; i++) {
3772 - struct e820entry *ei = &e820.map[i];
3773 - u64 final_start, final_end;
3774 - if (ei->type != old_type)
3776 - /* totally covered? */
3777 - if (ei->addr >= start && ei->size <= size) {
3778 - ei->type = new_type;
3781 - /* partially covered */
3782 - final_start = max(start, ei->addr);
3783 - final_end = min(start + size, ei->addr + ei->size);
3784 - if (final_start >= final_end)
3786 - add_memory_region(final_start, final_end - final_start,
3791 -void __init update_e820(void)
3795 - nr_map = e820.nr_map;
3796 - if (sanitize_e820_map(e820.map, &nr_map))
3798 - e820.nr_map = nr_map;
3799 - printk(KERN_INFO "modified physical RAM map:\n");
3800 - print_memory_map("modified");
3803 --- sle11-2009-10-16.orig/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:38:05.000000000 +0100
3804 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3807 - * Handle the memory map.
3808 - * The functions here do the job until bootmem takes over.
3810 - * Getting sanitize_e820_map() in sync with i386 version by applying change:
3811 - * - Provisions for empty E820 memory regions (reported by certain BIOSes).
3812 - * Alex Achenbach <xela@slit.de>, December 2002.
3813 - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
3816 -#include <linux/kernel.h>
3817 -#include <linux/types.h>
3818 -#include <linux/init.h>
3819 -#include <linux/bootmem.h>
3820 -#include <linux/ioport.h>
3821 -#include <linux/string.h>
3822 -#include <linux/kexec.h>
3823 -#include <linux/module.h>
3824 -#include <linux/mm.h>
3825 -#include <linux/suspend.h>
3826 -#include <linux/pfn.h>
3828 -#include <asm/pgtable.h>
3829 -#include <asm/page.h>
3830 -#include <asm/e820.h>
3831 -#include <asm/proto.h>
3832 -#include <asm/setup.h>
3833 -#include <asm/sections.h>
3834 -#include <asm/kdebug.h>
3835 -#include <xen/interface/memory.h>
3837 -struct e820map e820 __initdata;
3839 -struct e820map machine_e820;
3843 - * PFN of last memory page.
3845 -unsigned long end_pfn;
3848 - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
3849 - * The direct mapping extends to max_pfn_mapped, so that we can directly access
3850 - * apertures, ACPI and other tables without having to play with fixmaps.
3852 -unsigned long max_pfn_mapped;
3855 - * Last pfn which the user wants to use.
3857 -static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
3860 - * Early reserved memory areas.
3862 -#define MAX_EARLY_RES 20
3865 - unsigned long start, end;
3868 -static struct early_res early_res[MAX_EARLY_RES] __initdata = {
3870 - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
3871 -#ifdef CONFIG_X86_TRAMPOLINE
3872 - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
3878 -void __init reserve_early(unsigned long start, unsigned long end, char *name)
3881 - struct early_res *r;
3882 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3883 - r = &early_res[i];
3884 - if (end > r->start && start < r->end)
3885 - panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
3886 - start, end - 1, name?name:"", r->start, r->end - 1, r->name);
3888 - if (i >= MAX_EARLY_RES)
3889 - panic("Too many early reservations");
3890 - r = &early_res[i];
3894 - strncpy(r->name, name, sizeof(r->name) - 1);
3897 -void __init free_early(unsigned long start, unsigned long end)
3899 - struct early_res *r;
3902 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3903 - r = &early_res[i];
3904 - if (start == r->start && end == r->end)
3907 - if (i >= MAX_EARLY_RES || !early_res[i].end)
3908 - panic("free_early on not reserved area: %lx-%lx!", start, end);
3910 - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
3913 - memmove(&early_res[i], &early_res[i + 1],
3914 - (j - 1 - i) * sizeof(struct early_res));
3916 - early_res[j - 1].end = 0;
3919 -void __init early_res_to_bootmem(unsigned long start, unsigned long end)
3922 - unsigned long final_start, final_end;
3923 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3924 - struct early_res *r = &early_res[i];
3925 - final_start = max(start, r->start);
3926 - final_end = min(end, r->end);
3927 - if (final_start >= final_end)
3929 - printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
3930 - final_start, final_end - 1, r->name);
3931 - reserve_bootmem_generic(final_start, final_end - final_start);
3935 -/* Check for already reserved areas */
3936 -static inline int __init
3937 -bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
3940 - unsigned long addr = *addrp, last;
3943 - last = addr + size;
3944 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3945 - struct early_res *r = &early_res[i];
3946 - if (last >= r->start && addr < r->end) {
3947 - *addrp = addr = round_up(r->end, align);
3955 -/* Check for already reserved areas */
3956 -static inline int __init
3957 -bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
3960 - unsigned long addr = *addrp, last;
3961 - unsigned long size = *sizep;
3964 - last = addr + size;
3965 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3966 - struct early_res *r = &early_res[i];
3967 - if (last > r->start && addr < r->start) {
3968 - size = r->start - addr;
3972 - if (last > r->end && addr < r->end) {
3973 - addr = round_up(r->end, align);
3974 - size = last - addr;
3978 - if (last <= r->end && addr >= r->start) {
3990 - * This function checks if any part of the range <start,end> is mapped
3994 -e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
3999 - for (i = 0; i < e820.nr_map; i++) {
4000 - struct e820entry *ei = &e820.map[i];
4002 - if (!is_initial_xendomain())
4004 - for (i = 0; i < machine_e820.nr_map; i++) {
4005 - const struct e820entry *ei = &machine_e820.map[i];
4008 - if (type && ei->type != type)
4010 - if (ei->addr >= end || ei->addr + ei->size <= start)
4016 -EXPORT_SYMBOL_GPL(e820_any_mapped);
4019 - * This function checks if the entire range <start,end> is mapped with type.
4021 - * Note: this function only works correct if the e820 table is sorted and
4022 - * not-overlapping, which is the case
4024 -int __init e820_all_mapped(unsigned long start, unsigned long end,
4030 - for (i = 0; i < e820.nr_map; i++) {
4031 - struct e820entry *ei = &e820.map[i];
4033 - if (!is_initial_xendomain())
4035 - for (i = 0; i < machine_e820.nr_map; i++) {
4036 - const struct e820entry *ei = &machine_e820.map[i];
4039 - if (type && ei->type != type)
4041 - /* is the region (part) in overlap with the current region ?*/
4042 - if (ei->addr >= end || ei->addr + ei->size <= start)
4045 - /* if the region is at the beginning of <start,end> we move
4046 - * start to the end of the region since it's ok until there
4048 - if (ei->addr <= start)
4049 - start = ei->addr + ei->size;
4051 - * if start is now at or beyond end, we're done, full
4061 - * Find a free area with specified alignment in a specific range.
4063 -unsigned long __init find_e820_area(unsigned long start, unsigned long end,
4064 - unsigned long size, unsigned long align)
4068 - for (i = 0; i < e820.nr_map; i++) {
4069 - struct e820entry *ei = &e820.map[i];
4070 - unsigned long addr, last;
4071 - unsigned long ei_last;
4073 - if (ei->type != E820_RAM)
4075 - addr = round_up(ei->addr, align);
4076 - ei_last = ei->addr + ei->size;
4078 - addr = round_up(start, align);
4079 - if (addr >= ei_last)
4081 - while (bad_addr(&addr, size, align) && addr+size <= ei_last)
4083 - last = addr + size;
4084 - if (last > ei_last)
4094 - * Find next free range after *start
4096 -unsigned long __init find_e820_area_size(unsigned long start,
4097 - unsigned long *sizep,
4098 - unsigned long align)
4102 - for (i = 0; i < e820.nr_map; i++) {
4103 - struct e820entry *ei = &e820.map[i];
4104 - unsigned long addr, last;
4105 - unsigned long ei_last;
4107 - if (ei->type != E820_RAM)
4109 - addr = round_up(ei->addr, align);
4110 - ei_last = ei->addr + ei->size;
4112 - addr = round_up(start, align);
4113 - if (addr >= ei_last)
4115 - *sizep = ei_last - addr;
4116 - while (bad_addr_size(&addr, sizep, align) &&
4117 - addr + *sizep <= ei_last)
4119 - last = addr + *sizep;
4120 - if (last > ei_last)
4128 - * Find the highest page frame number we have available
4130 -unsigned long __init e820_end_of_ram(void)
4132 - unsigned long end_pfn;
4134 - end_pfn = find_max_pfn_with_active_regions();
4136 - if (end_pfn > max_pfn_mapped)
4137 - max_pfn_mapped = end_pfn;
4138 - if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
4139 - max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
4140 - if (end_pfn > end_user_pfn)
4141 - end_pfn = end_user_pfn;
4142 - if (end_pfn > max_pfn_mapped)
4143 - end_pfn = max_pfn_mapped;
4145 - printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
4150 - * Mark e820 reserved areas as busy for the resource manager.
4152 -void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
4155 - struct resource *res;
4157 - res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
4158 - for (i = 0; i < nr_map; i++) {
4159 - switch (e820[i].type) {
4160 - case E820_RAM: res->name = "System RAM"; break;
4161 - case E820_ACPI: res->name = "ACPI Tables"; break;
4162 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
4163 - default: res->name = "reserved";
4165 - res->start = e820[i].addr;
4166 - res->end = res->start + e820[i].size - 1;
4167 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4168 - insert_resource(&iomem_resource, res);
4175 - * Find the ranges of physical addresses that do not correspond to
4176 - * e820 RAM areas and mark the corresponding pages as nosave for software
4177 - * suspend and suspend to RAM.
4179 - * This function requires the e820 map to be sorted and without any
4180 - * overlapping entries and assumes the first e820 area to be RAM.
4182 -void __init e820_mark_nosave_regions(void)
4185 - unsigned long paddr;
4187 - paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
4188 - for (i = 1; i < e820.nr_map; i++) {
4189 - struct e820entry *ei = &e820.map[i];
4191 - if (paddr < ei->addr)
4192 - register_nosave_region(PFN_DOWN(paddr),
4193 - PFN_UP(ei->addr));
4195 - paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
4196 - if (ei->type != E820_RAM)
4197 - register_nosave_region(PFN_UP(ei->addr),
4200 - if (paddr >= (end_pfn << PAGE_SHIFT))
4207 - * Finds an active region in the address range from start_pfn to end_pfn and
4208 - * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
4210 -static int __init e820_find_active_region(const struct e820entry *ei,
4211 - unsigned long start_pfn,
4212 - unsigned long end_pfn,
4213 - unsigned long *ei_startpfn,
4214 - unsigned long *ei_endpfn)
4216 - *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
4217 - *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
4219 - /* Skip map entries smaller than a page */
4220 - if (*ei_startpfn >= *ei_endpfn)
4223 - /* Check if max_pfn_mapped should be updated */
4224 - if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
4225 - max_pfn_mapped = *ei_endpfn;
4227 - /* Skip if map is outside the node */
4228 - if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
4229 - *ei_startpfn >= end_pfn)
4232 - /* Check for overlaps */
4233 - if (*ei_startpfn < start_pfn)
4234 - *ei_startpfn = start_pfn;
4235 - if (*ei_endpfn > end_pfn)
4236 - *ei_endpfn = end_pfn;
4238 - /* Obey end_user_pfn to save on memmap */
4239 - if (*ei_startpfn >= end_user_pfn)
4241 - if (*ei_endpfn > end_user_pfn)
4242 - *ei_endpfn = end_user_pfn;
4247 -/* Walk the e820 map and register active regions within a node */
4249 -e820_register_active_regions(int nid, unsigned long start_pfn,
4250 - unsigned long end_pfn)
4252 - unsigned long ei_startpfn;
4253 - unsigned long ei_endpfn;
4256 - for (i = 0; i < e820.nr_map; i++)
4257 - if (e820_find_active_region(&e820.map[i],
4258 - start_pfn, end_pfn,
4259 - &ei_startpfn, &ei_endpfn))
4260 - add_active_range(nid, ei_startpfn, ei_endpfn);
4264 - * Add a memory region to the kernel e820 map.
4266 -void __init add_memory_region(unsigned long start, unsigned long size, int type)
4268 - int x = e820.nr_map;
4270 - if (x == E820MAX) {
4271 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
4275 - e820.map[x].addr = start;
4276 - e820.map[x].size = size;
4277 - e820.map[x].type = type;
4282 - * Find the hole size (in bytes) in the memory range.
4283 - * @start: starting address of the memory range to scan
4284 - * @end: ending address of the memory range to scan
4286 -unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
4288 - unsigned long start_pfn = start >> PAGE_SHIFT;
4289 - unsigned long end_pfn = end >> PAGE_SHIFT;
4290 - unsigned long ei_startpfn, ei_endpfn, ram = 0;
4293 - for (i = 0; i < e820.nr_map; i++) {
4294 - if (e820_find_active_region(&e820.map[i],
4295 - start_pfn, end_pfn,
4296 - &ei_startpfn, &ei_endpfn))
4297 - ram += ei_endpfn - ei_startpfn;
4299 - return end - start - (ram << PAGE_SHIFT);
4302 -static void __init e820_print_map(char *who)
4306 - for (i = 0; i < e820.nr_map; i++) {
4307 - printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
4308 - (unsigned long long) e820.map[i].addr,
4309 - (unsigned long long)
4310 - (e820.map[i].addr + e820.map[i].size));
4311 - switch (e820.map[i].type) {
4313 - printk(KERN_CONT "(usable)\n");
4315 - case E820_RESERVED:
4316 - printk(KERN_CONT "(reserved)\n");
4319 - printk(KERN_CONT "(ACPI data)\n");
4322 - printk(KERN_CONT "(ACPI NVS)\n");
4325 - printk(KERN_CONT "type %u\n", e820.map[i].type);
4332 - * Sanitize the BIOS e820 map.
4334 - * Some e820 responses include overlapping entries. The following
4335 - * replaces the original e820 map with a new one, removing overlaps.
4338 -static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
4340 - struct change_member {
4341 - struct e820entry *pbios; /* pointer to original bios entry */
4342 - unsigned long long addr; /* address for this change point */
4344 - static struct change_member change_point_list[2*E820MAX] __initdata;
4345 - static struct change_member *change_point[2*E820MAX] __initdata;
4346 - static struct e820entry *overlap_list[E820MAX] __initdata;
4347 - static struct e820entry new_bios[E820MAX] __initdata;
4348 - struct change_member *change_tmp;
4349 - unsigned long current_type, last_type;
4350 - unsigned long long last_addr;
4351 - int chgidx, still_changing;
4352 - int overlap_entries;
4353 - int new_bios_entry;
4354 - int old_nr, new_nr, chg_nr;
4358 - Visually we're performing the following
4359 - (1,2,3,4 = memory types)...
4361 - Sample memory map (w/overlaps):
4362 - ____22__________________
4363 - ______________________4_
4364 - ____1111________________
4365 - _44_____________________
4366 - 11111111________________
4367 - ____________________33__
4368 - ___________44___________
4369 - __________33333_________
4370 - ______________22________
4371 - ___________________2222_
4372 - _________111111111______
4373 - _____________________11_
4374 - _________________4______
4376 - Sanitized equivalent (no overlap):
4377 - 1_______________________
4378 - _44_____________________
4379 - ___1____________________
4380 - ____22__________________
4381 - ______11________________
4382 - _________1______________
4383 - __________3_____________
4384 - ___________44___________
4385 - _____________33_________
4386 - _______________2________
4387 - ________________1_______
4388 - _________________4______
4389 - ___________________2____
4390 - ____________________33__
4391 - ______________________4_
4394 - /* if there's only one memory region, don't bother */
4398 - old_nr = *pnr_map;
4400 - /* bail out if we find any unreasonable addresses in bios map */
4401 - for (i = 0; i < old_nr; i++)
4402 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
4405 - /* create pointers for initial change-point information (for sorting) */
4406 - for (i = 0; i < 2 * old_nr; i++)
4407 - change_point[i] = &change_point_list[i];
4409 - /* record all known change-points (starting and ending addresses),
4410 - omitting those that are for empty memory regions */
4412 - for (i = 0; i < old_nr; i++) {
4413 - if (biosmap[i].size != 0) {
4414 - change_point[chgidx]->addr = biosmap[i].addr;
4415 - change_point[chgidx++]->pbios = &biosmap[i];
4416 - change_point[chgidx]->addr = biosmap[i].addr +
4418 - change_point[chgidx++]->pbios = &biosmap[i];
4423 - /* sort change-point list by memory addresses (low -> high) */
4424 - still_changing = 1;
4425 - while (still_changing) {
4426 - still_changing = 0;
4427 - for (i = 1; i < chg_nr; i++) {
4428 - unsigned long long curaddr, lastaddr;
4429 - unsigned long long curpbaddr, lastpbaddr;
4431 - curaddr = change_point[i]->addr;
4432 - lastaddr = change_point[i - 1]->addr;
4433 - curpbaddr = change_point[i]->pbios->addr;
4434 - lastpbaddr = change_point[i - 1]->pbios->addr;
4437 - * swap entries, when:
4439 - * curaddr > lastaddr or
4440 - * curaddr == lastaddr and curaddr == curpbaddr and
4441 - * lastaddr != lastpbaddr
4443 - if (curaddr < lastaddr ||
4444 - (curaddr == lastaddr && curaddr == curpbaddr &&
4445 - lastaddr != lastpbaddr)) {
4446 - change_tmp = change_point[i];
4447 - change_point[i] = change_point[i-1];
4448 - change_point[i-1] = change_tmp;
4449 - still_changing = 1;
4454 - /* create a new bios memory map, removing overlaps */
4455 - overlap_entries = 0; /* number of entries in the overlap table */
4456 - new_bios_entry = 0; /* index for creating new bios map entries */
4457 - last_type = 0; /* start with undefined memory type */
4458 - last_addr = 0; /* start with 0 as last starting address */
4460 - /* loop through change-points, determining affect on the new bios map */
4461 - for (chgidx = 0; chgidx < chg_nr; chgidx++) {
4462 - /* keep track of all overlapping bios entries */
4463 - if (change_point[chgidx]->addr ==
4464 - change_point[chgidx]->pbios->addr) {
4466 - * add map entry to overlap list (> 1 entry
4467 - * implies an overlap)
4469 - overlap_list[overlap_entries++] =
4470 - change_point[chgidx]->pbios;
4473 - * remove entry from list (order independent,
4474 - * so swap with last)
4476 - for (i = 0; i < overlap_entries; i++) {
4477 - if (overlap_list[i] ==
4478 - change_point[chgidx]->pbios)
4480 - overlap_list[overlap_entries-1];
4482 - overlap_entries--;
4485 - * if there are overlapping entries, decide which
4486 - * "type" to use (larger value takes precedence --
4487 - * 1=usable, 2,3,4,4+=unusable)
4490 - for (i = 0; i < overlap_entries; i++)
4491 - if (overlap_list[i]->type > current_type)
4492 - current_type = overlap_list[i]->type;
4494 - * continue building up new bios map based on this
4497 - if (current_type != last_type) {
4498 - if (last_type != 0) {
4499 - new_bios[new_bios_entry].size =
4500 - change_point[chgidx]->addr - last_addr;
4502 - * move forward only if the new size
4505 - if (new_bios[new_bios_entry].size != 0)
4507 - * no more space left for new
4510 - if (++new_bios_entry >= E820MAX)
4513 - if (current_type != 0) {
4514 - new_bios[new_bios_entry].addr =
4515 - change_point[chgidx]->addr;
4516 - new_bios[new_bios_entry].type = current_type;
4517 - last_addr = change_point[chgidx]->addr;
4519 - last_type = current_type;
4522 - /* retain count for new bios entries */
4523 - new_nr = new_bios_entry;
4525 - /* copy new bios mapping into original location */
4526 - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
4527 - *pnr_map = new_nr;
4533 - * Copy the BIOS e820 map into a safe place.
4535 - * Sanity-check it while we're at it..
4537 - * If we're lucky and live on a modern system, the setup code
4538 - * will have given us a memory map that we can use to properly
4539 - * set up memory. If we aren't, we'll fake a memory map.
4541 -static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
4544 - /* Only one memory region (or negative)? Ignore it */
4548 - BUG_ON(nr_map < 1);
4552 - u64 start = biosmap->addr;
4553 - u64 size = biosmap->size;
4554 - u64 end = start + size;
4555 - u32 type = biosmap->type;
4557 - /* Overflow in 64 bits? Ignore the memory map. */
4561 - add_memory_region(start, size, type);
4562 - } while (biosmap++, --nr_map);
4565 - if (is_initial_xendomain()) {
4566 - struct xen_memory_map memmap;
4568 - memmap.nr_entries = E820MAX;
4569 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
4571 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
4573 - machine_e820.nr_map = memmap.nr_entries;
4575 - machine_e820 = e820;
4581 -static void early_panic(char *msg)
4583 - early_printk(msg);
4587 -/* We're not void only for x86 32-bit compat */
4588 -char * __init machine_specific_memory_setup(void)
4591 - char *who = "BIOS-e820";
4593 - * Try to copy the BIOS-supplied E820-map.
4595 - * Otherwise fake a memory map; one section from 0k->640k,
4596 - * the next section from 1mb->appropriate_mem_k
4598 - sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
4599 - if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
4600 - early_panic("Cannot find a valid memory map");
4601 -#else /* CONFIG_XEN */
4602 - char *who = "Xen";
4604 - struct xen_memory_map memmap;
4606 - * This is rather large for a stack variable but this early in
4607 - * the boot process we know we have plenty slack space.
4609 - struct e820entry map[E820MAX];
4611 - memmap.nr_entries = E820MAX;
4612 - set_xen_guest_handle(memmap.buffer, map);
4614 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
4615 - if ( rc == -ENOSYS ) {
4616 - memmap.nr_entries = 1;
4617 - map[0].addr = 0ULL;
4618 - map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
4619 - /* 8MB slack (to balance backend allocations). */
4620 - map[0].size += 8 << 20;
4621 - map[0].type = E820_RAM;
4626 - sanitize_e820_map(map, (char *)&memmap.nr_entries);
4628 - if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
4629 - early_panic("Cannot find a valid memory map");
4631 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
4632 - e820_print_map(who);
4634 - /* In case someone cares... */
4638 -static int __init parse_memopt(char *p)
4641 - unsigned long current_end;
4642 - unsigned long end;
4646 - end_user_pfn = memparse(p, &p);
4647 - end_user_pfn >>= PAGE_SHIFT;
4649 - end = end_user_pfn<<PAGE_SHIFT;
4650 - i = e820.nr_map-1;
4651 - current_end = e820.map[i].addr + e820.map[i].size;
4653 - if (current_end < end) {
4655 - * The e820 map ends before our requested size so
4656 - * extend the final entry to the requested address.
4658 - if (e820.map[i].type == E820_RAM)
4659 - e820.map[i].size = end - e820.map[i].addr;
4661 - add_memory_region(current_end, end - current_end, E820_RAM);
4666 -early_param("mem", parse_memopt);
4668 -static int userdef __initdata;
4670 -static int __init parse_memmap_opt(char *p)
4673 - unsigned long long start_at, mem_size;
4675 - if (!strcmp(p, "exactmap")) {
4676 -#ifdef CONFIG_CRASH_DUMP
4678 - * If we are doing a crash dump, we still need to know
4679 - * the real mem size before original memory map is
4682 - e820_register_active_regions(0, 0, -1UL);
4683 - saved_max_pfn = e820_end_of_ram();
4684 - remove_all_active_ranges();
4686 - max_pfn_mapped = 0;
4693 - mem_size = memparse(p, &p);
4699 - start_at = memparse(p+1, &p);
4700 - add_memory_region(start_at, mem_size, E820_RAM);
4701 - } else if (*p == '#') {
4702 - start_at = memparse(p+1, &p);
4703 - add_memory_region(start_at, mem_size, E820_ACPI);
4704 - } else if (*p == '$') {
4705 - start_at = memparse(p+1, &p);
4706 - add_memory_region(start_at, mem_size, E820_RESERVED);
4708 - end_user_pfn = (mem_size >> PAGE_SHIFT);
4710 - return *p == '\0' ? 0 : -EINVAL;
4712 -early_param("memmap", parse_memmap_opt);
4714 -void __init finish_e820_parsing(void)
4717 - char nr = e820.nr_map;
4719 - if (sanitize_e820_map(e820.map, &nr) < 0)
4720 - early_panic("Invalid user supplied memory map");
4723 - printk(KERN_INFO "user-defined physical RAM map:\n");
4724 - e820_print_map("user");
4729 -void __init update_memory_range(u64 start, u64 size, unsigned old_type,
4730 - unsigned new_type)
4734 - BUG_ON(old_type == new_type);
4736 - for (i = 0; i < e820.nr_map; i++) {
4737 - struct e820entry *ei = &e820.map[i];
4738 - u64 final_start, final_end;
4739 - if (ei->type != old_type)
4741 - /* totally covered? */
4742 - if (ei->addr >= start && ei->size <= size) {
4743 - ei->type = new_type;
4746 - /* partially covered */
4747 - final_start = max(start, ei->addr);
4748 - final_end = min(start + size, ei->addr + ei->size);
4749 - if (final_start >= final_end)
4751 - add_memory_region(final_start, final_end - final_start,
4756 -void __init update_e820(void)
4760 - nr_map = e820.nr_map;
4761 - if (sanitize_e820_map(e820.map, &nr_map))
4763 - e820.nr_map = nr_map;
4764 - printk(KERN_INFO "modified physical RAM map:\n");
4765 - e820_print_map("modified");
4769 -unsigned long pci_mem_start = 0xaeedbabe;
4770 -EXPORT_SYMBOL(pci_mem_start);
4773 - * Search for the biggest gap in the low 32 bits of the e820
4774 - * memory space. We pass this space to PCI to assign MMIO resources
4775 - * for hotplug or unconfigured devices in.
4776 - * Hopefully the BIOS let enough space left.
4778 -__init void e820_setup_gap(struct e820entry *e820, int nr_map)
4780 - unsigned long gapstart, gapsize, round;
4781 - unsigned long last;
4785 - last = 0x100000000ull;
4786 - gapstart = 0x10000000;
4787 - gapsize = 0x400000;
4789 - while (--i >= 0) {
4790 - unsigned long long start = e820[i].addr;
4791 - unsigned long long end = start + e820[i].size;
4794 - * Since "last" is at most 4GB, we know we'll
4795 - * fit in 32 bits if this condition is true
4798 - unsigned long gap = last - end;
4800 - if (gap > gapsize) {
4811 - gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
4812 - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
4814 - KERN_ERR "PCI: Unassigned devices with 32bit resource "
4815 - "registers may break!\n");
4819 - * See how much we want to round up: start off with
4820 - * rounding to the next 1MB area.
4823 - while ((gapsize >> 4) > round)
4825 - /* Fun with two's complement */
4826 - pci_mem_start = (gapstart + round) & -round;
4829 - "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
4830 - pci_mem_start, gapstart, gapsize);
4833 -int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
4837 - if (slot < 0 || slot >= e820.nr_map)
4839 - for (i = slot; i < e820.nr_map; i++) {
4840 - if (e820.map[i].type != E820_RAM)
4844 - if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
4846 - *addr = e820.map[i].addr;
4847 - *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
4848 - max_pfn << PAGE_SHIFT) - *addr;
4851 --- sle11-2009-10-16.orig/arch/x86/kernel/early_printk-xen.c 2009-09-24 10:29:16.000000000 +0200
4852 +++ sle11-2009-10-16/arch/x86/kernel/early_printk-xen.c 2009-06-04 10:21:39.000000000 +0200
4853 @@ -225,7 +225,7 @@ static struct console simnow_console = {
4854 static struct console *early_console = &early_vga_console;
4855 static int early_console_initialized;
4857 -void early_printk(const char *fmt, ...)
4858 +asmlinkage void early_printk(const char *fmt, ...)
4862 --- sle11-2009-10-16.orig/arch/x86/kernel/entry_32-xen.S 2009-03-16 16:38:05.000000000 +0100
4863 +++ sle11-2009-10-16/arch/x86/kernel/entry_32-xen.S 2009-06-04 10:21:39.000000000 +0200
4865 #include <asm/percpu.h>
4866 #include <asm/dwarf2.h>
4867 #include <asm/processor-flags.h>
4868 -#include "irq_vectors.h"
4869 +#include <asm/ftrace.h>
4870 +#include <asm/irq_vectors.h>
4871 #include <xen/interface/xen.h>
4873 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
4874 +#include <linux/elf-em.h>
4875 +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
4876 +#define __AUDIT_ARCH_LE 0x40000000
4878 +#ifndef CONFIG_AUDITSYSCALL
4879 +#define sysenter_audit syscall_trace_entry
4880 +#define sysexit_audit syscall_exit_work
4884 * We use macros for low-level operations which need to be overridden
4885 * for paravirtualization. The following will never clobber any registers:
4886 * INTERRUPT_RETURN (aka. "iret")
4887 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
4888 - * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
4889 + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
4891 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
4892 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
4893 @@ -277,11 +288,6 @@ END(resume_kernel)
4897 - .macro test_tif ti_reg # system call tracing in operation / emulation
4898 - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
4899 - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
4902 /* SYSENTER_RETURN points to after the "sysenter" instruction in
4903 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
4905 @@ -338,8 +344,9 @@ sysenter_past_esp:
4908 GET_THREAD_INFO(%ebp)
4910 - jnz syscall_trace_entry
4911 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4912 + jnz sysenter_audit
4914 cmpl $(nr_syscalls), %eax
4916 call *sys_call_table(,%eax,4)
4917 @@ -349,14 +356,54 @@ sysenter_past_esp:
4919 movl TI_flags(%ebp), %ecx
4920 testw $_TIF_ALLWORK_MASK, %cx
4921 - jne syscall_exit_work
4924 /* if something modifies registers it must also disable sysexit */
4925 movl PT_EIP(%esp), %edx
4926 movl PT_OLDESP(%esp), %ecx
4929 1: mov PT_FS(%esp), %fs
4930 - ENABLE_INTERRUPTS_SYSCALL_RET
4931 + ENABLE_INTERRUPTS_SYSEXIT
4933 +#ifdef CONFIG_AUDITSYSCALL
4935 + testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
4936 + jnz syscall_trace_entry
4938 + CFI_ADJUST_CFA_OFFSET -4
4939 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
4940 + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
4941 + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
4942 + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
4943 + movl %eax,%edx /* 2nd arg: syscall number */
4944 + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
4945 + call audit_syscall_entry
4947 + CFI_ADJUST_CFA_OFFSET 4
4948 + movl PT_EAX(%esp),%eax /* reload syscall number */
4949 + jmp sysenter_do_call
4952 + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4953 + jne syscall_exit_work
4955 + ENABLE_INTERRUPTS(CLBR_ANY)
4956 + movl %eax,%edx /* second arg, syscall return value */
4957 + cmpl $0,%eax /* is it < 0? */
4958 + setl %al /* 1 if so, 0 if not */
4959 + movzbl %al,%eax /* zero-extend that */
4960 + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
4961 + call audit_syscall_exit
4962 + DISABLE_INTERRUPTS(CLBR_ANY)
4964 + movl TI_flags(%ebp), %ecx
4965 + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4966 + jne syscall_exit_work
4967 + movl PT_EAX(%esp),%eax /* reload syscall return value */
4972 .pushsection .fixup,"ax"
4973 2: movl $0,PT_FS(%esp)
4974 @@ -400,7 +447,7 @@ ENTRY(system_call)
4975 CFI_ADJUST_CFA_OFFSET 4
4977 GET_THREAD_INFO(%ebp)
4979 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4980 jnz syscall_trace_entry
4981 cmpl $(nr_syscalls), %eax
4983 @@ -413,10 +460,6 @@ syscall_exit:
4984 # setting need_resched or sigpending
4985 # between sampling and the iret
4987 - testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
4989 - orl $_TIF_SINGLESTEP,TI_flags(%ebp)
4991 movl TI_flags(%ebp), %ecx
4992 testw $_TIF_ALLWORK_MASK, %cx # current->work
4993 jne syscall_exit_work
4994 @@ -588,12 +631,8 @@ END(work_pending)
4995 syscall_trace_entry:
4996 movl $-ENOSYS,PT_EAX(%esp)
4999 - call do_syscall_trace
5001 - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5002 - # so must skip actual syscall
5003 - movl PT_ORIG_EAX(%esp), %eax
5004 + call syscall_trace_enter
5005 + /* What it returned is what we'll actually use. */
5006 cmpl $(nr_syscalls), %eax
5009 @@ -602,14 +641,13 @@ END(syscall_trace_entry)
5010 # perform syscall exit tracing
5013 - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
5014 + testb $_TIF_WORK_SYSCALL_EXIT, %cl
5017 - ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
5018 + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
5019 # schedule() instead
5022 - call do_syscall_trace
5023 + call syscall_trace_leave
5024 jmp resume_userspace
5025 END(syscall_exit_work)
5027 @@ -1113,10 +1151,10 @@ ENTRY(native_iret)
5031 -ENTRY(native_irq_enable_syscall_ret)
5032 +ENTRY(native_irq_enable_sysexit)
5035 -END(native_irq_enable_syscall_ret)
5036 +END(native_irq_enable_sysexit)
5040 @@ -1265,6 +1303,77 @@ ENTRY(kernel_thread_helper)
5042 ENDPROC(kernel_thread_helper)
5044 +#ifdef CONFIG_FTRACE
5045 +#ifdef CONFIG_DYNAMIC_FTRACE
5051 + movl 0xc(%esp), %eax
5052 + subl $MCOUNT_INSN_SIZE, %eax
5065 +ENTRY(ftrace_caller)
5069 + movl 0xc(%esp), %eax
5070 + movl 0x4(%ebp), %edx
5071 + subl $MCOUNT_INSN_SIZE, %eax
5086 +#else /* ! CONFIG_DYNAMIC_FTRACE */
5089 + cmpl $ftrace_stub, ftrace_trace_function
5095 + /* taken from glibc */
5100 + movl 0xc(%esp), %eax
5101 + movl 0x4(%ebp), %edx
5102 + subl $MCOUNT_INSN_SIZE, %eax
5104 + call *ftrace_trace_function
5112 +#endif /* CONFIG_DYNAMIC_FTRACE */
5113 +#endif /* CONFIG_FTRACE */
5115 #include <asm/alternative-asm.h>
5117 # pv syscall call handler stub
5118 @@ -1290,7 +1399,7 @@ ENTRY(ia32pv_cstar_target)
5121 GET_THREAD_INFO(%ebp)
5123 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
5124 jnz cstar_trace_entry
5125 cmpl $nr_syscalls,%eax
5127 @@ -1324,29 +1433,21 @@ cstar_trace_entry:
5128 btl %eax,cstar_special
5129 jc .Lcstar_trace_special
5133 orl $_TIF_CSTAR,TI_flags(%ebp)
5134 - call do_syscall_trace
5135 + call syscall_trace_enter
5137 andl $~_TIF_CSTAR,TI_flags(%ebp)
5139 - jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
5140 - # so must skip actual syscall
5141 - movl PT_ORIG_EAX(%esp),%eax
5142 + /* What it returned is what we'll actually use. */
5143 cmpl $nr_syscalls,%eax
5146 .Lcstar_trace_special:
5147 movl PT_ECX(%esp),%ecx
5150 movl %ecx,PT_EBP(%esp) # put user EBP back in place
5151 - call do_syscall_trace
5153 - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5154 - # so must skip actual syscall
5155 - movl PT_ORIG_EAX(%esp),%eax
5156 + call syscall_trace_enter
5157 + /* What it returned is what we'll actually use. */
5158 cmpl $nr_syscalls,%eax
5161 --- sle11-2009-10-16.orig/arch/x86/kernel/entry_64.S 2009-10-28 14:55:02.000000000 +0100
5162 +++ sle11-2009-10-16/arch/x86/kernel/entry_64.S 2009-06-04 10:21:39.000000000 +0200
5163 @@ -1409,7 +1409,7 @@ ENTRY(arch_unwind_init_running)
5164 ENDPROC(arch_unwind_init_running)
5168 +#ifdef CONFIG_PARAVIRT_XEN
5169 ENTRY(xen_hypervisor_callback)
5170 zeroentry xen_do_hypervisor_callback
5171 END(xen_hypervisor_callback)
5172 @@ -1507,7 +1507,7 @@ ENTRY(xen_failsafe_callback)
5174 END(xen_failsafe_callback)
5176 -#endif /* CONFIG_XEN */
5177 +#endif /* CONFIG_PARAVIRT_XEN */
5181 --- sle11-2009-10-16.orig/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:38:05.000000000 +0100
5182 +++ sle11-2009-10-16/arch/x86/kernel/entry_64-xen.S 2009-06-04 10:21:39.000000000 +0200
5183 @@ -53,19 +53,130 @@
5184 #include <asm/hw_irq.h>
5185 #include <asm/page.h>
5186 #include <asm/irqflags.h>
5187 +#include <asm/ftrace.h>
5188 #include <asm/errno.h>
5189 #include <xen/interface/xen.h>
5190 #include <xen/interface/features.h>
5192 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
5193 +#include <linux/elf-em.h>
5194 +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
5195 +#define __AUDIT_ARCH_64BIT 0x80000000
5196 +#define __AUDIT_ARCH_LE 0x40000000
5200 +#ifdef CONFIG_FTRACE
5201 +#ifdef CONFIG_DYNAMIC_FTRACE
5206 + movq %rcx, 8(%rsp)
5207 + movq %rdx, 16(%rsp)
5208 + movq %rsi, 24(%rsp)
5209 + movq %rdi, 32(%rsp)
5210 + movq %r8, 40(%rsp)
5211 + movq %r9, 48(%rsp)
5213 + movq 0x38(%rsp), %rdi
5214 + subq $MCOUNT_INSN_SIZE, %rdi
5220 + movq 48(%rsp), %r9
5221 + movq 40(%rsp), %r8
5222 + movq 32(%rsp), %rdi
5223 + movq 24(%rsp), %rsi
5224 + movq 16(%rsp), %rdx
5225 + movq 8(%rsp), %rcx
5232 +ENTRY(ftrace_caller)
5234 + /* taken from glibc */
5237 + movq %rcx, 8(%rsp)
5238 + movq %rdx, 16(%rsp)
5239 + movq %rsi, 24(%rsp)
5240 + movq %rdi, 32(%rsp)
5241 + movq %r8, 40(%rsp)
5242 + movq %r9, 48(%rsp)
5244 + movq 0x38(%rsp), %rdi
5245 + movq 8(%rbp), %rsi
5246 + subq $MCOUNT_INSN_SIZE, %rdi
5252 + movq 48(%rsp), %r9
5253 + movq 40(%rsp), %r8
5254 + movq 32(%rsp), %rdi
5255 + movq 24(%rsp), %rsi
5256 + movq 16(%rsp), %rdx
5257 + movq 8(%rsp), %rcx
5266 +#else /* ! CONFIG_DYNAMIC_FTRACE */
5268 + cmpq $ftrace_stub, ftrace_trace_function
5275 + /* taken from glibc */
5278 + movq %rcx, 8(%rsp)
5279 + movq %rdx, 16(%rsp)
5280 + movq %rsi, 24(%rsp)
5281 + movq %rdi, 32(%rsp)
5282 + movq %r8, 40(%rsp)
5283 + movq %r9, 48(%rsp)
5285 + movq 0x38(%rsp), %rdi
5286 + movq 8(%rbp), %rsi
5287 + subq $MCOUNT_INSN_SIZE, %rdi
5289 + call *ftrace_trace_function
5291 + movq 48(%rsp), %r9
5292 + movq 40(%rsp), %r8
5293 + movq 32(%rsp), %rdi
5294 + movq 24(%rsp), %rsi
5295 + movq 16(%rsp), %rdx
5296 + movq 8(%rsp), %rcx
5302 +#endif /* CONFIG_DYNAMIC_FTRACE */
5303 +#endif /* CONFIG_FTRACE */
5305 #ifndef CONFIG_PREEMPT
5306 #define retint_kernel retint_restore_args
5309 #ifdef CONFIG_PARAVIRT
5310 -ENTRY(native_irq_enable_syscall_ret)
5311 - movq %gs:pda_oldrsp,%rsp
5312 +ENTRY(native_usergs_sysret64)
5315 #endif /* CONFIG_PARAVIRT */
5316 @@ -102,7 +213,7 @@ NMI_MASK = 0x80000000
5317 .macro FAKE_STACK_FRAME child_rip
5318 /* push in order ss, rsp, eflags, cs, rip */
5320 - pushq %rax /* ss */
5321 + pushq $__KERNEL_DS /* ss */
5322 CFI_ADJUST_CFA_OFFSET 8
5323 /*CFI_REL_OFFSET ss,0*/
5324 pushq %rax /* rsp */
5325 @@ -197,13 +308,13 @@ ENTRY(ret_from_fork)
5326 CFI_ADJUST_CFA_OFFSET -4
5328 GET_THREAD_INFO(%rcx)
5329 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
5330 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5334 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
5335 je int_ret_from_sys_call
5336 - testl $_TIF_IA32,threadinfo_flags(%rcx)
5337 + testl $_TIF_IA32,TI_flags(%rcx)
5338 jnz int_ret_from_sys_call
5339 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
5340 jmp ret_from_sys_call
5341 @@ -265,8 +376,9 @@ ENTRY(system_call)
5343 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
5344 GET_THREAD_INFO(%rcx)
5345 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
5346 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
5348 +system_call_fastpath:
5349 cmpq $__NR_syscall_max,%rax
5352 @@ -284,7 +396,7 @@ sysret_check:
5353 GET_THREAD_INFO(%rcx)
5354 DISABLE_INTERRUPTS(CLBR_NONE)
5356 - movl threadinfo_flags(%rcx),%edx
5357 + movl TI_flags(%rcx),%edx
5361 @@ -315,16 +427,16 @@ sysret_careful:
5364 ENABLE_INTERRUPTS(CLBR_NONE)
5365 - testl $_TIF_DO_NOTIFY_MASK,%edx
5368 - /* Really a signal */
5369 +#ifdef CONFIG_AUDITSYSCALL
5370 + bt $TIF_SYSCALL_AUDIT,%edx
5373 /* edx: work flags (arg3) */
5374 leaq do_notify_resume(%rip),%rax
5375 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
5376 xorl %esi,%esi # oldset -> arg2
5377 call ptregscall_common
5378 -1: movl $_TIF_NEED_RESCHED,%edi
5379 + movl $_TIF_WORK_MASK,%edi
5380 /* Use IRET because user could have changed frame. This
5381 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
5382 DISABLE_INTERRUPTS(CLBR_NONE)
5383 @@ -335,14 +447,56 @@ badsys:
5384 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
5385 jmp ret_from_sys_call
5387 +#ifdef CONFIG_AUDITSYSCALL
5389 + * Fast path for syscall audit without full syscall trace.
5390 + * We just call audit_syscall_entry() directly, and then
5391 + * jump back to the normal fast path.
5394 + movq %r10,%r9 /* 6th arg: 4th syscall arg */
5395 + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
5396 + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
5397 + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
5398 + movq %rax,%rsi /* 2nd arg: syscall number */
5399 + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
5400 + call audit_syscall_entry
5401 + LOAD_ARGS 0 /* reload call-clobbered registers */
5402 + jmp system_call_fastpath
5405 + * Return fast path for syscall audit. Call audit_syscall_exit()
5406 + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
5410 + movq %rax,%rsi /* second arg, syscall return value */
5411 + cmpq $0,%rax /* is it < 0? */
5412 + setl %al /* 1 if so, 0 if not */
5413 + movzbl %al,%edi /* zero-extend that into %edi */
5414 + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
5415 + call audit_syscall_exit
5416 + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
5418 +#endif /* CONFIG_AUDITSYSCALL */
5420 /* Do syscall tracing */
5422 +#ifdef CONFIG_AUDITSYSCALL
5423 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5427 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
5428 FIXUP_TOP_OF_STACK %rdi
5430 call syscall_trace_enter
5431 - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
5433 + * Reload arg registers from stack in case ptrace changed them.
5434 + * We don't reload %rax because syscall_trace_enter() returned
5435 + * the value it wants us to use in the table lookup.
5437 + LOAD_ARGS ARGOFFSET, 1
5439 cmpq $__NR_syscall_max,%rax
5440 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
5441 @@ -356,6 +510,7 @@ tracesys:
5442 * Has correct top of stack, but partial stack frame.
5444 .globl int_ret_from_sys_call
5445 + .globl int_with_check
5446 int_ret_from_sys_call:
5447 DISABLE_INTERRUPTS(CLBR_NONE)
5449 @@ -370,10 +525,10 @@ int_ret_from_sys_call:
5451 LOCKDEP_SYS_EXIT_IRQ
5452 GET_THREAD_INFO(%rcx)
5453 - movl threadinfo_flags(%rcx),%edx
5454 + movl TI_flags(%rcx),%edx
5457 - andl $~TS_COMPAT,threadinfo_status(%rcx)
5458 + andl $~TS_COMPAT,TI_status(%rcx)
5459 jmp retint_restore_args
5461 /* Either reschedule or signal or syscall exit tracking needed. */
5462 @@ -399,7 +554,7 @@ int_very_careful:
5463 ENABLE_INTERRUPTS(CLBR_NONE)
5465 /* Check for syscall exit trace */
5466 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
5467 + testl $_TIF_WORK_SYSCALL_EXIT,%edx
5470 CFI_ADJUST_CFA_OFFSET 8
5471 @@ -407,7 +562,7 @@ int_very_careful:
5472 call syscall_trace_leave
5474 CFI_ADJUST_CFA_OFFSET -8
5475 - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
5476 + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
5477 jmp int_restore_rest
5480 @@ -416,7 +571,7 @@ int_signal:
5481 movq %rsp,%rdi # &ptregs -> arg1
5482 xorl %esi,%esi # oldset -> arg2
5483 call do_notify_resume
5484 -1: movl $_TIF_NEED_RESCHED,%edi
5485 +1: movl $_TIF_WORK_MASK,%edi
5488 DISABLE_INTERRUPTS(CLBR_NONE)
5489 @@ -443,7 +598,6 @@ END(\label)
5490 PTREGSCALL stub_clone, sys_clone, %r8
5491 PTREGSCALL stub_fork, sys_fork, %rdi
5492 PTREGSCALL stub_vfork, sys_vfork, %rdi
5493 - PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
5494 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
5495 PTREGSCALL stub_iopl, sys_iopl, %rsi
5497 @@ -517,10 +671,12 @@ END(stub_rt_sigreturn)
5502 +retint_with_reschedule:
5503 CFI_DEFAULT_STACK adj=1
5504 + movl $_TIF_WORK_MASK,%edi
5506 LOCKDEP_SYS_EXIT_IRQ
5507 - movl threadinfo_flags(%rcx),%edx
5508 + movl TI_flags(%rcx),%edx
5512 @@ -565,17 +721,16 @@ retint_signal:
5514 DISABLE_INTERRUPTS(CLBR_NONE)
5516 - movl $_TIF_NEED_RESCHED,%edi
5517 GET_THREAD_INFO(%rcx)
5519 + jmp retint_with_reschedule
5521 #ifdef CONFIG_PREEMPT
5522 /* Returning to kernel space. Check if we need preemption */
5523 /* rcx: threadinfo. interrupts off. */
5524 ENTRY(retint_kernel)
5525 - cmpl $0,threadinfo_preempt_count(%rcx)
5526 + cmpl $0,TI_preempt_count(%rcx)
5527 jnz retint_restore_args
5528 - bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
5529 + bt $TIF_NEED_RESCHED,TI_flags(%rcx)
5530 jnc retint_restore_args
5531 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
5532 jnc retint_restore_args
5533 @@ -630,6 +785,9 @@ END(invalidate_interrupt\num)
5534 ENTRY(call_function_interrupt)
5535 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
5536 END(call_function_interrupt)
5537 +ENTRY(call_function_single_interrupt)
5538 + apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
5539 +END(call_function_single_interrupt)
5540 ENTRY(irq_move_cleanup_interrupt)
5541 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
5542 END(irq_move_cleanup_interrupt)
5543 @@ -639,6 +797,10 @@ ENTRY(apic_timer_interrupt)
5544 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
5545 END(apic_timer_interrupt)
5547 +ENTRY(uv_bau_message_intr1)
5548 + apicinterrupt 220,uv_bau_message_interrupt
5549 +END(uv_bau_message_intr1)
5551 ENTRY(error_interrupt)
5552 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
5553 END(error_interrupt)
5554 @@ -752,7 +914,7 @@ paranoid_restore\trace:
5556 paranoid_userspace\trace:
5557 GET_THREAD_INFO(%rcx)
5558 - movl threadinfo_flags(%rcx),%ebx
5559 + movl TI_flags(%rcx),%ebx
5560 andl $_TIF_WORK_MASK,%ebx
5561 jz paranoid_swapgs\trace
5562 movq %rsp,%rdi /* &pt_regs */
5563 @@ -849,7 +1011,7 @@ error_exit:
5564 testb $3,CS-ARGOFFSET(%rsp)
5566 LOCKDEP_SYS_EXIT_IRQ
5567 - movl threadinfo_flags(%rcx),%edx
5568 + movl TI_flags(%rcx),%edx
5569 movl $_TIF_WORK_MASK,%edi
5572 @@ -871,11 +1033,11 @@ error_kernelspace:
5573 iret run with kernel gs again, so don't set the user space flag.
5574 B stepping K8s sometimes report an truncated RIP for IRET
5575 exceptions returning to compat mode. Check for these here too. */
5576 - leaq irq_return(%rip),%rbp
5577 - cmpq %rbp,RIP(%rsp)
5578 + leaq irq_return(%rip),%rcx
5579 + cmpq %rcx,RIP(%rsp)
5581 - movl %ebp,%ebp /* zero extend */
5582 - cmpq %rbp,RIP(%rsp)
5583 + movl %ecx,%ecx /* zero extend */
5584 + cmpq %rcx,RIP(%rsp)
5586 cmpq $gs_change,RIP(%rsp)
5588 @@ -1121,6 +1283,7 @@ END(device_not_available)
5589 /* runs on exception stack */
5592 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5594 CFI_ADJUST_CFA_OFFSET 8 */
5596 @@ -1148,6 +1311,7 @@ END(do_nmi_callback)
5600 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5602 CFI_ADJUST_CFA_OFFSET 8 */
5604 @@ -1171,14 +1335,11 @@ ENTRY(coprocessor_segment_overrun)
5605 zeroentry do_coprocessor_segment_overrun
5606 END(coprocessor_segment_overrun)
5609 - zeroentry do_reserved
5613 /* runs on exception stack */
5616 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5617 paranoidentry do_double_fault
5620 @@ -1196,6 +1357,7 @@ END(segment_not_present)
5621 /* runs on exception stack */
5622 ENTRY(stack_segment)
5624 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5625 paranoidentry do_stack_segment */
5626 errorentry do_stack_segment
5627 /* jmp paranoid_exit1
5628 @@ -1222,6 +1384,7 @@ END(spurious_interrupt_bug)
5629 /* runs on exception stack */
5630 ENTRY(machine_check)
5632 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5634 CFI_ADJUST_CFA_OFFSET 8
5635 paranoidentry do_machine_check
5636 --- sle11-2009-10-16.orig/arch/x86/kernel/fixup.c 2009-03-16 16:33:40.000000000 +0100
5637 +++ sle11-2009-10-16/arch/x86/kernel/fixup.c 2009-06-04 10:21:39.000000000 +0200
5639 #include <linux/kernel.h>
5640 #include <linux/delay.h>
5641 #include <linux/version.h>
5642 +#include <asm/traps.h>
5644 #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
5646 --- sle11-2009-10-16.orig/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
5647 +++ sle11-2009-10-16/arch/x86/kernel/genapic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
5648 @@ -58,7 +58,7 @@ void __init setup_apic_routing(void)
5652 - if (num_possible_cpus() <= 8)
5653 + if (max_physical_apicid < 8)
5654 genapic = &apic_flat;
5656 genapic = &apic_physflat;
5657 @@ -121,4 +121,5 @@ int is_uv_system(void)
5659 return uv_system_type != UV_NONE;
5661 +EXPORT_SYMBOL_GPL(is_uv_system);
5663 --- sle11-2009-10-16.orig/arch/x86/kernel/genapic_xen_64.c 2009-03-16 16:38:05.000000000 +0100
5664 +++ sle11-2009-10-16/arch/x86/kernel/genapic_xen_64.c 2009-06-04 10:21:39.000000000 +0200
5665 @@ -43,7 +43,7 @@ void xen_send_IPI_shortcut(unsigned int
5666 __send_IPI_one(smp_processor_id(), vector);
5668 case APIC_DEST_ALLBUT:
5669 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5670 + for_each_possible_cpu(cpu) {
5671 if (cpu == smp_processor_id())
5673 if (cpu_isset(cpu, cpu_online_map)) {
5674 @@ -52,7 +52,7 @@ void xen_send_IPI_shortcut(unsigned int
5677 case APIC_DEST_ALLINC:
5678 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5679 + for_each_possible_cpu(cpu) {
5680 if (cpu_isset(cpu, cpu_online_map)) {
5681 __send_IPI_one(cpu, vector);
5683 @@ -81,8 +81,6 @@ static cpumask_t xen_vector_allocation_d
5685 static void xen_init_apic_ldr(void)
5687 - Dprintk("%s\n", __FUNCTION__);
5691 static void xen_send_IPI_allbutself(int vector)
5692 @@ -92,14 +90,12 @@ static void xen_send_IPI_allbutself(int
5693 * we get an APIC send error if we try to broadcast.
5694 * thus we have to avoid sending IPIs in this case.
5696 - Dprintk("%s\n", __FUNCTION__);
5697 if (num_online_cpus() > 1)
5698 xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
5701 static void xen_send_IPI_all(int vector)
5703 - Dprintk("%s\n", __FUNCTION__);
5704 xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
5707 @@ -109,11 +105,10 @@ static void xen_send_IPI_mask(cpumask_t
5709 unsigned long flags;
5711 - Dprintk("%s\n", __FUNCTION__);
5712 local_irq_save(flags);
5713 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
5715 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5716 + for_each_possible_cpu(cpu) {
5717 if (cpu_isset(cpu, cpumask)) {
5718 __send_IPI_one(cpu, vector);
5720 @@ -125,14 +120,12 @@ static void xen_send_IPI_mask(cpumask_t
5721 static int xen_apic_id_registered(void)
5724 - Dprintk("%s\n", __FUNCTION__);
5725 return physid_isset(smp_processor_id(), phys_cpu_present_map);
5729 static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
5731 - Dprintk("%s\n", __FUNCTION__);
5732 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
5735 @@ -140,7 +133,6 @@ static unsigned int phys_pkg_id(int inde
5739 - Dprintk("%s\n", __FUNCTION__);
5741 return ((ebx >> 24) & 0xFF) >> index_msb;
5743 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5744 +++ sle11-2009-10-16/arch/x86/kernel/head-xen.c 2009-06-04 10:21:39.000000000 +0200
5746 +#include <linux/kernel.h>
5747 +#include <linux/init.h>
5749 +#include <asm/setup.h>
5750 +#include <asm/bios_ebda.h>
5752 +#define BIOS_LOWMEM_KILOBYTES 0x413
5755 + * The BIOS places the EBDA/XBDA at the top of conventional
5756 + * memory, and usually decreases the reported amount of
5757 + * conventional memory (int 0x12) too. This also contains a
5758 + * workaround for Dell systems that neglect to reserve EBDA.
5759 + * The same workaround also avoids a problem with the AMD768MPX
5760 + * chipset: reserve a page before VGA to prevent PCI prefetch
5761 + * into it (errata #56). Usually the page is reserved anyways,
5762 + * unless you have no PS/2 mouse plugged in.
5764 +void __init reserve_ebda_region(void)
5767 + unsigned int lowmem, ebda_addr;
5769 + /* To determine the position of the EBDA and the */
5770 + /* end of conventional memory, we need to look at */
5771 + /* the BIOS data area. In a paravirtual environment */
5772 + /* that area is absent. We'll just have to assume */
5773 + /* that the paravirt case can handle memory setup */
5774 + /* correctly, without our help. */
5775 + if (paravirt_enabled())
5778 + /* end of low (conventional) memory */
5779 + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5782 + /* start of EBDA area */
5783 + ebda_addr = get_bios_ebda();
5785 + /* Fixup: bios puts an EBDA in the top 64K segment */
5786 + /* of conventional memory, but does not adjust lowmem. */
5787 + if ((lowmem - ebda_addr) <= 0x10000)
5788 + lowmem = ebda_addr;
5790 + /* Fixup: bios does not report an EBDA at all. */
5791 + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5792 + if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5795 + /* Paranoia: should never happen, but... */
5796 + if ((lowmem == 0) || (lowmem >= 0x100000))
5799 + /* reserve all memory between lowmem and the 1MB mark */
5800 + reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
5803 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5804 +++ sle11-2009-10-16/arch/x86/kernel/head32-xen.c 2009-06-04 10:21:39.000000000 +0200
5807 + * linux/arch/i386/kernel/head32.c -- prepare to run common code
5809 + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5810 + * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
5813 +#include <linux/init.h>
5814 +#include <linux/start_kernel.h>
5816 +#include <asm/setup.h>
5817 +#include <asm/sections.h>
5818 +#include <asm/e820.h>
5819 +#include <asm/bios_ebda.h>
5821 +void __init i386_start_kernel(void)
5823 + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
5826 +#ifdef CONFIG_BLK_DEV_INITRD
5827 + /* Reserve INITRD */
5828 + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
5829 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
5830 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
5831 + u64 ramdisk_end = ramdisk_image + ramdisk_size;
5832 + reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
5835 + reserve_early(init_pg_tables_start, init_pg_tables_end,
5838 + reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
5839 + __pa(xen_start_info->pt_base)
5840 + + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
5846 + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
5847 + max_cmdline = COMMAND_LINE_SIZE;
5848 + memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
5849 + boot_command_line[max_cmdline-1] = '\0';
5853 + reserve_ebda_region();
5856 + * At this point everything still needed from the boot loader
5857 + * or BIOS or kernel text should be early reserved or marked not
5858 + * RAM in e820. All other memory is free game.
5863 --- sle11-2009-10-16.orig/arch/x86/kernel/head64-xen.c 2009-03-16 16:38:05.000000000 +0100
5864 +++ sle11-2009-10-16/arch/x86/kernel/head64-xen.c 2009-06-04 10:21:39.000000000 +0200
5866 #include <asm/e820.h>
5867 #include <asm/bios_ebda.h>
5869 -unsigned long start_pfn;
5871 +static struct x8664_pda _boot_cpu_pda __read_mostly;
5875 + * We install an empty cpu_pda pointer table to indicate to early users
5876 + * (numa_set_node) that the cpu_pda pointer table for cpus other than
5877 + * the boot cpu is not yet setup.
5879 +static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
5881 +static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
5884 +void __init x86_64_init_pda(void)
5886 + _cpu_pda = __cpu_pda;
5887 + cpu_pda(0) = &_boot_cpu_pda;
5892 static void __init zap_identity_mappings(void)
5893 @@ -77,83 +96,10 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
5894 unsigned int machine_to_phys_order;
5895 EXPORT_SYMBOL(machine_to_phys_order);
5897 -#define BIOS_LOWMEM_KILOBYTES 0x413
5900 - * The BIOS places the EBDA/XBDA at the top of conventional
5901 - * memory, and usually decreases the reported amount of
5902 - * conventional memory (int 0x12) too. This also contains a
5903 - * workaround for Dell systems that neglect to reserve EBDA.
5904 - * The same workaround also avoids a problem with the AMD768MPX
5905 - * chipset: reserve a page before VGA to prevent PCI prefetch
5906 - * into it (errata #56). Usually the page is reserved anyways,
5907 - * unless you have no PS/2 mouse plugged in.
5909 -static void __init reserve_ebda_region(void)
5912 - unsigned int lowmem, ebda_addr;
5914 - /* To determine the position of the EBDA and the */
5915 - /* end of conventional memory, we need to look at */
5916 - /* the BIOS data area. In a paravirtual environment */
5917 - /* that area is absent. We'll just have to assume */
5918 - /* that the paravirt case can handle memory setup */
5919 - /* correctly, without our help. */
5920 - if (paravirt_enabled())
5923 - /* end of low (conventional) memory */
5924 - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5927 - /* start of EBDA area */
5928 - ebda_addr = get_bios_ebda();
5930 - /* Fixup: bios puts an EBDA in the top 64K segment */
5931 - /* of conventional memory, but does not adjust lowmem. */
5932 - if ((lowmem - ebda_addr) <= 0x10000)
5933 - lowmem = ebda_addr;
5935 - /* Fixup: bios does not report an EBDA at all. */
5936 - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5937 - if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5940 - /* Paranoia: should never happen, but... */
5941 - if ((lowmem == 0) || (lowmem >= 0x100000))
5944 - /* reserve all memory between lowmem and the 1MB mark */
5945 - reserve_early(lowmem, 0x100000, "BIOS reserved");
5949 -static void __init reserve_setup_data(void)
5952 - struct setup_data *data;
5953 - unsigned long pa_data;
5956 - if (boot_params.hdr.version < 0x0209)
5958 - pa_data = boot_params.hdr.setup_data;
5960 - data = early_ioremap(pa_data, sizeof(*data));
5961 - sprintf(buf, "setup data %x", data->type);
5962 - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
5963 - pa_data = data->next;
5964 - early_iounmap(data, sizeof(*data));
5969 void __init x86_64_start_kernel(char * real_mode_data)
5971 struct xen_machphys_mapping mapping;
5972 unsigned long machine_to_phys_nr_ents;
5976 * Build-time sanity checks on the kernel image and module
5977 @@ -167,6 +113,7 @@ void __init x86_64_start_kernel(char * r
5978 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
5979 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
5980 (__START_KERNEL & PGDIR_MASK)));
5981 + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
5983 xen_setup_features();
5985 @@ -174,8 +121,6 @@ void __init x86_64_start_kernel(char * r
5986 if (!xen_feature(XENFEAT_auto_translated_physmap))
5987 phys_to_machine_mapping =
5988 (unsigned long *)xen_start_info->mfn_list;
5989 - start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
5990 - xen_start_info->nr_pt_frames;
5992 machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
5993 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
5994 @@ -208,19 +153,23 @@ void __init x86_64_start_kernel(char * r
5996 early_printk("Kernel alive\n");
5998 - for (i = 0; i < NR_CPUS; i++)
5999 - cpu_pda(i) = &boot_cpu_pda[i];
6000 + x86_64_init_pda();
6003 + early_printk("Kernel really alive\n");
6005 + x86_64_start_reservations(real_mode_data);
6008 +void __init x86_64_start_reservations(char *real_mode_data)
6010 copy_bootdata(__va(real_mode_data));
6012 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
6014 reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
6015 - start_pfn << PAGE_SHIFT, "Xen provided");
6017 - reserve_ebda_region();
6018 - reserve_setup_data();
6019 + __pa(xen_start_info->pt_base)
6020 + + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
6024 * At this point everything still needed from the boot loader
6025 --- sle11-2009-10-16.orig/arch/x86/kernel/head_64-xen.S 2009-02-16 16:17:21.000000000 +0100
6026 +++ sle11-2009-10-16/arch/x86/kernel/head_64-xen.S 2009-06-04 10:21:39.000000000 +0200
6027 @@ -95,53 +95,6 @@ NEXT_PAGE(hypercall_page)
6034 - .globl cpu_gdt_descr
6036 - .word gdt_end-cpu_gdt_table-1
6038 - .quad cpu_gdt_table
6046 -/* We need valid kernel segments for data and code in long mode too
6047 - * IRET will check the segment types kkeil 2000/10/28
6048 - * Also sysret mandates a special GDT layout
6051 - .section .data.page_aligned, "aw"
6054 -/* The TLS descriptors are currently at a different place compared to i386.
6055 - Hopefully nobody expects them at a fixed place (Wine?) */
6057 -ENTRY(cpu_gdt_table)
6058 - .quad 0x0000000000000000 /* NULL descriptor */
6059 - .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
6060 - .quad 0x00af9b000000ffff /* __KERNEL_CS */
6061 - .quad 0x00cf93000000ffff /* __KERNEL_DS */
6062 - .quad 0x00cffb000000ffff /* __USER32_CS */
6063 - .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
6064 - .quad 0x00affb000000ffff /* __USER_CS */
6065 - .quad 0x0 /* unused */
6066 - .quad 0,0 /* TSS */
6067 - .quad 0,0 /* LDT */
6068 - .quad 0,0,0 /* three TLS descriptors */
6069 - .quad 0x0000f40000000000 /* node/CPU stored in limit */
6071 - /* asm/segment.h:GDT_ENTRIES must match this */
6072 - /* This should be a multiple of the cache line size */
6073 - /* GDTs of other CPUs are now dynamically allocated */
6075 - /* zero the remaining page */
6076 - .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
6078 .section .bss.page_aligned, "aw", @nobits
6080 ENTRY(empty_zero_page)
6081 --- sle11-2009-10-16.orig/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:38:05.000000000 +0100
6082 +++ sle11-2009-10-16/arch/x86/kernel/io_apic_32-xen.c 2009-06-04 10:21:39.000000000 +0200
6084 #include <linux/init.h>
6085 #include <linux/delay.h>
6086 #include <linux/sched.h>
6087 +#include <linux/bootmem.h>
6088 #include <linux/mc146818rtc.h>
6089 #include <linux/compiler.h>
6090 #include <linux/acpi.h>
6091 @@ -75,7 +76,7 @@ static struct { int pin, apic; } ioapic_
6092 static DEFINE_SPINLOCK(ioapic_lock);
6093 static DEFINE_SPINLOCK(vector_lock);
6095 -int timer_over_8254 __initdata = 1;
6096 +int timer_through_8259 __initdata;
6099 * Is the SiS APIC rmw bug present ?
6100 @@ -89,15 +90,21 @@ int sis_apic_bug = -1;
6101 int nr_ioapic_registers[MAX_IO_APICS];
6103 /* I/O APIC entries */
6104 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
6105 +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
6108 /* MP IRQ source entries */
6109 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6110 +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6112 /* # of MP IRQ source entries */
6115 +#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
6116 +int mp_bus_id_to_type[MAX_MP_BUSSES];
6119 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
6121 static int disable_timer_pin_1 __initdata;
6124 @@ -128,7 +135,7 @@ struct io_apic {
6125 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
6127 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
6128 - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
6129 + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
6133 @@ -142,7 +149,7 @@ static inline unsigned int io_apic_read(
6134 struct physdev_apic apic_op;
6137 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6138 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6140 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
6142 @@ -160,7 +167,7 @@ static inline void io_apic_write(unsigne
6144 struct physdev_apic apic_op;
6146 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6147 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6149 apic_op.value = value;
6150 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
6151 @@ -288,7 +295,7 @@ static void __init replace_pin_at_irq(un
6155 -static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
6156 +static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
6158 struct irq_pin_list *entry = irq_2_pin + irq;
6159 unsigned int pin, reg;
6160 @@ -308,30 +315,32 @@ static void __modify_IO_APIC_irq (unsign
6164 -static void __mask_IO_APIC_irq (unsigned int irq)
6165 +static void __mask_IO_APIC_irq(unsigned int irq)
6167 - __modify_IO_APIC_irq(irq, 0x00010000, 0);
6168 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
6172 -static void __unmask_IO_APIC_irq (unsigned int irq)
6173 +static void __unmask_IO_APIC_irq(unsigned int irq)
6175 - __modify_IO_APIC_irq(irq, 0, 0x00010000);
6176 + __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
6179 /* mask = 1, trigger = 0 */
6180 -static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
6181 +static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
6183 - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
6184 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
6185 + IO_APIC_REDIR_LEVEL_TRIGGER);
6188 /* mask = 0, trigger = 1 */
6189 -static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
6190 +static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
6192 - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
6193 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
6194 + IO_APIC_REDIR_MASKED);
6197 -static void mask_IO_APIC_irq (unsigned int irq)
6198 +static void mask_IO_APIC_irq(unsigned int irq)
6200 unsigned long flags;
6202 @@ -340,7 +349,7 @@ static void mask_IO_APIC_irq (unsigned i
6203 spin_unlock_irqrestore(&ioapic_lock, flags);
6206 -static void unmask_IO_APIC_irq (unsigned int irq)
6207 +static void unmask_IO_APIC_irq(unsigned int irq)
6209 unsigned long flags;
6211 @@ -352,7 +361,7 @@ static void unmask_IO_APIC_irq (unsigned
6212 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
6214 struct IO_APIC_route_entry entry;
6217 /* Check delivery_mode to be sure we're not clearing an SMI pin */
6218 entry = ioapic_read_entry(apic, pin);
6219 if (entry.delivery_mode == dest_SMI)
6220 @@ -364,7 +373,7 @@ static void clear_IO_APIC_pin(unsigned i
6221 ioapic_mask_entry(apic, pin);
6224 -static void clear_IO_APIC (void)
6225 +static void clear_IO_APIC(void)
6229 @@ -381,7 +390,7 @@ static void set_ioapic_affinity_irq(unsi
6230 struct irq_pin_list *entry = irq_2_pin + irq;
6231 unsigned int apicid_value;
6235 cpus_and(tmp, cpumask, cpu_online_map);
6236 if (cpus_empty(tmp))
6238 @@ -410,7 +419,7 @@ static void set_ioapic_affinity_irq(unsi
6239 # include <linux/kernel_stat.h> /* kstat */
6240 # include <linux/slab.h> /* kmalloc() */
6241 # include <linux/timer.h>
6244 #define IRQBALANCE_CHECK_ARCH -999
6245 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
6246 #define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
6247 @@ -422,14 +431,14 @@ static int physical_balance __read_mostl
6248 static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
6250 static struct irq_cpu_info {
6251 - unsigned long * last_irq;
6252 - unsigned long * irq_delta;
6253 + unsigned long *last_irq;
6254 + unsigned long *irq_delta;
6256 } irq_cpu_data[NR_CPUS];
6258 #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
6259 -#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
6260 -#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
6261 +#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
6262 +#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
6264 #define IDLE_ENOUGH(cpu,now) \
6265 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
6266 @@ -468,8 +477,8 @@ inside:
6270 - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
6271 - (search_idle && !IDLE_ENOUGH(cpu,now)));
6272 + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
6273 + (search_idle && !IDLE_ENOUGH(cpu, now)));
6277 @@ -479,15 +488,14 @@ static inline void balance_irq(int cpu,
6278 unsigned long now = jiffies;
6279 cpumask_t allowed_mask;
6280 unsigned int new_cpu;
6283 if (irqbalance_disabled)
6287 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
6288 new_cpu = move(cpu, allowed_mask, now, 1);
6289 - if (cpu != new_cpu) {
6290 + if (cpu != new_cpu)
6291 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
6295 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
6296 @@ -499,14 +507,14 @@ static inline void rotate_irqs_among_cpu
6297 if (!irq_desc[j].action)
6299 /* Is it a significant load ? */
6300 - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
6301 + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
6302 useful_load_threshold)
6307 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6308 - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6309 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6313 @@ -535,22 +543,22 @@ static void do_irq_balance(void)
6314 /* Is this an active IRQ or balancing disabled ? */
6315 if (!irq_desc[j].action || irq_balancing_disabled(j))
6317 - if ( package_index == i )
6318 - IRQ_DELTA(package_index,j) = 0;
6319 + if (package_index == i)
6320 + IRQ_DELTA(package_index, j) = 0;
6321 /* Determine the total count per processor per IRQ */
6322 value_now = (unsigned long) kstat_cpu(i).irqs[j];
6324 /* Determine the activity per processor per IRQ */
6325 - delta = value_now - LAST_CPU_IRQ(i,j);
6326 + delta = value_now - LAST_CPU_IRQ(i, j);
6328 /* Update last_cpu_irq[][] for the next time */
6329 - LAST_CPU_IRQ(i,j) = value_now;
6330 + LAST_CPU_IRQ(i, j) = value_now;
6332 /* Ignore IRQs whose rate is less than the clock */
6333 if (delta < useful_load_threshold)
6335 /* update the load for the processor or package total */
6336 - IRQ_DELTA(package_index,j) += delta;
6337 + IRQ_DELTA(package_index, j) += delta;
6339 /* Keep track of the higher numbered sibling as well */
6340 if (i != package_index)
6341 @@ -576,7 +584,8 @@ static void do_irq_balance(void)
6342 max_cpu_irq = ULONG_MAX;
6345 - /* Look for heaviest loaded processor.
6347 + * Look for heaviest loaded processor.
6348 * We may come back to get the next heaviest loaded processor.
6349 * Skip processors with trivial loads.
6351 @@ -585,7 +594,7 @@ tryanothercpu:
6352 for_each_online_cpu(i) {
6353 if (i != CPU_TO_PACKAGEINDEX(i))
6355 - if (max_cpu_irq <= CPU_IRQ(i))
6356 + if (max_cpu_irq <= CPU_IRQ(i))
6358 if (tmp_cpu_irq < CPU_IRQ(i)) {
6359 tmp_cpu_irq = CPU_IRQ(i);
6360 @@ -594,8 +603,9 @@ tryanothercpu:
6363 if (tmp_loaded == -1) {
6364 - /* In the case of small number of heavy interrupt sources,
6365 - * loading some of the cpus too much. We use Ingo's original
6367 + * In the case of small number of heavy interrupt sources,
6368 + * loading some of the cpus too much. We use Ingo's original
6369 * approach to rotate them around.
6371 if (!first_attempt && imbalance >= useful_load_threshold) {
6372 @@ -604,13 +614,14 @@ tryanothercpu:
6374 goto not_worth_the_effort;
6378 first_attempt = 0; /* heaviest search */
6379 max_cpu_irq = tmp_cpu_irq; /* load */
6380 max_loaded = tmp_loaded; /* processor */
6381 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
6383 - /* if imbalance is less than approx 10% of max load, then
6386 + * if imbalance is less than approx 10% of max load, then
6387 * observe diminishing returns action. - quit
6389 if (imbalance < (max_cpu_irq >> 3))
6390 @@ -626,26 +637,25 @@ tryanotherirq:
6391 /* Is this an active IRQ? */
6392 if (!irq_desc[j].action)
6394 - if (imbalance <= IRQ_DELTA(max_loaded,j))
6395 + if (imbalance <= IRQ_DELTA(max_loaded, j))
6397 /* Try to find the IRQ that is closest to the imbalance
6398 * without going over.
6400 - if (move_this_load < IRQ_DELTA(max_loaded,j)) {
6401 - move_this_load = IRQ_DELTA(max_loaded,j);
6402 + if (move_this_load < IRQ_DELTA(max_loaded, j)) {
6403 + move_this_load = IRQ_DELTA(max_loaded, j);
6407 - if (selected_irq == -1) {
6408 + if (selected_irq == -1)
6412 imbalance = move_this_load;
6415 /* For physical_balance case, we accumulated both load
6416 * values in the one of the siblings cpu_irq[],
6417 * to use the same code for physical and logical processors
6418 - * as much as possible.
6419 + * as much as possible.
6421 * NOTE: the cpu_irq[] array holds the sum of the load for
6422 * sibling A and sibling B in the slot for the lowest numbered
6423 @@ -674,11 +684,11 @@ tryanotherirq:
6424 /* mark for change destination */
6425 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
6427 - /* Since we made a change, come back sooner to
6428 + /* Since we made a change, come back sooner to
6429 * check for more variation.
6431 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6432 - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6433 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6437 @@ -689,7 +699,7 @@ not_worth_the_effort:
6440 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
6441 - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6442 + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6446 @@ -728,13 +738,13 @@ static int __init balanced_irq_init(void
6449 cpus_shift_right(tmp, cpu_online_map, 2);
6450 - c = &boot_cpu_data;
6451 + c = &boot_cpu_data;
6452 /* When not overwritten by the command line ask subarchitecture. */
6453 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
6454 irqbalance_disabled = NO_BALANCE_IRQ;
6455 if (irqbalance_disabled)
6459 /* disable irqbalance completely if there is only one processor online */
6460 if (num_online_cpus() < 2) {
6461 irqbalance_disabled = 1;
6462 @@ -748,16 +758,14 @@ static int __init balanced_irq_init(void
6463 physical_balance = 1;
6465 for_each_online_cpu(i) {
6466 - irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6467 - irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6468 + irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6469 + irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6470 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
6471 printk(KERN_ERR "balanced_irq_init: out of memory");
6474 - memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
6475 - memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
6479 printk(KERN_INFO "Starting balanced_irq\n");
6480 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
6482 @@ -799,7 +807,7 @@ void send_IPI_self(int vector)
6484 * Send the IPI. The write to APIC_ICR fires this off.
6486 - apic_write_around(APIC_ICR, cfg);
6487 + apic_write(APIC_ICR, cfg);
6490 #endif /* !CONFIG_SMP */
6491 @@ -853,10 +861,10 @@ static int find_irq_entry(int apic, int
6494 for (i = 0; i < mp_irq_entries; i++)
6495 - if (mp_irqs[i].mpc_irqtype == type &&
6496 - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
6497 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
6498 - mp_irqs[i].mpc_dstirq == pin)
6499 + if (mp_irqs[i].mp_irqtype == type &&
6500 + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
6501 + mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
6502 + mp_irqs[i].mp_dstirq == pin)
6506 @@ -871,13 +879,13 @@ static int __init find_isa_irq_pin(int i
6509 for (i = 0; i < mp_irq_entries; i++) {
6510 - int lbus = mp_irqs[i].mpc_srcbus;
6511 + int lbus = mp_irqs[i].mp_srcbus;
6513 if (test_bit(lbus, mp_bus_not_pci) &&
6514 - (mp_irqs[i].mpc_irqtype == type) &&
6515 - (mp_irqs[i].mpc_srcbusirq == irq))
6516 + (mp_irqs[i].mp_irqtype == type) &&
6517 + (mp_irqs[i].mp_srcbusirq == irq))
6519 - return mp_irqs[i].mpc_dstirq;
6520 + return mp_irqs[i].mp_dstirq;
6524 @@ -887,17 +895,17 @@ static int __init find_isa_irq_apic(int
6527 for (i = 0; i < mp_irq_entries; i++) {
6528 - int lbus = mp_irqs[i].mpc_srcbus;
6529 + int lbus = mp_irqs[i].mp_srcbus;
6531 if (test_bit(lbus, mp_bus_not_pci) &&
6532 - (mp_irqs[i].mpc_irqtype == type) &&
6533 - (mp_irqs[i].mpc_srcbusirq == irq))
6534 + (mp_irqs[i].mp_irqtype == type) &&
6535 + (mp_irqs[i].mp_srcbusirq == irq))
6538 if (i < mp_irq_entries) {
6540 - for(apic = 0; apic < nr_ioapics; apic++) {
6541 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
6542 + for (apic = 0; apic < nr_ioapics; apic++) {
6543 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
6547 @@ -918,28 +926,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6549 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
6550 "slot:%d, pin:%d.\n", bus, slot, pin);
6551 - if (mp_bus_id_to_pci_bus[bus] == -1) {
6552 + if (test_bit(bus, mp_bus_not_pci)) {
6553 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
6556 for (i = 0; i < mp_irq_entries; i++) {
6557 - int lbus = mp_irqs[i].mpc_srcbus;
6558 + int lbus = mp_irqs[i].mp_srcbus;
6560 for (apic = 0; apic < nr_ioapics; apic++)
6561 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
6562 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
6563 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
6564 + mp_irqs[i].mp_dstapic == MP_APIC_ALL)
6567 if (!test_bit(lbus, mp_bus_not_pci) &&
6568 - !mp_irqs[i].mpc_irqtype &&
6569 + !mp_irqs[i].mp_irqtype &&
6571 - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
6572 - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
6573 + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
6574 + int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
6576 if (!(apic || IO_APIC_IRQ(irq)))
6579 - if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
6580 + if (pin == (mp_irqs[i].mp_srcbusirq & 3))
6583 * Use the first all-but-pin matching entry as a
6584 @@ -954,7 +962,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6585 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
6588 - * This function currently is only a helper for the i386 smp boot process where
6589 + * This function currently is only a helper for the i386 smp boot process where
6590 * we need to reprogram the ioredtbls to cater for the cpus which have come online
6591 * so mask in all cases should simply be TARGET_CPUS
6593 @@ -1008,7 +1016,7 @@ static int EISA_ELCR(unsigned int irq)
6594 * EISA conforming in the MP table, that means its trigger type must
6595 * be read in from the ELCR */
6597 -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
6598 +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
6599 #define default_EISA_polarity(idx) default_ISA_polarity(idx)
6601 /* PCI interrupts are always polarity one level triggered,
6602 @@ -1025,118 +1033,115 @@ static int EISA_ELCR(unsigned int irq)
6604 static int MPBIOS_polarity(int idx)
6606 - int bus = mp_irqs[idx].mpc_srcbus;
6607 + int bus = mp_irqs[idx].mp_srcbus;
6611 * Determine IRQ line polarity (high active or low active):
6613 - switch (mp_irqs[idx].mpc_irqflag & 3)
6614 + switch (mp_irqs[idx].mp_irqflag & 3) {
6615 + case 0: /* conforms, ie. bus-type dependent polarity */
6617 - case 0: /* conforms, ie. bus-type dependent polarity */
6619 - polarity = test_bit(bus, mp_bus_not_pci)?
6620 - default_ISA_polarity(idx):
6621 - default_PCI_polarity(idx);
6624 - case 1: /* high active */
6629 - case 2: /* reserved */
6631 - printk(KERN_WARNING "broken BIOS!!\n");
6635 - case 3: /* low active */
6640 - default: /* invalid */
6642 - printk(KERN_WARNING "broken BIOS!!\n");
6646 + polarity = test_bit(bus, mp_bus_not_pci)?
6647 + default_ISA_polarity(idx):
6648 + default_PCI_polarity(idx);
6651 + case 1: /* high active */
6656 + case 2: /* reserved */
6658 + printk(KERN_WARNING "broken BIOS!!\n");
6662 + case 3: /* low active */
6667 + default: /* invalid */
6669 + printk(KERN_WARNING "broken BIOS!!\n");
6677 static int MPBIOS_trigger(int idx)
6679 - int bus = mp_irqs[idx].mpc_srcbus;
6680 + int bus = mp_irqs[idx].mp_srcbus;
6684 * Determine IRQ trigger mode (edge or level sensitive):
6686 - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
6687 + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
6688 + case 0: /* conforms, ie. bus-type dependent */
6690 - case 0: /* conforms, ie. bus-type dependent */
6692 - trigger = test_bit(bus, mp_bus_not_pci)?
6693 - default_ISA_trigger(idx):
6694 - default_PCI_trigger(idx);
6695 + trigger = test_bit(bus, mp_bus_not_pci)?
6696 + default_ISA_trigger(idx):
6697 + default_PCI_trigger(idx);
6698 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
6699 - switch (mp_bus_id_to_type[bus])
6701 - case MP_BUS_ISA: /* ISA pin */
6703 - /* set before the switch */
6706 - case MP_BUS_EISA: /* EISA pin */
6708 - trigger = default_EISA_trigger(idx);
6711 - case MP_BUS_PCI: /* PCI pin */
6713 - /* set before the switch */
6716 - case MP_BUS_MCA: /* MCA pin */
6718 - trigger = default_MCA_trigger(idx);
6723 - printk(KERN_WARNING "broken BIOS!!\n");
6729 + switch (mp_bus_id_to_type[bus]) {
6730 + case MP_BUS_ISA: /* ISA pin */
6732 + /* set before the switch */
6735 - case 1: /* edge */
6736 + case MP_BUS_EISA: /* EISA pin */
6739 + trigger = default_EISA_trigger(idx);
6742 - case 2: /* reserved */
6743 + case MP_BUS_PCI: /* PCI pin */
6745 - printk(KERN_WARNING "broken BIOS!!\n");
6747 + /* set before the switch */
6750 - case 3: /* level */
6751 + case MP_BUS_MCA: /* MCA pin */
6754 + trigger = default_MCA_trigger(idx);
6757 - default: /* invalid */
6760 printk(KERN_WARNING "broken BIOS!!\n");
6769 + case 1: /* edge */
6774 + case 2: /* reserved */
6776 + printk(KERN_WARNING "broken BIOS!!\n");
6780 + case 3: /* level */
6785 + default: /* invalid */
6787 + printk(KERN_WARNING "broken BIOS!!\n");
6795 @@ -1153,16 +1158,16 @@ static inline int irq_trigger(int idx)
6796 static int pin_2_irq(int idx, int apic, int pin)
6799 - int bus = mp_irqs[idx].mpc_srcbus;
6800 + int bus = mp_irqs[idx].mp_srcbus;
6803 * Debugging check, we are in big trouble if this message pops up!
6805 - if (mp_irqs[idx].mpc_dstirq != pin)
6806 + if (mp_irqs[idx].mp_dstirq != pin)
6807 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
6809 if (test_bit(bus, mp_bus_not_pci))
6810 - irq = mp_irqs[idx].mpc_srcbusirq;
6811 + irq = mp_irqs[idx].mp_srcbusirq;
6814 * PCI IRQs are mapped in order
6815 @@ -1204,8 +1209,8 @@ static inline int IO_APIC_irq_trigger(in
6817 for (apic = 0; apic < nr_ioapics; apic++) {
6818 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6819 - idx = find_irq_entry(apic,pin,mp_INT);
6820 - if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
6821 + idx = find_irq_entry(apic, pin, mp_INT);
6822 + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
6823 return irq_trigger(idx);
6826 @@ -1291,25 +1296,25 @@ static void __init setup_IO_APIC_irqs(vo
6828 * add it to the IO-APIC irq-routing table:
6830 - memset(&entry,0,sizeof(entry));
6831 + memset(&entry, 0, sizeof(entry));
6833 entry.delivery_mode = INT_DELIVERY_MODE;
6834 entry.dest_mode = INT_DEST_MODE;
6835 entry.mask = 0; /* enable IRQ */
6836 - entry.dest.logical.logical_dest =
6837 + entry.dest.logical.logical_dest =
6838 cpu_mask_to_apicid(TARGET_CPUS);
6840 - idx = find_irq_entry(apic,pin,mp_INT);
6841 + idx = find_irq_entry(apic, pin, mp_INT);
6844 apic_printk(APIC_VERBOSE, KERN_DEBUG
6845 " IO-APIC (apicid-pin) %d-%d",
6846 - mp_ioapics[apic].mpc_apicid,
6847 + mp_ioapics[apic].mp_apicid,
6851 apic_printk(APIC_VERBOSE, ", %d-%d",
6852 - mp_ioapics[apic].mpc_apicid, pin);
6853 + mp_ioapics[apic].mp_apicid, pin);
6857 @@ -1343,7 +1348,7 @@ static void __init setup_IO_APIC_irqs(vo
6858 vector = assign_irq_vector(irq);
6859 entry.vector = vector;
6860 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
6863 if (!apic && (irq < 16))
6864 disable_8259A_irq(irq);
6866 @@ -1355,27 +1360,23 @@ static void __init setup_IO_APIC_irqs(vo
6867 apic_printk(APIC_VERBOSE, " not connected.\n");
6872 - * Set up the 8259A-master output pin:
6873 + * Set up the timer pin, possibly with the 8259A-master behind.
6876 -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
6877 +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
6880 struct IO_APIC_route_entry entry;
6882 - memset(&entry,0,sizeof(entry));
6884 - disable_8259A_irq(0);
6887 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6888 + memset(&entry, 0, sizeof(entry));
6891 * We use logical delivery to get the timer IRQ
6894 entry.dest_mode = INT_DEST_MODE;
6895 - entry.mask = 0; /* unmask IRQ now */
6896 + entry.mask = 1; /* mask IRQ now */
6897 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6898 entry.delivery_mode = INT_DELIVERY_MODE;
6900 @@ -1384,17 +1385,14 @@ static void __init setup_ExtINT_IRQ0_pin
6903 * The timer IRQ doesn't have to know that behind the
6904 - * scene we have a 8259A-master in AEOI mode ...
6905 + * scene we may have a 8259A-master in AEOI mode ...
6907 - irq_desc[0].chip = &ioapic_chip;
6908 - set_irq_handler(0, handle_edge_irq);
6909 + ioapic_register_intr(0, vector, IOAPIC_EDGE);
6912 * Add it to the IO-APIC irq-routing table:
6914 ioapic_write_entry(apic, pin, entry);
6916 - enable_8259A_irq(0);
6919 void __init print_IO_APIC(void)
6920 @@ -1409,10 +1407,10 @@ void __init print_IO_APIC(void)
6921 if (apic_verbosity == APIC_QUIET)
6924 - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6925 + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6926 for (i = 0; i < nr_ioapics; i++)
6927 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
6928 - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
6929 + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
6932 * We are a bit conservative about what we expect. We have to
6933 @@ -1431,7 +1429,7 @@ void __init print_IO_APIC(void)
6934 reg_03.raw = io_apic_read(apic, 3);
6935 spin_unlock_irqrestore(&ioapic_lock, flags);
6937 - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
6938 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
6939 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
6940 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
6941 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
6942 @@ -1512,7 +1510,7 @@ void __init print_IO_APIC(void)
6946 -static void print_APIC_bitfield (int base)
6947 +static void print_APIC_bitfield(int base)
6951 @@ -1533,7 +1531,7 @@ static void print_APIC_bitfield (int bas
6955 -void /*__init*/ print_local_APIC(void * dummy)
6956 +void /*__init*/ print_local_APIC(void *dummy)
6958 unsigned int v, ver, maxlvt;
6960 @@ -1542,6 +1540,7 @@ void /*__init*/ print_local_APIC(void *
6962 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
6963 smp_processor_id(), hard_smp_processor_id());
6964 + v = apic_read(APIC_ID);
6965 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
6966 GET_APIC_ID(read_apic_id()));
6967 v = apic_read(APIC_LVR);
6968 @@ -1616,9 +1615,9 @@ void /*__init*/ print_local_APIC(void *
6972 -void print_all_local_APICs (void)
6973 +void print_all_local_APICs(void)
6975 - on_each_cpu(print_local_APIC, NULL, 1, 1);
6976 + on_each_cpu(print_local_APIC, NULL, 1);
6979 void /*__init*/ print_PIC(void)
6980 @@ -1639,11 +1638,11 @@ void /*__init*/ print_PIC(void)
6981 v = inb(0xa0) << 8 | inb(0x20);
6982 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
6988 v = inb(0xa0) << 8 | inb(0x20);
6994 spin_unlock_irqrestore(&i8259A_lock, flags);
6996 @@ -1652,6 +1651,8 @@ void /*__init*/ print_PIC(void)
6997 v = inb(0x4d1) << 8 | inb(0x4d0);
6998 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
7001 +void __init print_IO_APIC(void) {}
7002 #endif /* !CONFIG_XEN */
7004 static void __init enable_IO_APIC(void)
7005 @@ -1681,7 +1682,7 @@ static void __init enable_IO_APIC(void)
7006 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
7009 - for(apic = 0; apic < nr_ioapics; apic++) {
7010 + for (apic = 0; apic < nr_ioapics; apic++) {
7012 /* See if any of the pins is in ExtINT mode */
7013 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
7014 @@ -1774,7 +1775,7 @@ void disable_IO_APIC(void)
7015 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
7018 -#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
7020 static void __init setup_ioapic_ids_from_mpc(void)
7022 union IO_APIC_reg_00 reg_00;
7023 @@ -1784,6 +1785,11 @@ static void __init setup_ioapic_ids_from
7024 unsigned char old_id;
7025 unsigned long flags;
7027 +#ifdef CONFIG_X86_NUMAQ
7033 * Don't check I/O APIC IDs for xAPIC systems. They have
7034 * no meaning without the serial APIC bus.
7035 @@ -1806,15 +1812,15 @@ static void __init setup_ioapic_ids_from
7036 spin_lock_irqsave(&ioapic_lock, flags);
7037 reg_00.raw = io_apic_read(apic, 0);
7038 spin_unlock_irqrestore(&ioapic_lock, flags);
7040 - old_id = mp_ioapics[apic].mpc_apicid;
7042 - if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
7043 + old_id = mp_ioapics[apic].mp_apicid;
7045 + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
7046 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
7047 - apic, mp_ioapics[apic].mpc_apicid);
7048 + apic, mp_ioapics[apic].mp_apicid);
7049 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7051 - mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
7052 + mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
7056 @@ -1823,9 +1829,9 @@ static void __init setup_ioapic_ids_from
7057 * 'stuck on smp_invalidate_needed IPI wait' messages.
7059 if (check_apicid_used(phys_id_present_map,
7060 - mp_ioapics[apic].mpc_apicid)) {
7061 + mp_ioapics[apic].mp_apicid)) {
7062 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
7063 - apic, mp_ioapics[apic].mpc_apicid);
7064 + apic, mp_ioapics[apic].mp_apicid);
7065 for (i = 0; i < get_physical_broadcast(); i++)
7066 if (!physid_isset(i, phys_id_present_map))
7068 @@ -1834,13 +1840,13 @@ static void __init setup_ioapic_ids_from
7069 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7071 physid_set(i, phys_id_present_map);
7072 - mp_ioapics[apic].mpc_apicid = i;
7073 + mp_ioapics[apic].mp_apicid = i;
7076 - tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
7077 + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
7078 apic_printk(APIC_VERBOSE, "Setting %d in the "
7079 "phys_id_present_map\n",
7080 - mp_ioapics[apic].mpc_apicid);
7081 + mp_ioapics[apic].mp_apicid);
7082 physids_or(phys_id_present_map, phys_id_present_map, tmp);
7085 @@ -1849,21 +1855,21 @@ static void __init setup_ioapic_ids_from
7086 * We need to adjust the IRQ routing table
7087 * if the ID changed.
7089 - if (old_id != mp_ioapics[apic].mpc_apicid)
7090 + if (old_id != mp_ioapics[apic].mp_apicid)
7091 for (i = 0; i < mp_irq_entries; i++)
7092 - if (mp_irqs[i].mpc_dstapic == old_id)
7093 - mp_irqs[i].mpc_dstapic
7094 - = mp_ioapics[apic].mpc_apicid;
7095 + if (mp_irqs[i].mp_dstapic == old_id)
7096 + mp_irqs[i].mp_dstapic
7097 + = mp_ioapics[apic].mp_apicid;
7100 * Read the right value from the MPC table and
7101 * write it into the ID register.
7104 apic_printk(APIC_VERBOSE, KERN_INFO
7105 "...changing IO-APIC physical APIC ID to %d ...",
7106 - mp_ioapics[apic].mpc_apicid);
7107 + mp_ioapics[apic].mp_apicid);
7109 - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
7110 + reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
7111 spin_lock_irqsave(&ioapic_lock, flags);
7112 io_apic_write(apic, 0, reg_00.raw);
7113 spin_unlock_irqrestore(&ioapic_lock, flags);
7114 @@ -1874,17 +1880,13 @@ static void __init setup_ioapic_ids_from
7115 spin_lock_irqsave(&ioapic_lock, flags);
7116 reg_00.raw = io_apic_read(apic, 0);
7117 spin_unlock_irqrestore(&ioapic_lock, flags);
7118 - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
7119 + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
7120 printk("could not set ID!\n");
7122 apic_printk(APIC_VERBOSE, " ok.\n");
7126 -static void __init setup_ioapic_ids_from_mpc(void) { }
7130 int no_timer_check __initdata;
7132 static int __init notimercheck(char *s)
7133 @@ -2077,45 +2079,53 @@ static inline void init_IO_APIC_traps(vo
7134 * The local APIC irq-chip implementation:
7137 -static void ack_apic(unsigned int irq)
7138 +static void ack_lapic_irq(unsigned int irq)
7143 -static void mask_lapic_irq (unsigned int irq)
7144 +static void mask_lapic_irq(unsigned int irq)
7148 v = apic_read(APIC_LVT0);
7149 - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
7150 + apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
7153 -static void unmask_lapic_irq (unsigned int irq)
7154 +static void unmask_lapic_irq(unsigned int irq)
7158 v = apic_read(APIC_LVT0);
7159 - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
7160 + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
7163 static struct irq_chip lapic_chip __read_mostly = {
7164 - .name = "local-APIC-edge",
7165 + .name = "local-APIC",
7166 .mask = mask_lapic_irq,
7167 .unmask = unmask_lapic_irq,
7169 + .ack = ack_lapic_irq,
7172 +static void lapic_register_intr(int irq, int vector)
7174 + irq_desc[irq].status &= ~IRQ_LEVEL;
7175 + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
7177 + set_intr_gate(vector, interrupt[irq]);
7180 static void __init setup_nmi(void)
7183 - * Dirty trick to enable the NMI watchdog ...
7184 + * Dirty trick to enable the NMI watchdog ...
7185 * We put the 8259A master into AEOI mode and
7186 * unmask on all local APICs LVT0 as NMI.
7188 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
7189 * is from Maciej W. Rozycki - so we do not have to EOI from
7190 * the NMI handler or the timer interrupt.
7193 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
7195 enable_NMI_through_LVT0();
7196 @@ -2191,11 +2201,16 @@ static inline void __init unlock_ExtINT_
7197 static inline void __init check_timer(void)
7199 int apic1, pin1, apic2, pin2;
7203 unsigned long flags;
7205 local_irq_save(flags);
7207 + ver = apic_read(APIC_LVR);
7208 + ver = GET_APIC_VERSION(ver);
7211 * get/set the timer IRQ vector:
7213 @@ -2204,34 +2219,54 @@ static inline void __init check_timer(vo
7214 set_intr_gate(vector, interrupt[0]);
7217 - * Subtle, code in do_timer_interrupt() expects an AEOI
7218 - * mode for the 8259A whenever interrupts are routed
7219 - * through I/O APICs. Also IRQ0 has to be enabled in
7220 - * the 8259A which implies the virtual wire has to be
7221 - * disabled in the local APIC.
7222 + * As IRQ0 is to be enabled in the 8259A, the virtual
7223 + * wire has to be disabled in the local APIC. Also
7224 + * timer interrupts need to be acknowledged manually in
7225 + * the 8259A for the i82489DX when using the NMI
7226 + * watchdog as that APIC treats NMIs as level-triggered.
7227 + * The AEOI mode will finish them in the 8259A
7230 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7231 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7234 - if (timer_over_8254 > 0)
7235 - enable_8259A_irq(0);
7236 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
7238 pin1 = find_isa_irq_pin(0, mp_INT);
7239 apic1 = find_isa_irq_apic(0, mp_INT);
7240 pin2 = ioapic_i8259.pin;
7241 apic2 = ioapic_i8259.apic;
7243 - printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
7244 - vector, apic1, pin1, apic2, pin2);
7245 + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
7246 + "apic1=%d pin1=%d apic2=%d pin2=%d\n",
7247 + vector, apic1, pin1, apic2, pin2);
7250 + * Some BIOS writers are clueless and report the ExtINTA
7251 + * I/O APIC input from the cascaded 8259A as the timer
7252 + * interrupt input. So just in case, if only one pin
7253 + * was found above, try it both directly and through the
7260 + } else if (pin2 == -1) {
7267 * Ok, does IRQ0 through the IOAPIC work?
7270 + add_pin_to_irq(0, apic1, pin1);
7271 + setup_timer_IRQ0_pin(apic1, pin1, vector);
7273 unmask_IO_APIC_irq(0);
7274 if (timer_irq_works()) {
7275 if (nmi_watchdog == NMI_IO_APIC) {
7276 - disable_8259A_irq(0);
7278 enable_8259A_irq(0);
7280 @@ -2240,71 +2275,77 @@ static inline void __init check_timer(vo
7283 clear_IO_APIC_pin(apic1, pin1);
7284 - printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
7288 - printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
7290 - printk("\n..... (found pin %d) ...", pin2);
7292 + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
7293 + "8254 timer not connected to IO-APIC\n");
7295 + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
7296 + "(IRQ0) through the 8259A ...\n");
7297 + apic_printk(APIC_QUIET, KERN_INFO
7298 + "..... (found apic %d pin %d) ...\n", apic2, pin2);
7300 * legacy devices should be connected to IO APIC #0
7302 - setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
7303 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7304 + setup_timer_IRQ0_pin(apic2, pin2, vector);
7305 + unmask_IO_APIC_irq(0);
7306 + enable_8259A_irq(0);
7307 if (timer_irq_works()) {
7308 - printk("works.\n");
7310 - replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7312 - add_pin_to_irq(0, apic2, pin2);
7313 + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
7314 + timer_through_8259 = 1;
7315 if (nmi_watchdog == NMI_IO_APIC) {
7316 + disable_8259A_irq(0);
7318 + enable_8259A_irq(0);
7323 * Cleanup, just in case ...
7325 + disable_8259A_irq(0);
7326 clear_IO_APIC_pin(apic2, pin2);
7327 + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
7329 - printk(" failed.\n");
7331 if (nmi_watchdog == NMI_IO_APIC) {
7332 - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
7334 + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
7335 + "through the IO-APIC - disabling NMI Watchdog!\n");
7336 + nmi_watchdog = NMI_NONE;
7340 - printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
7341 + apic_printk(APIC_QUIET, KERN_INFO
7342 + "...trying to set up timer as Virtual Wire IRQ...\n");
7344 - disable_8259A_irq(0);
7345 - set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
7347 - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7348 + lapic_register_intr(0, vector);
7349 + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7350 enable_8259A_irq(0);
7352 if (timer_irq_works()) {
7353 - printk(" works.\n");
7354 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7357 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7358 - printk(" failed.\n");
7359 + disable_8259A_irq(0);
7360 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7361 + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
7363 - printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
7364 + apic_printk(APIC_QUIET, KERN_INFO
7365 + "...trying to set up timer as ExtINT IRQ...\n");
7370 - apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
7371 + apic_write(APIC_LVT0, APIC_DM_EXTINT);
7373 unlock_ExtINT_logic();
7375 if (timer_irq_works()) {
7376 - printk(" works.\n");
7377 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7380 - printk(" failed :(.\n");
7381 + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
7382 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
7383 - "report. Then try booting with the 'noapic' option");
7384 + "report. Then try booting with the 'noapic' option.\n");
7386 local_irq_restore(flags);
7388 @@ -2314,11 +2355,21 @@ int timer_uses_ioapic_pin_0 = 0;
7393 - * IRQ's that are handled by the PIC in the MPS IOAPIC case.
7394 - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
7395 - * Linux doesn't really care, as it's not actually used
7396 - * for any interrupt handling anyway.
7397 + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
7398 + * to devices. However there may be an I/O APIC pin available for
7399 + * this interrupt regardless. The pin may be left unconnected, but
7400 + * typically it will be reused as an ExtINT cascade interrupt for
7401 + * the master 8259A. In the MPS case such a pin will normally be
7402 + * reported as an ExtINT interrupt in the MP table. With ACPI
7403 + * there is no provision for ExtINT interrupts, and in the absence
7404 + * of an override it would be treated as an ordinary ISA I/O APIC
7405 + * interrupt, that is edge-triggered and unmasked by default. We
7406 + * used to do this, but it caused problems on some systems because
7407 + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
7408 + * the same ExtINT cascade interrupt to drive the local APIC of the
7409 + * bootstrap processor. Therefore we refrain from routing IRQ2 to
7410 + * the I/O APIC in all cases now. No actual device should request
7411 + * it anyway. --macro
7413 #define PIC_IRQS (1 << PIC_CASCADE_IR)
7415 @@ -2328,25 +2379,22 @@ void __init setup_IO_APIC(void)
7418 /* Reserve all the system vectors. */
7419 - for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
7420 + for (i = first_system_vector; i < NR_VECTORS; i++)
7421 set_bit(i, used_vectors);
7427 - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
7429 - io_apic_irqs = ~PIC_IRQS;
7430 + io_apic_irqs = ~PIC_IRQS;
7432 printk("ENABLING IO-APIC IRQs\n");
7436 * Set up IO-APIC IRQ routing.
7439 setup_ioapic_ids_from_mpc();
7443 setup_IO_APIC_irqs();
7444 @@ -2356,28 +2404,14 @@ void __init setup_IO_APIC(void)
7448 -static int __init setup_disable_8254_timer(char *s)
7450 - timer_over_8254 = -1;
7453 -static int __init setup_enable_8254_timer(char *s)
7455 - timer_over_8254 = 2;
7459 -__setup("disable_8254_timer", setup_disable_8254_timer);
7460 -__setup("enable_8254_timer", setup_enable_8254_timer);
7463 * Called after all the initialization is done. If we didnt find any
7464 * APIC bugs then we can allow the modify fast path
7468 static int __init io_apic_bug_finalize(void)
7470 - if(sis_apic_bug == -1)
7471 + if (sis_apic_bug == -1)
7473 if (is_initial_xendomain()) {
7474 struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
7475 @@ -2396,17 +2430,17 @@ struct sysfs_ioapic_data {
7476 struct sys_device dev;
7477 struct IO_APIC_route_entry entry[0];
7479 -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
7480 +static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
7482 static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
7484 struct IO_APIC_route_entry *entry;
7485 struct sysfs_ioapic_data *data;
7489 data = container_of(dev, struct sysfs_ioapic_data, dev);
7490 entry = data->entry;
7491 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7492 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7493 entry[i] = ioapic_read_entry(dev->id, i);
7496 @@ -2419,18 +2453,18 @@ static int ioapic_resume(struct sys_devi
7497 unsigned long flags;
7498 union IO_APIC_reg_00 reg_00;
7502 data = container_of(dev, struct sysfs_ioapic_data, dev);
7503 entry = data->entry;
7505 spin_lock_irqsave(&ioapic_lock, flags);
7506 reg_00.raw = io_apic_read(dev->id, 0);
7507 - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
7508 - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
7509 + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
7510 + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
7511 io_apic_write(dev->id, 0, reg_00.raw);
7513 spin_unlock_irqrestore(&ioapic_lock, flags);
7514 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7515 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7516 ioapic_write_entry(dev->id, i, entry[i]);
7519 @@ -2444,24 +2478,23 @@ static struct sysdev_class ioapic_sysdev
7521 static int __init ioapic_init_sysfs(void)
7523 - struct sys_device * dev;
7524 + struct sys_device *dev;
7525 int i, size, error = 0;
7527 error = sysdev_class_register(&ioapic_sysdev_class);
7531 - for (i = 0; i < nr_ioapics; i++ ) {
7532 - size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7533 + for (i = 0; i < nr_ioapics; i++) {
7534 + size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7535 * sizeof(struct IO_APIC_route_entry);
7536 - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
7537 + mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
7538 if (!mp_ioapic_data[i]) {
7539 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
7542 - memset(mp_ioapic_data[i], 0, size);
7543 dev = &mp_ioapic_data[i]->dev;
7546 dev->cls = &ioapic_sysdev_class;
7547 error = sysdev_register(dev);
7549 @@ -2538,7 +2571,7 @@ static int msi_compose_msg(struct pci_de
7552 ((INT_DEST_MODE == 0) ?
7553 - MSI_ADDR_DEST_MODE_PHYSICAL:
7554 +MSI_ADDR_DEST_MODE_PHYSICAL:
7555 MSI_ADDR_DEST_MODE_LOGICAL) |
7556 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7557 MSI_ADDR_REDIRECTION_CPU:
7558 @@ -2549,7 +2582,7 @@ static int msi_compose_msg(struct pci_de
7559 MSI_DATA_TRIGGER_EDGE |
7560 MSI_DATA_LEVEL_ASSERT |
7561 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7562 - MSI_DATA_DELIVERY_FIXED:
7563 +MSI_DATA_DELIVERY_FIXED:
7564 MSI_DATA_DELIVERY_LOWPRI) |
7565 MSI_DATA_VECTOR(vector);
7567 @@ -2720,12 +2753,12 @@ int arch_setup_ht_irq(unsigned int irq,
7568 #endif /* CONFIG_HT_IRQ */
7570 /* --------------------------------------------------------------------------
7571 - ACPI-based IOAPIC Configuration
7572 + ACPI-based IOAPIC Configuration
7573 -------------------------------------------------------------------------- */
7577 -int __init io_apic_get_unique_id (int ioapic, int apic_id)
7578 +int __init io_apic_get_unique_id(int ioapic, int apic_id)
7581 union IO_APIC_reg_00 reg_00;
7582 @@ -2735,10 +2768,10 @@ int __init io_apic_get_unique_id (int io
7586 - * The P4 platform supports up to 256 APIC IDs on two separate APIC
7587 - * buses (one for LAPICs, one for IOAPICs), where predecessors only
7588 + * The P4 platform supports up to 256 APIC IDs on two separate APIC
7589 + * buses (one for LAPICs, one for IOAPICs), where predecessors only
7590 * supports up to 16 on one shared APIC bus.
7593 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
7594 * advantage of new APIC bus architecture.
7596 @@ -2757,7 +2790,7 @@ int __init io_apic_get_unique_id (int io
7600 - * Every APIC in a system must have a unique ID or we get lots of nice
7601 + * Every APIC in a system must have a unique ID or we get lots of nice
7602 * 'stuck on smp_invalidate_needed IPI wait' messages.
7604 if (check_apicid_used(apic_id_map, apic_id)) {
7605 @@ -2774,7 +2807,7 @@ int __init io_apic_get_unique_id (int io
7606 "trying %d\n", ioapic, apic_id, i);
7612 tmp = apicid_to_cpu_present(apic_id);
7613 physids_or(apic_id_map, apic_id_map, tmp);
7614 @@ -2802,7 +2835,7 @@ int __init io_apic_get_unique_id (int io
7618 -int __init io_apic_get_version (int ioapic)
7619 +int __init io_apic_get_version(int ioapic)
7621 union IO_APIC_reg_01 reg_01;
7622 unsigned long flags;
7623 @@ -2815,7 +2848,7 @@ int __init io_apic_get_version (int ioap
7627 -int __init io_apic_get_redir_entries (int ioapic)
7628 +int __init io_apic_get_redir_entries(int ioapic)
7630 union IO_APIC_reg_01 reg_01;
7631 unsigned long flags;
7632 @@ -2828,7 +2861,7 @@ int __init io_apic_get_redir_entries (in
7636 -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
7637 +int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
7639 struct IO_APIC_route_entry entry;
7641 @@ -2844,7 +2877,7 @@ int io_apic_set_pci_routing (int ioapic,
7642 * corresponding device driver registers for this IRQ.
7645 - memset(&entry,0,sizeof(entry));
7646 + memset(&entry, 0, sizeof(entry));
7648 entry.delivery_mode = INT_DELIVERY_MODE;
7649 entry.dest_mode = INT_DEST_MODE;
7650 @@ -2863,7 +2896,7 @@ int io_apic_set_pci_routing (int ioapic,
7652 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
7653 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
7654 - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
7655 + mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
7656 edge_level, active_high_low);
7658 ioapic_register_intr(irq, entry.vector, edge_level);
7659 @@ -2884,8 +2917,8 @@ int acpi_get_override_irq(int bus_irq, i
7662 for (i = 0; i < mp_irq_entries; i++)
7663 - if (mp_irqs[i].mpc_irqtype == mp_INT &&
7664 - mp_irqs[i].mpc_srcbusirq == bus_irq)
7665 + if (mp_irqs[i].mp_irqtype == mp_INT &&
7666 + mp_irqs[i].mp_srcbusirq == bus_irq)
7668 if (i >= mp_irq_entries)
7670 @@ -2918,3 +2951,35 @@ static int __init parse_noapic(char *arg
7673 early_param("noapic", parse_noapic);
7676 +void __init ioapic_init_mappings(void)
7678 + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
7681 + for (i = 0; i < nr_ioapics; i++) {
7682 + if (smp_found_config) {
7683 + ioapic_phys = mp_ioapics[i].mp_apicaddr;
7684 + if (!ioapic_phys) {
7686 + "WARNING: bogus zero IO-APIC "
7687 + "address found in MPTABLE, "
7688 + "disabling IO/APIC support!\n");
7689 + smp_found_config = 0;
7690 + skip_ioapic_setup = 1;
7691 + goto fake_ioapic_page;
7695 + ioapic_phys = (unsigned long)
7696 + alloc_bootmem_pages(PAGE_SIZE);
7697 + ioapic_phys = __pa(ioapic_phys);
7699 + set_fixmap_nocache(idx, ioapic_phys);
7700 + printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
7701 + __fix_to_virt(idx), ioapic_phys);
7706 --- sle11-2009-10-16.orig/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
7707 +++ sle11-2009-10-16/arch/x86/kernel/io_apic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
7709 #include <asm/proto.h>
7710 #include <asm/acpi.h>
7711 #include <asm/dma.h>
7712 +#include <asm/i8259.h>
7713 #include <asm/nmi.h>
7714 #include <asm/msidef.h>
7715 #include <asm/hypertransport.h>
7716 @@ -63,10 +64,16 @@ struct irq_cfg {
7719 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
7720 -struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7721 +static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7723 static int assign_irq_vector(int irq, cpumask_t mask);
7726 +int first_system_vector = 0xfe;
7728 +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
7731 #define __apicdebuginit __init
7733 int sis_apic_bug; /* not actually supported, dummy for compile */
7734 @@ -89,14 +96,14 @@ unsigned long io_apic_irqs;
7736 #define clear_IO_APIC() ((void)0)
7738 -int timer_over_8254 __initdata = 1;
7739 +int timer_through_8259 __initdata;
7741 /* Where if anywhere is the i8259 connect in external int mode */
7742 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
7745 static DEFINE_SPINLOCK(ioapic_lock);
7746 -DEFINE_SPINLOCK(vector_lock);
7747 +static DEFINE_SPINLOCK(vector_lock);
7750 * # of IRQ routing registers
7751 @@ -104,15 +111,17 @@ DEFINE_SPINLOCK(vector_lock);
7752 int nr_ioapic_registers[MAX_IO_APICS];
7754 /* I/O APIC entries */
7755 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
7756 +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
7759 /* MP IRQ source entries */
7760 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7761 +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7763 /* # of MP IRQ source entries */
7766 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
7769 * Rough estimation of how many shared IRQs there are, can
7770 * be changed anytime.
7771 @@ -141,7 +150,7 @@ struct io_apic {
7772 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
7774 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
7775 - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
7776 + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
7780 @@ -155,7 +164,7 @@ static inline unsigned int io_apic_read(
7781 struct physdev_apic apic_op;
7784 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7785 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7787 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
7789 @@ -173,7 +182,7 @@ static inline void io_apic_write(unsigne
7791 struct physdev_apic apic_op;
7793 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7794 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7796 apic_op.value = value;
7797 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
7798 @@ -209,7 +218,7 @@ static bool io_apic_level_ack_pending(un
7800 reg = io_apic_read(entry->apic, 0x10 + pin*2);
7801 /* Is the remote IRR bit set? */
7802 - if ((reg >> 14) & 1) {
7803 + if (reg & IO_APIC_REDIR_REMOTE_IRR) {
7804 spin_unlock_irqrestore(&ioapic_lock, flags);
7807 @@ -308,7 +317,7 @@ static void __target_IO_APIC_irq(unsigne
7809 io_apic_write(apic, 0x11 + pin*2, dest);
7810 reg = io_apic_read(apic, 0x10 + pin*2);
7811 - reg &= ~0x000000ff;
7812 + reg &= ~IO_APIC_REDIR_VECTOR_MASK;
7814 io_apic_modify(apic, reg);
7816 @@ -372,6 +381,26 @@ static void add_pin_to_irq(unsigned int
7821 + * Reroute an IRQ to a different pin.
7823 +static void __init replace_pin_at_irq(unsigned int irq,
7824 + int oldapic, int oldpin,
7825 + int newapic, int newpin)
7827 + struct irq_pin_list *entry = irq_2_pin + irq;
7830 + if (entry->apic == oldapic && entry->pin == oldpin) {
7831 + entry->apic = newapic;
7832 + entry->pin = newpin;
7836 + entry = irq_2_pin + entry->next;
7840 #define __DO_ACTION(R, ACTION, FINAL) \
7843 @@ -399,10 +428,11 @@ static void add_pin_to_irq(unsigned int
7844 static void name##_IO_APIC_irq (unsigned int irq) \
7845 __DO_ACTION(R, ACTION, FINAL)
7847 -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
7849 -DO_ACTION( __unmask, 0, &= 0xfffeffff, )
7852 +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
7855 +DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
7857 static void mask_IO_APIC_irq (unsigned int irq)
7859 @@ -465,22 +495,6 @@ static int __init disable_timer_pin_setu
7861 __setup("disable_timer_pin_1", disable_timer_pin_setup);
7864 -static int __init setup_disable_8254_timer(char *s)
7866 - timer_over_8254 = -1;
7869 -static int __init setup_enable_8254_timer(char *s)
7871 - timer_over_8254 = 2;
7875 -__setup("disable_8254_timer", setup_disable_8254_timer);
7876 -__setup("enable_8254_timer", setup_enable_8254_timer);
7877 -#endif /* !CONFIG_XEN */
7881 * Find the IRQ entry number of a certain pin.
7882 @@ -490,10 +504,10 @@ static int find_irq_entry(int apic, int
7885 for (i = 0; i < mp_irq_entries; i++)
7886 - if (mp_irqs[i].mpc_irqtype == type &&
7887 - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
7888 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
7889 - mp_irqs[i].mpc_dstirq == pin)
7890 + if (mp_irqs[i].mp_irqtype == type &&
7891 + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
7892 + mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
7893 + mp_irqs[i].mp_dstirq == pin)
7897 @@ -508,13 +522,13 @@ static int __init find_isa_irq_pin(int i
7900 for (i = 0; i < mp_irq_entries; i++) {
7901 - int lbus = mp_irqs[i].mpc_srcbus;
7902 + int lbus = mp_irqs[i].mp_srcbus;
7904 if (test_bit(lbus, mp_bus_not_pci) &&
7905 - (mp_irqs[i].mpc_irqtype == type) &&
7906 - (mp_irqs[i].mpc_srcbusirq == irq))
7907 + (mp_irqs[i].mp_irqtype == type) &&
7908 + (mp_irqs[i].mp_srcbusirq == irq))
7910 - return mp_irqs[i].mpc_dstirq;
7911 + return mp_irqs[i].mp_dstirq;
7915 @@ -524,17 +538,17 @@ static int __init find_isa_irq_apic(int
7918 for (i = 0; i < mp_irq_entries; i++) {
7919 - int lbus = mp_irqs[i].mpc_srcbus;
7920 + int lbus = mp_irqs[i].mp_srcbus;
7922 if (test_bit(lbus, mp_bus_not_pci) &&
7923 - (mp_irqs[i].mpc_irqtype == type) &&
7924 - (mp_irqs[i].mpc_srcbusirq == irq))
7925 + (mp_irqs[i].mp_irqtype == type) &&
7926 + (mp_irqs[i].mp_srcbusirq == irq))
7929 if (i < mp_irq_entries) {
7931 for(apic = 0; apic < nr_ioapics; apic++) {
7932 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
7933 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
7937 @@ -555,28 +569,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7939 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
7941 - if (mp_bus_id_to_pci_bus[bus] == -1) {
7942 + if (test_bit(bus, mp_bus_not_pci)) {
7943 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
7946 for (i = 0; i < mp_irq_entries; i++) {
7947 - int lbus = mp_irqs[i].mpc_srcbus;
7948 + int lbus = mp_irqs[i].mp_srcbus;
7950 for (apic = 0; apic < nr_ioapics; apic++)
7951 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
7952 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
7953 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
7954 + mp_irqs[i].mp_dstapic == MP_APIC_ALL)
7957 if (!test_bit(lbus, mp_bus_not_pci) &&
7958 - !mp_irqs[i].mpc_irqtype &&
7959 + !mp_irqs[i].mp_irqtype &&
7961 - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
7962 - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
7963 + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
7964 + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
7966 if (!(apic || IO_APIC_IRQ(irq)))
7969 - if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
7970 + if (pin == (mp_irqs[i].mp_srcbusirq & 3))
7973 * Use the first all-but-pin matching entry as a
7974 @@ -604,13 +618,13 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7976 static int MPBIOS_polarity(int idx)
7978 - int bus = mp_irqs[idx].mpc_srcbus;
7979 + int bus = mp_irqs[idx].mp_srcbus;
7983 * Determine IRQ line polarity (high active or low active):
7985 - switch (mp_irqs[idx].mpc_irqflag & 3)
7986 + switch (mp_irqs[idx].mp_irqflag & 3)
7988 case 0: /* conforms, ie. bus-type dependent polarity */
7989 if (test_bit(bus, mp_bus_not_pci))
7990 @@ -646,13 +660,13 @@ static int MPBIOS_polarity(int idx)
7992 static int MPBIOS_trigger(int idx)
7994 - int bus = mp_irqs[idx].mpc_srcbus;
7995 + int bus = mp_irqs[idx].mp_srcbus;
7999 * Determine IRQ trigger mode (edge or level sensitive):
8001 - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
8002 + switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
8004 case 0: /* conforms, ie. bus-type dependent */
8005 if (test_bit(bus, mp_bus_not_pci))
8006 @@ -699,16 +713,16 @@ static inline int irq_trigger(int idx)
8007 static int pin_2_irq(int idx, int apic, int pin)
8010 - int bus = mp_irqs[idx].mpc_srcbus;
8011 + int bus = mp_irqs[idx].mp_srcbus;
8014 * Debugging check, we are in big trouble if this message pops up!
8016 - if (mp_irqs[idx].mpc_dstirq != pin)
8017 + if (mp_irqs[idx].mp_dstirq != pin)
8018 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
8020 if (test_bit(bus, mp_bus_not_pci)) {
8021 - irq = mp_irqs[idx].mpc_srcbusirq;
8022 + irq = mp_irqs[idx].mp_srcbusirq;
8025 * PCI IRQs are mapped in order
8026 @@ -722,6 +736,19 @@ static int pin_2_irq(int idx, int apic,
8030 +void lock_vector_lock(void)
8032 + /* Used to the online set of cpus does not change
8033 + * during assign_irq_vector.
8035 + spin_lock(&vector_lock);
8038 +void unlock_vector_lock(void)
8040 + spin_unlock(&vector_lock);
8043 static int __assign_irq_vector(int irq, cpumask_t mask)
8045 struct physdev_irq irq_op;
8046 @@ -773,7 +800,7 @@ static void __clear_irq_vector(int irq)
8048 vector = cfg->vector;
8049 cpus_and(mask, cfg->domain, cpu_online_map);
8050 - for_each_cpu_mask(cpu, mask)
8051 + for_each_cpu_mask_nr(cpu, mask)
8052 per_cpu(vector_irq, cpu)[vector] = -1;
8055 @@ -842,7 +869,7 @@ static void setup_IO_APIC_irq(int apic,
8056 apic_printk(APIC_VERBOSE,KERN_DEBUG
8057 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
8058 "IRQ %d Mode:%i Active:%i)\n",
8059 - apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
8060 + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
8061 irq, trigger, polarity);
8064 @@ -883,10 +910,10 @@ static void __init setup_IO_APIC_irqs(vo
8065 idx = find_irq_entry(apic,pin,mp_INT);
8068 - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8069 + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
8072 - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8073 + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
8076 if (!first_notcon) {
8077 @@ -908,26 +935,21 @@ static void __init setup_IO_APIC_irqs(vo
8081 - * Set up the 8259A-master output pin as broadcast to all
8083 + * Set up the timer pin, possibly with the 8259A-master behind.
8085 -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
8086 +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
8089 struct IO_APIC_route_entry entry;
8091 memset(&entry, 0, sizeof(entry));
8093 - disable_8259A_irq(0);
8096 - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8099 * We use logical delivery to get the timer IRQ
8102 entry.dest_mode = INT_DEST_MODE;
8103 - entry.mask = 0; /* unmask IRQ now */
8104 + entry.mask = 1; /* mask IRQ now */
8105 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
8106 entry.delivery_mode = INT_DELIVERY_MODE;
8108 @@ -936,7 +958,7 @@ static void __init setup_ExtINT_IRQ0_pin
8111 * The timer IRQ doesn't have to know that behind the
8112 - * scene we have a 8259A-master in AEOI mode ...
8113 + * scene we may have a 8259A-master in AEOI mode ...
8115 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
8117 @@ -944,8 +966,6 @@ static void __init setup_ExtINT_IRQ0_pin
8118 * Add it to the IO-APIC irq-routing table:
8120 ioapic_write_entry(apic, pin, entry);
8122 - enable_8259A_irq(0);
8125 void __apicdebuginit print_IO_APIC(void)
8126 @@ -962,7 +982,7 @@ void __apicdebuginit print_IO_APIC(void)
8127 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
8128 for (i = 0; i < nr_ioapics; i++)
8129 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
8130 - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
8131 + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
8134 * We are a bit conservative about what we expect. We have to
8135 @@ -980,7 +1000,7 @@ void __apicdebuginit print_IO_APIC(void)
8136 spin_unlock_irqrestore(&ioapic_lock, flags);
8139 - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
8140 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
8141 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
8142 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
8144 @@ -1072,6 +1092,7 @@ void __apicdebuginit print_local_APIC(vo
8146 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
8147 smp_processor_id(), hard_smp_processor_id());
8148 + v = apic_read(APIC_ID);
8149 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
8150 v = apic_read(APIC_LVR);
8151 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
8152 @@ -1141,7 +1162,7 @@ void __apicdebuginit print_local_APIC(vo
8154 void print_all_local_APICs (void)
8156 - on_each_cpu(print_local_APIC, NULL, 1, 1);
8157 + on_each_cpu(print_local_APIC, NULL, 1);
8160 void __apicdebuginit print_PIC(void)
8161 @@ -1175,6 +1196,8 @@ void __apicdebuginit print_PIC(void)
8162 v = inb(0x4d1) << 8 | inb(0x4d0);
8163 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
8166 +void __apicdebuginit print_IO_APIC(void) {}
8167 #endif /* !CONFIG_XEN */
8169 void __init enable_IO_APIC(void)
8170 @@ -1359,12 +1382,10 @@ static unsigned int startup_ioapic_irq(u
8171 static int ioapic_retrigger_irq(unsigned int irq)
8173 struct irq_cfg *cfg = &irq_cfg[irq];
8175 unsigned long flags;
8177 spin_lock_irqsave(&vector_lock, flags);
8178 - mask = cpumask_of_cpu(first_cpu(cfg->domain));
8179 - send_IPI_mask(mask, cfg->vector);
8180 + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
8181 spin_unlock_irqrestore(&vector_lock, flags);
8184 @@ -1545,7 +1566,7 @@ static inline void init_IO_APIC_traps(vo
8188 -static void enable_lapic_irq (unsigned int irq)
8189 +static void unmask_lapic_irq(unsigned int irq)
8193 @@ -1553,7 +1574,7 @@ static void enable_lapic_irq (unsigned i
8194 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
8197 -static void disable_lapic_irq (unsigned int irq)
8198 +static void mask_lapic_irq(unsigned int irq)
8202 @@ -1566,19 +1587,20 @@ static void ack_lapic_irq (unsigned int
8206 -static void end_lapic_irq (unsigned int i) { /* nothing */ }
8208 -static struct hw_interrupt_type lapic_irq_type __read_mostly = {
8209 - .name = "local-APIC",
8210 - .typename = "local-APIC-edge",
8211 - .startup = NULL, /* startup_irq() not used for IRQ0 */
8212 - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
8213 - .enable = enable_lapic_irq,
8214 - .disable = disable_lapic_irq,
8215 - .ack = ack_lapic_irq,
8216 - .end = end_lapic_irq,
8217 +static struct irq_chip lapic_chip __read_mostly = {
8218 + .name = "local-APIC",
8219 + .mask = mask_lapic_irq,
8220 + .unmask = unmask_lapic_irq,
8221 + .ack = ack_lapic_irq,
8224 +static void lapic_register_intr(int irq)
8226 + irq_desc[irq].status &= ~IRQ_LEVEL;
8227 + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
8231 static void __init setup_nmi(void)
8234 @@ -1664,6 +1686,7 @@ static inline void __init check_timer(vo
8235 struct irq_cfg *cfg = irq_cfg + 0;
8236 int apic1, pin1, apic2, pin2;
8237 unsigned long flags;
8240 local_irq_save(flags);
8242 @@ -1674,34 +1697,48 @@ static inline void __init check_timer(vo
8243 assign_irq_vector(0, TARGET_CPUS);
8246 - * Subtle, code in do_timer_interrupt() expects an AEOI
8247 - * mode for the 8259A whenever interrupts are routed
8248 - * through I/O APICs. Also IRQ0 has to be enabled in
8249 - * the 8259A which implies the virtual wire has to be
8250 - * disabled in the local APIC.
8251 + * As IRQ0 is to be enabled in the 8259A, the virtual
8252 + * wire has to be disabled in the local APIC.
8254 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8256 - if (timer_over_8254 > 0)
8257 - enable_8259A_irq(0);
8259 pin1 = find_isa_irq_pin(0, mp_INT);
8260 apic1 = find_isa_irq_apic(0, mp_INT);
8261 pin2 = ioapic_i8259.pin;
8262 apic2 = ioapic_i8259.apic;
8264 - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
8265 - cfg->vector, apic1, pin1, apic2, pin2);
8266 + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
8267 + "apic1=%d pin1=%d apic2=%d pin2=%d\n",
8268 + cfg->vector, apic1, pin1, apic2, pin2);
8271 + * Some BIOS writers are clueless and report the ExtINTA
8272 + * I/O APIC input from the cascaded 8259A as the timer
8273 + * interrupt input. So just in case, if only one pin
8274 + * was found above, try it both directly and through the
8281 + } else if (pin2 == -1) {
8288 * Ok, does IRQ0 through the IOAPIC work?
8291 + add_pin_to_irq(0, apic1, pin1);
8292 + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
8294 unmask_IO_APIC_irq(0);
8295 if (!no_timer_check && timer_irq_works()) {
8296 - nmi_watchdog_default();
8297 if (nmi_watchdog == NMI_IO_APIC) {
8298 - disable_8259A_irq(0);
8300 enable_8259A_irq(0);
8302 @@ -1710,54 +1747,62 @@ static inline void __init check_timer(vo
8305 clear_IO_APIC_pin(apic1, pin1);
8306 - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
8307 - "connected to IO-APIC\n");
8310 - apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
8311 - "through the 8259A ... ");
8313 - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
8316 + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
8317 + "8254 timer not connected to IO-APIC\n");
8319 + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
8320 + "(IRQ0) through the 8259A ...\n");
8321 + apic_printk(APIC_QUIET, KERN_INFO
8322 + "..... (found apic %d pin %d) ...\n", apic2, pin2);
8324 * legacy devices should be connected to IO APIC #0
8326 - setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
8327 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
8328 + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
8329 + unmask_IO_APIC_irq(0);
8330 + enable_8259A_irq(0);
8331 if (timer_irq_works()) {
8332 - apic_printk(APIC_VERBOSE," works.\n");
8333 - nmi_watchdog_default();
8334 + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
8335 + timer_through_8259 = 1;
8336 if (nmi_watchdog == NMI_IO_APIC) {
8337 + disable_8259A_irq(0);
8339 + enable_8259A_irq(0);
8344 * Cleanup, just in case ...
8346 + disable_8259A_irq(0);
8347 clear_IO_APIC_pin(apic2, pin2);
8348 + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
8350 - apic_printk(APIC_VERBOSE," failed.\n");
8352 if (nmi_watchdog == NMI_IO_APIC) {
8353 - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
8355 + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
8356 + "through the IO-APIC - disabling NMI Watchdog!\n");
8357 + nmi_watchdog = NMI_NONE;
8360 - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
8361 + apic_printk(APIC_QUIET, KERN_INFO
8362 + "...trying to set up timer as Virtual Wire IRQ...\n");
8364 - disable_8259A_irq(0);
8365 - irq_desc[0].chip = &lapic_irq_type;
8366 + lapic_register_intr(0);
8367 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
8368 enable_8259A_irq(0);
8370 if (timer_irq_works()) {
8371 - apic_printk(APIC_VERBOSE," works.\n");
8372 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8375 + disable_8259A_irq(0);
8376 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
8377 - apic_printk(APIC_VERBOSE," failed.\n");
8378 + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
8380 - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
8381 + apic_printk(APIC_QUIET, KERN_INFO
8382 + "...trying to set up timer as ExtINT IRQ...\n");
8386 @@ -1766,11 +1811,12 @@ static inline void __init check_timer(vo
8387 unlock_ExtINT_logic();
8389 if (timer_irq_works()) {
8390 - apic_printk(APIC_VERBOSE," works.\n");
8391 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8394 - apic_printk(APIC_VERBOSE," failed :(.\n");
8395 - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
8396 + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
8397 + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
8398 + "report. Then try booting with the 'noapic' option.\n");
8400 local_irq_restore(flags);
8402 @@ -1788,10 +1834,21 @@ __setup("no_timer_check", notimercheck);
8406 - * IRQs that are handled by the PIC in the MPS IOAPIC case.
8407 - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
8408 - * Linux doesn't really care, as it's not actually used
8409 - * for any interrupt handling anyway.
8410 + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
8411 + * to devices. However there may be an I/O APIC pin available for
8412 + * this interrupt regardless. The pin may be left unconnected, but
8413 + * typically it will be reused as an ExtINT cascade interrupt for
8414 + * the master 8259A. In the MPS case such a pin will normally be
8415 + * reported as an ExtINT interrupt in the MP table. With ACPI
8416 + * there is no provision for ExtINT interrupts, and in the absence
8417 + * of an override it would be treated as an ordinary ISA I/O APIC
8418 + * interrupt, that is edge-triggered and unmasked by default. We
8419 + * used to do this, but it caused problems on some systems because
8420 + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
8421 + * the same ExtINT cascade interrupt to drive the local APIC of the
8422 + * bootstrap processor. Therefore we refrain from routing IRQ2 to
8423 + * the I/O APIC in all cases now. No actual device should request
8424 + * it anyway. --macro
8426 #define PIC_IRQS (1<<2)
8428 @@ -1799,10 +1856,7 @@ void __init setup_IO_APIC(void)
8433 - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
8435 - io_apic_irqs = ~PIC_IRQS;
8436 + io_apic_irqs = ~PIC_IRQS;
8438 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
8440 @@ -1851,8 +1905,8 @@ static int ioapic_resume(struct sys_devi
8442 spin_lock_irqsave(&ioapic_lock, flags);
8443 reg_00.raw = io_apic_read(dev->id, 0);
8444 - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
8445 - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
8446 + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
8447 + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
8448 io_apic_write(dev->id, 0, reg_00.raw);
8450 spin_unlock_irqrestore(&ioapic_lock, flags);
8451 @@ -2254,8 +2308,8 @@ int acpi_get_override_irq(int bus_irq, i
8454 for (i = 0; i < mp_irq_entries; i++)
8455 - if (mp_irqs[i].mpc_irqtype == mp_INT &&
8456 - mp_irqs[i].mpc_srcbusirq == bus_irq)
8457 + if (mp_irqs[i].mp_irqtype == mp_INT &&
8458 + mp_irqs[i].mp_srcbusirq == bus_irq)
8460 if (i >= mp_irq_entries)
8462 @@ -2349,7 +2403,7 @@ void __init ioapic_init_mappings(void)
8463 ioapic_res = ioapic_setup_resources();
8464 for (i = 0; i < nr_ioapics; i++) {
8465 if (smp_found_config) {
8466 - ioapic_phys = mp_ioapics[i].mpc_apicaddr;
8467 + ioapic_phys = mp_ioapics[i].mp_apicaddr;
8469 ioapic_phys = (unsigned long)
8470 alloc_bootmem_pages(PAGE_SIZE);
8471 --- sle11-2009-10-16.orig/arch/x86/kernel/ipi-xen.c 2009-03-16 16:38:05.000000000 +0100
8472 +++ sle11-2009-10-16/arch/x86/kernel/ipi-xen.c 2009-06-04 10:21:39.000000000 +0200
8474 #include <linux/kernel_stat.h>
8475 #include <linux/mc146818rtc.h>
8476 #include <linux/cache.h>
8477 -#include <linux/interrupt.h>
8478 #include <linux/cpu.h>
8479 #include <linux/module.h>
8481 @@ -85,7 +84,7 @@ void __send_IPI_shortcut(unsigned int sh
8483 * Send the IPI. The write to APIC_ICR fires this off.
8485 - apic_write_around(APIC_ICR, cfg);
8486 + apic_write(APIC_ICR, cfg);
8490 @@ -132,7 +131,7 @@ static inline void __send_IPI_dest_field
8491 * prepare target chip field
8493 cfg = __prepare_ICR2(mask);
8494 - apic_write_around(APIC_ICR2, cfg);
8495 + apic_write(APIC_ICR2, cfg);
8499 @@ -142,7 +141,7 @@ static inline void __send_IPI_dest_field
8501 * Send the IPI. The write to APIC_ICR fires this off.
8503 - apic_write_around(APIC_ICR, cfg);
8504 + apic_write(APIC_ICR, cfg);
8508 --- sle11-2009-10-16.orig/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:38:05.000000000 +0100
8509 +++ sle11-2009-10-16/arch/x86/kernel/irq_32-xen.c 2009-06-04 10:21:39.000000000 +0200
8510 @@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
8514 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
8515 +/* Debugging check for stack overflow: is there less than 1KB free? */
8516 +static int check_stack_overflow(void)
8520 + __asm__ __volatile__("andl %%esp,%0" :
8521 + "=r" (sp) : "0" (THREAD_SIZE - 1));
8523 + return sp < (sizeof(struct thread_info) + STACK_WARN);
8526 +static void print_stack_overflow(void)
8528 + printk(KERN_WARNING "low stack detected by irq handler\n");
8533 +static inline int check_stack_overflow(void) { return 0; }
8534 +static inline void print_stack_overflow(void) { }
8537 #ifdef CONFIG_4KSTACKS
8539 * per-CPU IRQ handling contexts (thread information and stack)
8540 @@ -59,48 +82,26 @@ union irq_ctx {
8542 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
8543 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
8547 - * do_IRQ handles all normal device IRQ's (the special
8548 - * SMP cross-CPU interrupts have their own specific
8551 -unsigned int do_IRQ(struct pt_regs *regs)
8553 - struct pt_regs *old_regs;
8554 - /* high bit used in ret_from_ code */
8555 - int irq = ~regs->orig_ax;
8556 - struct irq_desc *desc = irq_desc + irq;
8557 -#ifdef CONFIG_4KSTACKS
8558 - union irq_ctx *curctx, *irqctx;
8562 - if (unlikely((unsigned)irq >= NR_IRQS)) {
8563 - printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8567 +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8568 +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8570 - old_regs = set_irq_regs(regs);
8572 -#ifdef CONFIG_DEBUG_STACKOVERFLOW
8573 - /* Debugging check for stack overflow: is there less than 1KB free? */
8577 - __asm__ __volatile__("andl %%esp,%0" :
8578 - "=r" (sp) : "0" (THREAD_SIZE - 1));
8579 - if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
8580 - printk("do_IRQ: stack overflow: %ld\n",
8581 - sp - sizeof(struct thread_info));
8586 +static void call_on_stack(void *func, void *stack)
8588 + asm volatile("xchgl %%ebx,%%esp \n"
8590 + "movl %%ebx,%%esp \n"
8594 + : "memory", "cc", "edx", "ecx", "eax");
8597 -#ifdef CONFIG_4KSTACKS
8599 +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
8601 + union irq_ctx *curctx, *irqctx;
8602 + u32 *isp, arg1, arg2;
8604 curctx = (union irq_ctx *) current_thread_info();
8605 irqctx = hardirq_ctx[smp_processor_id()];
8606 @@ -111,52 +112,39 @@ unsigned int do_IRQ(struct pt_regs *regs
8607 * handler) we can't do that and just have to keep using the
8608 * current stack (which is the irq stack already after all)
8610 - if (curctx != irqctx) {
8611 - int arg1, arg2, bx;
8613 - /* build the stack frame on the IRQ stack */
8614 - isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8615 - irqctx->tinfo.task = curctx->tinfo.task;
8616 - irqctx->tinfo.previous_esp = current_stack_pointer;
8617 + if (unlikely(curctx == irqctx))
8621 - * Copy the softirq bits in preempt_count so that the
8622 - * softirq checks work in the hardirq context.
8624 - irqctx->tinfo.preempt_count =
8625 - (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8626 - (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8629 - " xchgl %%ebx,%%esp \n"
8631 - " movl %%ebx,%%esp \n"
8632 - : "=a" (arg1), "=d" (arg2), "=b" (bx)
8633 - : "0" (irq), "1" (desc), "2" (isp),
8634 - "D" (desc->handle_irq)
8635 - : "memory", "cc", "ecx"
8639 - desc->handle_irq(irq, desc);
8640 + /* build the stack frame on the IRQ stack */
8641 + isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
8642 + irqctx->tinfo.task = curctx->tinfo.task;
8643 + irqctx->tinfo.previous_esp = current_stack_pointer;
8646 - set_irq_regs(old_regs);
8648 + * Copy the softirq bits in preempt_count so that the
8649 + * softirq checks work in the hardirq context.
8651 + irqctx->tinfo.preempt_count =
8652 + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8653 + (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8655 + if (unlikely(overflow))
8656 + call_on_stack(print_stack_overflow, isp);
8658 + asm volatile("xchgl %%ebx,%%esp \n"
8660 + "movl %%ebx,%%esp \n"
8661 + : "=a" (arg1), "=d" (arg2), "=b" (isp)
8662 + : "0" (irq), "1" (desc), "2" (isp),
8663 + "D" (desc->handle_irq)
8664 + : "memory", "cc", "ecx");
8668 -#ifdef CONFIG_4KSTACKS
8670 -static char softirq_stack[NR_CPUS * THREAD_SIZE]
8671 - __attribute__((__section__(".bss.page_aligned")));
8673 -static char hardirq_stack[NR_CPUS * THREAD_SIZE]
8674 - __attribute__((__section__(".bss.page_aligned")));
8677 * allocate per-cpu stacks for hardirq and for softirq processing
8679 -void irq_ctx_init(int cpu)
8680 +void __cpuinit irq_ctx_init(int cpu)
8682 union irq_ctx *irqctx;
8684 @@ -164,25 +152,25 @@ void irq_ctx_init(int cpu)
8687 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
8688 - irqctx->tinfo.task = NULL;
8689 - irqctx->tinfo.exec_domain = NULL;
8690 - irqctx->tinfo.cpu = cpu;
8691 - irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8692 - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8693 + irqctx->tinfo.task = NULL;
8694 + irqctx->tinfo.exec_domain = NULL;
8695 + irqctx->tinfo.cpu = cpu;
8696 + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8697 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8699 hardirq_ctx[cpu] = irqctx;
8701 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
8702 - irqctx->tinfo.task = NULL;
8703 - irqctx->tinfo.exec_domain = NULL;
8704 - irqctx->tinfo.cpu = cpu;
8705 - irqctx->tinfo.preempt_count = 0;
8706 - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8707 + irqctx->tinfo.task = NULL;
8708 + irqctx->tinfo.exec_domain = NULL;
8709 + irqctx->tinfo.cpu = cpu;
8710 + irqctx->tinfo.preempt_count = 0;
8711 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8713 softirq_ctx[cpu] = irqctx;
8715 - printk("CPU %u irqstacks, hard=%p soft=%p\n",
8716 - cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8717 + printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
8718 + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8721 void irq_ctx_exit(int cpu)
8722 @@ -211,25 +199,56 @@ asmlinkage void do_softirq(void)
8723 /* build the stack frame on the softirq stack */
8724 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8727 - " xchgl %%ebx,%%esp \n"
8728 - " call __do_softirq \n"
8729 - " movl %%ebx,%%esp \n"
8732 - : "memory", "cc", "edx", "ecx", "eax"
8734 + call_on_stack(__do_softirq, isp);
8736 * Shouldnt happen, we returned above if in_interrupt():
8739 WARN_ON_ONCE(softirq_count());
8742 local_irq_restore(flags);
8747 +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
8751 + * do_IRQ handles all normal device IRQ's (the special
8752 + * SMP cross-CPU interrupts have their own specific
8755 +unsigned int do_IRQ(struct pt_regs *regs)
8757 + struct pt_regs *old_regs;
8758 + /* high bit used in ret_from_ code */
8759 + int overflow, irq = ~regs->orig_ax;
8760 + struct irq_desc *desc = irq_desc + irq;
8762 + if (unlikely((unsigned)irq >= NR_IRQS)) {
8763 + printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8768 + old_regs = set_irq_regs(regs);
8771 + overflow = check_stack_overflow();
8773 + if (!execute_on_irq_stack(overflow, desc, irq)) {
8774 + if (unlikely(overflow))
8775 + print_stack_overflow();
8776 + desc->handle_irq(irq, desc);
8780 + set_irq_regs(old_regs);
8785 * Interrupt statistics:
8788 @@ -337,6 +356,42 @@ skip:
8793 + * /proc/stat helpers
8795 +u64 arch_irq_stat_cpu(unsigned int cpu)
8797 + u64 sum = nmi_count(cpu);
8799 +#ifdef CONFIG_X86_LOCAL_APIC
8800 + sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
8803 + sum += per_cpu(irq_stat, cpu).irq_resched_count;
8804 + sum += per_cpu(irq_stat, cpu).irq_call_count;
8806 + sum += per_cpu(irq_stat, cpu).irq_tlb_count;
8809 +#ifdef CONFIG_X86_MCE
8810 + sum += per_cpu(irq_stat, cpu).irq_thermal_count;
8812 +#ifdef CONFIG_X86_LOCAL_APIC
8813 + sum += per_cpu(irq_stat, cpu).irq_spurious_count;
8818 +u64 arch_irq_stat(void)
8820 + u64 sum = atomic_read(&irq_err_count);
8822 +#ifdef CONFIG_X86_IO_APIC
8823 + sum += atomic_read(&irq_mis_count);
8828 #ifdef CONFIG_HOTPLUG_CPU
8830 void fixup_irqs(cpumask_t map)
8831 --- sle11-2009-10-16.orig/arch/x86/kernel/irq_64-xen.c 2009-03-16 16:33:40.000000000 +0100
8832 +++ sle11-2009-10-16/arch/x86/kernel/irq_64-xen.c 2009-06-04 10:21:39.000000000 +0200
8833 @@ -163,6 +163,34 @@ skip:
8837 + * /proc/stat helpers
8839 +u64 arch_irq_stat_cpu(unsigned int cpu)
8841 + u64 sum = cpu_pda(cpu)->__nmi_count;
8843 + sum += cpu_pda(cpu)->apic_timer_irqs;
8845 + sum += cpu_pda(cpu)->irq_resched_count;
8846 + sum += cpu_pda(cpu)->irq_call_count;
8848 + sum += cpu_pda(cpu)->irq_tlb_count;
8851 +#ifdef CONFIG_X86_MCE
8852 + sum += cpu_pda(cpu)->irq_thermal_count;
8853 + sum += cpu_pda(cpu)->irq_threshold_count;
8855 + sum += cpu_pda(cpu)->irq_spurious_count;
8859 +u64 arch_irq_stat(void)
8861 + return atomic_read(&irq_err_count);
8865 * do_IRQ handles all normal device IRQ's (the special
8866 * SMP cross-CPU interrupts have their own specific
8868 --- sle11-2009-10-16.orig/arch/x86/kernel/ldt-xen.c 2009-03-16 16:33:40.000000000 +0100
8869 +++ sle11-2009-10-16/arch/x86/kernel/ldt-xen.c 2009-06-04 10:21:39.000000000 +0200
8871 #include <asm/mmu_context.h>
8874 -static void flush_ldt(void *null)
8875 +static void flush_ldt(void *current_mm)
8877 - if (current->active_mm)
8878 + if (current->active_mm == current_mm)
8879 load_LDT(¤t->active_mm->context);
8882 @@ -62,8 +62,6 @@ static int alloc_ldt(mm_context_t *pc, i
8890 make_pages_readonly(newldt,
8891 @@ -71,9 +69,9 @@ static int alloc_ldt(mm_context_t *pc, i
8892 XENFEAT_writable_descriptor_tables);
8895 - mask = cpumask_of_cpu(smp_processor_id());
8896 - if (!cpus_equal(current->mm->cpu_vm_mask, mask))
8897 - smp_call_function(flush_ldt, NULL, 1, 1);
8898 + if (!cpus_equal(current->mm->cpu_vm_mask,
8899 + cpumask_of_cpu(smp_processor_id())))
8900 + smp_call_function(flush_ldt, current->mm, 1);
8904 --- sle11-2009-10-16.orig/arch/x86/kernel/machine_kexec_32.c 2008-11-25 12:35:53.000000000 +0100
8905 +++ sle11-2009-10-16/arch/x86/kernel/machine_kexec_32.c 2009-06-04 10:21:39.000000000 +0200
8906 @@ -68,6 +68,8 @@ void machine_kexec_setup_load_arg(xen_ke
8907 xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
8908 xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
8910 + if (image->type == KEXEC_TYPE_DEFAULT)
8911 + xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
8914 int __init machine_kexec_setup_resources(struct resource *hypervisor,
8915 --- sle11-2009-10-16.orig/arch/x86/kernel/microcode-xen.c 2009-03-16 16:38:05.000000000 +0100
8916 +++ sle11-2009-10-16/arch/x86/kernel/microcode-xen.c 2009-06-04 10:21:39.000000000 +0200
8918 * 2006 Shaohua Li <shaohua.li@intel.com>
8920 * This driver allows to upgrade microcode on Intel processors
8921 - * belonging to IA-32 family - PentiumPro, Pentium II,
8922 + * belonging to IA-32 family - PentiumPro, Pentium II,
8923 * Pentium III, Xeon, Pentium 4, etc.
8925 - * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
8926 - * Order Number 245472 or free download from:
8928 - * http://developer.intel.com/design/pentium4/manuals/245472.htm
8929 + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
8930 + * Software Developer's Manual
8931 + * Order Number 253668 or free download from:
8933 + * http://developer.intel.com/design/pentium4/manuals/253668.htm
8935 * For more information, go to http://www.urbanmyth.org/microcode
8938 #include <linux/kernel.h>
8939 #include <linux/init.h>
8940 #include <linux/sched.h>
8941 +#include <linux/smp_lock.h>
8942 #include <linux/cpumask.h>
8943 #include <linux/module.h>
8944 #include <linux/slab.h>
8945 @@ -86,6 +88,7 @@ static int do_microcode_update (const vo
8947 static int microcode_open (struct inode *unused1, struct file *unused2)
8949 + cycle_kernel_lock();
8950 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
8953 @@ -162,7 +165,7 @@ static int request_microcode(void)
8954 c->x86, c->x86_model, c->x86_mask);
8955 error = request_firmware(&firmware, name, µcode_pdev->dev);
8957 - pr_debug("microcode: ucode data file %s load failed\n", name);
8958 + pr_debug("microcode: data file %s load failed\n", name);
8962 @@ -183,6 +186,9 @@ static int __init microcode_init (void)
8967 + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8969 error = microcode_dev_init();
8972 @@ -195,8 +201,6 @@ static int __init microcode_init (void)
8974 request_microcode();
8977 - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8981 --- sle11-2009-10-16.orig/arch/x86/kernel/mpparse-xen.c 2009-03-16 16:38:05.000000000 +0100
8982 +++ sle11-2009-10-16/arch/x86/kernel/mpparse-xen.c 2009-06-04 10:21:39.000000000 +0200
8984 #include <asm/proto.h>
8985 #include <asm/acpi.h>
8986 #include <asm/bios_ebda.h>
8987 +#include <asm/e820.h>
8988 +#include <asm/trampoline.h>
8989 +#include <asm/setup.h>
8991 #include <mach_apic.h>
8992 #ifdef CONFIG_X86_32
8994 #include <mach_mpparse.h>
8997 -/* Have we found an MP table */
8998 -int smp_found_config;
9001 - * Various Linux-internal data structures created from the
9004 -#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9005 -int mp_bus_id_to_type[MAX_MP_BUSSES];
9008 -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
9009 -int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
9011 -static int mp_current_pci_id;
9016 - * Intel MP BIOS table parsing routines:
9018 +static void *_bus_to_virt(unsigned long ma)
9020 + return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma);
9024 * Checksum an MP configuration block.
9025 @@ -68,19 +54,7 @@ static int __init mpf_checksum(unsigned
9029 -#ifdef CONFIG_X86_NUMAQ
9031 - * Have to match translation table entries to main table entries by counter
9032 - * hence the mpc_record variable .... can't see a less disgusting way of
9036 -static int mpc_record;
9037 -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
9041 -static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
9042 +static void __init MP_processor_info(struct mpc_config_processor *m)
9046 @@ -90,11 +64,12 @@ static void __cpuinit MP_processor_info(
9050 -#ifdef CONFIG_X86_NUMAQ
9051 - apicid = mpc_apic_id(m, translation_table[mpc_record]);
9053 - apicid = m->mpc_apicid;
9056 + if (x86_quirks->mpc_apic_id)
9057 + apicid = x86_quirks->mpc_apic_id(m);
9059 + apicid = m->mpc_apicid;
9061 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
9062 bootup_cpu = " (Bootup-CPU)";
9063 boot_cpu_physical_apicid = m->mpc_apicid;
9064 @@ -107,18 +82,17 @@ static void __cpuinit MP_processor_info(
9068 +#ifdef CONFIG_X86_IO_APIC
9069 static void __init MP_bus_info(struct mpc_config_bus *m)
9073 memcpy(str, m->mpc_bustype, 6);
9076 -#ifdef CONFIG_X86_NUMAQ
9077 - mpc_oem_bus_info(m, str, translation_table[mpc_record]);
9079 - Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
9081 + if (x86_quirks->mpc_oem_bus_info)
9082 + x86_quirks->mpc_oem_bus_info(m, str);
9084 + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
9086 #if MAX_MP_BUSSES < 256
9087 if (m->mpc_busid >= MAX_MP_BUSSES) {
9088 @@ -135,12 +109,10 @@ static void __init MP_bus_info(struct mp
9089 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
9091 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
9092 -#ifdef CONFIG_X86_NUMAQ
9093 - mpc_oem_pci_bus(m, translation_table[mpc_record]);
9095 + if (x86_quirks->mpc_oem_pci_bus)
9096 + x86_quirks->mpc_oem_pci_bus(m);
9098 clear_bit(m->mpc_busid, mp_bus_not_pci);
9099 - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
9100 - mp_current_pci_id++;
9101 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
9102 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
9103 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
9104 @@ -151,6 +123,7 @@ static void __init MP_bus_info(struct mp
9106 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
9110 #ifdef CONFIG_X86_IO_APIC
9112 @@ -180,117 +153,111 @@ static void __init MP_ioapic_info(struct
9113 if (bad_ioapic(m->mpc_apicaddr))
9116 - mp_ioapics[nr_ioapics] = *m;
9117 + mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
9118 + mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
9119 + mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
9120 + mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
9121 + mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
9125 -static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9126 +static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
9128 - mp_irqs[mp_irq_entries] = *m;
9129 - Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
9130 + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9131 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9132 m->mpc_irqtype, m->mpc_irqflag & 3,
9133 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
9134 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
9135 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
9136 - panic("Max # of irq sources exceeded!!\n");
9141 -static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9142 +static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
9144 - Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
9145 - " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9146 - m->mpc_irqtype, m->mpc_irqflag & 3,
9147 - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9148 - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9149 + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9150 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9151 + mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
9152 + (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
9153 + mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
9156 -#ifdef CONFIG_X86_NUMAQ
9157 -static void __init MP_translation_info(struct mpc_config_translation *m)
9158 +static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
9159 + struct mp_config_intsrc *mp_irq)
9162 - "Translation: record %d, type %d, quad %d, global %d, local %d\n",
9163 - mpc_record, m->trans_type, m->trans_quad, m->trans_global,
9165 + mp_irq->mp_dstapic = m->mpc_dstapic;
9166 + mp_irq->mp_type = m->mpc_type;
9167 + mp_irq->mp_irqtype = m->mpc_irqtype;
9168 + mp_irq->mp_irqflag = m->mpc_irqflag;
9169 + mp_irq->mp_srcbus = m->mpc_srcbus;
9170 + mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
9171 + mp_irq->mp_dstirq = m->mpc_dstirq;
9174 - if (mpc_record >= MAX_MPC_ENTRY)
9175 - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
9177 - translation_table[mpc_record] = m; /* stash this for later */
9178 - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
9179 - node_set_online(m->trans_quad);
9180 +static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
9181 + struct mpc_config_intsrc *m)
9183 + m->mpc_dstapic = mp_irq->mp_dstapic;
9184 + m->mpc_type = mp_irq->mp_type;
9185 + m->mpc_irqtype = mp_irq->mp_irqtype;
9186 + m->mpc_irqflag = mp_irq->mp_irqflag;
9187 + m->mpc_srcbus = mp_irq->mp_srcbus;
9188 + m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
9189 + m->mpc_dstirq = mp_irq->mp_dstirq;
9193 - * Read/parse the MPC oem tables
9195 +static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
9196 + struct mpc_config_intsrc *m)
9198 + if (mp_irq->mp_dstapic != m->mpc_dstapic)
9200 + if (mp_irq->mp_type != m->mpc_type)
9202 + if (mp_irq->mp_irqtype != m->mpc_irqtype)
9204 + if (mp_irq->mp_irqflag != m->mpc_irqflag)
9206 + if (mp_irq->mp_srcbus != m->mpc_srcbus)
9208 + if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
9210 + if (mp_irq->mp_dstirq != m->mpc_dstirq)
9213 -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
9214 - unsigned short oemsize)
9218 +static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9220 - int count = sizeof(*oemtable); /* the header size */
9221 - unsigned char *oemptr = ((unsigned char *)oemtable) + count;
9225 - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
9227 - if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
9228 - printk(KERN_WARNING
9229 - "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
9230 - oemtable->oem_signature[0], oemtable->oem_signature[1],
9231 - oemtable->oem_signature[2], oemtable->oem_signature[3]);
9234 - if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
9235 - printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
9238 - while (count < oemtable->oem_length) {
9239 - switch (*oemptr) {
9240 - case MP_TRANSLATION:
9242 - struct mpc_config_translation *m =
9243 - (struct mpc_config_translation *)oemptr;
9244 - MP_translation_info(m);
9245 - oemptr += sizeof(*m);
9246 - count += sizeof(*m);
9252 - printk(KERN_WARNING
9253 - "Unrecognised OEM table entry type! - %d\n",
9258 + print_MP_intsrc_info(m);
9260 + for (i = 0; i < mp_irq_entries; i++) {
9261 + if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
9265 + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
9266 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
9267 + panic("Max # of irq sources exceeded!!\n");
9270 -static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
9274 +static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9276 - if (strncmp(oem, "IBM NUMA", 8))
9277 - printk("Warning! May not be a NUMA-Q system!\n");
9278 - if (mpc->mpc_oemptr)
9279 - smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
9280 - mpc->mpc_oemsize);
9281 + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
9282 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9283 + m->mpc_irqtype, m->mpc_irqflag & 3,
9284 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9285 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9287 -#endif /* CONFIG_X86_NUMAQ */
9290 * Read/parse the MPC
9293 -static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9294 +static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
9299 - int count = sizeof(*mpc);
9300 - unsigned char *mpt = ((unsigned char *)mpc) + count;
9302 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
9303 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
9304 @@ -313,19 +280,41 @@ static int __init smp_read_mpc(struct mp
9306 memcpy(oem, mpc->mpc_oem, 8);
9308 - printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
9309 + printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
9311 memcpy(str, mpc->mpc_productid, 12);
9313 - printk("Product ID: %s ", str);
9315 -#ifdef CONFIG_X86_32
9316 - mps_oem_check(mpc, oem, str);
9318 - printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
9319 + printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
9321 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
9326 +static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9331 + int count = sizeof(*mpc);
9332 + unsigned char *mpt = ((unsigned char *)mpc) + count;
9334 + if (!smp_check_mpc(mpc, oem, str))
9337 +#ifdef CONFIG_X86_32
9339 + * need to make sure summit and es7000's mps_oem_check is safe to be
9340 + * called early via genericarch 's mps_oem_check
9343 +#ifdef CONFIG_X86_NUMAQ
9344 + numaq_mps_oem_check(mpc, oem, str);
9347 + mps_oem_check(mpc, oem, str);
9349 /* save the local APIC address, it might be non-default */
9351 mp_lapic_addr = mpc->mpc_lapic;
9352 @@ -333,12 +322,17 @@ static int __init smp_read_mpc(struct mp
9356 + if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
9357 + struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
9358 + x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
9362 * Now process the configuration blocks.
9364 -#ifdef CONFIG_X86_NUMAQ
9367 + if (x86_quirks->mpc_record)
9368 + *x86_quirks->mpc_record = 0;
9370 while (count < mpc->mpc_length) {
9373 @@ -356,7 +350,9 @@ static int __init smp_read_mpc(struct mp
9375 struct mpc_config_bus *m =
9376 (struct mpc_config_bus *)mpt;
9377 +#ifdef CONFIG_X86_IO_APIC
9381 count += sizeof(*m);
9383 @@ -402,10 +398,14 @@ static int __init smp_read_mpc(struct mp
9384 count = mpc->mpc_length;
9387 -#ifdef CONFIG_X86_NUMAQ
9390 + if (x86_quirks->mpc_record)
9391 + (*x86_quirks->mpc_record)++;
9394 +#ifdef CONFIG_X86_GENERICARCH
9395 + generic_bigsmp_probe();
9398 setup_apic_routing();
9399 if (!num_processors)
9400 printk(KERN_ERR "MPTABLE: no processors registered!\n");
9401 @@ -431,7 +431,7 @@ static void __init construct_default_ioi
9402 intsrc.mpc_type = MP_INTSRC;
9403 intsrc.mpc_irqflag = 0; /* conforming */
9404 intsrc.mpc_srcbus = 0;
9405 - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
9406 + intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
9408 intsrc.mpc_irqtype = mp_INT;
9410 @@ -492,40 +492,11 @@ static void __init construct_default_ioi
9411 MP_intsrc_info(&intsrc);
9416 -static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9417 +static void __init construct_ioapic_table(int mpc_default_type)
9419 - struct mpc_config_processor processor;
9420 - struct mpc_config_bus bus;
9421 -#ifdef CONFIG_X86_IO_APIC
9422 struct mpc_config_ioapic ioapic;
9424 - struct mpc_config_lintsrc lintsrc;
9425 - int linttypes[2] = { mp_ExtINT, mp_NMI };
9429 - * local APIC has default address
9431 - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9434 - * 2 CPUs, numbered 0 & 1.
9436 - processor.mpc_type = MP_PROCESSOR;
9437 - /* Either an integrated APIC or a discrete 82489DX. */
9438 - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9439 - processor.mpc_cpuflag = CPU_ENABLED;
9440 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9441 - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9442 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9443 - processor.mpc_reserved[0] = 0;
9444 - processor.mpc_reserved[1] = 0;
9445 - for (i = 0; i < 2; i++) {
9446 - processor.mpc_apicid = i;
9447 - MP_processor_info(&processor);
9449 + struct mpc_config_bus bus;
9451 bus.mpc_type = MP_BUS;
9453 @@ -554,7 +525,6 @@ static inline void __init construct_defa
9457 -#ifdef CONFIG_X86_IO_APIC
9458 ioapic.mpc_type = MP_IOAPIC;
9459 ioapic.mpc_apicid = 2;
9460 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9461 @@ -566,7 +536,42 @@ static inline void __init construct_defa
9462 * We set up most of the low 16 IO-APIC pins according to MPS rules.
9464 construct_default_ioirq_mptable(mpc_default_type);
9467 +static inline void __init construct_ioapic_table(int mpc_default_type) { }
9470 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9472 + struct mpc_config_processor processor;
9473 + struct mpc_config_lintsrc lintsrc;
9474 + int linttypes[2] = { mp_ExtINT, mp_NMI };
9478 + * local APIC has default address
9480 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9483 + * 2 CPUs, numbered 0 & 1.
9485 + processor.mpc_type = MP_PROCESSOR;
9486 + /* Either an integrated APIC or a discrete 82489DX. */
9487 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9488 + processor.mpc_cpuflag = CPU_ENABLED;
9489 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9490 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9491 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9492 + processor.mpc_reserved[0] = 0;
9493 + processor.mpc_reserved[1] = 0;
9494 + for (i = 0; i < 2; i++) {
9495 + processor.mpc_apicid = i;
9496 + MP_processor_info(&processor);
9499 + construct_ioapic_table(mpc_default_type);
9501 lintsrc.mpc_type = MP_LINTSRC;
9502 lintsrc.mpc_irqflag = 0; /* conforming */
9503 lintsrc.mpc_srcbusid = 0;
9504 @@ -584,10 +589,14 @@ static struct intel_mp_floating *mpf_fou
9506 * Scan the memory blocks for an SMP configuration block.
9508 -static void __init __get_smp_config(unsigned early)
9509 +static void __init __get_smp_config(unsigned int early)
9511 struct intel_mp_floating *mpf = mpf_found;
9513 + if (x86_quirks->mach_get_smp_config) {
9514 + if (x86_quirks->mach_get_smp_config(early))
9517 if (acpi_lapic && early)
9520 @@ -604,7 +613,7 @@ static void __init __get_smp_config(unsi
9522 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
9523 mpf->mpf_specification);
9524 -#ifdef CONFIG_X86_32
9525 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
9526 if (mpf->mpf_feature2 & (1 << 7)) {
9527 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
9529 @@ -635,8 +644,10 @@ static void __init __get_smp_config(unsi
9530 * Read the physical hardware table. Anything here will
9531 * override the defaults.
9533 - if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
9534 + if (!smp_read_mpc(_bus_to_virt(mpf->mpf_physptr), early)) {
9535 +#ifdef CONFIG_X86_LOCAL_APIC
9536 smp_found_config = 0;
9539 "BIOS bug, MP table errors detected!...\n");
9540 printk(KERN_ERR "... disabling SMP support. "
9541 @@ -690,10 +701,11 @@ void __init get_smp_config(void)
9542 static int __init smp_scan_config(unsigned long base, unsigned long length,
9545 - unsigned int *bp = isa_bus_to_virt(base);
9546 + unsigned int *bp = _bus_to_virt(base);
9547 struct intel_mp_floating *mpf;
9549 - Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
9550 + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
9552 BUILD_BUG_ON(sizeof(*mpf) != 16);
9554 while (length > 0) {
9555 @@ -703,16 +715,22 @@ static int __init smp_scan_config(unsign
9556 !mpf_checksum((unsigned char *)bp, 16) &&
9557 ((mpf->mpf_specification == 1)
9558 || (mpf->mpf_specification == 4))) {
9560 +#ifdef CONFIG_X86_LOCAL_APIC
9561 smp_found_config = 1;
9564 -#ifdef CONFIG_X86_32
9567 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9568 mpf, virt_to_phys(mpf));
9569 - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
9573 + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
9575 if (mpf->mpf_physptr) {
9576 + unsigned long size = PAGE_SIZE;
9577 +#ifdef CONFIG_X86_32
9579 * We cannot access to MPC table to compute
9580 * table size yet, as only few megabytes from
9581 @@ -722,27 +740,18 @@ static int __init smp_scan_config(unsign
9582 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
9583 * in reserve_bootmem.
9585 - unsigned long size = PAGE_SIZE;
9586 unsigned long end = max_low_pfn * PAGE_SIZE;
9587 if (mpf->mpf_physptr + size > end)
9588 size = end - mpf->mpf_physptr;
9589 - reserve_bootmem(mpf->mpf_physptr, size,
9591 + reserve_bootmem_generic(mpf->mpf_physptr, size,
9595 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9596 - mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
9598 -#elif !defined(CONFIG_XEN)
9602 - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
9603 - if (mpf->mpf_physptr)
9604 - reserve_bootmem_generic(mpf->mpf_physptr,
9606 + mpf, ((void *)bp - _bus_to_virt(base)) + base);
9613 @@ -750,12 +759,16 @@ static int __init smp_scan_config(unsign
9617 -static void __init __find_smp_config(unsigned reserve)
9618 +static void __init __find_smp_config(unsigned int reserve)
9621 unsigned int address;
9624 + if (x86_quirks->mach_find_smp_config) {
9625 + if (x86_quirks->mach_find_smp_config(reserve))
9629 * FIXME: Linux assumes you have 640K of base ram..
9630 * this continues the error...
9631 @@ -802,300 +815,297 @@ void __init find_smp_config(void)
9632 __find_smp_config(1);
9635 -/* --------------------------------------------------------------------------
9636 - ACPI-based MP Configuration
9637 - -------------------------------------------------------------------------- */
9640 - * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
9645 +#ifdef CONFIG_X86_IO_APIC
9646 +static u8 __initdata irq_used[MAX_IRQ_SOURCES];
9648 -#ifdef CONFIG_X86_IO_APIC
9649 +static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
9653 -#define MP_ISA_BUS 0
9654 + if (m->mpc_irqtype != mp_INT)
9657 -extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
9658 + if (m->mpc_irqflag != 0x0f)
9661 -static int mp_find_ioapic(int gsi)
9666 - /* Find the IOAPIC that manages this GSI. */
9667 - for (i = 0; i < nr_ioapics; i++) {
9668 - if ((gsi >= mp_ioapic_routing[i].gsi_base)
9669 - && (gsi <= mp_ioapic_routing[i].gsi_end))
9671 + for (i = 0; i < mp_irq_entries; i++) {
9672 + if (mp_irqs[i].mp_irqtype != mp_INT)
9675 + if (mp_irqs[i].mp_irqflag != 0x0f)
9678 + if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
9680 + if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
9682 + if (irq_used[i]) {
9683 + /* already claimed */
9690 - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
9695 -static u8 __init uniq_ioapic_id(u8 id)
9697 -#ifdef CONFIG_X86_32
9698 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
9699 - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
9700 - return io_apic_get_unique_id(nr_ioapics, id);
9705 - DECLARE_BITMAP(used, 256);
9706 - bitmap_zero(used, 256);
9707 - for (i = 0; i < nr_ioapics; i++) {
9708 - struct mpc_config_ioapic *ia = &mp_ioapics[i];
9709 - __set_bit(ia->mpc_apicid, used);
9711 - if (!test_bit(id, used))
9713 - return find_first_zero_bit(used, 256);
9714 +#define SPARE_SLOT_NUM 20
9716 +static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
9720 -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
9721 +static int __init replace_intsrc_all(struct mp_config_table *mpc,
9722 + unsigned long mpc_new_phys,
9723 + unsigned long mpc_new_length)
9727 - if (bad_ioapic(address))
9729 +#ifdef CONFIG_X86_IO_APIC
9731 + int nr_m_spare = 0;
9735 + int count = sizeof(*mpc);
9736 + unsigned char *mpt = ((unsigned char *)mpc) + count;
9738 - mp_ioapics[idx].mpc_type = MP_IOAPIC;
9739 - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
9740 - mp_ioapics[idx].mpc_apicaddr = address;
9741 + printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
9742 + while (count < mpc->mpc_length) {
9744 + case MP_PROCESSOR:
9746 + struct mpc_config_processor *m =
9747 + (struct mpc_config_processor *)mpt;
9748 + mpt += sizeof(*m);
9749 + count += sizeof(*m);
9754 + struct mpc_config_bus *m =
9755 + (struct mpc_config_bus *)mpt;
9756 + mpt += sizeof(*m);
9757 + count += sizeof(*m);
9762 + mpt += sizeof(struct mpc_config_ioapic);
9763 + count += sizeof(struct mpc_config_ioapic);
9768 +#ifdef CONFIG_X86_IO_APIC
9769 + struct mpc_config_intsrc *m =
9770 + (struct mpc_config_intsrc *)mpt;
9773 - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
9774 + apic_printk(APIC_VERBOSE, "OLD ");
9775 + print_MP_intsrc_info(m);
9776 + i = get_MP_intsrc_index(m);
9778 + assign_to_mpc_intsrc(&mp_irqs[i], m);
9779 + apic_printk(APIC_VERBOSE, "NEW ");
9780 + print_mp_irq_info(&mp_irqs[i]);
9782 + /* legacy, do nothing */
9783 + } else if (nr_m_spare < SPARE_SLOT_NUM) {
9785 + * not found (-1), or duplicated (-2)
9786 + * are invalid entries,
9787 + * we need to use the slot later
9789 + m_spare[nr_m_spare] = m;
9793 - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
9794 -#ifdef CONFIG_X86_32
9795 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
9797 - mp_ioapics[idx].mpc_apicver = 0;
9798 + mpt += sizeof(struct mpc_config_intsrc);
9799 + count += sizeof(struct mpc_config_intsrc);
9804 + struct mpc_config_lintsrc *m =
9805 + (struct mpc_config_lintsrc *)mpt;
9806 + mpt += sizeof(*m);
9807 + count += sizeof(*m);
9811 + /* wrong mptable */
9812 + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
9813 + printk(KERN_ERR "type %x\n", *mpt);
9814 + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
9815 + 1, mpc, mpc->mpc_length, 1);
9820 +#ifdef CONFIG_X86_IO_APIC
9821 + for (i = 0; i < mp_irq_entries; i++) {
9825 + if (mp_irqs[i].mp_irqtype != mp_INT)
9828 + if (mp_irqs[i].mp_irqflag != 0x0f)
9831 + if (nr_m_spare > 0) {
9832 + apic_printk(APIC_VERBOSE, "*NEW* found\n");
9834 + assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
9835 + m_spare[nr_m_spare] = NULL;
9837 + struct mpc_config_intsrc *m =
9838 + (struct mpc_config_intsrc *)mpt;
9839 + count += sizeof(struct mpc_config_intsrc);
9840 + if (!mpc_new_phys) {
9841 + printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
9843 + if (count <= mpc_new_length)
9844 + printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
9846 + printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
9850 + assign_to_mpc_intsrc(&mp_irqs[i], m);
9851 + mpc->mpc_length = count;
9852 + mpt += sizeof(struct mpc_config_intsrc);
9854 + print_mp_irq_info(&mp_irqs[i]);
9858 - * Build basic GSI lookup table to facilitate gsi->io_apic lookups
9859 - * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
9861 - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
9862 - mp_ioapic_routing[idx].gsi_base = gsi_base;
9863 - mp_ioapic_routing[idx].gsi_end = gsi_base +
9864 - io_apic_get_redir_entries(idx);
9866 - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
9867 - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
9868 - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
9869 - mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
9871 + /* update checksum */
9872 + mpc->mpc_checksum = 0;
9873 + mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
9880 -void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
9882 - struct mpc_config_intsrc intsrc;
9887 - * Convert 'gsi' to 'ioapic.pin'.
9889 - ioapic = mp_find_ioapic(gsi);
9892 - pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9893 +static int __initdata enable_update_mptable;
9896 - * TBD: This check is for faulty timer entries, where the override
9897 - * erroneously sets the trigger to level, resulting in a HUGE
9898 - * increase of timer interrupts!
9900 - if ((bus_irq == 0) && (trigger == 3))
9902 +static int __init update_mptable_setup(char *str)
9904 + enable_update_mptable = 1;
9907 +early_param("update_mptable", update_mptable_setup);
9909 - intsrc.mpc_type = MP_INTSRC;
9910 - intsrc.mpc_irqtype = mp_INT;
9911 - intsrc.mpc_irqflag = (trigger << 2) | polarity;
9912 - intsrc.mpc_srcbus = MP_ISA_BUS;
9913 - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
9914 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
9915 - intsrc.mpc_dstirq = pin; /* INTIN# */
9916 +static unsigned long __initdata mpc_new_phys;
9917 +static unsigned long mpc_new_length __initdata = 4096;
9919 - MP_intsrc_info(&intsrc);
9920 +/* alloc_mptable or alloc_mptable=4k */
9921 +static int __initdata alloc_mptable;
9922 +static int __init parse_alloc_mptable_opt(char *p)
9924 + enable_update_mptable = 1;
9925 + alloc_mptable = 1;
9928 + mpc_new_length = memparse(p, &p);
9931 +early_param("alloc_mptable", parse_alloc_mptable_opt);
9933 -void __init mp_config_acpi_legacy_irqs(void)
9934 +void __init early_reserve_e820_mpc_new(void)
9936 - struct mpc_config_intsrc intsrc;
9940 -#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9942 - * Fabricate the legacy ISA bus (bus #31).
9944 - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
9945 + if (enable_update_mptable && alloc_mptable) {
9947 +#ifdef CONFIG_X86_TRAMPOLINE
9948 + startt = TRAMPOLINE_BASE;
9950 - set_bit(MP_ISA_BUS, mp_bus_not_pci);
9951 - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
9954 - * Older generations of ES7000 have no legacy identity mappings
9956 - if (es7000_plat == 1)
9960 - * Locate the IOAPIC that manages the ISA IRQs (0-15).
9962 - ioapic = mp_find_ioapic(0);
9966 - intsrc.mpc_type = MP_INTSRC;
9967 - intsrc.mpc_irqflag = 0; /* Conforming */
9968 - intsrc.mpc_srcbus = MP_ISA_BUS;
9969 -#ifdef CONFIG_X86_IO_APIC
9970 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
9973 - * Use the default configuration for the IRQs 0-15. Unless
9974 - * overridden by (MADT) interrupt source override entries.
9976 - for (i = 0; i < 16; i++) {
9979 - for (idx = 0; idx < mp_irq_entries; idx++) {
9980 - struct mpc_config_intsrc *irq = mp_irqs + idx;
9982 - /* Do we already have a mapping for this ISA IRQ? */
9983 - if (irq->mpc_srcbus == MP_ISA_BUS
9984 - && irq->mpc_srcbusirq == i)
9987 - /* Do we already have a mapping for this IOAPIC pin */
9988 - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
9989 - (irq->mpc_dstirq == i))
9993 - if (idx != mp_irq_entries) {
9994 - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
9995 - continue; /* IRQ already used */
9998 - intsrc.mpc_irqtype = mp_INT;
9999 - intsrc.mpc_srcbusirq = i; /* Identity mapped */
10000 - intsrc.mpc_dstirq = i;
10002 - MP_intsrc_info(&intsrc);
10003 + mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
10007 -int mp_register_gsi(u32 gsi, int triggering, int polarity)
10008 +static int __init update_mp_table(void)
10012 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10013 -#define MAX_GSI_NUM 4096
10014 -#define IRQ_COMPRESSION_START 64
10017 + struct intel_mp_floating *mpf;
10018 + struct mp_config_table *mpc;
10019 + struct mp_config_table *mpc_new;
10021 + if (!enable_update_mptable)
10028 - static int pci_irq = IRQ_COMPRESSION_START;
10030 - * Mapping between Global System Interrupts, which
10031 - * represent all possible interrupts, and IRQs
10032 - * assigned to actual devices.
10033 + * Now see if we need to go further.
10035 - static int gsi_to_irq[MAX_GSI_NUM];
10037 + if (mpf->mpf_feature1 != 0)
10040 - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
10043 + if (!mpf->mpf_physptr)
10046 - /* Don't set up the ACPI SCI because it's already set up */
10047 - if (acpi_gbl_FADT.sci_interrupt == gsi)
10049 + mpc = _bus_to_virt(mpf->mpf_physptr);
10051 - ioapic = mp_find_ioapic(gsi);
10052 - if (ioapic < 0) {
10053 - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
10056 + if (!smp_check_mpc(mpc, oem, str))
10059 - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
10060 + printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf));
10061 + printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
10063 -#ifndef CONFIG_X86_32
10064 - if (ioapic_renumber_irq)
10065 - gsi = ioapic_renumber_irq(ioapic, gsi);
10067 + if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
10068 + mpc_new_phys = 0;
10069 + printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
10073 + if (!mpc_new_phys) {
10074 + unsigned char old, new;
10075 + /* check if we can change the postion */
10076 + mpc->mpc_checksum = 0;
10077 + old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10078 + mpc->mpc_checksum = 0xff;
10079 + new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10080 + if (old == new) {
10081 + printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
10084 + printk(KERN_INFO "use in-positon replacing\n");
10086 + maddr_t mpc_new_bus;
10089 - * Avoid pin reprogramming. PRTs typically include entries
10090 - * with redundant pin->gsi mappings (but unique PCI devices);
10091 - * we only program the IOAPIC on the first.
10093 - if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
10094 - printk(KERN_ERR "Invalid reference to IOAPIC pin "
10095 - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
10099 - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
10100 - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
10101 - mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
10102 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10103 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
10107 + mpc_new_bus = phys_to_machine(mpc_new_phys);
10108 + mpf->mpf_physptr = mpc_new_bus;
10109 + mpc_new = phys_to_virt(mpc_new_phys);
10110 + memcpy(mpc_new, mpc, mpc->mpc_length);
10112 + /* check if we can modify that */
10113 + if (mpc_new_bus - mpf->mpf_physptr) {
10114 + struct intel_mp_floating *mpf_new;
10115 + /* steal 16 bytes from [0, 1k) */
10116 + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
10117 + mpf_new = isa_bus_to_virt(0x400 - 16);
10118 + memcpy(mpf_new, mpf, 16);
10120 + mpf->mpf_physptr = mpc_new_bus;
10122 + mpf->mpf_checksum = 0;
10123 + mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
10124 + printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
10127 - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
10128 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10130 - * For GSI >= 64, use IRQ compression
10131 + * only replace the one with mp_INT and
10132 + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
10133 + * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
10134 + * may need pci=routeirq for all coverage
10136 - if ((gsi >= IRQ_COMPRESSION_START)
10137 - && (triggering == ACPI_LEVEL_SENSITIVE)) {
10139 - * For PCI devices assign IRQs in order, avoiding gaps
10140 - * due to unused I/O APIC pins.
10143 - if (gsi < MAX_GSI_NUM) {
10145 - * Retain the VIA chipset work-around (gsi > 15), but
10146 - * avoid a problem where the 8254 timer (IRQ0) is setup
10147 - * via an override (so it's not on pin 0 of the ioapic),
10148 - * and at the same time, the pin 0 interrupt is a PCI
10149 - * type. The gsi > 15 test could cause these two pins
10150 - * to be shared as IRQ0, and they are not shareable.
10151 - * So test for this condition, and if necessary, avoid
10152 - * the pin collision.
10156 - * Don't assign IRQ used by ACPI SCI
10158 - if (gsi == acpi_gbl_FADT.sci_interrupt)
10160 - gsi_to_irq[irq] = gsi;
10162 - printk(KERN_ERR "GSI %u is too high\n", gsi);
10167 - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
10168 - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
10169 - polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
10171 + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
10176 -#endif /* CONFIG_X86_IO_APIC */
10177 -#endif /* CONFIG_ACPI */
10178 +late_initcall(update_mp_table);
10179 --- sle11-2009-10-16.orig/arch/x86/kernel/nmi.c 2009-10-28 14:55:02.000000000 +0100
10180 +++ sle11-2009-10-16/arch/x86/kernel/nmi.c 2009-06-04 10:21:39.000000000 +0200
10182 #include <linux/kdebug.h>
10183 #include <linux/smp.h>
10185 +#ifndef CONFIG_XEN
10186 #include <asm/i8259.h>
10188 #include <asm/io_apic.h>
10189 #include <asm/smp.h>
10190 #include <asm/nmi.h>
10191 @@ -179,8 +181,10 @@ int __init check_nmi_watchdog(void)
10192 kfree(prev_nmi_count);
10195 +#ifndef CONFIG_XEN
10196 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
10197 disable_8259A_irq(0);
10199 #ifdef CONFIG_X86_32
10202 --- sle11-2009-10-16.orig/arch/x86/kernel/pci-dma-xen.c 2009-10-22 11:31:59.000000000 +0200
10203 +++ sle11-2009-10-16/arch/x86/kernel/pci-dma-xen.c 2009-06-04 10:21:39.000000000 +0200
10206 #include <asm/proto.h>
10207 #include <asm/dma.h>
10208 -#include <asm/gart.h>
10209 +#include <asm/iommu.h>
10210 #include <asm/calgary.h>
10211 +#include <asm/amd_iommu.h>
10213 -int forbid_dac __read_mostly;
10214 -EXPORT_SYMBOL(forbid_dac);
10215 +static int forbid_dac __read_mostly;
10217 -const struct dma_mapping_ops *dma_ops;
10218 +struct dma_mapping_ops *dma_ops;
10219 EXPORT_SYMBOL(dma_ops);
10221 static int iommu_sac_force __read_mostly;
10222 @@ -74,13 +74,17 @@ early_param("dma32_size", parse_dma32_si
10223 void __init dma32_reserve_bootmem(void)
10225 unsigned long size, align;
10226 - if (end_pfn <= MAX_DMA32_PFN)
10227 + if (max_pfn <= MAX_DMA32_PFN)
10231 + * check aperture_64.c allocate_aperture() for reason about
10232 + * using 512M as goal
10235 size = round_up(dma32_bootmem_size, align);
10236 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
10237 - __pa(MAX_DMA_ADDRESS));
10239 if (dma32_bootmem_ptr)
10240 dma32_bootmem_size = size;
10242 @@ -88,17 +92,14 @@ void __init dma32_reserve_bootmem(void)
10244 static void __init dma32_free_bootmem(void)
10248 - if (end_pfn <= MAX_DMA32_PFN)
10249 + if (max_pfn <= MAX_DMA32_PFN)
10252 if (!dma32_bootmem_ptr)
10255 - for_each_online_node(node)
10256 - free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
10257 - dma32_bootmem_size);
10258 + free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
10260 dma32_bootmem_ptr = NULL;
10261 dma32_bootmem_size = 0;
10262 @@ -107,7 +108,7 @@ static void __init dma32_free_bootmem(vo
10263 #define dma32_free_bootmem() ((void)0)
10266 -static const struct dma_mapping_ops swiotlb_dma_ops = {
10267 +static struct dma_mapping_ops swiotlb_dma_ops = {
10268 .mapping_error = swiotlb_dma_mapping_error,
10269 .map_single = swiotlb_map_single_phys,
10270 .unmap_single = swiotlb_unmap_single,
10271 @@ -130,25 +131,31 @@ void __init pci_iommu_alloc(void)
10272 * The order of these functions is important for
10273 * fall-back/fail-over reasons
10275 -#ifdef CONFIG_GART_IOMMU
10276 gart_iommu_hole_init();
10279 -#ifdef CONFIG_CALGARY_IOMMU
10283 detect_intel_iommu();
10285 -#ifdef CONFIG_SWIOTLB
10286 + amd_iommu_detect();
10290 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
10291 dma_ops = &swiotlb_dma_ops;
10296 +#ifndef CONFIG_XEN
10297 +unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
10299 + unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
10301 + return size >> PAGE_SHIFT;
10303 +EXPORT_SYMBOL(iommu_num_pages);
10307 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
10309 @@ -201,9 +208,7 @@ static __init int iommu_setup(char *p)
10313 -#ifdef CONFIG_GART_IOMMU
10314 gart_parse_options(p);
10317 #ifdef CONFIG_CALGARY_IOMMU
10318 if (!strncmp(p, "calgary", 7))
10319 @@ -245,136 +250,19 @@ int range_straddles_page_boundary(paddr_
10320 !check_pages_physically_contiguous(pfn, offset, size));
10323 -#ifdef CONFIG_X86_32
10324 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
10325 - dma_addr_t device_addr, size_t size, int flags)
10327 - void __iomem *mem_base = NULL;
10328 - int pages = size >> PAGE_SHIFT;
10329 - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
10331 - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
10335 - if (dev->dma_mem)
10338 - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
10340 - mem_base = ioremap(bus_addr, size);
10344 - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
10345 - if (!dev->dma_mem)
10347 - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
10348 - if (!dev->dma_mem->bitmap)
10351 - dev->dma_mem->virt_base = mem_base;
10352 - dev->dma_mem->device_base = device_addr;
10353 - dev->dma_mem->size = pages;
10354 - dev->dma_mem->flags = flags;
10356 - if (flags & DMA_MEMORY_MAP)
10357 - return DMA_MEMORY_MAP;
10359 - return DMA_MEMORY_IO;
10362 - kfree(dev->dma_mem);
10365 - iounmap(mem_base);
10368 -EXPORT_SYMBOL(dma_declare_coherent_memory);
10370 -void dma_release_declared_memory(struct device *dev)
10372 - struct dma_coherent_mem *mem = dev->dma_mem;
10376 - dev->dma_mem = NULL;
10377 - iounmap(mem->virt_base);
10378 - kfree(mem->bitmap);
10381 -EXPORT_SYMBOL(dma_release_declared_memory);
10383 -void *dma_mark_declared_memory_occupied(struct device *dev,
10384 - dma_addr_t device_addr, size_t size)
10386 - struct dma_coherent_mem *mem = dev->dma_mem;
10388 - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
10390 - pages >>= PAGE_SHIFT;
10393 - return ERR_PTR(-EINVAL);
10395 - pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
10396 - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
10398 - return ERR_PTR(err);
10399 - return mem->virt_base + (pos << PAGE_SHIFT);
10401 -EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
10403 -static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
10404 - dma_addr_t *dma_handle, void **ret)
10406 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10407 - int order = get_order(size);
10410 - int page = bitmap_find_free_region(mem->bitmap, mem->size,
10413 - *dma_handle = mem->device_base + (page << PAGE_SHIFT);
10414 - *ret = mem->virt_base + (page << PAGE_SHIFT);
10415 - memset(*ret, 0, size);
10417 - if (mem->flags & DMA_MEMORY_EXCLUSIVE)
10420 - return (mem != NULL);
10423 -static int dma_release_coherent(struct device *dev, int order, void *vaddr)
10425 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10427 - if (mem && vaddr >= mem->virt_base && vaddr <
10428 - (mem->virt_base + (mem->size << PAGE_SHIFT))) {
10429 - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
10431 - bitmap_release_region(mem->bitmap, page, order);
10437 -#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
10438 -#define dma_release_coherent(dev, order, vaddr) (0)
10439 -#endif /* CONFIG_X86_32 */
10441 int dma_supported(struct device *dev, u64 mask)
10443 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10446 if (mask > 0xffffffff && forbid_dac > 0) {
10447 - printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
10449 + dev_info(dev, "PCI: Disallowing DAC for device\n");
10454 - if (dma_ops->dma_supported)
10455 - return dma_ops->dma_supported(dev, mask);
10456 + if (ops->dma_supported)
10457 + return ops->dma_supported(dev, mask);
10459 /* Copied from i386. Doesn't make much sense, because it will
10460 only work for pci_alloc_coherent.
10461 @@ -395,8 +283,7 @@ int dma_supported(struct device *dev, u6
10462 type. Normally this doesn't make any difference, but gives
10463 more gentle handling of IOMMU overflow. */
10464 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
10465 - printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
10466 - dev->bus_id, mask);
10467 + dev_info(dev, "Force SAC with mask %Lx\n", mask);
10471 @@ -422,6 +309,9 @@ void *
10472 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
10475 +#ifndef CONFIG_XEN
10476 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10478 void *memory = NULL;
10480 unsigned long dma_mask = 0;
10481 @@ -431,7 +321,7 @@ dma_alloc_coherent(struct device *dev, s
10482 /* ignore region specifiers */
10483 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
10485 - if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
10486 + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
10490 @@ -491,8 +381,8 @@ dma_alloc_coherent(struct device *dev, s
10491 /* Let low level make its own zone decisions */
10492 gfp &= ~(GFP_DMA32|GFP_DMA);
10494 - if (dma_ops->alloc_coherent)
10495 - return dma_ops->alloc_coherent(dev, size,
10496 + if (ops->alloc_coherent)
10497 + return ops->alloc_coherent(dev, size,
10501 @@ -504,14 +394,14 @@ dma_alloc_coherent(struct device *dev, s
10505 - if (dma_ops->alloc_coherent) {
10506 + if (ops->alloc_coherent) {
10507 free_pages((unsigned long)memory, order);
10508 gfp &= ~(GFP_DMA|GFP_DMA32);
10509 - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
10510 + return ops->alloc_coherent(dev, size, dma_handle, gfp);
10513 - if (dma_ops->map_simple) {
10514 - *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
10515 + if (ops->map_simple) {
10516 + *dma_handle = ops->map_simple(dev, virt_to_bus(memory),
10518 PCI_DMA_BIDIRECTIONAL);
10519 if (*dma_handle != bad_dma_address)
10520 @@ -542,13 +432,17 @@ EXPORT_SYMBOL(dma_alloc_coherent);
10521 void dma_free_coherent(struct device *dev, size_t size,
10522 void *vaddr, dma_addr_t bus)
10524 +#ifndef CONFIG_XEN
10525 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10528 int order = get_order(size);
10529 WARN_ON(irqs_disabled()); /* for portability */
10530 - if (dma_release_coherent(dev, order, vaddr))
10531 + if (dma_release_from_coherent(dev, order, vaddr))
10534 - if (dma_ops->unmap_single)
10535 - dma_ops->unmap_single(dev, bus, size, 0);
10536 + if (ops->unmap_single)
10537 + ops->unmap_single(dev, bus, size, 0);
10539 xen_destroy_contiguous_region((unsigned long)vaddr, order);
10540 free_pages((unsigned long)vaddr, order);
10541 @@ -557,15 +451,13 @@ EXPORT_SYMBOL(dma_free_coherent);
10543 static int __init pci_iommu_init(void)
10545 -#ifdef CONFIG_CALGARY_IOMMU
10546 calgary_iommu_init();
10549 intel_iommu_init();
10551 -#ifdef CONFIG_GART_IOMMU
10552 + amd_iommu_init();
10559 --- sle11-2009-10-16.orig/arch/x86/kernel/pci-nommu-xen.c 2009-03-16 16:38:05.000000000 +0100
10560 +++ sle11-2009-10-16/arch/x86/kernel/pci-nommu-xen.c 2009-06-04 10:21:39.000000000 +0200
10561 @@ -79,18 +79,12 @@ gnttab_unmap_single(struct device *dev,
10562 gnttab_dma_unmap_page(dma_addr);
10565 -static int nommu_mapping_error(dma_addr_t dma_addr)
10567 - return (dma_addr == bad_dma_address);
10570 -static const struct dma_mapping_ops nommu_dma_ops = {
10571 +static struct dma_mapping_ops nommu_dma_ops = {
10572 .map_single = gnttab_map_single,
10573 .unmap_single = gnttab_unmap_single,
10574 .map_sg = gnttab_map_sg,
10575 .unmap_sg = gnttab_unmap_sg,
10576 .dma_supported = swiotlb_dma_supported,
10577 - .mapping_error = nommu_mapping_error
10580 void __init no_iommu_init(void)
10581 --- sle11-2009-10-16.orig/arch/x86/kernel/probe_roms_32.c 2009-10-28 14:55:02.000000000 +0100
10582 +++ sle11-2009-10-16/arch/x86/kernel/probe_roms_32.c 2009-06-04 10:21:39.000000000 +0200
10583 @@ -99,6 +99,11 @@ void __init probe_roms(void)
10588 + if (!is_initial_xendomain())
10593 upper = adapter_rom_resources[0].start;
10594 for (start = video_rom_resource.start; start < upper; start += 2048) {
10595 @@ -131,7 +136,7 @@ void __init probe_roms(void)
10596 upper = system_rom_resource.start;
10598 /* check for extension rom (ignore length byte!) */
10599 - rom = isa_bus_to_virt(extension_rom_resource.start);
10600 + rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
10601 if (romsignature(rom)) {
10602 length = extension_rom_resource.end - extension_rom_resource.start + 1;
10603 if (romchecksum(rom, length)) {
10604 --- sle11-2009-10-16.orig/arch/x86/kernel/process-xen.c 2009-03-16 16:38:05.000000000 +0100
10605 +++ sle11-2009-10-16/arch/x86/kernel/process-xen.c 2009-06-04 10:21:39.000000000 +0200
10607 #include <linux/sched.h>
10608 #include <linux/module.h>
10609 #include <linux/pm.h>
10610 +#include <linux/clockchips.h>
10611 +#include <asm/system.h>
10613 +unsigned long idle_halt;
10614 +EXPORT_SYMBOL(idle_halt);
10615 +unsigned long idle_nomwait;
10616 +EXPORT_SYMBOL(idle_nomwait);
10618 struct kmem_cache *task_xstate_cachep;
10620 @@ -45,6 +52,70 @@ void arch_task_cache_init(void)
10625 + * Idle related variables and functions
10627 +unsigned long boot_option_idle_override = 0;
10628 +EXPORT_SYMBOL(boot_option_idle_override);
10631 + * Powermanagement idle function, if any..
10633 +void (*pm_idle)(void);
10634 +EXPORT_SYMBOL(pm_idle);
10636 +#ifdef CONFIG_X86_32
10638 + * This halt magic was a workaround for ancient floppy DMA
10639 + * wreckage. It should be safe to remove.
10641 +static int hlt_counter;
10642 +void disable_hlt(void)
10646 +EXPORT_SYMBOL(disable_hlt);
10648 +void enable_hlt(void)
10652 +EXPORT_SYMBOL(enable_hlt);
10654 +static inline int hlt_use_halt(void)
10656 + return (!hlt_counter && boot_cpu_data.hlt_works_ok);
10659 +static inline int hlt_use_halt(void)
10666 + * We use this if we don't have any better
10669 +void xen_idle(void)
10671 + current_thread_info()->status &= ~TS_POLLING;
10673 + * TS_POLLING-cleared state must be visible before we
10674 + * test NEED_RESCHED:
10678 + if (!need_resched())
10679 + safe_halt(); /* enables interrupts racelessly */
10681 + local_irq_enable();
10682 + current_thread_info()->status |= TS_POLLING;
10684 +#ifdef CONFIG_APM_MODULE
10685 +EXPORT_SYMBOL(default_idle);
10688 static void do_nothing(void *unused)
10691 @@ -61,7 +132,7 @@ void cpu_idle_wait(void)
10694 /* kick all the CPUs so that they exit out of pm_idle */
10695 - smp_call_function(do_nothing, NULL, 0, 1);
10696 + smp_call_function(do_nothing, NULL, 1);
10698 EXPORT_SYMBOL_GPL(cpu_idle_wait);
10700 @@ -125,60 +196,175 @@ static void poll_idle(void)
10702 * idle=mwait overrides this decision and forces the usage of mwait.
10704 +static int __cpuinitdata force_mwait;
10706 +#define MWAIT_INFO 0x05
10707 +#define MWAIT_ECX_EXTENDED_INFO 0x01
10708 +#define MWAIT_EDX_C1 0xf0
10710 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
10712 + u32 eax, ebx, ecx, edx;
10717 - if (c->x86_vendor == X86_VENDOR_AMD) {
10722 + if (c->cpuid_level < MWAIT_INFO)
10725 + cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
10726 + /* Check, whether EDX has extended info about MWAIT */
10727 + if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
10731 + * edx enumeratios MONITOR/MWAIT extensions. Check, whether
10732 + * C1 supports MWAIT
10734 + return (edx & MWAIT_EDX_C1);
10738 + * Check for AMD CPUs, which have potentially C1E support
10740 +static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
10742 + if (c->x86_vendor != X86_VENDOR_AMD)
10745 + if (c->x86 < 0x0F)
10748 + /* Family 0x0f models < rev F do not have C1E */
10749 + if (c->x86 == 0x0f && c->x86_model < 0x40)
10755 +static cpumask_t c1e_mask = CPU_MASK_NONE;
10756 +static int c1e_detected;
10758 +void c1e_remove_cpu(int cpu)
10760 + cpu_clear(cpu, c1e_mask);
10764 + * C1E aware idle routine. We check for C1E active in the interrupt
10765 + * pending message MSR. If we detect C1E, then we handle it the same
10766 + * way as C3 power states (local apic timer and TSC stop)
10768 +static void c1e_idle(void)
10770 + if (need_resched())
10773 + if (!c1e_detected) {
10776 + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
10777 + if (lo & K8_INTP_C1E_ACTIVE_MASK) {
10778 + c1e_detected = 1;
10779 + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
10780 + mark_tsc_unstable("TSC halt in AMD C1E");
10781 + printk(KERN_INFO "System has AMD C1E enabled\n");
10782 + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
10787 + if (c1e_detected) {
10788 + int cpu = smp_processor_id();
10790 + if (!cpu_isset(cpu, c1e_mask)) {
10791 + cpu_set(cpu, c1e_mask);
10793 + * Force broadcast so ACPI can not interfere. Needs
10794 + * to run with interrupts enabled as it uses
10795 + * smp_function_call.
10797 + local_irq_enable();
10798 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
10800 + printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
10802 + local_irq_disable();
10804 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
10809 + * The switch back from broadcast mode needs to be
10810 + * called with interrupts disabled.
10812 + local_irq_disable();
10813 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
10814 + local_irq_enable();
10820 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
10823 - static int selected;
10827 #ifdef CONFIG_X86_SMP
10828 if (pm_idle == poll_idle && smp_num_siblings > 1) {
10829 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
10830 " performance may degrade.\n");
10836 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
10838 - * Skip, if setup has overridden idle.
10839 * One CPU supports mwait => All CPUs supports mwait
10842 - printk(KERN_INFO "using mwait in idle threads.\n");
10843 - pm_idle = mwait_idle;
10847 + printk(KERN_INFO "using mwait in idle threads.\n");
10848 + pm_idle = mwait_idle;
10849 + } else if (check_c1e_idle(c)) {
10850 + printk(KERN_INFO "using C1E aware idle routine\n");
10851 + pm_idle = c1e_idle;
10853 + pm_idle = default_idle;
10857 static int __init idle_setup(char *str)
10862 if (!strcmp(str, "poll")) {
10863 printk("using polling idle threads.\n");
10864 pm_idle = poll_idle;
10867 - else if (!strcmp(str, "mwait"))
10868 + } else if (!strcmp(str, "mwait"))
10870 + else if (!strcmp(str, "halt")) {
10872 + * When the boot option of idle=halt is added, halt is
10873 + * forced to be used for CPU idle. In such case CPU C2/C3
10874 + * won't be used again.
10875 + * To continue to load the CPU idle driver, don't touch
10876 + * the boot_option_idle_override.
10878 + pm_idle = default_idle;
10881 + } else if (!strcmp(str, "nomwait")) {
10883 + * If the boot option of "idle=nomwait" is added,
10884 + * it means that mwait will be disabled for CPU C2/C3
10885 + * states. In such case it won't touch the variable
10886 + * of boot_option_idle_override.
10888 + idle_nomwait = 1;
10895 boot_option_idle_override = 1;
10896 --- sle11-2009-10-16.orig/arch/x86/kernel/process_32-xen.c 2009-03-16 16:38:05.000000000 +0100
10897 +++ sle11-2009-10-16/arch/x86/kernel/process_32-xen.c 2009-06-04 10:21:39.000000000 +0200
10898 @@ -59,15 +59,11 @@
10899 #include <asm/tlbflush.h>
10900 #include <asm/cpu.h>
10901 #include <asm/kdebug.h>
10902 +#include <asm/idle.h>
10904 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
10905 asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
10907 -static int hlt_counter;
10909 -unsigned long boot_option_idle_override = 0;
10910 -EXPORT_SYMBOL(boot_option_idle_override);
10912 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
10913 EXPORT_PER_CPU_SYMBOL(current_task);
10915 @@ -82,46 +78,27 @@ unsigned long thread_saved_pc(struct tas
10916 return ((unsigned long *)tsk->thread.sp)[3];
10920 - * Powermanagement idle function, if any..
10922 -void (*pm_idle)(void);
10923 -EXPORT_SYMBOL(pm_idle);
10924 +#ifdef CONFIG_HOTPLUG_CPU
10925 +#ifndef CONFIG_XEN
10926 +#include <asm/nmi.h>
10928 -void disable_hlt(void)
10929 +static void cpu_exit_clear(void)
10933 + int cpu = raw_smp_processor_id();
10935 -EXPORT_SYMBOL(disable_hlt);
10937 -void enable_hlt(void)
10941 + idle_task_exit();
10943 -EXPORT_SYMBOL(enable_hlt);
10945 + irq_ctx_exit(cpu);
10947 -static void xen_idle(void)
10949 - current_thread_info()->status &= ~TS_POLLING;
10951 - * TS_POLLING-cleared state must be visible before we
10952 - * test NEED_RESCHED:
10955 + cpu_clear(cpu, cpu_callout_map);
10956 + cpu_clear(cpu, cpu_callin_map);
10958 - if (!need_resched())
10959 - safe_halt(); /* enables interrupts racelessly */
10961 - local_irq_enable();
10962 - current_thread_info()->status |= TS_POLLING;
10963 + numa_remove_cpu(cpu);
10964 + c1e_remove_cpu(cpu);
10966 -#ifdef CONFIG_APM_MODULE
10967 -EXPORT_SYMBOL(default_idle);
10970 -#ifdef CONFIG_HOTPLUG_CPU
10971 static inline void play_dead(void)
10974 @@ -152,13 +129,11 @@ void cpu_idle(void)
10976 /* endless idle loop with no priority at all */
10978 - tick_nohz_stop_sched_tick();
10979 + tick_nohz_stop_sched_tick(1);
10980 while (!need_resched()) {
10981 - void (*idle)(void);
10985 - idle = xen_idle; /* no alternatives */
10987 if (rcu_pending(cpu))
10988 rcu_check_callbacks(cpu, 0);
10989 @@ -168,7 +143,10 @@ void cpu_idle(void)
10991 local_irq_disable();
10992 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
10994 + /* Don't trace irqs off for idle */
10995 + stop_critical_timings();
10997 + start_critical_timings();
10999 tick_nohz_restart_sched_tick();
11000 preempt_enable_no_resched();
11001 --- sle11-2009-10-16.orig/arch/x86/kernel/process_64-xen.c 2009-03-16 16:38:05.000000000 +0100
11002 +++ sle11-2009-10-16/arch/x86/kernel/process_64-xen.c 2009-06-04 10:21:39.000000000 +0200
11003 @@ -65,15 +65,6 @@ asmlinkage extern void ret_from_fork(voi
11005 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
11007 -unsigned long boot_option_idle_override = 0;
11008 -EXPORT_SYMBOL(boot_option_idle_override);
11011 - * Powermanagement idle function, if any..
11013 -void (*pm_idle)(void);
11014 -EXPORT_SYMBOL(pm_idle);
11016 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
11018 void idle_notifier_register(struct notifier_block *n)
11019 @@ -103,25 +94,13 @@ void exit_idle(void)
11023 -static void xen_idle(void)
11025 - current_thread_info()->status &= ~TS_POLLING;
11027 - * TS_POLLING-cleared state must be visible before we
11028 - * test NEED_RESCHED:
11031 - if (!need_resched())
11032 - safe_halt(); /* enables interrupts racelessly */
11034 - local_irq_enable();
11035 - current_thread_info()->status |= TS_POLLING;
11038 #ifdef CONFIG_HOTPLUG_CPU
11039 static inline void play_dead(void)
11042 +#ifndef CONFIG_XEN
11043 + c1e_remove_cpu(raw_smp_processor_id());
11045 local_irq_disable();
11046 cpu_clear(smp_processor_id(), cpu_initialized);
11047 preempt_enable_no_resched();
11048 @@ -146,12 +125,11 @@ void cpu_idle(void)
11049 current_thread_info()->status |= TS_POLLING;
11050 /* endless idle loop with no priority at all */
11052 - tick_nohz_stop_sched_tick();
11053 + tick_nohz_stop_sched_tick(1);
11054 while (!need_resched()) {
11055 - void (*idle)(void);
11058 - idle = xen_idle; /* no alternatives */
11060 if (cpu_is_offline(smp_processor_id()))
11063 @@ -161,7 +139,10 @@ void cpu_idle(void)
11065 local_irq_disable();
11068 + /* Don't trace irqs off for idle */
11069 + stop_critical_timings();
11071 + start_critical_timings();
11072 /* In many cases the interrupt that ended idle
11073 has already called exit_idle. But some idle
11074 loops can be woken up without interrupt. */
11075 @@ -271,7 +252,7 @@ void exit_thread(void)
11079 -void load_gs_index(unsigned gs)
11080 +void xen_load_gs_index(unsigned gs)
11082 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
11084 @@ -373,10 +354,10 @@ int copy_thread(int nr, unsigned long cl
11085 p->thread.fs = me->thread.fs;
11086 p->thread.gs = me->thread.gs;
11088 - asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
11089 - asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
11090 - asm("mov %%es,%0" : "=m" (p->thread.es));
11091 - asm("mov %%ds,%0" : "=m" (p->thread.ds));
11092 + savesegment(gs, p->thread.gsindex);
11093 + savesegment(fs, p->thread.fsindex);
11094 + savesegment(es, p->thread.es);
11095 + savesegment(ds, p->thread.ds);
11097 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
11098 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
11099 @@ -417,7 +398,9 @@ out:
11101 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
11103 - asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
11104 + loadsegment(fs, 0);
11105 + loadsegment(es, 0);
11106 + loadsegment(ds, 0);
11110 @@ -557,8 +540,8 @@ static inline void __switch_to_xtra(stru
11111 struct task_struct *
11112 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
11114 - struct thread_struct *prev = &prev_p->thread,
11115 - *next = &next_p->thread;
11116 + struct thread_struct *prev = &prev_p->thread;
11117 + struct thread_struct *next = &next_p->thread;
11118 int cpu = smp_processor_id();
11119 #ifndef CONFIG_X86_NO_TSS
11120 struct tss_struct *tss = &per_cpu(init_tss, cpu);
11121 @@ -661,12 +644,25 @@ __switch_to(struct task_struct *prev_p,
11123 if (unlikely(next->es))
11124 loadsegment(es, next->es);
11127 if (unlikely(next->ds))
11128 loadsegment(ds, next->ds);
11131 + * Leave lazy mode, flushing any hypercalls made here.
11132 + * This must be done before restoring TLS segments so
11133 + * the GDT and LDT are properly updated, and must be
11134 + * done before math_state_restore, so the TS bit is up
11137 + arch_leave_lazy_cpu_mode();
11140 * Switch FS and GS.
11142 + * Segment register != 0 always requires a reload. Also
11143 + * reload when it has changed. When prev process used 64bit
11144 + * base always reload to avoid an information leak.
11146 if (unlikely(next->fsindex))
11147 loadsegment(fs, next->fsindex);
11148 @@ -687,7 +683,8 @@ __switch_to(struct task_struct *prev_p,
11149 write_pda(oldrsp, next->usersp);
11150 write_pda(pcurrent, next_p);
11151 write_pda(kernelstack,
11152 - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
11153 + (unsigned long)task_stack_page(next_p) +
11154 + THREAD_SIZE - PDA_STACKOFFSET);
11155 #ifdef CONFIG_CC_STACKPROTECTOR
11156 write_pda(stack_canary, next_p->stack_canary);
11158 @@ -848,7 +845,7 @@ long do_arch_prctl(struct task_struct *t
11159 set_32bit_tls(task, FS_TLS, addr);
11161 load_TLS(&task->thread, cpu);
11162 - asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
11163 + loadsegment(fs, FS_TLS_SEL);
11165 task->thread.fsindex = FS_TLS_SEL;
11166 task->thread.fs = 0;
11167 @@ -858,7 +855,7 @@ long do_arch_prctl(struct task_struct *t
11169 /* set the selector to 0 to not confuse
11171 - asm volatile("movl %0,%%fs" :: "r" (0));
11172 + loadsegment(fs, 0);
11173 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
11176 @@ -882,7 +879,7 @@ long do_arch_prctl(struct task_struct *t
11177 if (task->thread.gsindex == GS_TLS_SEL)
11178 base = read_32bit_tls(task, GS_TLS);
11180 - asm("movl %%gs,%0" : "=r" (gsindex));
11181 + savesegment(gs, gsindex);
11183 rdmsrl(MSR_KERNEL_GS_BASE, base);
11185 --- sle11-2009-10-16.orig/arch/x86/kernel/quirks-xen.c 2009-03-16 16:33:40.000000000 +0100
11186 +++ sle11-2009-10-16/arch/x86/kernel/quirks-xen.c 2009-06-04 10:21:39.000000000 +0200
11187 @@ -63,6 +63,7 @@ static enum {
11188 ICH_FORCE_HPET_RESUME,
11189 VT8237_FORCE_HPET_RESUME,
11190 NVIDIA_FORCE_HPET_RESUME,
11191 + ATI_FORCE_HPET_RESUME,
11192 } force_hpet_resume_type;
11194 static void __iomem *rcba_base;
11195 @@ -156,6 +157,8 @@ static void ich_force_enable_hpet(struct
11197 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
11198 ich_force_enable_hpet);
11199 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
11200 + ich_force_enable_hpet);
11201 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
11202 ich_force_enable_hpet);
11203 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
11204 @@ -172,6 +175,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
11206 static struct pci_dev *cached_dev;
11208 +static void hpet_print_force_info(void)
11210 + printk(KERN_INFO "HPET not enabled in BIOS. "
11211 + "You might try hpet=force boot option\n");
11214 static void old_ich_force_hpet_resume(void)
11217 @@ -251,8 +260,12 @@ static void old_ich_force_enable_hpet_us
11219 if (hpet_force_user)
11220 old_ich_force_enable_hpet(dev);
11222 + hpet_print_force_info();
11225 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
11226 + old_ich_force_enable_hpet_user);
11227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
11228 old_ich_force_enable_hpet_user);
11229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
11230 @@ -288,9 +301,14 @@ static void vt8237_force_enable_hpet(str
11232 u32 uninitialized_var(val);
11234 - if (!hpet_force_user || hpet_address || force_hpet_address)
11235 + if (hpet_address || force_hpet_address)
11238 + if (!hpet_force_user) {
11239 + hpet_print_force_info();
11243 pci_read_config_dword(dev, 0x68, &val);
11245 * Bit 7 is HPET enable bit.
11246 @@ -328,6 +346,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_V
11247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
11248 vt8237_force_enable_hpet);
11250 +static void ati_force_hpet_resume(void)
11252 + pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
11253 + printk(KERN_DEBUG "Force enabled HPET at resume\n");
11256 +static void ati_force_enable_hpet(struct pci_dev *dev)
11258 + u32 uninitialized_var(val);
11260 + if (hpet_address || force_hpet_address)
11263 + if (!hpet_force_user) {
11264 + hpet_print_force_info();
11268 + pci_write_config_dword(dev, 0x14, 0xfed00000);
11269 + pci_read_config_dword(dev, 0x14, &val);
11270 + force_hpet_address = val;
11271 + force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
11272 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
11273 + force_hpet_address);
11274 + cached_dev = dev;
11277 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
11278 + ati_force_enable_hpet);
11281 * Undocumented chipset feature taken from LinuxBIOS.
11283 @@ -341,8 +389,13 @@ static void nvidia_force_enable_hpet(str
11285 u32 uninitialized_var(val);
11287 - if (!hpet_force_user || hpet_address || force_hpet_address)
11288 + if (hpet_address || force_hpet_address)
11291 + if (!hpet_force_user) {
11292 + hpet_print_force_info();
11296 pci_write_config_dword(dev, 0x44, 0xfed00001);
11297 pci_read_config_dword(dev, 0x44, &val);
11298 @@ -395,6 +448,9 @@ void force_hpet_resume(void)
11299 case NVIDIA_FORCE_HPET_RESUME:
11300 nvidia_force_hpet_resume();
11302 + case ATI_FORCE_HPET_RESUME:
11303 + ati_force_hpet_resume();
11308 --- sle11-2009-10-16.orig/arch/x86/kernel/setup-xen.c 2009-03-16 16:38:05.000000000 +0100
11309 +++ sle11-2009-10-16/arch/x86/kernel/setup-xen.c 2009-06-04 10:21:39.000000000 +0200
11310 @@ -1,141 +1,1132 @@
11311 -#include <linux/kernel.h>
11313 + * Copyright (C) 1995 Linus Torvalds
11315 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11317 + * Memory region support
11318 + * David Parsons <orc@pell.chi.il.us>, July-August 1999
11320 + * Added E820 sanitization routine (removes overlapping memory regions);
11321 + * Brian Moyle <bmoyle@mvista.com>, February 2001
11323 + * Moved CPU detection code to cpu/${cpu}.c
11324 + * Patrick Mochel <mochel@osdl.org>, March 2002
11326 + * Provisions for empty E820 memory regions (reported by certain BIOSes).
11327 + * Alex Achenbach <xela@slit.de>, December 2002.
11332 + * This file handles the architecture-dependent parts of initialization
11335 +#include <linux/sched.h>
11336 +#include <linux/mm.h>
11337 +#include <linux/mmzone.h>
11338 +#include <linux/screen_info.h>
11339 +#include <linux/ioport.h>
11340 +#include <linux/acpi.h>
11341 +#include <linux/apm_bios.h>
11342 +#include <linux/initrd.h>
11343 +#include <linux/bootmem.h>
11344 +#include <linux/seq_file.h>
11345 +#include <linux/console.h>
11346 +#include <linux/mca.h>
11347 +#include <linux/root_dev.h>
11348 +#include <linux/highmem.h>
11349 #include <linux/module.h>
11350 +#include <linux/efi.h>
11351 #include <linux/init.h>
11352 -#include <linux/bootmem.h>
11353 +#include <linux/edd.h>
11354 +#include <linux/iscsi_ibft.h>
11355 +#include <linux/nodemask.h>
11356 +#include <linux/kexec.h>
11357 +#include <linux/dmi.h>
11358 +#include <linux/pfn.h>
11359 +#include <linux/pci.h>
11360 +#include <asm/pci-direct.h>
11361 +#include <linux/init_ohci1394_dma.h>
11362 +#include <linux/kvm_para.h>
11364 +#include <linux/errno.h>
11365 +#include <linux/kernel.h>
11366 +#include <linux/stddef.h>
11367 +#include <linux/unistd.h>
11368 +#include <linux/ptrace.h>
11369 +#include <linux/slab.h>
11370 +#include <linux/user.h>
11371 +#include <linux/delay.h>
11373 +#include <linux/kallsyms.h>
11374 +#include <linux/cpufreq.h>
11375 +#include <linux/dma-mapping.h>
11376 +#include <linux/ctype.h>
11377 +#include <linux/uaccess.h>
11379 #include <linux/percpu.h>
11380 -#include <asm/smp.h>
11381 -#include <asm/percpu.h>
11382 +#include <linux/crash_dump.h>
11384 +#include <video/edid.h>
11386 +#include <asm/mtrr.h>
11387 +#include <asm/apic.h>
11388 +#include <asm/e820.h>
11389 +#include <asm/mpspec.h>
11390 +#include <asm/setup.h>
11391 +#include <asm/arch_hooks.h>
11392 +#include <asm/efi.h>
11393 #include <asm/sections.h>
11394 +#include <asm/dmi.h>
11395 +#include <asm/io_apic.h>
11396 +#include <asm/ist.h>
11397 +#include <asm/vmi.h>
11398 +#include <setup_arch.h>
11399 +#include <asm/bios_ebda.h>
11400 +#include <asm/cacheflush.h>
11401 #include <asm/processor.h>
11402 -#include <asm/setup.h>
11403 +#include <asm/bugs.h>
11405 +#include <asm/system.h>
11406 +#include <asm/vsyscall.h>
11407 +#include <asm/smp.h>
11408 +#include <asm/desc.h>
11409 +#include <asm/dma.h>
11410 +#include <asm/iommu.h>
11411 +#include <asm/mmu_context.h>
11412 +#include <asm/proto.h>
11414 +#include <mach_apic.h>
11415 +#include <asm/paravirt.h>
11417 +#include <asm/percpu.h>
11418 #include <asm/topology.h>
11419 -#include <asm/mpspec.h>
11420 #include <asm/apicdef.h>
11421 +#ifdef CONFIG_X86_64
11422 +#include <asm/numa_64.h>
11426 +#include <asm/hypervisor.h>
11427 +#include <xen/interface/kexec.h>
11428 +#include <xen/interface/memory.h>
11429 +#include <xen/interface/nmi.h>
11430 +#include <xen/interface/physdev.h>
11431 +#include <xen/features.h>
11432 +#include <xen/firmware.h>
11433 +#include <xen/xencons.h>
11435 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
11436 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
11438 -#ifdef CONFIG_X86_LOCAL_APIC
11439 -unsigned int num_processors;
11440 -unsigned disabled_cpus __cpuinitdata;
11441 -/* Processor that is doing the boot up */
11442 -unsigned int boot_cpu_physical_apicid = -1U;
11443 -EXPORT_SYMBOL(boot_cpu_physical_apicid);
11444 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
11445 +static struct notifier_block xen_panic_block = {
11446 + xen_panic_event, NULL, 0 /* try to go last */
11449 -DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
11450 -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
11451 +unsigned long *phys_to_machine_mapping;
11452 +EXPORT_SYMBOL(phys_to_machine_mapping);
11454 -/* Bitmask of physically existing CPUs */
11455 -physid_mask_t phys_cpu_present_map;
11456 +unsigned long *pfn_to_mfn_frame_list_list,
11457 +#ifdef CONFIG_X86_64
11458 + *pfn_to_mfn_frame_list[512];
11460 + *pfn_to_mfn_frame_list[128];
11463 +/* Raw start-of-day parameters from the hypervisor. */
11464 +start_info_t *xen_start_info;
11465 +EXPORT_SYMBOL(xen_start_info);
11468 +#ifndef ARCH_SETUP
11469 +#define ARCH_SETUP
11472 +#ifndef CONFIG_XEN
11473 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
11474 +struct boot_params __initdata boot_params;
11476 +struct boot_params boot_params;
11480 -#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
11482 - * Copy data used in early init routines from the initial arrays to the
11483 - * per cpu data areas. These arrays then become expendable and the
11484 - * *_early_ptr's are zeroed indicating that the static arrays are gone.
11485 + * Machine setup..
11487 -static void __init setup_per_cpu_maps(void)
11488 +static struct resource data_resource = {
11489 + .name = "Kernel data",
11492 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11495 +static struct resource code_resource = {
11496 + .name = "Kernel code",
11499 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11502 +static struct resource bss_resource = {
11503 + .name = "Kernel bss",
11506 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11510 +#ifdef CONFIG_X86_32
11511 +#ifndef CONFIG_XEN
11512 +/* This value is set up by the early boot code to point to the value
11513 + immediately after the boot time page tables. It contains a *physical*
11514 + address, and must not be in the .bss segment! */
11515 +unsigned long init_pg_tables_start __initdata = ~0UL;
11516 +unsigned long init_pg_tables_end __initdata = ~0UL;
11519 +static struct resource video_ram_resource = {
11520 + .name = "Video RAM area",
11521 + .start = 0xa0000,
11523 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11526 +/* cpu data as detected by the assembly code in head.S */
11527 +struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11528 +/* common cpu data for all cpus */
11529 +struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11530 +EXPORT_SYMBOL(boot_cpu_data);
11531 +#ifndef CONFIG_XEN
11532 +static void set_mca_bus(int x)
11539 +unsigned int def_to_bigsmp;
11541 +/* for MCA, but anyone else can use it if they want */
11542 +unsigned int machine_id;
11543 +unsigned int machine_submodel_id;
11544 +unsigned int BIOS_revision;
11546 +struct apm_info apm_info;
11547 +EXPORT_SYMBOL(apm_info);
11550 +#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
11551 +struct ist_info ist_info;
11552 +EXPORT_SYMBOL(ist_info);
11553 +#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
11554 +struct ist_info ist_info;
11558 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
11559 +EXPORT_SYMBOL(boot_cpu_data);
11563 +#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
11564 +unsigned long mmu_cr4_features;
11566 +unsigned long mmu_cr4_features = X86_CR4_PAE;
11569 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
11570 +int bootloader_type;
11573 + * Early DMI memory
11575 +int dmi_alloc_index;
11576 +char dmi_alloc_data[DMI_MAX_DATA];
11581 +struct screen_info screen_info;
11582 +EXPORT_SYMBOL(screen_info);
11583 +struct edid_info edid_info;
11584 +EXPORT_SYMBOL_GPL(edid_info);
11586 +extern int root_mountflags;
11588 +unsigned long saved_video_mode;
11590 +#define RAMDISK_IMAGE_START_MASK 0x07FF
11591 +#define RAMDISK_PROMPT_FLAG 0x8000
11592 +#define RAMDISK_LOAD_FLAG 0x4000
11594 +static char __initdata command_line[COMMAND_LINE_SIZE];
11596 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
11598 +#ifdef CONFIG_EDD_MODULE
11599 +EXPORT_SYMBOL(edd);
11601 +#ifndef CONFIG_XEN
11603 + * copy_edd() - Copy the BIOS EDD information
11604 + * from boot_params into a safe place.
11607 +static inline void copy_edd(void)
11609 + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
11610 + sizeof(edd.mbr_signature));
11611 + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
11612 + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
11613 + edd.edd_info_nr = boot_params.eddbuf_entries;
11617 +static inline void copy_edd(void)
11622 +#ifdef CONFIG_BLK_DEV_INITRD
11624 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11626 +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
11627 +static void __init relocate_initrd(void)
11630 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11631 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11632 + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11633 + u64 ramdisk_here;
11634 + unsigned long slop, clen, mapaddr;
11637 + /* We need to move the initrd down into lowmem */
11638 + ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
11641 + if (ramdisk_here == -1ULL)
11642 + panic("Cannot find place for new RAMDISK of size %lld\n",
11645 + /* Note: this includes all the lowmem currently occupied by
11646 + the initrd, we rely on that fact to keep the data intact. */
11647 + reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
11649 + initrd_start = ramdisk_here + PAGE_OFFSET;
11650 + initrd_end = initrd_start + ramdisk_size;
11651 + printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
11652 + ramdisk_here, ramdisk_here + ramdisk_size);
11654 + q = (char *)initrd_start;
11656 + /* Copy any lowmem portion of the initrd */
11657 + if (ramdisk_image < end_of_lowmem) {
11658 + clen = end_of_lowmem - ramdisk_image;
11659 + p = (char *)__va(ramdisk_image);
11660 + memcpy(q, p, clen);
11662 + ramdisk_image += clen;
11663 + ramdisk_size -= clen;
11666 + /* Copy the highmem portion of the initrd */
11667 + while (ramdisk_size) {
11668 + slop = ramdisk_image & ~PAGE_MASK;
11669 + clen = ramdisk_size;
11670 + if (clen > MAX_MAP_CHUNK-slop)
11671 + clen = MAX_MAP_CHUNK-slop;
11672 + mapaddr = ramdisk_image & PAGE_MASK;
11673 + p = early_ioremap(mapaddr, clen+slop);
11674 + memcpy(q, p+slop, clen);
11675 + early_iounmap(p, clen+slop);
11677 + ramdisk_image += clen;
11678 + ramdisk_size -= clen;
11680 + /* high pages is not converted by early_res_to_bootmem */
11681 + ramdisk_image = boot_params.hdr.ramdisk_image;
11682 + ramdisk_size = boot_params.hdr.ramdisk_size;
11683 + printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
11684 + " %08llx - %08llx\n",
11685 + ramdisk_image, ramdisk_image + ramdisk_size - 1,
11686 + ramdisk_here, ramdisk_here + ramdisk_size - 1);
11690 +static void __init reserve_initrd(void)
11694 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11695 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11696 + u64 ramdisk_end = ramdisk_image + ramdisk_size;
11697 + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11699 + if (!boot_params.hdr.type_of_loader ||
11700 + !ramdisk_image || !ramdisk_size)
11701 + return; /* No initrd provided by bootloader */
11703 + unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
11704 + unsigned long ramdisk_size = xen_start_info->mod_len;
11705 + unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
11706 + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11708 - for_each_possible_cpu(cpu) {
11709 - per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
11710 - per_cpu(x86_bios_cpu_apicid, cpu) =
11711 - x86_bios_cpu_apicid_init[cpu];
11712 -#ifdef CONFIG_NUMA
11713 - per_cpu(x86_cpu_to_node_map, cpu) =
11714 - x86_cpu_to_node_map_init[cpu];
11715 + if (!xen_start_info->mod_start || !ramdisk_size)
11716 + return; /* No initrd provided by bootloader */
11719 + initrd_start = 0;
11721 + if (ramdisk_size >= (end_of_lowmem>>1)) {
11722 + free_early(ramdisk_image, ramdisk_end);
11723 + printk(KERN_ERR "initrd too large to handle, "
11724 + "disabling initrd\n");
11728 - /* indicate the early static arrays will soon be gone */
11729 - x86_cpu_to_apicid_early_ptr = NULL;
11730 - x86_bios_cpu_apicid_early_ptr = NULL;
11731 -#ifdef CONFIG_NUMA
11732 - x86_cpu_to_node_map_early_ptr = NULL;
11733 + printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
11737 + if (ramdisk_end <= end_of_lowmem) {
11738 + /* All in lowmem, easy case */
11740 + * don't need to reserve again, already reserved early
11741 + * in i386_start_kernel
11743 + initrd_start = ramdisk_image + PAGE_OFFSET;
11744 + initrd_end = initrd_start + ramdisk_size;
11745 +#ifdef CONFIG_X86_64_XEN
11746 + initrd_below_start_ok = 1;
11751 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11752 + relocate_initrd();
11754 + printk(KERN_ERR "initrd extends beyond end of memory "
11755 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
11756 + ramdisk_end, end_of_lowmem);
11757 + initrd_start = 0;
11759 + free_early(ramdisk_image, ramdisk_end);
11762 +static void __init reserve_initrd(void)
11765 +#endif /* CONFIG_BLK_DEV_INITRD */
11767 +static void __init parse_setup_data(void)
11769 +#ifndef CONFIG_XEN
11770 + struct setup_data *data;
11773 -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
11774 -cpumask_t *cpumask_of_cpu_map __read_mostly;
11775 -EXPORT_SYMBOL(cpumask_of_cpu_map);
11776 + if (boot_params.hdr.version < 0x0209)
11778 + pa_data = boot_params.hdr.setup_data;
11779 + while (pa_data) {
11780 + data = early_ioremap(pa_data, PAGE_SIZE);
11781 + switch (data->type) {
11782 + case SETUP_E820_EXT:
11783 + parse_e820_ext(data, pa_data);
11788 + pa_data = data->next;
11789 + early_iounmap(data, PAGE_SIZE);
11794 -/* requires nr_cpu_ids to be initialized */
11795 -static void __init setup_cpumask_of_cpu(void)
11796 +static void __init e820_reserve_setup_data(void)
11799 +#ifndef CONFIG_XEN
11800 + struct setup_data *data;
11804 + if (boot_params.hdr.version < 0x0209)
11806 + pa_data = boot_params.hdr.setup_data;
11807 + while (pa_data) {
11808 + data = early_ioremap(pa_data, sizeof(*data));
11809 + e820_update_range(pa_data, sizeof(*data)+data->len,
11810 + E820_RAM, E820_RESERVED_KERN);
11812 + pa_data = data->next;
11813 + early_iounmap(data, sizeof(*data));
11818 - /* alloc_bootmem zeroes memory */
11819 - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
11820 - for (i = 0; i < nr_cpu_ids; i++)
11821 - cpu_set(i, cpumask_of_cpu_map[i]);
11822 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
11823 + memcpy(&e820_saved, &e820, sizeof(struct e820map));
11824 + printk(KERN_INFO "extended physical RAM map:\n");
11825 + e820_print_map("reserve setup_data");
11829 -static inline void setup_cpumask_of_cpu(void) { }
11831 +static void __init reserve_early_setup_data(void)
11833 +#ifndef CONFIG_XEN
11834 + struct setup_data *data;
11838 + if (boot_params.hdr.version < 0x0209)
11840 + pa_data = boot_params.hdr.setup_data;
11841 + while (pa_data) {
11842 + data = early_ioremap(pa_data, sizeof(*data));
11843 + sprintf(buf, "setup data %x", data->type);
11844 + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
11845 + pa_data = data->next;
11846 + early_iounmap(data, sizeof(*data));
11851 -#ifdef CONFIG_X86_32
11853 - * Great future not-so-futuristic plan: make i386 and x86_64 do it
11855 + * --------- Crashkernel reservation ------------------------------
11858 +#ifdef CONFIG_KEXEC
11860 +#ifndef CONFIG_XEN
11862 + * Reserve @size bytes of crashkernel memory at any suitable offset.
11864 + * @size: Size of the crashkernel memory to reserve.
11865 + * Returns the base address on success, and -1ULL on failure.
11867 +unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
11869 + const unsigned long long alignment = 16<<20; /* 16M */
11870 + unsigned long long start = 0LL;
11875 + start = find_e820_area(start, ULONG_MAX, size, alignment);
11876 + if (start == -1ULL)
11879 + /* try to reserve it */
11880 + ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
11884 + start += alignment;
11888 +static inline unsigned long long get_total_mem(void)
11890 + unsigned long long total;
11892 + total = max_low_pfn - min_low_pfn;
11893 +#ifdef CONFIG_HIGHMEM
11894 + total += highend_pfn - highstart_pfn;
11897 + return total << PAGE_SHIFT;
11900 +static void __init reserve_crashkernel(void)
11902 + unsigned long long total_mem;
11903 + unsigned long long crash_size, crash_base;
11906 + total_mem = get_total_mem();
11908 + ret = parse_crashkernel(boot_command_line, total_mem,
11909 + &crash_size, &crash_base);
11910 + if (ret != 0 || crash_size <= 0)
11913 + /* 0 means: find the address automatically */
11914 + if (crash_base <= 0) {
11915 + crash_base = find_and_reserve_crashkernel(crash_size);
11916 + if (crash_base == -1ULL) {
11917 + pr_info("crashkernel reservation failed. "
11918 + "No suitable area found.\n");
11922 + ret = reserve_bootmem_generic(crash_base, crash_size,
11923 + BOOTMEM_EXCLUSIVE);
11925 + pr_info("crashkernel reservation failed - "
11926 + "memory is in use\n");
11931 + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
11932 + "for crashkernel (System RAM: %ldMB)\n",
11933 + (unsigned long)(crash_size >> 20),
11934 + (unsigned long)(crash_base >> 20),
11935 + (unsigned long)(total_mem >> 20));
11937 + crashk_res.start = crash_base;
11938 + crashk_res.end = crash_base + crash_size - 1;
11939 + insert_resource(&iomem_resource, &crashk_res);
11942 +#define reserve_crashkernel xen_machine_kexec_setup_resources
11945 +static void __init reserve_crashkernel(void)
11950 +static struct resource standard_io_resources[] = {
11951 + { .name = "dma1", .start = 0x00, .end = 0x1f,
11952 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11953 + { .name = "pic1", .start = 0x20, .end = 0x21,
11954 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11955 + { .name = "timer0", .start = 0x40, .end = 0x43,
11956 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11957 + { .name = "timer1", .start = 0x50, .end = 0x53,
11958 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11959 + { .name = "keyboard", .start = 0x60, .end = 0x60,
11960 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11961 + { .name = "keyboard", .start = 0x64, .end = 0x64,
11962 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11963 + { .name = "dma page reg", .start = 0x80, .end = 0x8f,
11964 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11965 + { .name = "pic2", .start = 0xa0, .end = 0xa1,
11966 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11967 + { .name = "dma2", .start = 0xc0, .end = 0xdf,
11968 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11969 + { .name = "fpu", .start = 0xf0, .end = 0xff,
11970 + .flags = IORESOURCE_BUSY | IORESOURCE_IO }
11973 +static void __init reserve_standard_io_resources(void)
11977 + /* Nothing to do if not running in dom0. */
11978 + if (!is_initial_xendomain())
11981 + /* request I/O space for devices used on all i[345]86 PCs */
11982 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
11983 + request_resource(&ioport_resource, &standard_io_resources[i]);
11987 +#ifdef CONFIG_PROC_VMCORE
11988 +/* elfcorehdr= specifies the location of elf core header
11989 + * stored by the crashed kernel. This option will be passed
11990 + * by kexec loader to the capture kernel.
11992 -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
11993 -EXPORT_SYMBOL(__per_cpu_offset);
11994 +static int __init setup_elfcorehdr(char *arg)
11999 + elfcorehdr_addr = memparse(arg, &end);
12000 + return end > arg ? 0 : -EINVAL;
12002 +early_param("elfcorehdr", setup_elfcorehdr);
12005 +static struct x86_quirks default_x86_quirks __initdata;
12007 +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
12010 + * Determine if we were loaded by an EFI loader. If so, then we have also been
12011 + * passed the efi memmap, systab, etc., so we should use these data structures
12012 + * for initialization. Note, the efi init code path is determined by the
12013 + * global efi_enabled. This allows the same kernel image to be used on existing
12014 + * systems (with a traditional BIOS) as well as on EFI systems.
12017 - * Great future plan:
12018 - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
12019 - * Always point %gs to its beginning
12020 + * setup_arch - architecture-specific boot-time initializations
12022 + * Note: On x86_64, fixmaps are ready for use even before this is called.
12024 -void __init setup_per_cpu_areas(void)
12026 +void __init setup_arch(char **cmdline_p)
12028 - int i, highest_cpu = 0;
12029 - unsigned long size;
12032 + unsigned long p2m_pages;
12033 + struct physdev_set_iopl set_iopl;
12035 -#ifdef CONFIG_HOTPLUG_CPU
12036 - prefill_possible_map();
12037 +#ifdef CONFIG_X86_32
12038 + /* Force a quick death if the kernel panics (not domain 0). */
12039 + extern int panic_timeout;
12040 + if (!panic_timeout && !is_initial_xendomain())
12041 + panic_timeout = 1;
12044 - /* Copy section for each CPU (we discard the original) */
12045 - size = PERCPU_ENOUGH_ROOM;
12046 - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
12049 - for_each_possible_cpu(i) {
12051 -#ifndef CONFIG_NEED_MULTIPLE_NODES
12052 - ptr = alloc_bootmem_pages(size);
12054 - int node = early_cpu_to_node(i);
12055 - if (!node_online(node) || !NODE_DATA(node)) {
12056 - ptr = alloc_bootmem_pages(size);
12058 - "cpu %d has no node or node-local memory\n", i);
12061 - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
12062 + /* Register a call for panic conditions. */
12063 + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
12065 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12066 + VMASST_TYPE_writable_pagetables));
12067 +#ifdef CONFIG_X86_32
12068 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12069 + VMASST_TYPE_4gb_segments));
12071 +#endif /* CONFIG_XEN */
12073 +#ifdef CONFIG_X86_32
12074 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
12075 + visws_early_detect();
12076 + pre_setup_arch_hook();
12078 + printk(KERN_INFO "Command line: %s\n", boot_command_line);
12081 + early_cpu_init();
12082 + early_ioremap_init();
12084 +#ifndef CONFIG_XEN
12085 + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
12086 + screen_info = boot_params.screen_info;
12087 + edid_info = boot_params.edid_info;
12088 +#ifdef CONFIG_X86_32
12089 + apm_info.bios = boot_params.apm_bios_info;
12090 + ist_info = boot_params.ist_info;
12091 + if (boot_params.sys_desc_table.length != 0) {
12092 + set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
12093 + machine_id = boot_params.sys_desc_table.table[0];
12094 + machine_submodel_id = boot_params.sys_desc_table.table[1];
12095 + BIOS_revision = boot_params.sys_desc_table.table[2];
12098 + saved_video_mode = boot_params.hdr.vid_mode;
12099 + bootloader_type = boot_params.hdr.type_of_loader;
12101 +#ifdef CONFIG_BLK_DEV_RAM
12102 + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
12103 + rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
12104 + rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
12107 + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
12108 +#ifdef CONFIG_X86_32
12114 - panic("Cannot allocate cpu data for CPU %d\n", i);
12117 + efi_reserve_early();
12120 +#else /* CONFIG_XEN */
12121 +#ifdef CONFIG_X86_32
12122 + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
12123 + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
12125 + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
12127 + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
12129 + if (is_initial_xendomain()) {
12130 + const struct dom0_vga_console_info *info =
12131 + (void *)((char *)xen_start_info +
12132 + xen_start_info->console.dom0.info_off);
12134 + dom0_init_screen_info(info,
12135 + xen_start_info->console.dom0.info_size);
12136 + xen_start_info->console.domU.mfn = 0;
12137 + xen_start_info->console.domU.evtchn = 0;
12139 + screen_info.orig_video_isVGA = 0;
12141 +#endif /* CONFIG_XEN */
12145 + setup_memory_map();
12146 + parse_setup_data();
12147 + /* update the e820_saved too */
12148 + e820_reserve_setup_data();
12152 +#ifndef CONFIG_XEN
12153 + if (!boot_params.hdr.root_flags)
12154 + root_mountflags &= ~MS_RDONLY;
12156 + init_mm.start_code = (unsigned long) _text;
12157 + init_mm.end_code = (unsigned long) _etext;
12158 + init_mm.end_data = (unsigned long) _edata;
12159 +#ifdef CONFIG_X86_32
12160 +#ifndef CONFIG_XEN
12161 + init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
12163 + init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
12164 + xen_start_info->nr_pt_frames) << PAGE_SHIFT;
12167 + init_mm.brk = (unsigned long) &_end;
12170 + code_resource.start = virt_to_phys(_text);
12171 + code_resource.end = virt_to_phys(_etext)-1;
12172 + data_resource.start = virt_to_phys(_etext);
12173 + data_resource.end = virt_to_phys(_edata)-1;
12174 + bss_resource.start = virt_to_phys(&__bss_start);
12175 + bss_resource.end = virt_to_phys(&__bss_stop)-1;
12177 + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
12178 + *cmdline_p = command_line;
12180 + parse_early_param();
12182 #ifdef CONFIG_X86_64
12183 - cpu_pda(i)->data_offset = ptr - __per_cpu_start;
12187 +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
12189 + * Must be before kernel pagetables are setup
12190 + * or fixmap area is touched.
12195 + /* after early param, so could get panic from serial */
12196 + reserve_early_setup_data();
12198 + if (acpi_mps_check()) {
12199 +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
12200 + disable_apic = 1;
12202 + setup_clear_cpu_cap(X86_FEATURE_APIC);
12206 + if (pci_early_dump_regs)
12207 + early_dump_pci_devices();
12210 + finish_e820_parsing();
12212 +#ifdef CONFIG_X86_32
12216 +#ifndef CONFIG_XEN
12217 + /* after parse_early_param, so could debug it */
12218 + insert_resource(&iomem_resource, &code_resource);
12219 + insert_resource(&iomem_resource, &data_resource);
12220 + insert_resource(&iomem_resource, &bss_resource);
12225 +#ifdef CONFIG_X86_32
12226 + if (ppro_with_ram_bug()) {
12227 + e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
12229 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
12230 + printk(KERN_INFO "fixed physical RAM map:\n");
12231 + e820_print_map("bad_ppro");
12234 - __per_cpu_offset[i] = ptr - __per_cpu_start;
12235 + early_gart_iommu_check();
12237 - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
12238 +#endif /* CONFIG_XEN */
12242 + * partially used pages are not usable - thus
12243 + * we are rounding upwards:
12245 + max_pfn = e820_end_of_ram_pfn();
12247 + /* preallocate 4k for mptable mpc */
12248 + early_reserve_e820_mpc_new();
12249 + /* update e820 for memory not covered by WB MTRRs */
12251 +#ifndef CONFIG_XEN
12252 + if (mtrr_trim_uncached_memory(max_pfn))
12253 + max_pfn = e820_end_of_ram_pfn();
12256 +#ifdef CONFIG_X86_32
12257 + /* max_low_pfn get updated here */
12258 + find_low_pfn_range();
12260 + num_physpages = max_pfn;
12261 + max_mapnr = max_pfn;
12264 + /* How many end-of-memory variables you have, grandma! */
12265 + /* need this before calling reserve_initrd */
12266 + if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
12267 + max_low_pfn = e820_end_of_low_ram_pfn();
12269 + max_low_pfn = max_pfn;
12271 + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
12274 + /* max_pfn_mapped is updated here */
12275 + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
12276 + max_pfn_mapped = max_low_pfn_mapped;
12278 +#ifdef CONFIG_X86_64
12279 + if (max_pfn > max_low_pfn) {
12280 + max_pfn_mapped = init_memory_mapping(1UL<<32,
12281 + max_pfn<<PAGE_SHIFT);
12282 + /* can we preseve max_low_pfn ?*/
12283 + max_low_pfn = max_pfn;
12287 - nr_cpu_ids = highest_cpu + 1;
12288 - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
12290 + * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
12293 - /* Setup percpu data maps */
12294 - setup_per_cpu_maps();
12295 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
12296 + if (init_ohci1394_dma_early)
12297 + init_ohci1394_dma_on_all_controllers();
12300 - /* Setup cpumask_of_cpu map */
12301 - setup_cpumask_of_cpu();
12303 + reserve_initrd();
12305 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12309 + if (is_initial_xendomain())
12310 + dmi_scan_machine();
12314 +#ifdef CONFIG_ACPI
12315 + if (!is_initial_xendomain()) {
12316 + printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
12322 + * Parse the ACPI tables for possible boot-time SMP configuration.
12324 + acpi_boot_table_init();
12326 +#ifdef CONFIG_ACPI_NUMA
12328 + * Parse SRAT to discover nodes.
12330 + acpi_numa_init();
12333 + initmem_init(0, max_pfn);
12335 +#ifdef CONFIG_ACPI_SLEEP
12337 + * Reserve low memory region for sleep support.
12339 + acpi_reserve_bootmem();
12341 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
12343 + * Find and reserve possible boot-time SMP configuration:
12345 + find_smp_config();
12347 + reserve_crashkernel();
12349 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12351 + * dma32_reserve_bootmem() allocates bootmem which may conflict
12352 + * with the crashkernel command line, so do that after
12353 + * reserve_crashkernel()
12355 + dma32_reserve_bootmem();
12358 + reserve_ibft_region();
12360 +#ifdef CONFIG_KVM_CLOCK
12364 + xen_pagetable_setup_start(swapper_pg_dir);
12366 + xen_pagetable_setup_done(swapper_pg_dir);
12367 + paravirt_post_allocator_init();
12369 +#ifdef CONFIG_X86_64
12374 + p2m_pages = max_pfn;
12375 + if (xen_start_info->nr_pages > max_pfn) {
12377 + * the max_pfn was shrunk (probably by mem= or highmem=
12378 + * kernel parameter); shrink reservation with the HV
12380 + struct xen_memory_reservation reservation = {
12381 + .address_bits = 0,
12382 + .extent_order = 0,
12383 + .domid = DOMID_SELF
12385 + unsigned int difference;
12388 + difference = xen_start_info->nr_pages - max_pfn;
12390 + set_xen_guest_handle(reservation.extent_start,
12391 + ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
12392 + reservation.nr_extents = difference;
12393 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
12395 + BUG_ON(ret != difference);
12397 + else if (max_pfn > xen_start_info->nr_pages)
12398 + p2m_pages = xen_start_info->nr_pages;
12400 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
12401 + unsigned long i, j;
12402 + unsigned int k, fpp;
12404 + /* Make sure we have a large enough P->M table. */
12405 + phys_to_machine_mapping = alloc_bootmem_pages(
12406 + max_pfn * sizeof(unsigned long));
12407 + memset(phys_to_machine_mapping, ~0,
12408 + max_pfn * sizeof(unsigned long));
12409 + memcpy(phys_to_machine_mapping,
12410 + (unsigned long *)xen_start_info->mfn_list,
12411 + p2m_pages * sizeof(unsigned long));
12413 + __pa(xen_start_info->mfn_list),
12414 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
12415 + sizeof(unsigned long))));
12418 + * Initialise the list of the frames that specify the list of
12419 + * frames that make up the p2m table. Used by save/restore.
12421 + pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
12423 + fpp = PAGE_SIZE/sizeof(unsigned long);
12424 + for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
12429 + BUG_ON(k>=ARRAY_SIZE(pfn_to_mfn_frame_list));
12430 + pfn_to_mfn_frame_list[k] =
12431 + alloc_bootmem_pages(PAGE_SIZE);
12432 + pfn_to_mfn_frame_list_list[k] =
12433 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
12435 + pfn_to_mfn_frame_list[k][j] =
12436 + virt_to_mfn(&phys_to_machine_mapping[i]);
12438 + HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
12439 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
12440 + virt_to_mfn(pfn_to_mfn_frame_list_list);
12443 + /* Mark all ISA DMA channels in-use - using them wouldn't work. */
12444 + for (i = 0; i < MAX_DMA_CHANNELS; ++i)
12445 + if (i != 4 && request_dma(i, "xen") != 0)
12447 +#endif /* CONFIG_XEN */
12449 +#ifdef CONFIG_X86_GENERICARCH
12450 + generic_apic_probe();
12453 +#ifndef CONFIG_XEN
12458 + * Read APIC and some other early information from ACPI tables.
12460 + acpi_boot_init();
12462 +#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
12464 + * get boot-time SMP configuration:
12466 + if (smp_found_config)
12467 + get_smp_config();
12470 + prefill_possible_map();
12471 +#ifdef CONFIG_X86_64
12472 + init_cpu_to_node();
12475 +#ifndef CONFIG_XEN
12476 + init_apic_mappings();
12477 + ioapic_init_mappings();
12479 + kvm_guest_init();
12481 + e820_reserve_resources();
12482 + e820_mark_nosave_regions(max_low_pfn);
12484 + if (is_initial_xendomain())
12485 + e820_reserve_resources();
12488 +#ifdef CONFIG_X86_32
12489 + if (is_initial_xendomain())
12490 + request_resource(&iomem_resource, &video_ram_resource);
12492 + reserve_standard_io_resources();
12494 +#ifndef CONFIG_XEN
12495 + e820_setup_gap();
12498 +#if defined(CONFIG_VGA_CONSOLE)
12499 + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
12500 + conswitchp = &vga_con;
12501 +#elif defined(CONFIG_DUMMY_CONSOLE)
12502 + conswitchp = &dummy_con;
12505 +#else /* CONFIG_XEN */
12506 + if (is_initial_xendomain())
12507 + e820_setup_gap();
12509 + set_iopl.iopl = 1;
12510 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
12513 +#ifdef CONFIG_DUMMY_CONSOLE
12514 + conswitchp = &dummy_con;
12516 +#ifdef CONFIG_VGA_CONSOLE
12517 + if (is_initial_xendomain())
12518 + conswitchp = &vga_con;
12521 +#endif /* CONFIG_XEN */
12526 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
12528 + HYPERVISOR_shutdown(SHUTDOWN_crash);
12529 + /* we're never actually going to get here... */
12530 + return NOTIFY_DONE;
12532 +#endif /* !CONFIG_XEN */
12533 --- sle11-2009-10-16.orig/arch/x86/kernel/setup64-xen.c 2009-03-16 16:38:05.000000000 +0100
12534 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12537 - * X86-64 specific CPU setup.
12538 - * Copyright (C) 1995 Linus Torvalds
12539 - * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
12540 - * See setup.c for older changelog.
12542 - * Jun Nakajima <jun.nakajima@intel.com>
12543 - * Modified for Xen
12546 -#include <linux/init.h>
12547 -#include <linux/kernel.h>
12548 -#include <linux/sched.h>
12549 -#include <linux/string.h>
12550 -#include <linux/bootmem.h>
12551 -#include <linux/bitops.h>
12552 -#include <linux/module.h>
12553 -#include <linux/kgdb.h>
12554 -#include <asm/pda.h>
12555 -#include <asm/pgtable.h>
12556 -#include <asm/processor.h>
12557 -#include <asm/desc.h>
12558 -#include <asm/atomic.h>
12559 -#include <asm/mmu_context.h>
12560 -#include <asm/smp.h>
12561 -#include <asm/i387.h>
12562 -#include <asm/percpu.h>
12563 -#include <asm/proto.h>
12564 -#include <asm/sections.h>
12565 -#include <asm/setup.h>
12566 -#include <asm/genapic.h>
12568 -#include <asm/hypervisor.h>
12571 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
12572 -struct boot_params __initdata boot_params;
12574 -struct boot_params boot_params;
12577 -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
12579 -struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
12580 -EXPORT_SYMBOL(_cpu_pda);
12581 -struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
12583 -#ifndef CONFIG_X86_NO_IDT
12584 -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
12587 -char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
12589 -unsigned long __supported_pte_mask __read_mostly = ~0UL;
12590 -EXPORT_SYMBOL(__supported_pte_mask);
12592 -static int do_not_nx __cpuinitdata = 0;
12595 -Control non executable mappings for 64bit processes.
12597 -on Enable(default)
12600 -static int __init nonx_setup(char *str)
12604 - if (!strncmp(str, "on", 2)) {
12605 - __supported_pte_mask |= _PAGE_NX;
12607 - } else if (!strncmp(str, "off", 3)) {
12609 - __supported_pte_mask &= ~_PAGE_NX;
12613 -early_param("noexec", nonx_setup);
12615 -int force_personality32 = 0;
12617 -/* noexec32=on|off
12618 -Control non executable heap for 32bit processes.
12619 -To control the stack too use noexec=off
12621 -on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
12622 -off PROT_READ implies PROT_EXEC
12624 -static int __init nonx32_setup(char *str)
12626 - if (!strcmp(str, "on"))
12627 - force_personality32 &= ~READ_IMPLIES_EXEC;
12628 - else if (!strcmp(str, "off"))
12629 - force_personality32 |= READ_IMPLIES_EXEC;
12632 -__setup("noexec32=", nonx32_setup);
12635 -static void __init_refok switch_pt(int cpu)
12639 - xen_pt_switch(__pa_symbol(init_level4_pgt));
12640 - xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
12642 -#define switch_pt() switch_pt(cpu)
12644 -static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12646 - unsigned long frames[16];
12647 - unsigned long va;
12650 - for (va = gdt_descr->address, f = 0;
12651 - va < gdt_descr->address + gdt_descr->size;
12652 - va += PAGE_SIZE, f++) {
12653 - frames[f] = virt_to_mfn(va);
12654 - make_page_readonly(
12655 - (void *)va, XENFEAT_writable_descriptor_tables);
12657 - if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
12658 - sizeof (struct desc_struct)))
12662 -static void switch_pt(void)
12664 - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
12667 -static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12669 - load_gdt(gdt_descr);
12670 - load_idt(idt_descr);
12674 -void pda_init(int cpu)
12676 - struct x8664_pda *pda = cpu_pda(cpu);
12678 - /* Setup up data that may be needed in __get_free_pages early */
12679 - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
12680 -#ifndef CONFIG_XEN
12681 - /* Memory clobbers used to order PDA accessed */
12683 - wrmsrl(MSR_GS_BASE, pda);
12686 - if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
12687 - (unsigned long)pda))
12690 - pda->cpunumber = cpu;
12691 - pda->irqcount = -1;
12692 - pda->kernelstack =
12693 - (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
12694 - pda->active_mm = &init_mm;
12695 - pda->mmu_state = 0;
12698 - /* others are initialized in smpboot.c */
12699 - pda->pcurrent = &init_task;
12700 - pda->irqstackptr = boot_cpu_stack;
12702 - pda->irqstackptr = (char *)
12703 - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
12704 - if (!pda->irqstackptr)
12705 - panic("cannot allocate irqstack for cpu %d", cpu);
12710 - pda->irqstackptr += IRQSTACKSIZE-64;
12713 -#ifndef CONFIG_X86_NO_TSS
12714 -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
12715 -__attribute__((section(".bss.page_aligned")));
12718 -extern asmlinkage void ignore_sysret(void);
12720 -/* May not be marked __init: used by software suspend */
12721 -void syscall_init(void)
12723 -#ifndef CONFIG_XEN
12725 - * LSTAR and STAR live in a bit strange symbiosis.
12726 - * They both write to the same internal register. STAR allows to set CS/DS
12727 - * but only a 32bit target. LSTAR sets the 64bit rip.
12729 - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
12730 - wrmsrl(MSR_LSTAR, system_call);
12731 - wrmsrl(MSR_CSTAR, ignore_sysret);
12733 - /* Flags to clear on syscall */
12734 - wrmsrl(MSR_SYSCALL_MASK,
12735 - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
12737 -#ifdef CONFIG_IA32_EMULATION
12738 - syscall32_cpu_init ();
12741 - static const struct callback_register cstar = {
12742 - .type = CALLBACKTYPE_syscall32,
12743 - .address = (unsigned long)ignore_sysret
12745 - if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
12746 - printk(KERN_WARNING "Unable to register CSTAR callback\n");
12751 -void __cpuinit check_efer(void)
12753 - unsigned long efer;
12755 - rdmsrl(MSR_EFER, efer);
12756 - if (!(efer & EFER_NX) || do_not_nx) {
12757 - __supported_pte_mask &= ~_PAGE_NX;
12761 -unsigned long kernel_eflags;
12763 -#ifndef CONFIG_X86_NO_TSS
12765 - * Copies of the original ist values from the tss are only accessed during
12766 - * debugging, no special alignment required.
12768 -DEFINE_PER_CPU(struct orig_ist, orig_ist);
12772 - * cpu_init() initializes state that is per-CPU. Some data is already
12773 - * initialized (naturally) in the bootstrap process, such as the GDT
12774 - * and IDT. We reload them nevertheless, this function acts as a
12775 - * 'CPU state barrier', nothing should get across.
12776 - * A lot of state is already set up in PDA init.
12778 -void __cpuinit cpu_init (void)
12780 - int cpu = stack_smp_processor_id();
12781 -#ifndef CONFIG_X86_NO_TSS
12782 - struct tss_struct *t = &per_cpu(init_tss, cpu);
12783 - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
12785 - char *estacks = NULL;
12788 - struct task_struct *me;
12790 - /* CPU 0 is initialised in head64.c */
12794 -#ifndef CONFIG_X86_NO_TSS
12796 - estacks = boot_exception_stacks;
12801 - if (cpu_test_and_set(cpu, cpu_initialized))
12802 - panic("CPU#%d already initialized!\n", cpu);
12804 - printk("Initializing CPU#%d\n", cpu);
12806 - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
12809 - * Initialize the per-CPU GDT with the boot GDT,
12810 - * and set up the GDT descriptor:
12812 -#ifndef CONFIG_XEN
12814 - memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
12817 - cpu_gdt_descr[cpu].size = GDT_SIZE;
12818 - cpu_gdt_init(&cpu_gdt_descr[cpu]);
12820 - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
12823 - wrmsrl(MSR_FS_BASE, 0);
12824 - wrmsrl(MSR_KERNEL_GS_BASE, 0);
12829 -#ifndef CONFIG_X86_NO_TSS
12831 - * set up and load the per-CPU TSS
12833 - for (v = 0; v < N_EXCEPTION_STACKS; v++) {
12834 - static const unsigned int order[N_EXCEPTION_STACKS] = {
12835 - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
12836 - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
12839 - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
12841 - panic("Cannot allocate exception stack %ld %d\n",
12844 - estacks += PAGE_SIZE << order[v];
12845 - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
12848 - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
12850 - * <= is required because the CPU will access up to
12851 - * 8 bits beyond the end of the IO permission bitmap.
12853 - for (i = 0; i <= IO_BITMAP_LONGS; i++)
12854 - t->io_bitmap[i] = ~0UL;
12857 - atomic_inc(&init_mm.mm_count);
12858 - me->active_mm = &init_mm;
12861 - enter_lazy_tlb(&init_mm, me);
12863 -#ifndef CONFIG_X86_NO_TSS
12864 - set_tss_desc(cpu, t);
12866 -#ifndef CONFIG_XEN
12869 - load_LDT(&init_mm.context);
12871 -#ifdef CONFIG_KGDB
12873 - * If the kgdb is connected no debug regs should be altered. This
12874 - * is only applicable when KGDB and a KGDB I/O module are built
12875 - * into the kernel and you are using early debugging with
12876 - * kgdbwait. KGDB will control the kernel HW breakpoint registers.
12878 - if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
12879 - arch_kgdb_ops.correct_hw_break();
12883 - * Clear all 6 debug registers:
12886 - set_debugreg(0UL, 0);
12887 - set_debugreg(0UL, 1);
12888 - set_debugreg(0UL, 2);
12889 - set_debugreg(0UL, 3);
12890 - set_debugreg(0UL, 6);
12891 - set_debugreg(0UL, 7);
12892 -#ifdef CONFIG_KGDB
12893 - /* If the kgdb is connected no debug regs should be altered. */
12899 - asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
12900 - if (raw_irqs_disabled())
12901 - kernel_eflags &= ~X86_EFLAGS_IF;
12903 - if (is_uv_system())
12906 --- sle11-2009-10-16.orig/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:38:05.000000000 +0100
12907 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12910 - * Copyright (C) 1995 Linus Torvalds
12912 - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
12914 - * Memory region support
12915 - * David Parsons <orc@pell.chi.il.us>, July-August 1999
12917 - * Added E820 sanitization routine (removes overlapping memory regions);
12918 - * Brian Moyle <bmoyle@mvista.com>, February 2001
12920 - * Moved CPU detection code to cpu/${cpu}.c
12921 - * Patrick Mochel <mochel@osdl.org>, March 2002
12923 - * Provisions for empty E820 memory regions (reported by certain BIOSes).
12924 - * Alex Achenbach <xela@slit.de>, December 2002.
12929 - * This file handles the architecture-dependent parts of initialization
12932 -#include <linux/sched.h>
12933 -#include <linux/mm.h>
12934 -#include <linux/mmzone.h>
12935 -#include <linux/screen_info.h>
12936 -#include <linux/ioport.h>
12937 -#include <linux/acpi.h>
12938 -#include <linux/apm_bios.h>
12939 -#include <linux/initrd.h>
12940 -#include <linux/bootmem.h>
12941 -#include <linux/seq_file.h>
12942 -#include <linux/console.h>
12943 -#include <linux/mca.h>
12944 -#include <linux/root_dev.h>
12945 -#include <linux/highmem.h>
12946 -#include <linux/module.h>
12947 -#include <linux/efi.h>
12948 -#include <linux/init.h>
12949 -#include <linux/edd.h>
12950 -#include <linux/iscsi_ibft.h>
12951 -#include <linux/nodemask.h>
12952 -#include <linux/kernel.h>
12953 -#include <linux/percpu.h>
12954 -#include <linux/notifier.h>
12955 -#include <linux/kexec.h>
12956 -#include <linux/crash_dump.h>
12957 -#include <linux/dmi.h>
12958 -#include <linux/pfn.h>
12959 -#include <linux/pci.h>
12960 -#include <linux/init_ohci1394_dma.h>
12961 -#include <linux/kvm_para.h>
12963 -#include <video/edid.h>
12965 -#include <asm/mtrr.h>
12966 -#include <asm/apic.h>
12967 -#include <asm/e820.h>
12968 -#include <asm/mpspec.h>
12969 -#include <asm/mmzone.h>
12970 -#include <asm/setup.h>
12971 -#include <asm/arch_hooks.h>
12972 -#include <asm/sections.h>
12973 -#include <asm/io_apic.h>
12974 -#include <asm/ist.h>
12975 -#include <asm/io.h>
12976 -#include <asm/hypervisor.h>
12977 -#include <xen/interface/physdev.h>
12978 -#include <xen/interface/memory.h>
12979 -#include <xen/features.h>
12980 -#include <xen/firmware.h>
12981 -#include <xen/xencons.h>
12982 -#include <setup_arch.h>
12983 -#include <asm/bios_ebda.h>
12984 -#include <asm/cacheflush.h>
12985 -#include <asm/processor.h>
12988 -#include <xen/interface/kexec.h>
12991 -static int xen_panic_event(struct notifier_block *, unsigned long, void *);
12992 -static struct notifier_block xen_panic_block = {
12993 - xen_panic_event, NULL, 0 /* try to go last */
12997 - * Machine setup..
12999 -static struct resource data_resource = {
13000 - .name = "Kernel data",
13003 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13006 -static struct resource code_resource = {
13007 - .name = "Kernel code",
13010 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13013 -static struct resource bss_resource = {
13014 - .name = "Kernel bss",
13017 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13020 -static struct resource video_ram_resource = {
13021 - .name = "Video RAM area",
13022 - .start = 0xa0000,
13024 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13027 -static struct resource standard_io_resources[] = { {
13031 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13036 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13038 - .name = "timer0",
13041 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13043 - .name = "timer1",
13046 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13048 - .name = "keyboard",
13051 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13053 - .name = "keyboard",
13056 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13058 - .name = "dma page reg",
13061 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13066 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13071 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13076 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13079 -/* cpu data as detected by the assembly code in head.S */
13080 -struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13081 -/* common cpu data for all cpus */
13082 -struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13083 -EXPORT_SYMBOL(boot_cpu_data);
13085 -unsigned int def_to_bigsmp;
13087 -#ifndef CONFIG_X86_PAE
13088 -unsigned long mmu_cr4_features;
13090 -unsigned long mmu_cr4_features = X86_CR4_PAE;
13093 -/* for MCA, but anyone else can use it if they want */
13094 -unsigned int machine_id;
13095 -unsigned int machine_submodel_id;
13096 -unsigned int BIOS_revision;
13098 -/* Boot loader ID as an integer, for the benefit of proc_dointvec */
13099 -int bootloader_type;
13101 -/* user-defined highmem size */
13102 -static unsigned int highmem_pages = -1;
13107 -struct screen_info screen_info;
13108 -EXPORT_SYMBOL(screen_info);
13109 -struct apm_info apm_info;
13110 -EXPORT_SYMBOL(apm_info);
13111 -struct edid_info edid_info;
13112 -EXPORT_SYMBOL_GPL(edid_info);
13113 -#ifndef CONFIG_XEN
13114 -#define copy_edid() (edid_info = boot_params.edid_info)
13116 -struct ist_info ist_info;
13117 -#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
13118 - defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
13119 -EXPORT_SYMBOL(ist_info);
13122 -extern void early_cpu_init(void);
13123 -extern int root_mountflags;
13125 -unsigned long saved_video_mode;
13127 -#define RAMDISK_IMAGE_START_MASK 0x07FF
13128 -#define RAMDISK_PROMPT_FLAG 0x8000
13129 -#define RAMDISK_LOAD_FLAG 0x4000
13131 -static char __initdata command_line[COMMAND_LINE_SIZE];
13133 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
13134 -struct boot_params __initdata boot_params;
13136 -struct boot_params boot_params;
13140 - * Point at the empty zero page to start with. We map the real shared_info
13141 - * page as soon as fixmap is up and running.
13143 -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
13144 -EXPORT_SYMBOL(HYPERVISOR_shared_info);
13146 -unsigned long *phys_to_machine_mapping;
13147 -unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
13148 -EXPORT_SYMBOL(phys_to_machine_mapping);
13150 -/* Raw start-of-day parameters from the hypervisor. */
13151 -start_info_t *xen_start_info;
13152 -EXPORT_SYMBOL(xen_start_info);
13154 -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
13156 -#ifdef CONFIG_EDD_MODULE
13157 -EXPORT_SYMBOL(edd);
13159 -#ifndef CONFIG_XEN
13161 - * copy_edd() - Copy the BIOS EDD information
13162 - * from boot_params into a safe place.
13165 -static inline void copy_edd(void)
13167 - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
13168 - sizeof(edd.mbr_signature));
13169 - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
13170 - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
13171 - edd.edd_info_nr = boot_params.eddbuf_entries;
13175 -static inline void copy_edd(void)
13180 -int __initdata user_defined_memmap;
13183 - * "mem=nopentium" disables the 4MB page tables.
13184 - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
13185 - * to <mem>, overriding the bios size.
13186 - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
13187 - * <start> to <start>+<mem>, overriding the bios size.
13189 - * HPA tells me bootloaders need to parse mem=, so no new
13190 - * option should be mem= [also see Documentation/i386/boot.txt]
13192 -static int __init parse_mem(char *arg)
13197 - if (strcmp(arg, "nopentium") == 0) {
13198 - setup_clear_cpu_cap(X86_FEATURE_PSE);
13200 - /* If the user specifies memory size, we
13201 - * limit the BIOS-provided memory map to
13202 - * that size. exactmap can be used to specify
13203 - * the exact map. mem=number can be used to
13204 - * trim the existing memory map.
13206 - unsigned long long mem_size;
13208 - mem_size = memparse(arg, &arg);
13209 - limit_regions(mem_size);
13210 - user_defined_memmap = 1;
13214 -early_param("mem", parse_mem);
13216 -#ifdef CONFIG_PROC_VMCORE
13217 -/* elfcorehdr= specifies the location of elf core header
13218 - * stored by the crashed kernel.
13220 -static int __init parse_elfcorehdr(char *arg)
13225 - elfcorehdr_addr = memparse(arg, &arg);
13228 -early_param("elfcorehdr", parse_elfcorehdr);
13229 -#endif /* CONFIG_PROC_VMCORE */
13232 - * highmem=size forces highmem to be exactly 'size' bytes.
13233 - * This works even on boxes that have no highmem otherwise.
13234 - * This also works to reduce highmem size on bigger boxes.
13236 -static int __init parse_highmem(char *arg)
13241 - highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
13244 -early_param("highmem", parse_highmem);
13247 - * vmalloc=size forces the vmalloc area to be exactly 'size'
13248 - * bytes. This can be used to increase (or decrease) the
13249 - * vmalloc area - the default is 128m.
13251 -static int __init parse_vmalloc(char *arg)
13256 - __VMALLOC_RESERVE = memparse(arg, &arg);
13259 -early_param("vmalloc", parse_vmalloc);
13261 -#ifndef CONFIG_XEN
13263 - * reservetop=size reserves a hole at the top of the kernel address space which
13264 - * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
13265 - * so relocating the fixmap can be done before paging initialization.
13267 -static int __init parse_reservetop(char *arg)
13269 - unsigned long address;
13274 - address = memparse(arg, &arg);
13275 - reserve_top_address(address);
13278 -early_param("reservetop", parse_reservetop);
13282 - * Determine low and high memory ranges:
13284 -unsigned long __init find_max_low_pfn(void)
13286 - unsigned long max_low_pfn;
13288 - max_low_pfn = max_pfn;
13289 - if (max_low_pfn > MAXMEM_PFN) {
13290 - if (highmem_pages == -1)
13291 - highmem_pages = max_pfn - MAXMEM_PFN;
13292 - if (highmem_pages + MAXMEM_PFN < max_pfn)
13293 - max_pfn = MAXMEM_PFN + highmem_pages;
13294 - if (highmem_pages + MAXMEM_PFN > max_pfn) {
13295 - printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
13296 - highmem_pages = 0;
13298 - max_low_pfn = MAXMEM_PFN;
13299 -#ifndef CONFIG_HIGHMEM
13300 - /* Maximum memory usable is what is directly addressable */
13301 - printk(KERN_WARNING "Warning only %ldMB will be used.\n",
13303 - if (max_pfn > MAX_NONPAE_PFN)
13304 - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13306 - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
13307 - max_pfn = MAXMEM_PFN;
13308 -#else /* !CONFIG_HIGHMEM */
13309 -#ifndef CONFIG_HIGHMEM64G
13310 - if (max_pfn > MAX_NONPAE_PFN) {
13311 - max_pfn = MAX_NONPAE_PFN;
13312 - printk(KERN_WARNING "Warning only 4GB will be used.\n");
13313 - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13315 -#endif /* !CONFIG_HIGHMEM64G */
13316 -#endif /* !CONFIG_HIGHMEM */
13318 - if (highmem_pages == -1)
13319 - highmem_pages = 0;
13320 -#ifdef CONFIG_HIGHMEM
13321 - if (highmem_pages >= max_pfn) {
13322 - printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
13323 - highmem_pages = 0;
13325 - if (highmem_pages) {
13326 - if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
13327 - printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
13328 - highmem_pages = 0;
13330 - max_low_pfn -= highmem_pages;
13333 - if (highmem_pages)
13334 - printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
13337 - return max_low_pfn;
13340 -#ifndef CONFIG_XEN
13341 -#define BIOS_LOWMEM_KILOBYTES 0x413
13344 - * The BIOS places the EBDA/XBDA at the top of conventional
13345 - * memory, and usually decreases the reported amount of
13346 - * conventional memory (int 0x12) too. This also contains a
13347 - * workaround for Dell systems that neglect to reserve EBDA.
13348 - * The same workaround also avoids a problem with the AMD768MPX
13349 - * chipset: reserve a page before VGA to prevent PCI prefetch
13350 - * into it (errata #56). Usually the page is reserved anyways,
13351 - * unless you have no PS/2 mouse plugged in.
13353 -static void __init reserve_ebda_region(void)
13355 - unsigned int lowmem, ebda_addr;
13357 - /* To determine the position of the EBDA and the */
13358 - /* end of conventional memory, we need to look at */
13359 - /* the BIOS data area. In a paravirtual environment */
13360 - /* that area is absent. We'll just have to assume */
13361 - /* that the paravirt case can handle memory setup */
13362 - /* correctly, without our help. */
13363 - if (paravirt_enabled())
13366 - /* end of low (conventional) memory */
13367 - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
13370 - /* start of EBDA area */
13371 - ebda_addr = get_bios_ebda();
13373 - /* Fixup: bios puts an EBDA in the top 64K segment */
13374 - /* of conventional memory, but does not adjust lowmem. */
13375 - if ((lowmem - ebda_addr) <= 0x10000)
13376 - lowmem = ebda_addr;
13378 - /* Fixup: bios does not report an EBDA at all. */
13379 - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
13380 - if ((ebda_addr == 0) && (lowmem >= 0x9f000))
13381 - lowmem = 0x9f000;
13383 - /* Paranoia: should never happen, but... */
13384 - if ((lowmem == 0) || (lowmem >= 0x100000))
13385 - lowmem = 0x9f000;
13387 - /* reserve all memory between lowmem and the 1MB mark */
13388 - reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
13392 -#ifndef CONFIG_NEED_MULTIPLE_NODES
13393 -static void __init setup_bootmem_allocator(void);
13394 -static unsigned long __init setup_memory(void)
13397 - * partially used pages are not usable - thus
13398 - * we are rounding upwards:
13400 - min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
13401 - xen_start_info->nr_pt_frames;
13403 - max_low_pfn = find_max_low_pfn();
13405 -#ifdef CONFIG_HIGHMEM
13406 - highstart_pfn = highend_pfn = max_pfn;
13407 - if (max_pfn > max_low_pfn) {
13408 - highstart_pfn = max_low_pfn;
13410 - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
13411 - pages_to_mb(highend_pfn - highstart_pfn));
13412 - num_physpages = highend_pfn;
13413 - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
13415 - num_physpages = max_low_pfn;
13416 - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
13418 -#ifdef CONFIG_FLATMEM
13419 - max_mapnr = num_physpages;
13421 - printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
13422 - pages_to_mb(max_low_pfn));
13424 - setup_bootmem_allocator();
13426 - return max_low_pfn;
13429 -static void __init zone_sizes_init(void)
13431 - unsigned long max_zone_pfns[MAX_NR_ZONES];
13432 - memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
13433 - max_zone_pfns[ZONE_DMA] =
13434 - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
13435 - max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
13436 -#ifdef CONFIG_HIGHMEM
13437 - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
13438 - add_active_range(0, 0, highend_pfn);
13440 - add_active_range(0, 0, max_low_pfn);
13443 - free_area_init_nodes(max_zone_pfns);
13446 -extern unsigned long __init setup_memory(void);
13447 -extern void zone_sizes_init(void);
13448 -#endif /* !CONFIG_NEED_MULTIPLE_NODES */
13450 -static inline unsigned long long get_total_mem(void)
13452 - unsigned long long total;
13454 - total = max_low_pfn - min_low_pfn;
13455 -#ifdef CONFIG_HIGHMEM
13456 - total += highend_pfn - highstart_pfn;
13459 - return total << PAGE_SHIFT;
13462 -#ifdef CONFIG_KEXEC
13463 -#ifndef CONFIG_XEN
13464 -static void __init reserve_crashkernel(void)
13466 - unsigned long long total_mem;
13467 - unsigned long long crash_size, crash_base;
13470 - total_mem = get_total_mem();
13472 - ret = parse_crashkernel(boot_command_line, total_mem,
13473 - &crash_size, &crash_base);
13474 - if (ret == 0 && crash_size > 0) {
13475 - if (crash_base > 0) {
13476 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
13477 - "for crashkernel (System RAM: %ldMB)\n",
13478 - (unsigned long)(crash_size >> 20),
13479 - (unsigned long)(crash_base >> 20),
13480 - (unsigned long)(total_mem >> 20));
13482 - if (reserve_bootmem(crash_base, crash_size,
13483 - BOOTMEM_EXCLUSIVE) < 0) {
13484 - printk(KERN_INFO "crashkernel reservation "
13485 - "failed - memory is in use\n");
13489 - crashk_res.start = crash_base;
13490 - crashk_res.end = crash_base + crash_size - 1;
13492 - printk(KERN_INFO "crashkernel reservation failed - "
13493 - "you have to specify a base address\n");
13497 -#define reserve_crashkernel xen_machine_kexec_setup_resources
13500 -static inline void __init reserve_crashkernel(void)
13504 -#ifdef CONFIG_BLK_DEV_INITRD
13506 -static bool do_relocate_initrd = false;
13508 -static void __init reserve_initrd(void)
13510 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
13511 - unsigned long ramdisk_size = xen_start_info->mod_len;
13512 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
13513 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13514 - unsigned long ramdisk_here;
13516 - initrd_start = 0;
13518 - if (!xen_start_info->mod_start || !ramdisk_size)
13519 - return; /* No initrd provided by bootloader */
13521 - if (ramdisk_end < ramdisk_image) {
13522 - printk(KERN_ERR "initrd wraps around end of memory, "
13523 - "disabling initrd\n");
13526 - if (ramdisk_size >= end_of_lowmem/2) {
13527 - printk(KERN_ERR "initrd too large to handle, "
13528 - "disabling initrd\n");
13531 - if (ramdisk_end <= end_of_lowmem) {
13532 - /* All in lowmem, easy case */
13533 - reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
13534 - initrd_start = ramdisk_image + PAGE_OFFSET;
13535 - initrd_end = initrd_start+ramdisk_size;
13539 - /* We need to move the initrd down into lowmem */
13540 - ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
13542 - /* Note: this includes all the lowmem currently occupied by
13543 - the initrd, we rely on that fact to keep the data intact. */
13544 - reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
13545 - initrd_start = ramdisk_here + PAGE_OFFSET;
13546 - initrd_end = initrd_start + ramdisk_size;
13548 - do_relocate_initrd = true;
13551 -#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
13553 -static void __init relocate_initrd(void)
13555 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
13556 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
13557 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13558 - unsigned long ramdisk_here;
13559 - unsigned long slop, clen, mapaddr;
13562 - if (!do_relocate_initrd)
13565 - ramdisk_here = initrd_start - PAGE_OFFSET;
13567 - q = (char *)initrd_start;
13569 - /* Copy any lowmem portion of the initrd */
13570 - if (ramdisk_image < end_of_lowmem) {
13571 - clen = end_of_lowmem - ramdisk_image;
13572 - p = (char *)__va(ramdisk_image);
13573 - memcpy(q, p, clen);
13575 - ramdisk_image += clen;
13576 - ramdisk_size -= clen;
13579 - /* Copy the highmem portion of the initrd */
13580 - while (ramdisk_size) {
13581 - slop = ramdisk_image & ~PAGE_MASK;
13582 - clen = ramdisk_size;
13583 - if (clen > MAX_MAP_CHUNK-slop)
13584 - clen = MAX_MAP_CHUNK-slop;
13585 - mapaddr = ramdisk_image & PAGE_MASK;
13586 - p = early_ioremap(mapaddr, clen+slop);
13587 - memcpy(q, p+slop, clen);
13588 - early_iounmap(p, clen+slop);
13590 - ramdisk_image += clen;
13591 - ramdisk_size -= clen;
13595 -#endif /* CONFIG_BLK_DEV_INITRD */
13597 -void __init setup_bootmem_allocator(void)
13599 - unsigned long bootmap_size;
13601 - * Initialize the boot-time allocator (with low memory only):
13603 - bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
13605 - register_bootmem_low_pages(max_low_pfn);
13608 - * Reserve the bootmem bitmap itself as well. We do this in two
13609 - * steps (first step was init_bootmem()) because this catches
13610 - * the (very unlikely) case of us accidentally initializing the
13611 - * bootmem allocator with an invalid RAM area.
13613 - reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
13614 - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
13615 - BOOTMEM_DEFAULT);
13617 -#ifndef CONFIG_XEN
13619 - * reserve physical page 0 - it's a special BIOS page on many boxes,
13620 - * enabling clean reboots, SMP operation, laptop functions.
13622 - reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
13624 - /* reserve EBDA region */
13625 - reserve_ebda_region();
13629 - * But first pinch a few for the stack/trampoline stuff
13630 - * FIXME: Don't need the extra page at 4K, but need to fix
13631 - * trampoline before removing it. (see the GDT stuff)
13633 - reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
13635 -#ifdef CONFIG_ACPI_SLEEP
13637 - * Reserve low memory region for sleep support.
13639 - acpi_reserve_bootmem();
13641 -#endif /* !CONFIG_XEN */
13643 -#ifdef CONFIG_BLK_DEV_INITRD
13644 - reserve_initrd();
13646 - numa_kva_reserve();
13647 - reserve_crashkernel();
13649 - reserve_ibft_region();
13653 - * The node 0 pgdat is initialized before all of these because
13654 - * it's needed for bootmem. node>0 pgdats have their virtual
13655 - * space allocated before the pagetables are in place to access
13656 - * them, so they can't be cleared then.
13658 - * This should all compile down to nothing when NUMA is off.
13660 -static void __init remapped_pgdat_init(void)
13664 - for_each_online_node(nid) {
13666 - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
13671 -static void set_mca_bus(int x)
13676 -static void set_mca_bus(int x) { }
13679 -/* Overridden in paravirt.c if CONFIG_PARAVIRT */
13680 -char * __init __attribute__((weak)) memory_setup(void)
13682 - return machine_specific_memory_setup();
13685 -#ifdef CONFIG_NUMA
13687 - * In the golden day, when everything among i386 and x86_64 will be
13688 - * integrated, this will not live here
13690 -void *x86_cpu_to_node_map_early_ptr;
13691 -int x86_cpu_to_node_map_init[NR_CPUS] = {
13692 - [0 ... NR_CPUS-1] = NUMA_NO_NODE
13694 -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
13698 - * Determine if we were loaded by an EFI loader. If so, then we have also been
13699 - * passed the efi memmap, systab, etc., so we should use these data structures
13700 - * for initialization. Note, the efi init code path is determined by the
13701 - * global efi_enabled. This allows the same kernel image to be used on existing
13702 - * systems (with a traditional BIOS) as well as on EFI systems.
13704 -void __init setup_arch(char **cmdline_p)
13706 - int i, j, k, fpp;
13707 - struct physdev_set_iopl set_iopl;
13708 - unsigned long max_low_pfn;
13709 - unsigned long p2m_pages;
13711 - /* Force a quick death if the kernel panics (not domain 0). */
13712 - extern int panic_timeout;
13713 - if (!panic_timeout && !is_initial_xendomain())
13714 - panic_timeout = 1;
13716 - /* Register a call for panic conditions. */
13717 - atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
13719 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13720 - VMASST_TYPE_4gb_segments));
13721 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13722 - VMASST_TYPE_writable_pagetables));
13724 - memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
13725 - pre_setup_arch_hook();
13726 - early_cpu_init();
13727 - early_ioremap_init();
13729 - prefill_possible_map();
13733 - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
13738 - /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
13739 - properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
13741 - ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
13742 - screen_info = boot_params.screen_info;
13744 - apm_info.bios = boot_params.apm_bios_info;
13745 - ist_info = boot_params.ist_info;
13746 - saved_video_mode = boot_params.hdr.vid_mode;
13747 - if( boot_params.sys_desc_table.length != 0 ) {
13748 - set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
13749 - machine_id = boot_params.sys_desc_table.table[0];
13750 - machine_submodel_id = boot_params.sys_desc_table.table[1];
13751 - BIOS_revision = boot_params.sys_desc_table.table[2];
13753 - bootloader_type = boot_params.hdr.type_of_loader;
13755 - if (is_initial_xendomain()) {
13756 - const struct dom0_vga_console_info *info =
13757 - (void *)((char *)xen_start_info +
13758 - xen_start_info->console.dom0.info_off);
13760 - dom0_init_screen_info(info,
13761 - xen_start_info->console.dom0.info_size);
13762 - xen_start_info->console.domU.mfn = 0;
13763 - xen_start_info->console.domU.evtchn = 0;
13765 - screen_info.orig_video_isVGA = 0;
13767 -#ifdef CONFIG_BLK_DEV_RAM
13768 - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
13769 - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
13770 - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
13775 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
13776 - print_memory_map(memory_setup());
13780 - if (!boot_params.hdr.root_flags)
13781 - root_mountflags &= ~MS_RDONLY;
13782 - init_mm.start_code = (unsigned long) _text;
13783 - init_mm.end_code = (unsigned long) _etext;
13784 - init_mm.end_data = (unsigned long) _edata;
13785 - init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
13786 - xen_start_info->nr_pt_frames) << PAGE_SHIFT;
13788 - code_resource.start = virt_to_phys(_text);
13789 - code_resource.end = virt_to_phys(_etext)-1;
13790 - data_resource.start = virt_to_phys(_etext);
13791 - data_resource.end = virt_to_phys(_edata)-1;
13792 - bss_resource.start = virt_to_phys(&__bss_start);
13793 - bss_resource.end = virt_to_phys(&__bss_stop)-1;
13795 - if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
13796 - i = COMMAND_LINE_SIZE;
13797 - memcpy(boot_command_line, xen_start_info->cmd_line, i);
13798 - boot_command_line[i - 1] = '\0';
13799 - parse_early_param();
13801 - if (user_defined_memmap) {
13802 - printk(KERN_INFO "user-defined physical RAM map:\n");
13803 - print_memory_map("user");
13806 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
13807 - *cmdline_p = command_line;
13812 - /* update e820 for memory not covered by WB MTRRs */
13813 - propagate_e820_map();
13815 -#ifndef CONFIG_XEN
13816 - if (mtrr_trim_uncached_memory(max_pfn))
13817 - propagate_e820_map();
13820 - max_low_pfn = setup_memory();
13822 -#ifdef CONFIG_KVM_CLOCK
13828 - * Must be after max_low_pfn is determined, and before kernel
13829 - * pagetables are setup.
13833 - kvm_guest_init();
13836 - * NOTE: before this point _nobody_ is allowed to allocate
13837 - * any memory using the bootmem allocator. Although the
13838 - * allocator is now initialised only the first 8Mb of the kernel
13839 - * virtual address space has been mapped. All allocations before
13840 - * paging_init() has completed must use the alloc_bootmem_low_pages()
13841 - * variant (which allocates DMA'able memory) and care must be taken
13842 - * not to exceed the 8Mb limit.
13846 - smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
13851 - * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
13854 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
13855 - if (init_ohci1394_dma_early)
13856 - init_ohci1394_dma_on_all_controllers();
13859 - remapped_pgdat_init();
13861 - zone_sizes_init();
13863 -#ifdef CONFIG_X86_FIND_SMP_CONFIG
13865 - * Find and reserve possible boot-time SMP configuration:
13867 - find_smp_config();
13870 - p2m_pages = max_pfn;
13871 - if (xen_start_info->nr_pages > max_pfn) {
13873 - * the max_pfn was shrunk (probably by mem= or highmem=
13874 - * kernel parameter); shrink reservation with the HV
13876 - struct xen_memory_reservation reservation = {
13877 - .address_bits = 0,
13878 - .extent_order = 0,
13879 - .domid = DOMID_SELF
13881 - unsigned int difference;
13884 - difference = xen_start_info->nr_pages - max_pfn;
13886 - set_xen_guest_handle(reservation.extent_start,
13887 - ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
13888 - reservation.nr_extents = difference;
13889 - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
13891 - BUG_ON (ret != difference);
13893 - else if (max_pfn > xen_start_info->nr_pages)
13894 - p2m_pages = xen_start_info->nr_pages;
13896 - /* Make sure we have a correctly sized P->M table. */
13897 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
13898 - phys_to_machine_mapping = alloc_bootmem_low_pages(
13899 - max_pfn * sizeof(unsigned long));
13900 - memset(phys_to_machine_mapping, ~0,
13901 - max_pfn * sizeof(unsigned long));
13902 - memcpy(phys_to_machine_mapping,
13903 - (unsigned long *)xen_start_info->mfn_list,
13904 - p2m_pages * sizeof(unsigned long));
13906 - __pa(xen_start_info->mfn_list),
13907 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
13908 - sizeof(unsigned long))));
13911 - * Initialise the list of the frames that specify the list of
13912 - * frames that make up the p2m table. Used by save/restore
13914 - pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
13916 - fpp = PAGE_SIZE/sizeof(unsigned long);
13917 - for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
13918 - if ((j % fpp) == 0) {
13921 - pfn_to_mfn_frame_list[k] =
13922 - alloc_bootmem_low_pages(PAGE_SIZE);
13923 - pfn_to_mfn_frame_list_list[k] =
13924 - virt_to_mfn(pfn_to_mfn_frame_list[k]);
13927 - pfn_to_mfn_frame_list[k][j] =
13928 - virt_to_mfn(&phys_to_machine_mapping[i]);
13930 - HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
13931 - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
13932 - virt_to_mfn(pfn_to_mfn_frame_list_list);
13935 - /* Mark all ISA DMA channels in-use - using them wouldn't work. */
13936 - for (i = 0; i < MAX_DMA_CHANNELS; ++i)
13937 - if (i != 4 && request_dma(i, "xen") != 0)
13941 - * NOTE: at this point the bootmem allocator is fully available.
13944 -#ifdef CONFIG_BLK_DEV_INITRD
13945 - relocate_initrd();
13948 - paravirt_post_allocator_init();
13950 - if (is_initial_xendomain())
13951 - dmi_scan_machine();
13955 -#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
13957 - * setup to use the early static init tables during kernel startup
13958 - * X86_SMP will exclude sub-arches that don't deal well with it.
13960 - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
13961 - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
13962 -#ifdef CONFIG_NUMA
13963 - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
13967 -#ifdef CONFIG_X86_GENERICARCH
13968 - generic_apic_probe();
13971 - set_iopl.iopl = 1;
13972 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
13974 -#ifdef CONFIG_ACPI
13975 - if (!is_initial_xendomain()) {
13976 - printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
13977 - acpi_disabled = 1;
13982 - * Parse the ACPI tables for possible boot-time SMP configuration.
13984 - acpi_boot_table_init();
13987 -#ifndef CONFIG_XEN
13991 -#ifdef CONFIG_ACPI
13992 - acpi_boot_init();
13994 -#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
13995 - if (def_to_bigsmp)
13996 - printk(KERN_WARNING "More than 8 CPUs detected and "
13997 - "CONFIG_X86_PC cannot handle it.\nUse "
13998 - "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
14001 -#ifdef CONFIG_X86_LOCAL_APIC
14002 - if (smp_found_config)
14003 - get_smp_config();
14006 - e820_register_memory();
14007 - e820_mark_nosave_regions();
14009 - if (is_initial_xendomain()) {
14011 -#if defined(CONFIG_VGA_CONSOLE)
14012 - if (!efi_enabled ||
14013 - (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14014 - conswitchp = &vga_con;
14015 -#elif defined(CONFIG_DUMMY_CONSOLE)
14016 - conswitchp = &dummy_con;
14020 -#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14021 - conswitchp = &dummy_con;
14027 -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14029 - HYPERVISOR_shutdown(SHUTDOWN_crash);
14030 - /* we're never actually going to get here... */
14031 - return NOTIFY_DONE;
14035 - * Request address space for all standard resources
14037 - * This is called just before pcibios_init(), which is also a
14038 - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
14040 -static int __init request_standard_resources(void)
14044 - /* Nothing to do if not running in dom0. */
14045 - if (!is_initial_xendomain())
14048 - printk(KERN_INFO "Setting up standard PCI resources\n");
14049 - init_iomem_resources(&code_resource, &data_resource, &bss_resource);
14051 - request_resource(&iomem_resource, &video_ram_resource);
14053 - /* request I/O space for devices used on all i[345]86 PCs */
14054 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14055 - request_resource(&ioport_resource, &standard_io_resources[i]);
14059 -subsys_initcall(request_standard_resources);
14060 --- sle11-2009-10-16.orig/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:38:05.000000000 +0100
14061 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
14064 - * Copyright (C) 1995 Linus Torvalds
14068 - * This file handles the architecture-dependent parts of initialization
14071 -#include <linux/errno.h>
14072 -#include <linux/sched.h>
14073 -#include <linux/kernel.h>
14074 -#include <linux/mm.h>
14075 -#include <linux/stddef.h>
14076 -#include <linux/unistd.h>
14077 -#include <linux/ptrace.h>
14078 -#include <linux/slab.h>
14079 -#include <linux/user.h>
14080 -#include <linux/screen_info.h>
14081 -#include <linux/ioport.h>
14082 -#include <linux/delay.h>
14083 -#include <linux/init.h>
14084 -#include <linux/initrd.h>
14085 -#include <linux/highmem.h>
14086 -#include <linux/bootmem.h>
14087 -#include <linux/module.h>
14088 -#include <asm/processor.h>
14089 -#include <linux/console.h>
14090 -#include <linux/seq_file.h>
14091 -#include <linux/crash_dump.h>
14092 -#include <linux/root_dev.h>
14093 -#include <linux/pci.h>
14094 -#include <asm/pci-direct.h>
14095 -#include <linux/efi.h>
14096 -#include <linux/acpi.h>
14097 -#include <linux/kallsyms.h>
14098 -#include <linux/edd.h>
14099 -#include <linux/iscsi_ibft.h>
14100 -#include <linux/mmzone.h>
14101 -#include <linux/kexec.h>
14102 -#include <linux/cpufreq.h>
14103 -#include <linux/dmi.h>
14104 -#include <linux/dma-mapping.h>
14105 -#include <linux/ctype.h>
14106 -#include <linux/sort.h>
14107 -#include <linux/uaccess.h>
14108 -#include <linux/init_ohci1394_dma.h>
14109 -#include <linux/kvm_para.h>
14111 -#include <asm/mtrr.h>
14112 -#include <asm/uaccess.h>
14113 -#include <asm/system.h>
14114 -#include <asm/vsyscall.h>
14115 -#include <asm/io.h>
14116 -#include <asm/smp.h>
14117 -#include <asm/msr.h>
14118 -#include <asm/desc.h>
14119 -#include <video/edid.h>
14120 -#include <asm/e820.h>
14121 -#include <asm/dma.h>
14122 -#include <asm/gart.h>
14123 -#include <asm/mpspec.h>
14124 -#include <asm/mmu_context.h>
14125 -#include <asm/proto.h>
14126 -#include <asm/setup.h>
14127 -#include <asm/numa.h>
14128 -#include <asm/sections.h>
14129 -#include <asm/dmi.h>
14130 -#include <asm/cacheflush.h>
14131 -#include <asm/mce.h>
14132 -#include <asm/ds.h>
14133 -#include <asm/topology.h>
14134 -#include <asm/pat.h>
14136 -#include <mach_apic.h>
14138 -#include <linux/percpu.h>
14139 -#include <xen/interface/physdev.h>
14140 -#include "setup_arch_pre.h"
14141 -#include <asm/hypervisor.h>
14142 -#include <xen/interface/nmi.h>
14143 -#include <xen/features.h>
14144 -#include <xen/firmware.h>
14145 -#include <xen/xencons.h>
14146 -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
14147 -#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
14148 -#include <asm/mach-xen/setup_arch_post.h>
14149 -#include <xen/interface/memory.h>
14152 -#include <xen/interface/kexec.h>
14155 -extern unsigned long start_pfn;
14156 -extern struct edid_info edid_info;
14158 -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
14159 -EXPORT_SYMBOL(HYPERVISOR_shared_info);
14161 -static int xen_panic_event(struct notifier_block *, unsigned long, void *);
14162 -static struct notifier_block xen_panic_block = {
14163 - xen_panic_event, NULL, 0 /* try to go last */
14166 -unsigned long *phys_to_machine_mapping;
14167 -unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
14169 -EXPORT_SYMBOL(phys_to_machine_mapping);
14171 -DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
14172 -DEFINE_PER_CPU(int, nr_multicall_ents);
14174 -/* Raw start-of-day parameters from the hypervisor. */
14175 -start_info_t *xen_start_info;
14176 -EXPORT_SYMBOL(xen_start_info);
14180 - * Machine setup..
14183 -struct cpuinfo_x86 boot_cpu_data __read_mostly;
14184 -EXPORT_SYMBOL(boot_cpu_data);
14186 -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
14188 -unsigned long mmu_cr4_features;
14190 -/* Boot loader ID as an integer, for the benefit of proc_dointvec */
14191 -int bootloader_type;
14193 -unsigned long saved_video_mode;
14195 -int force_mwait __cpuinitdata;
14198 - * Early DMI memory
14200 -int dmi_alloc_index;
14201 -char dmi_alloc_data[DMI_MAX_DATA];
14206 -struct screen_info screen_info;
14207 -EXPORT_SYMBOL(screen_info);
14208 -struct sys_desc_table_struct {
14209 - unsigned short length;
14210 - unsigned char table[0];
14213 -struct edid_info edid_info;
14214 -EXPORT_SYMBOL_GPL(edid_info);
14216 -extern int root_mountflags;
14218 -char __initdata command_line[COMMAND_LINE_SIZE];
14220 -static struct resource standard_io_resources[] = {
14221 - { .name = "dma1", .start = 0x00, .end = 0x1f,
14222 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14223 - { .name = "pic1", .start = 0x20, .end = 0x21,
14224 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14225 - { .name = "timer0", .start = 0x40, .end = 0x43,
14226 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14227 - { .name = "timer1", .start = 0x50, .end = 0x53,
14228 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14229 - { .name = "keyboard", .start = 0x60, .end = 0x60,
14230 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14231 - { .name = "keyboard", .start = 0x64, .end = 0x64,
14232 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14233 - { .name = "dma page reg", .start = 0x80, .end = 0x8f,
14234 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14235 - { .name = "pic2", .start = 0xa0, .end = 0xa1,
14236 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14237 - { .name = "dma2", .start = 0xc0, .end = 0xdf,
14238 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14239 - { .name = "fpu", .start = 0xf0, .end = 0xff,
14240 - .flags = IORESOURCE_BUSY | IORESOURCE_IO }
14243 -#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
14245 -static struct resource data_resource = {
14246 - .name = "Kernel data",
14249 - .flags = IORESOURCE_RAM,
14251 -static struct resource code_resource = {
14252 - .name = "Kernel code",
14255 - .flags = IORESOURCE_RAM,
14257 -static struct resource bss_resource = {
14258 - .name = "Kernel bss",
14261 - .flags = IORESOURCE_RAM,
14264 -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
14266 -#ifdef CONFIG_PROC_VMCORE
14267 -/* elfcorehdr= specifies the location of elf core header
14268 - * stored by the crashed kernel. This option will be passed
14269 - * by kexec loader to the capture kernel.
14271 -static int __init setup_elfcorehdr(char *arg)
14276 - elfcorehdr_addr = memparse(arg, &end);
14277 - return end > arg ? 0 : -EINVAL;
14279 -early_param("elfcorehdr", setup_elfcorehdr);
14282 -#ifndef CONFIG_NUMA
14283 -static void __init
14284 -contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
14286 - unsigned long bootmap_size, bootmap;
14288 - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
14289 - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
14291 - if (bootmap == -1L)
14292 - panic("Cannot find bootmem map of size %ld\n", bootmap_size);
14293 - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
14294 - e820_register_active_regions(0, start_pfn, end_pfn);
14296 - free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
14297 - early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
14299 - free_bootmem_with_active_regions(0, end_pfn);
14300 - early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
14302 - reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
14306 -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
14308 -#ifdef CONFIG_EDD_MODULE
14309 -EXPORT_SYMBOL(edd);
14311 -#ifndef CONFIG_XEN
14313 - * copy_edd() - Copy the BIOS EDD information
14314 - * from boot_params into a safe place.
14317 -static inline void copy_edd(void)
14319 - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
14320 - sizeof(edd.mbr_signature));
14321 - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
14322 - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
14323 - edd.edd_info_nr = boot_params.eddbuf_entries;
14327 -static inline void copy_edd(void)
14332 -#ifdef CONFIG_KEXEC
14333 -#ifndef CONFIG_XEN
14334 -static void __init reserve_crashkernel(void)
14336 - unsigned long long total_mem;
14337 - unsigned long long crash_size, crash_base;
14340 - total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
14342 - ret = parse_crashkernel(boot_command_line, total_mem,
14343 - &crash_size, &crash_base);
14344 - if (ret == 0 && crash_size) {
14345 - if (crash_base <= 0) {
14346 - printk(KERN_INFO "crashkernel reservation failed - "
14347 - "you have to specify a base address\n");
14351 - if (reserve_bootmem(crash_base, crash_size,
14352 - BOOTMEM_EXCLUSIVE) < 0) {
14353 - printk(KERN_INFO "crashkernel reservation failed - "
14354 - "memory is in use\n");
14358 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
14359 - "for crashkernel (System RAM: %ldMB)\n",
14360 - (unsigned long)(crash_size >> 20),
14361 - (unsigned long)(crash_base >> 20),
14362 - (unsigned long)(total_mem >> 20));
14363 - crashk_res.start = crash_base;
14364 - crashk_res.end = crash_base + crash_size - 1;
14365 - insert_resource(&iomem_resource, &crashk_res);
14369 -#define reserve_crashkernel xen_machine_kexec_setup_resources
14372 -static inline void __init reserve_crashkernel(void)
14376 -/* Overridden in paravirt.c if CONFIG_PARAVIRT */
14377 -void __attribute__((weak)) __init memory_setup(void)
14379 - machine_specific_memory_setup();
14382 -static void __init parse_setup_data(void)
14384 - struct setup_data *data;
14385 - unsigned long pa_data;
14387 - if (boot_params.hdr.version < 0x0209)
14389 - pa_data = boot_params.hdr.setup_data;
14390 - while (pa_data) {
14391 - data = early_ioremap(pa_data, PAGE_SIZE);
14392 - switch (data->type) {
14396 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
14397 - free_early(pa_data, pa_data+sizeof(*data)+data->len);
14399 - pa_data = data->next;
14400 - early_iounmap(data, PAGE_SIZE);
14404 -#ifdef CONFIG_PCI_MMCONFIG
14405 -extern void __cpuinit fam10h_check_enable_mmcfg(void);
14406 -extern void __init check_enable_amd_mmconf_dmi(void);
14408 -void __cpuinit fam10h_check_enable_mmcfg(void)
14411 -void __init check_enable_amd_mmconf_dmi(void)
14417 - * setup_arch - architecture-specific boot-time initializations
14419 - * Note: On x86_64, fixmaps are ready for use even before this is called.
14421 -void __init setup_arch(char **cmdline_p)
14426 - extern struct e820map machine_e820;
14428 - printk(KERN_INFO "Command line: %s\n", boot_command_line);
14430 - /* Register a call for panic conditions. */
14431 - atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
14433 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
14434 - VMASST_TYPE_writable_pagetables));
14436 - early_ioremap_init();
14438 - ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
14439 - screen_info = boot_params.screen_info;
14441 - if (is_initial_xendomain()) {
14442 - const struct dom0_vga_console_info *info =
14443 - (void *)((char *)xen_start_info +
14444 - xen_start_info->console.dom0.info_off);
14446 - dom0_init_screen_info(info,
14447 - xen_start_info->console.dom0.info_size);
14448 - xen_start_info->console.domU.mfn = 0;
14449 - xen_start_info->console.domU.evtchn = 0;
14451 - screen_info.orig_video_isVGA = 0;
14455 - printk(KERN_INFO "Command line: %s\n", boot_command_line);
14457 - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
14458 - screen_info = boot_params.screen_info;
14459 - edid_info = boot_params.edid_info;
14460 -#endif /* !CONFIG_XEN */
14461 - saved_video_mode = boot_params.hdr.vid_mode;
14462 - bootloader_type = boot_params.hdr.type_of_loader;
14464 -#ifdef CONFIG_BLK_DEV_RAM
14465 - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
14466 - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
14467 - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
14470 - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
14480 - if (!boot_params.hdr.root_flags)
14481 - root_mountflags &= ~MS_RDONLY;
14482 - init_mm.start_code = (unsigned long) &_text;
14483 - init_mm.end_code = (unsigned long) &_etext;
14484 - init_mm.end_data = (unsigned long) &_edata;
14485 - init_mm.brk = (unsigned long) &_end;
14487 - code_resource.start = virt_to_phys(&_text);
14488 - code_resource.end = virt_to_phys(&_etext)-1;
14489 - data_resource.start = virt_to_phys(&_etext);
14490 - data_resource.end = virt_to_phys(&_edata)-1;
14491 - bss_resource.start = virt_to_phys(&__bss_start);
14492 - bss_resource.end = virt_to_phys(&__bss_stop)-1;
14494 - early_identify_cpu(&boot_cpu_data);
14496 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
14497 - *cmdline_p = command_line;
14499 - parse_setup_data();
14501 - parse_early_param();
14503 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
14504 - if (init_ohci1394_dma_early)
14505 - init_ohci1394_dma_on_all_controllers();
14508 - finish_e820_parsing();
14510 -#ifndef CONFIG_XEN
14511 - /* after parse_early_param, so could debug it */
14512 - insert_resource(&iomem_resource, &code_resource);
14513 - insert_resource(&iomem_resource, &data_resource);
14514 - insert_resource(&iomem_resource, &bss_resource);
14517 - early_gart_iommu_check();
14519 - e820_register_active_regions(0, 0, -1UL);
14521 - * partially used pages are not usable - thus
14522 - * we are rounding upwards:
14524 - end_pfn = e820_end_of_ram();
14525 - /* update e820 for memory not covered by WB MTRRs */
14527 -#ifndef CONFIG_XEN
14528 - if (mtrr_trim_uncached_memory(end_pfn)) {
14529 - e820_register_active_regions(0, 0, -1UL);
14530 - end_pfn = e820_end_of_ram();
14534 - num_physpages = end_pfn;
14535 - max_mapnr = end_pfn;
14539 - max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
14543 -#ifndef CONFIG_XEN
14547 - if (is_initial_xendomain())
14548 - dmi_scan_machine();
14552 -#ifdef CONFIG_KVM_CLOCK
14556 -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
14557 - /* setup to use the early static init tables during kernel startup */
14558 - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
14559 - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
14560 -#ifdef CONFIG_NUMA
14561 - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
14565 - /* How many end-of-memory variables you have, grandma! */
14566 - max_low_pfn = end_pfn;
14567 - max_pfn = end_pfn;
14568 - high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
14570 - /* Remove active ranges so rediscovery with NUMA-awareness happens */
14571 - remove_all_active_ranges();
14573 -#ifdef CONFIG_ACPI_NUMA
14575 - * Parse SRAT to discover nodes.
14577 - acpi_numa_init();
14580 -#ifdef CONFIG_NUMA
14581 - numa_initmem_init(0, end_pfn);
14583 - contig_initmem_init(0, end_pfn);
14586 -#ifndef CONFIG_XEN
14587 - dma32_reserve_bootmem();
14589 -#ifdef CONFIG_ACPI_SLEEP
14591 - * Reserve low memory region for sleep support.
14593 - acpi_reserve_bootmem();
14597 - efi_reserve_bootmem();
14600 -#ifdef CONFIG_BLK_DEV_INITRD
14602 - if (xen_start_info->mod_start) {
14603 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
14604 - unsigned long ramdisk_size = xen_start_info->mod_len;
14606 - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
14607 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
14608 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
14610 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
14611 - unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
14613 - if (ramdisk_end <= end_of_mem) {
14615 - * don't need to reserve again, already reserved early
14616 - * in x86_64_start_kernel, and early_res_to_bootmem
14617 - * convert that to reserved in bootmem
14619 - initrd_start = ramdisk_image + PAGE_OFFSET;
14620 - initrd_end = initrd_start+ramdisk_size;
14622 - initrd_below_start_ok = 1;
14625 - free_bootmem(ramdisk_image, ramdisk_size);
14626 - printk(KERN_ERR "initrd extends beyond end of memory "
14627 - "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
14628 - ramdisk_end, end_of_mem);
14629 - initrd_start = 0;
14633 - reserve_crashkernel();
14635 - reserve_ibft_region();
14639 -#ifdef CONFIG_X86_LOCAL_APIC
14641 - * Find and reserve possible boot-time SMP configuration:
14643 - find_smp_config();
14647 - int i, j, k, fpp;
14648 - unsigned long p2m_pages;
14650 - p2m_pages = end_pfn;
14651 - if (xen_start_info->nr_pages > end_pfn) {
14653 - * the end_pfn was shrunk (probably by mem= or highmem=
14654 - * kernel parameter); shrink reservation with the HV
14656 - struct xen_memory_reservation reservation = {
14657 - .address_bits = 0,
14658 - .extent_order = 0,
14659 - .domid = DOMID_SELF
14661 - unsigned int difference;
14664 - difference = xen_start_info->nr_pages - end_pfn;
14666 - set_xen_guest_handle(reservation.extent_start,
14667 - ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
14668 - reservation.nr_extents = difference;
14669 - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
14671 - BUG_ON (ret != difference);
14673 - else if (end_pfn > xen_start_info->nr_pages)
14674 - p2m_pages = xen_start_info->nr_pages;
14676 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
14677 - /* Make sure we have a large enough P->M table. */
14678 - phys_to_machine_mapping = alloc_bootmem_pages(
14679 - end_pfn * sizeof(unsigned long));
14680 - memset(phys_to_machine_mapping, ~0,
14681 - end_pfn * sizeof(unsigned long));
14682 - memcpy(phys_to_machine_mapping,
14683 - (unsigned long *)xen_start_info->mfn_list,
14684 - p2m_pages * sizeof(unsigned long));
14686 - __pa(xen_start_info->mfn_list),
14687 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
14688 - sizeof(unsigned long))));
14691 - * Initialise the list of the frames that specify the
14692 - * list of frames that make up the p2m table. Used by
14695 - pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
14697 - fpp = PAGE_SIZE/sizeof(unsigned long);
14698 - for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
14699 - if ((j % fpp) == 0) {
14702 - pfn_to_mfn_frame_list[k] =
14703 - alloc_bootmem_pages(PAGE_SIZE);
14704 - pfn_to_mfn_frame_list_list[k] =
14705 - virt_to_mfn(pfn_to_mfn_frame_list[k]);
14708 - pfn_to_mfn_frame_list[k][j] =
14709 - virt_to_mfn(&phys_to_machine_mapping[i]);
14711 - HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
14712 - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
14713 - virt_to_mfn(pfn_to_mfn_frame_list_list);
14716 - /* Mark all ISA DMA channels in-use - using them wouldn't work. */
14717 - for (i = 0; i < MAX_DMA_CHANNELS; ++i)
14718 - if (i != 4 && request_dma(i, "xen") != 0)
14722 -#ifdef CONFIG_ACPI
14723 - if (!is_initial_xendomain()) {
14724 - acpi_disabled = 1;
14730 -#ifndef CONFIG_XEN
14734 -#ifdef CONFIG_ACPI
14736 - * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
14737 - * Call this early for SRAT node setup.
14739 - acpi_boot_table_init();
14742 - * Read APIC and some other early information from ACPI tables.
14744 - acpi_boot_init();
14747 - init_cpu_to_node();
14749 -#ifdef CONFIG_X86_LOCAL_APIC
14751 - * get boot-time SMP configuration:
14753 - if (smp_found_config)
14754 - get_smp_config();
14755 -#ifndef CONFIG_XEN
14756 - init_apic_mappings();
14757 - ioapic_init_mappings();
14760 -#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
14761 - prefill_possible_map();
14764 - kvm_guest_init();
14767 - * We trust e820 completely. No explicit ROM probing in memory.
14770 - if (is_initial_xendomain())
14771 - e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
14773 - e820_reserve_resources(e820.map, e820.nr_map);
14774 - e820_mark_nosave_regions();
14777 - /* request I/O space for devices used on all i[345]86 PCs */
14778 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14779 - request_resource(&ioport_resource, &standard_io_resources[i]);
14782 - if (is_initial_xendomain())
14783 - e820_setup_gap(machine_e820.map, machine_e820.nr_map);
14785 - e820_setup_gap(e820.map, e820.nr_map);
14790 - struct physdev_set_iopl set_iopl;
14792 - set_iopl.iopl = 1;
14793 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
14795 - if (is_initial_xendomain()) {
14797 -#if defined(CONFIG_VGA_CONSOLE)
14798 - conswitchp = &vga_con;
14799 -#elif defined(CONFIG_DUMMY_CONSOLE)
14800 - conswitchp = &dummy_con;
14804 -#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14805 - conswitchp = &dummy_con;
14809 -#else /* CONFIG_XEN */
14812 -#if defined(CONFIG_VGA_CONSOLE)
14813 - if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14814 - conswitchp = &vga_con;
14815 -#elif defined(CONFIG_DUMMY_CONSOLE)
14816 - conswitchp = &dummy_con;
14820 -#endif /* !CONFIG_XEN */
14822 - /* do this before identify_cpu for boot cpu */
14823 - check_enable_amd_mmconf_dmi();
14828 -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14830 - HYPERVISOR_shutdown(SHUTDOWN_crash);
14831 - /* we're never actually going to get here... */
14832 - return NOTIFY_DONE;
14834 -#endif /* !CONFIG_XEN */
14837 -static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
14841 - if (c->extended_cpuid_level < 0x80000004)
14844 - v = (unsigned int *) c->x86_model_id;
14845 - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
14846 - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
14847 - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
14848 - c->x86_model_id[48] = 0;
14853 -static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
14855 - unsigned int n, dummy, eax, ebx, ecx, edx;
14857 - n = c->extended_cpuid_level;
14859 - if (n >= 0x80000005) {
14860 - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
14861 - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
14862 - "D cache %dK (%d bytes/line)\n",
14863 - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
14864 - c->x86_cache_size = (ecx>>24) + (edx>>24);
14865 - /* On K8 L1 TLB is inclusive, so don't count it */
14866 - c->x86_tlbsize = 0;
14869 - if (n >= 0x80000006) {
14870 - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
14871 - ecx = cpuid_ecx(0x80000006);
14872 - c->x86_cache_size = ecx >> 16;
14873 - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
14875 - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
14876 - c->x86_cache_size, ecx & 0xFF);
14878 - if (n >= 0x80000008) {
14879 - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
14880 - c->x86_virt_bits = (eax >> 8) & 0xff;
14881 - c->x86_phys_bits = eax & 0xff;
14885 -#ifdef CONFIG_NUMA
14886 -static int __cpuinit nearby_node(int apicid)
14890 - for (i = apicid - 1; i >= 0; i--) {
14891 - node = apicid_to_node[i];
14892 - if (node != NUMA_NO_NODE && node_online(node))
14895 - for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
14896 - node = apicid_to_node[i];
14897 - if (node != NUMA_NO_NODE && node_online(node))
14900 - return first_node(node_online_map); /* Shouldn't happen */
14905 - * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
14906 - * Assumes number of cores is a power of two.
14908 -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
14912 -#ifdef CONFIG_NUMA
14913 - int cpu = smp_processor_id();
14915 - unsigned apicid = hard_smp_processor_id();
14917 - bits = c->x86_coreid_bits;
14919 - /* Low order bits define the core id (index of core in socket) */
14920 - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
14921 - /* Convert the initial APIC ID into the socket ID */
14922 - c->phys_proc_id = c->initial_apicid >> bits;
14924 -#ifdef CONFIG_NUMA
14925 - node = c->phys_proc_id;
14926 - if (apicid_to_node[apicid] != NUMA_NO_NODE)
14927 - node = apicid_to_node[apicid];
14928 - if (!node_online(node)) {
14929 - /* Two possibilities here:
14930 - - The CPU is missing memory and no node was created.
14931 - In that case try picking one from a nearby CPU
14932 - - The APIC IDs differ from the HyperTransport node IDs
14933 - which the K8 northbridge parsing fills in.
14934 - Assume they are all increased by a constant offset,
14935 - but in the same order as the HT nodeids.
14936 - If that doesn't result in a usable node fall back to the
14937 - path for the previous case. */
14939 - int ht_nodeid = c->initial_apicid;
14941 - if (ht_nodeid >= 0 &&
14942 - apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
14943 - node = apicid_to_node[ht_nodeid];
14944 - /* Pick a nearby node */
14945 - if (!node_online(node))
14946 - node = nearby_node(apicid);
14948 - numa_set_node(cpu, node);
14950 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
14955 -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
14958 - unsigned bits, ecx;
14960 - /* Multi core CPU? */
14961 - if (c->extended_cpuid_level < 0x80000008)
14964 - ecx = cpuid_ecx(0x80000008);
14966 - c->x86_max_cores = (ecx & 0xff) + 1;
14968 - /* CPU telling us the core id bits shift? */
14969 - bits = (ecx >> 12) & 0xF;
14971 - /* Otherwise recompute */
14973 - while ((1 << bits) < c->x86_max_cores)
14977 - c->x86_coreid_bits = bits;
14982 -#define ENABLE_C1E_MASK 0x18000000
14983 -#define CPUID_PROCESSOR_SIGNATURE 1
14984 -#define CPUID_XFAM 0x0ff00000
14985 -#define CPUID_XFAM_K8 0x00000000
14986 -#define CPUID_XFAM_10H 0x00100000
14987 -#define CPUID_XFAM_11H 0x00200000
14988 -#define CPUID_XMOD 0x000f0000
14989 -#define CPUID_XMOD_REV_F 0x00040000
14991 -#ifndef CONFIG_XEN
14992 -/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
14993 -static __cpuinit int amd_apic_timer_broken(void)
14995 - u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
14997 - switch (eax & CPUID_XFAM) {
14998 - case CPUID_XFAM_K8:
14999 - if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
15001 - case CPUID_XFAM_10H:
15002 - case CPUID_XFAM_11H:
15003 - rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
15004 - if (lo & ENABLE_C1E_MASK)
15008 - /* err on the side of caution */
15015 -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
15017 - early_init_amd_mc(c);
15019 - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
15020 - if (c->x86_power & (1<<8))
15021 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15024 -static void __cpuinit init_amd(struct cpuinfo_x86 *c)
15029 - unsigned long value;
15032 - * Disable TLB flush filter by setting HWCR.FFDIS on K8
15033 - * bit 6 of msr C001_0015
15035 - * Errata 63 for SH-B3 steppings
15036 - * Errata 122 for all steppings (F+ have it disabled by default)
15038 - if (c->x86 == 15) {
15039 - rdmsrl(MSR_K8_HWCR, value);
15041 - wrmsrl(MSR_K8_HWCR, value);
15045 - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
15046 - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
15047 - clear_cpu_cap(c, 0*32+31);
15049 - /* On C+ stepping K8 rep microcode works well for copy/memset */
15050 - level = cpuid_eax(1);
15051 - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
15052 - level >= 0x0f58))
15053 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15054 - if (c->x86 == 0x10 || c->x86 == 0x11)
15055 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15057 - /* Enable workaround for FXSAVE leak */
15059 - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
15061 - level = get_model_name(c);
15063 - switch (c->x86) {
15065 - /* Should distinguish Models here, but this is only
15066 - a fallback anyways. */
15067 - strcpy(c->x86_model_id, "Hammer");
15071 - display_cacheinfo(c);
15073 - /* Multi core CPU? */
15074 - if (c->extended_cpuid_level >= 0x80000008)
15075 - amd_detect_cmp(c);
15077 - if (c->extended_cpuid_level >= 0x80000006 &&
15078 - (cpuid_edx(0x80000006) & 0xf000))
15079 - num_cache_leaves = 4;
15081 - num_cache_leaves = 3;
15083 - if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
15084 - set_cpu_cap(c, X86_FEATURE_K8);
15086 - /* MFENCE stops RDTSC speculation */
15087 - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
15089 - if (c->x86 == 0x10)
15090 - fam10h_check_enable_mmcfg();
15092 -#ifndef CONFIG_XEN
15093 - if (amd_apic_timer_broken())
15094 - disable_apic_timer = 1;
15096 - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
15097 - unsigned long long tseg;
15100 - * Split up direct mapping around the TSEG SMM area.
15101 - * Don't do it for gbpages because there seems very little
15102 - * benefit in doing so.
15104 - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
15105 - (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
15106 - set_memory_4k((unsigned long)__va(tseg), 1);
15111 -void __cpuinit detect_ht(struct cpuinfo_x86 *c)
15114 - u32 eax, ebx, ecx, edx;
15115 - int index_msb, core_bits;
15117 - cpuid(1, &eax, &ebx, &ecx, &edx);
15120 - if (!cpu_has(c, X86_FEATURE_HT))
15122 - if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
15125 - smp_num_siblings = (ebx & 0xff0000) >> 16;
15127 - if (smp_num_siblings == 1) {
15128 - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
15129 - } else if (smp_num_siblings > 1) {
15131 - if (smp_num_siblings > NR_CPUS) {
15132 - printk(KERN_WARNING "CPU: Unsupported number of "
15133 - "siblings %d", smp_num_siblings);
15134 - smp_num_siblings = 1;
15138 - index_msb = get_count_order(smp_num_siblings);
15139 - c->phys_proc_id = phys_pkg_id(index_msb);
15141 - smp_num_siblings = smp_num_siblings / c->x86_max_cores;
15143 - index_msb = get_count_order(smp_num_siblings);
15145 - core_bits = get_count_order(c->x86_max_cores);
15147 - c->cpu_core_id = phys_pkg_id(index_msb) &
15148 - ((1 << core_bits) - 1);
15151 - if ((c->x86_max_cores * smp_num_siblings) > 1) {
15152 - printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
15153 - c->phys_proc_id);
15154 - printk(KERN_INFO "CPU: Processor Core ID: %d\n",
15162 - * find out the number of processor cores on the die
15164 -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
15166 - unsigned int eax, t;
15168 - if (c->cpuid_level < 4)
15171 - cpuid_count(4, 0, &eax, &t, &t, &t);
15174 - return ((eax >> 26) + 1);
15179 -static void __cpuinit srat_detect_node(void)
15181 -#ifdef CONFIG_NUMA
15183 - int cpu = smp_processor_id();
15184 - int apicid = hard_smp_processor_id();
15186 - /* Don't do the funky fallback heuristics the AMD version employs
15188 - node = apicid_to_node[apicid];
15189 - if (node == NUMA_NO_NODE || !node_online(node))
15190 - node = first_node(node_online_map);
15191 - numa_set_node(cpu, node);
15193 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
15197 -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
15199 - if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
15200 - (c->x86 == 0x6 && c->x86_model >= 0x0e))
15201 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15204 -static void __cpuinit init_intel(struct cpuinfo_x86 *c)
15206 - /* Cache sizes */
15209 - init_intel_cacheinfo(c);
15210 - if (c->cpuid_level > 9) {
15211 - unsigned eax = cpuid_eax(10);
15212 - /* Check for version and the number of counters */
15213 - if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
15214 - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
15217 - if (cpu_has_ds) {
15218 - unsigned int l1, l2;
15219 - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
15220 - if (!(l1 & (1<<11)))
15221 - set_cpu_cap(c, X86_FEATURE_BTS);
15222 - if (!(l1 & (1<<12)))
15223 - set_cpu_cap(c, X86_FEATURE_PEBS);
15228 - ds_init_intel(c);
15230 - n = c->extended_cpuid_level;
15231 - if (n >= 0x80000008) {
15232 - unsigned eax = cpuid_eax(0x80000008);
15233 - c->x86_virt_bits = (eax >> 8) & 0xff;
15234 - c->x86_phys_bits = eax & 0xff;
15235 - /* CPUID workaround for Intel 0F34 CPU */
15236 - if (c->x86_vendor == X86_VENDOR_INTEL &&
15237 - c->x86 == 0xF && c->x86_model == 0x3 &&
15238 - c->x86_mask == 0x4)
15239 - c->x86_phys_bits = 36;
15242 - if (c->x86 == 15)
15243 - c->x86_cache_alignment = c->x86_clflush_size * 2;
15245 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15246 - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15247 - c->x86_max_cores = intel_num_cpu_cores(c);
15249 - srat_detect_node();
15252 -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
15254 - if (c->x86 == 0x6 && c->x86_model >= 0xf)
15255 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15258 -static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
15260 - /* Cache sizes */
15263 - n = c->extended_cpuid_level;
15264 - if (n >= 0x80000008) {
15265 - unsigned eax = cpuid_eax(0x80000008);
15266 - c->x86_virt_bits = (eax >> 8) & 0xff;
15267 - c->x86_phys_bits = eax & 0xff;
15270 - if (c->x86 == 0x6 && c->x86_model >= 0xf) {
15271 - c->x86_cache_alignment = c->x86_clflush_size * 2;
15272 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15273 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15275 - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15278 -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
15280 - char *v = c->x86_vendor_id;
15282 - if (!strcmp(v, "AuthenticAMD"))
15283 - c->x86_vendor = X86_VENDOR_AMD;
15284 - else if (!strcmp(v, "GenuineIntel"))
15285 - c->x86_vendor = X86_VENDOR_INTEL;
15286 - else if (!strcmp(v, "CentaurHauls"))
15287 - c->x86_vendor = X86_VENDOR_CENTAUR;
15289 - c->x86_vendor = X86_VENDOR_UNKNOWN;
15292 -/* Do some early cpuid on the boot CPU to get some parameter that are
15293 - needed before check_bugs. Everything advanced is in identify_cpu
15295 -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
15299 - c->loops_per_jiffy = loops_per_jiffy;
15300 - c->x86_cache_size = -1;
15301 - c->x86_vendor = X86_VENDOR_UNKNOWN;
15302 - c->x86_model = c->x86_mask = 0; /* So far unknown... */
15303 - c->x86_vendor_id[0] = '\0'; /* Unset */
15304 - c->x86_model_id[0] = '\0'; /* Unset */
15305 - c->x86_clflush_size = 64;
15306 - c->x86_cache_alignment = c->x86_clflush_size;
15307 - c->x86_max_cores = 1;
15308 - c->x86_coreid_bits = 0;
15309 - c->extended_cpuid_level = 0;
15310 - memset(&c->x86_capability, 0, sizeof c->x86_capability);
15312 - /* Get vendor name */
15313 - cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
15314 - (unsigned int *)&c->x86_vendor_id[0],
15315 - (unsigned int *)&c->x86_vendor_id[8],
15316 - (unsigned int *)&c->x86_vendor_id[4]);
15318 - get_cpu_vendor(c);
15320 - /* Initialize the standard set of capabilities */
15321 - /* Note that the vendor-specific code below might override */
15323 - /* Intel-defined flags: level 0x00000001 */
15324 - if (c->cpuid_level >= 0x00000001) {
15326 - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
15327 - &c->x86_capability[0]);
15328 - c->x86 = (tfms >> 8) & 0xf;
15329 - c->x86_model = (tfms >> 4) & 0xf;
15330 - c->x86_mask = tfms & 0xf;
15331 - if (c->x86 == 0xf)
15332 - c->x86 += (tfms >> 20) & 0xff;
15333 - if (c->x86 >= 0x6)
15334 - c->x86_model += ((tfms >> 16) & 0xF) << 4;
15335 - if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
15336 - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
15338 - /* Have CPUID level 0 only - unheard of */
15342 - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
15344 - c->phys_proc_id = c->initial_apicid;
15346 - /* AMD-defined flags: level 0x80000001 */
15347 - xlvl = cpuid_eax(0x80000000);
15348 - c->extended_cpuid_level = xlvl;
15349 - if ((xlvl & 0xffff0000) == 0x80000000) {
15350 - if (xlvl >= 0x80000001) {
15351 - c->x86_capability[1] = cpuid_edx(0x80000001);
15352 - c->x86_capability[6] = cpuid_ecx(0x80000001);
15354 - if (xlvl >= 0x80000004)
15355 - get_model_name(c); /* Default name */
15358 - /* Transmeta-defined flags: level 0x80860001 */
15359 - xlvl = cpuid_eax(0x80860000);
15360 - if ((xlvl & 0xffff0000) == 0x80860000) {
15361 - /* Don't set x86_cpuid_level here for now to not confuse. */
15362 - if (xlvl >= 0x80860001)
15363 - c->x86_capability[2] = cpuid_edx(0x80860001);
15366 - c->extended_cpuid_level = cpuid_eax(0x80000000);
15367 - if (c->extended_cpuid_level >= 0x80000007)
15368 - c->x86_power = cpuid_edx(0x80000007);
15370 - switch (c->x86_vendor) {
15371 - case X86_VENDOR_AMD:
15372 - early_init_amd(c);
15374 - case X86_VENDOR_INTEL:
15375 - early_init_intel(c);
15377 - case X86_VENDOR_CENTAUR:
15378 - early_init_centaur(c);
15382 - validate_pat_support(c);
15386 - * This does the hard work of actually picking apart the CPU stuff...
15388 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
15392 - early_identify_cpu(c);
15394 - init_scattered_cpuid_features(c);
15396 - c->apicid = phys_pkg_id(0);
15399 - * Vendor-specific initialization. In this section we
15400 - * canonicalize the feature flags, meaning if there are
15401 - * features a certain CPU supports which CPUID doesn't
15402 - * tell us, CPUID claiming incorrect flags, or other bugs,
15403 - * we handle them here.
15405 - * At the end of this section, c->x86_capability better
15406 - * indicate the features this CPU genuinely supports!
15408 - switch (c->x86_vendor) {
15409 - case X86_VENDOR_AMD:
15413 - case X86_VENDOR_INTEL:
15417 - case X86_VENDOR_CENTAUR:
15421 - case X86_VENDOR_UNKNOWN:
15423 - display_cacheinfo(c);
15430 - * On SMP, boot_cpu_data holds the common feature set between
15431 - * all CPUs; so make sure that we indicate which features are
15432 - * common between the CPUs. The first time this routine gets
15433 - * executed, c == &boot_cpu_data.
15435 - if (c != &boot_cpu_data) {
15436 - /* AND the already accumulated flags with these */
15437 - for (i = 0; i < NCAPINTS; i++)
15438 - boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
15441 - /* Clear all flags overriden by options */
15442 - for (i = 0; i < NCAPINTS; i++)
15443 - c->x86_capability[i] &= ~cleared_cpu_caps[i];
15445 -#ifdef CONFIG_X86_MCE
15448 - select_idle_routine(c);
15450 -#ifdef CONFIG_NUMA
15451 - numa_add_cpu(smp_processor_id());
15456 -void __cpuinit identify_boot_cpu(void)
15458 - identify_cpu(&boot_cpu_data);
15461 -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
15463 - BUG_ON(c == &boot_cpu_data);
15468 -static __init int setup_noclflush(char *arg)
15470 - setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
15473 -__setup("noclflush", setup_noclflush);
15475 -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
15477 - if (c->x86_model_id[0])
15478 - printk(KERN_CONT "%s", c->x86_model_id);
15480 - if (c->x86_mask || c->cpuid_level >= 0)
15481 - printk(KERN_CONT " stepping %02x\n", c->x86_mask);
15483 - printk(KERN_CONT "\n");
15486 -static __init int setup_disablecpuid(char *arg)
15489 - if (get_option(&arg, &bit) && bit < NCAPINTS*32)
15490 - setup_clear_cpu_cap(bit);
15495 -__setup("clearcpuid=", setup_disablecpuid);
15496 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
15497 +++ sle11-2009-10-16/arch/x86/kernel/setup_percpu-xen.c 2009-06-04 10:21:39.000000000 +0200
15499 +#include <linux/kernel.h>
15500 +#include <linux/module.h>
15501 +#include <linux/init.h>
15502 +#include <linux/bootmem.h>
15503 +#include <linux/percpu.h>
15504 +#include <linux/kexec.h>
15505 +#include <linux/crash_dump.h>
15506 +#include <asm/smp.h>
15507 +#include <asm/percpu.h>
15508 +#include <asm/sections.h>
15509 +#include <asm/processor.h>
15510 +#include <asm/setup.h>
15511 +#include <asm/topology.h>
15512 +#include <asm/mpspec.h>
15513 +#include <asm/apicdef.h>
15514 +#include <asm/highmem.h>
15516 +#ifdef CONFIG_X86_LOCAL_APIC
15517 +unsigned int num_processors;
15518 +unsigned disabled_cpus __cpuinitdata;
15519 +/* Processor that is doing the boot up */
15520 +unsigned int boot_cpu_physical_apicid = -1U;
15521 +unsigned int max_physical_apicid;
15522 +EXPORT_SYMBOL(boot_cpu_physical_apicid);
15524 +/* Bitmask of physically existing CPUs */
15525 +physid_mask_t phys_cpu_present_map;
15528 +/* map cpu index to physical APIC ID */
15529 +#ifndef CONFIG_XEN
15530 +DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
15531 +DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
15532 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15533 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
15535 +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
15536 +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15539 +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
15540 +#define X86_64_NUMA 1
15542 +/* map cpu index to node index */
15543 +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
15544 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
15546 +/* which logical CPUs are on which nodes */
15547 +cpumask_t *node_to_cpumask_map;
15548 +EXPORT_SYMBOL(node_to_cpumask_map);
15550 +/* setup node_to_cpumask_map */
15551 +static void __init setup_node_to_cpumask_map(void);
15554 +static inline void setup_node_to_cpumask_map(void) { }
15557 +#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
15559 + * Copy data used in early init routines from the initial arrays to the
15560 + * per cpu data areas. These arrays then become expendable and the
15561 + * *_early_ptr's are zeroed indicating that the static arrays are gone.
15563 +static void __init setup_per_cpu_maps(void)
15565 +#ifndef CONFIG_XEN
15568 + for_each_possible_cpu(cpu) {
15569 + per_cpu(x86_cpu_to_apicid, cpu) =
15570 + early_per_cpu_map(x86_cpu_to_apicid, cpu);
15571 + per_cpu(x86_bios_cpu_apicid, cpu) =
15572 + early_per_cpu_map(x86_bios_cpu_apicid, cpu);
15573 +#ifdef X86_64_NUMA
15574 + per_cpu(x86_cpu_to_node_map, cpu) =
15575 + early_per_cpu_map(x86_cpu_to_node_map, cpu);
15579 + /* indicate the early static arrays will soon be gone */
15580 + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
15581 + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
15582 +#ifdef X86_64_NUMA
15583 + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
15588 +#ifdef CONFIG_X86_32
15590 + * Great future not-so-futuristic plan: make i386 and x86_64 do it
15593 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
15594 +EXPORT_SYMBOL(__per_cpu_offset);
15595 +static inline void setup_cpu_pda_map(void) { }
15597 +#elif !defined(CONFIG_SMP)
15598 +static inline void setup_cpu_pda_map(void) { }
15600 +#else /* CONFIG_SMP && CONFIG_X86_64 */
15603 + * Allocate cpu_pda pointer table and array via alloc_bootmem.
15605 +static void __init setup_cpu_pda_map(void)
15608 + struct x8664_pda **new_cpu_pda;
15609 + unsigned long size;
15612 + size = roundup(sizeof(struct x8664_pda), cache_line_size());
15614 + /* allocate cpu_pda array and pointer table */
15616 + unsigned long tsize = nr_cpu_ids * sizeof(void *);
15617 + unsigned long asize = size * (nr_cpu_ids - 1);
15619 + tsize = roundup(tsize, cache_line_size());
15620 + new_cpu_pda = alloc_bootmem(tsize + asize);
15621 + pda = (char *)new_cpu_pda + tsize;
15624 + /* initialize pointer table to static pda's */
15625 + for_each_possible_cpu(cpu) {
15627 + /* leave boot cpu pda in place */
15628 + new_cpu_pda[0] = cpu_pda(0);
15631 + new_cpu_pda[cpu] = (struct x8664_pda *)pda;
15632 + new_cpu_pda[cpu]->in_bootmem = 1;
15636 + /* point to new pointer table */
15637 + _cpu_pda = new_cpu_pda;
15642 + * Great future plan:
15643 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
15644 + * Always point %gs to its beginning
15646 +void __init setup_per_cpu_areas(void)
15648 + ssize_t size = PERCPU_ENOUGH_ROOM;
15652 + /* Setup cpu_pda map */
15653 + setup_cpu_pda_map();
15655 + /* Copy section for each CPU (we discard the original) */
15656 + size = PERCPU_ENOUGH_ROOM;
15657 + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
15660 + for_each_possible_cpu(cpu) {
15661 +#ifndef CONFIG_NEED_MULTIPLE_NODES
15662 + ptr = alloc_bootmem_pages(size);
15664 + int node = early_cpu_to_node(cpu);
15665 + if (!node_online(node) || !NODE_DATA(node)) {
15666 + ptr = alloc_bootmem_pages(size);
15668 + "cpu %d has no node %d or node-local memory\n",
15672 + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
15674 + per_cpu_offset(cpu) = ptr - __per_cpu_start;
15675 + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
15679 + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
15680 + NR_CPUS, nr_cpu_ids, nr_node_ids);
15682 + /* Setup percpu data maps */
15683 + setup_per_cpu_maps();
15685 + /* Setup node to cpumask map */
15686 + setup_node_to_cpumask_map();
15691 +#ifdef X86_64_NUMA
15694 + * Allocate node_to_cpumask_map based on number of available nodes
15695 + * Requires node_possible_map to be valid.
15697 + * Note: node_to_cpumask() is not valid until after this is done.
15699 +static void __init setup_node_to_cpumask_map(void)
15701 + unsigned int node, num = 0;
15704 + /* setup nr_node_ids if not done yet */
15705 + if (nr_node_ids == MAX_NUMNODES) {
15706 + for_each_node_mask(node, node_possible_map)
15708 + nr_node_ids = num + 1;
15711 + /* allocate the map */
15712 + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
15714 + pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
15715 + map, nr_node_ids);
15717 + /* node_to_cpumask() will now work */
15718 + node_to_cpumask_map = map;
15721 +void __cpuinit numa_set_node(int cpu, int node)
15723 + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
15725 + if (cpu_pda(cpu) && node != NUMA_NO_NODE)
15726 + cpu_pda(cpu)->nodenumber = node;
15728 + if (cpu_to_node_map)
15729 + cpu_to_node_map[cpu] = node;
15731 + else if (per_cpu_offset(cpu))
15732 + per_cpu(x86_cpu_to_node_map, cpu) = node;
15735 + pr_debug("Setting node for non-present cpu %d\n", cpu);
15738 +void __cpuinit numa_clear_node(int cpu)
15740 + numa_set_node(cpu, NUMA_NO_NODE);
15743 +#ifndef CONFIG_DEBUG_PER_CPU_MAPS
15745 +void __cpuinit numa_add_cpu(int cpu)
15747 + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
15750 +void __cpuinit numa_remove_cpu(int cpu)
15752 + cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
15755 +#else /* CONFIG_DEBUG_PER_CPU_MAPS */
15758 + * --------- debug versions of the numa functions ---------
15760 +static void __cpuinit numa_set_cpumask(int cpu, int enable)
15762 + int node = cpu_to_node(cpu);
15766 + if (node_to_cpumask_map == NULL) {
15767 + printk(KERN_ERR "node_to_cpumask_map NULL\n");
15772 + mask = &node_to_cpumask_map[node];
15774 + cpu_set(cpu, *mask);
15776 + cpu_clear(cpu, *mask);
15778 + cpulist_scnprintf(buf, sizeof(buf), *mask);
15779 + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
15780 + enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
15783 +void __cpuinit numa_add_cpu(int cpu)
15785 + numa_set_cpumask(cpu, 1);
15788 +void __cpuinit numa_remove_cpu(int cpu)
15790 + numa_set_cpumask(cpu, 0);
15793 +int cpu_to_node(int cpu)
15795 + if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
15796 + printk(KERN_WARNING
15797 + "cpu_to_node(%d): usage too early!\n", cpu);
15799 + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15801 + return per_cpu(x86_cpu_to_node_map, cpu);
15803 +EXPORT_SYMBOL(cpu_to_node);
15806 + * Same function as cpu_to_node() but used if called before the
15807 + * per_cpu areas are setup.
15809 +int early_cpu_to_node(int cpu)
15811 + if (early_per_cpu_ptr(x86_cpu_to_node_map))
15812 + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15814 + if (!per_cpu_offset(cpu)) {
15815 + printk(KERN_WARNING
15816 + "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
15818 + return NUMA_NO_NODE;
15820 + return per_cpu(x86_cpu_to_node_map, cpu);
15824 +/* empty cpumask */
15825 +static const cpumask_t cpu_mask_none;
15828 + * Returns a pointer to the bitmask of CPUs on Node 'node'.
15830 +const cpumask_t *_node_to_cpumask_ptr(int node)
15832 + if (node_to_cpumask_map == NULL) {
15833 + printk(KERN_WARNING
15834 + "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
15837 + return (const cpumask_t *)&cpu_online_map;
15839 + if (node >= nr_node_ids) {
15840 + printk(KERN_WARNING
15841 + "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
15842 + node, nr_node_ids);
15844 + return &cpu_mask_none;
15846 + return &node_to_cpumask_map[node];
15848 +EXPORT_SYMBOL(_node_to_cpumask_ptr);
15851 + * Returns a bitmask of CPUs on Node 'node'.
15853 + * Side note: this function creates the returned cpumask on the stack
15854 + * so with a high NR_CPUS count, excessive stack space is used. The
15855 + * node_to_cpumask_ptr function should be used whenever possible.
15857 +cpumask_t node_to_cpumask(int node)
15859 + if (node_to_cpumask_map == NULL) {
15860 + printk(KERN_WARNING
15861 + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
15863 + return cpu_online_map;
15865 + if (node >= nr_node_ids) {
15866 + printk(KERN_WARNING
15867 + "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
15868 + node, nr_node_ids);
15870 + return cpu_mask_none;
15872 + return node_to_cpumask_map[node];
15874 +EXPORT_SYMBOL(node_to_cpumask);
15877 + * --------- end of debug versions of the numa functions ---------
15880 +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
15882 +#endif /* X86_64_NUMA */
15884 --- sle11-2009-10-16.orig/arch/x86/kernel/smp-xen.c 2009-03-16 16:38:05.000000000 +0100
15885 +++ sle11-2009-10-16/arch/x86/kernel/smp-xen.c 2009-06-04 10:21:39.000000000 +0200
15886 @@ -121,132 +121,14 @@ void xen_smp_send_reschedule(int cpu)
15887 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
15891 - * Structure and data for smp_call_function(). This is designed to minimise
15892 - * static memory requirements. It also looks cleaner.
15894 -static DEFINE_SPINLOCK(call_lock);
15896 -struct call_data_struct {
15897 - void (*func) (void *info);
15899 - atomic_t started;
15900 - atomic_t finished;
15904 -void lock_ipi_call_lock(void)
15905 +void xen_send_call_func_single_ipi(int cpu)
15907 - spin_lock_irq(&call_lock);
15908 + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR);
15911 -void unlock_ipi_call_lock(void)
15912 +void xen_send_call_func_ipi(cpumask_t mask)
15914 - spin_unlock_irq(&call_lock);
15917 -static struct call_data_struct *call_data;
15919 -static void __smp_call_function(void (*func) (void *info), void *info,
15920 - int nonatomic, int wait)
15922 - struct call_data_struct data;
15923 - int cpus = num_online_cpus() - 1;
15928 - data.func = func;
15929 - data.info = info;
15930 - atomic_set(&data.started, 0);
15931 - data.wait = wait;
15933 - atomic_set(&data.finished, 0);
15935 - call_data = &data;
15938 - /* Send a message to all other CPUs and wait for them to respond */
15939 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
15941 - /* Wait for response */
15942 - while (atomic_read(&data.started) != cpus)
15946 - while (atomic_read(&data.finished) != cpus)
15952 - * smp_call_function_mask(): Run a function on a set of other CPUs.
15953 - * @mask: The set of cpus to run on. Must not include the current cpu.
15954 - * @func: The function to run. This must be fast and non-blocking.
15955 - * @info: An arbitrary pointer to pass to the function.
15956 - * @wait: If true, wait (atomically) until function has completed on other CPUs.
15958 - * Returns 0 on success, else a negative status code.
15960 - * If @wait is true, then returns once @func has returned; otherwise
15961 - * it returns just before the target cpu calls @func.
15963 - * You must not call this function with disabled interrupts or from a
15964 - * hardware interrupt handler or from a bottom half handler.
15967 -xen_smp_call_function_mask(cpumask_t mask,
15968 - void (*func)(void *), void *info,
15971 - struct call_data_struct data;
15972 - cpumask_t allbutself;
15975 - /* Can deadlock when called with interrupts disabled */
15976 - WARN_ON(irqs_disabled());
15978 - /* Holding any lock stops cpus from going down. */
15979 - spin_lock(&call_lock);
15981 - allbutself = cpu_online_map;
15982 - cpu_clear(smp_processor_id(), allbutself);
15984 - cpus_and(mask, mask, allbutself);
15985 - cpus = cpus_weight(mask);
15988 - spin_unlock(&call_lock);
15992 - data.func = func;
15993 - data.info = info;
15994 - atomic_set(&data.started, 0);
15995 - data.wait = wait;
15997 - atomic_set(&data.finished, 0);
15999 - call_data = &data;
16002 - /* Send a message to other CPUs */
16003 - if (cpus_equal(mask, allbutself) &&
16004 - cpus_equal(cpu_online_map, cpu_callout_map))
16005 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
16007 - send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16009 - /* Wait for response */
16010 - while (atomic_read(&data.started) != cpus)
16014 - while (atomic_read(&data.finished) != cpus)
16016 - spin_unlock(&call_lock);
16019 + send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16022 static void stop_this_cpu(void *dummy)
16023 @@ -268,15 +150,10 @@ static void stop_this_cpu(void *dummy)
16025 void xen_smp_send_stop(void)
16028 unsigned long flags;
16030 - /* Don't deadlock on the call lock in panic */
16031 - nolock = !spin_trylock(&call_lock);
16032 + smp_call_function(stop_this_cpu, NULL, 0);
16033 local_irq_save(flags);
16034 - __smp_call_function(stop_this_cpu, NULL, 0, 0);
16036 - spin_unlock(&call_lock);
16037 disable_all_local_evtchn();
16038 local_irq_restore(flags);
16040 @@ -298,21 +175,8 @@ irqreturn_t smp_reschedule_interrupt(int
16042 irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
16044 - void (*func) (void *info) = call_data->func;
16045 - void *info = call_data->info;
16046 - int wait = call_data->wait;
16049 - * Notify initiating CPU that I've grabbed the data and am
16050 - * about to execute the function
16053 - atomic_inc(&call_data->started);
16055 - * At this point the info structure may be out of scope unless wait==1
16059 + generic_smp_call_function_interrupt();
16060 #ifdef CONFIG_X86_32
16061 __get_cpu_var(irq_stat).irq_call_count++;
16063 @@ -320,10 +184,19 @@ irqreturn_t smp_call_function_interrupt(
16069 - atomic_inc(&call_data->finished);
16071 + return IRQ_HANDLED;
16074 +irqreturn_t smp_call_function_single_interrupt(int irq, void *dev_id)
16077 + generic_smp_call_function_single_interrupt();
16078 +#ifdef CONFIG_X86_32
16079 + __get_cpu_var(irq_stat).irq_call_count++;
16081 + add_pda(irq_call_count, 1);
16085 return IRQ_HANDLED;
16087 --- sle11-2009-10-16.orig/arch/x86/kernel/time_32-xen.c 2009-10-28 14:58:12.000000000 +0100
16088 +++ sle11-2009-10-16/arch/x86/kernel/time_32-xen.c 2009-10-28 14:58:19.000000000 +0100
16089 @@ -468,7 +468,7 @@ irqreturn_t timer_interrupt(int irq, voi
16091 /* Keep nmi watchdog up to date */
16093 - per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
16094 + x86_add_percpu(irq_stat.irq0_irqs, 1);
16096 add_pda(irq0_irqs, 1);
16098 @@ -747,9 +747,7 @@ void __init time_init(void)
16100 update_wallclock();
16102 -#ifndef CONFIG_X86_64
16106 /* Cannot request_irq() until kmem is initialised. */
16107 late_time_init = setup_cpu0_timer_irq;
16108 @@ -806,7 +804,8 @@ static void stop_hz_timer(void)
16110 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
16111 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
16112 - (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
16113 + (j = get_next_timer_interrupt(jiffies),
16114 + time_before_eq(j, jiffies))) {
16115 cpu_clear(cpu, nohz_cpu_mask);
16118 --- sle11-2009-10-16.orig/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:38:05.000000000 +0100
16119 +++ sle11-2009-10-16/arch/x86/kernel/traps_32-xen.c 2009-06-04 10:21:39.000000000 +0200
16122 * Copyright (C) 1991, 1992 Linus Torvalds
16123 + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
16125 * Pentium III FXSR, SSE support
16126 * Gareth Hughes <gareth@valinux.com>, May 2000
16127 @@ -57,11 +58,10 @@
16128 #include <asm/nmi.h>
16129 #include <asm/smp.h>
16130 #include <asm/io.h>
16131 +#include <asm/traps.h>
16133 #include "mach_traps.h"
16135 -int panic_on_unrecovered_nmi;
16138 DECLARE_BITMAP(used_vectors, NR_VECTORS);
16139 EXPORT_SYMBOL_GPL(used_vectors);
16140 @@ -82,43 +82,22 @@ gate_desc idt_table[256]
16141 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
16144 -asmlinkage void divide_error(void);
16145 -asmlinkage void debug(void);
16146 -asmlinkage void nmi(void);
16147 -asmlinkage void int3(void);
16148 -asmlinkage void overflow(void);
16149 -asmlinkage void bounds(void);
16150 -asmlinkage void invalid_op(void);
16151 -asmlinkage void device_not_available(void);
16152 -asmlinkage void coprocessor_segment_overrun(void);
16153 -asmlinkage void invalid_TSS(void);
16154 -asmlinkage void segment_not_present(void);
16155 -asmlinkage void stack_segment(void);
16156 -asmlinkage void general_protection(void);
16157 -asmlinkage void page_fault(void);
16158 -asmlinkage void coprocessor_error(void);
16159 -asmlinkage void simd_coprocessor_error(void);
16160 -asmlinkage void alignment_check(void);
16161 -#ifndef CONFIG_XEN
16162 -asmlinkage void spurious_interrupt_bug(void);
16164 -asmlinkage void fixup_4gb_segment(void);
16166 -asmlinkage void machine_check(void);
16168 +int panic_on_unrecovered_nmi;
16169 int kstack_depth_to_print = 24;
16170 static unsigned int code_bytes = 64;
16171 +static int ignore_nmis;
16172 +static int die_counter;
16174 void printk_address(unsigned long address, int reliable)
16176 #ifdef CONFIG_KALLSYMS
16177 - char namebuf[KSYM_NAME_LEN];
16178 unsigned long offset = 0;
16179 unsigned long symsize;
16180 const char *symname;
16181 - char reliab[4] = "";
16182 - char *delim = ":";
16184 + char *delim = ":";
16185 + char namebuf[KSYM_NAME_LEN];
16186 + char reliab[4] = "";
16188 symname = kallsyms_lookup(address, &symsize, &offset,
16189 &modname, namebuf);
16190 @@ -138,22 +117,23 @@ void printk_address(unsigned long addres
16194 -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
16195 +static inline int valid_stack_ptr(struct thread_info *tinfo,
16196 + void *p, unsigned int size)
16198 - return p > (void *)tinfo &&
16199 - p <= (void *)tinfo + THREAD_SIZE - size;
16201 + return p > t && p <= t + THREAD_SIZE - size;
16204 /* The form of the top of the frame on the stack */
16205 struct stack_frame {
16206 - struct stack_frame *next_frame;
16207 - unsigned long return_address;
16208 + struct stack_frame *next_frame;
16209 + unsigned long return_address;
16212 static inline unsigned long
16213 print_context_stack(struct thread_info *tinfo,
16214 - unsigned long *stack, unsigned long bp,
16215 - const struct stacktrace_ops *ops, void *data)
16216 + unsigned long *stack, unsigned long bp,
16217 + const struct stacktrace_ops *ops, void *data)
16219 struct stack_frame *frame = (struct stack_frame *)bp;
16221 @@ -175,8 +155,6 @@ print_context_stack(struct thread_info *
16225 -#define MSG(msg) ops->warning(data, msg)
16227 void dump_trace(struct task_struct *task, struct pt_regs *regs,
16228 unsigned long *stack, unsigned long bp,
16229 const struct stacktrace_ops *ops, void *data)
16230 @@ -186,7 +164,6 @@ void dump_trace(struct task_struct *task
16233 unsigned long dummy;
16236 if (task != current)
16237 stack = (unsigned long *)task->thread.sp;
16238 @@ -204,7 +181,7 @@ void dump_trace(struct task_struct *task
16244 struct thread_info *context;
16246 context = (struct thread_info *)
16247 @@ -256,15 +233,15 @@ static void print_trace_address(void *da
16250 static const struct stacktrace_ops print_trace_ops = {
16251 - .warning = print_trace_warning,
16252 - .warning_symbol = print_trace_warning_symbol,
16253 - .stack = print_trace_stack,
16254 - .address = print_trace_address,
16255 + .warning = print_trace_warning,
16256 + .warning_symbol = print_trace_warning_symbol,
16257 + .stack = print_trace_stack,
16258 + .address = print_trace_address,
16262 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16263 - unsigned long *stack, unsigned long bp, char *log_lvl)
16264 + unsigned long *stack, unsigned long bp, char *log_lvl)
16266 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16267 printk("%s =======================\n", log_lvl);
16268 @@ -359,15 +336,14 @@ void show_registers(struct pt_regs *regs
16269 printk(KERN_EMERG "Code: ");
16271 ip = (u8 *)regs->ip - code_prologue;
16272 - if (ip < (u8 *)PAGE_OFFSET ||
16273 - probe_kernel_address(ip, c)) {
16274 + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
16275 /* try starting at EIP */
16276 ip = (u8 *)regs->ip;
16277 code_len = code_len - code_prologue + 1;
16279 for (i = 0; i < code_len; i++, ip++) {
16280 if (ip < (u8 *)PAGE_OFFSET ||
16281 - probe_kernel_address(ip, c)) {
16282 + probe_kernel_address(ip, c)) {
16283 printk(" Bad EIP value.");
16286 @@ -392,7 +368,53 @@ int is_valid_bugaddr(unsigned long ip)
16287 return ud2 == 0x0b0f;
16290 -static int die_counter;
16291 +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
16292 +static int die_owner = -1;
16293 +static unsigned int die_nest_count;
16295 +unsigned __kprobes long oops_begin(void)
16297 + unsigned long flags;
16301 + if (die_owner != raw_smp_processor_id()) {
16302 + console_verbose();
16303 + raw_local_irq_save(flags);
16304 + __raw_spin_lock(&die_lock);
16305 + die_owner = smp_processor_id();
16306 + die_nest_count = 0;
16307 + bust_spinlocks(1);
16309 + raw_local_irq_save(flags);
16311 + die_nest_count++;
16315 +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
16317 + bust_spinlocks(0);
16319 + add_taint(TAINT_DIE);
16320 + __raw_spin_unlock(&die_lock);
16321 + raw_local_irq_restore(flags);
16326 + if (kexec_should_crash(current))
16327 + crash_kexec(regs);
16329 + if (in_interrupt())
16330 + panic("Fatal exception in interrupt");
16332 + if (panic_on_oops)
16333 + panic("Fatal exception");
16339 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
16341 @@ -410,26 +432,22 @@ int __kprobes __die(const char *str, str
16342 printk("DEBUG_PAGEALLOC");
16346 if (notify_die(DIE_OOPS, str, regs, err,
16347 - current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
16348 + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
16351 - show_registers(regs);
16352 - /* Executive summary in case the oops scrolled away */
16353 - sp = (unsigned long) (®s->sp);
16354 - savesegment(ss, ss);
16355 - if (user_mode(regs)) {
16357 - ss = regs->ss & 0xffff;
16359 - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16360 - print_symbol("%s", regs->ip);
16361 - printk(" SS:ESP %04x:%08lx\n", ss, sp);
16367 + show_registers(regs);
16368 + /* Executive summary in case the oops scrolled away */
16369 + sp = (unsigned long) (®s->sp);
16370 + savesegment(ss, ss);
16371 + if (user_mode(regs)) {
16373 + ss = regs->ss & 0xffff;
16375 + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16376 + print_symbol("%s", regs->ip);
16377 + printk(" SS:ESP %04x:%08lx\n", ss, sp);
16382 @@ -438,31 +456,9 @@ int __kprobes __die(const char *str, str
16384 void die(const char *str, struct pt_regs *regs, long err)
16387 - raw_spinlock_t lock;
16389 - int lock_owner_depth;
16391 - .lock = __RAW_SPIN_LOCK_UNLOCKED,
16392 - .lock_owner = -1,
16393 - .lock_owner_depth = 0
16395 - unsigned long flags;
16398 + unsigned long flags = oops_begin();
16400 - if (die.lock_owner != raw_smp_processor_id()) {
16401 - console_verbose();
16402 - raw_local_irq_save(flags);
16403 - __raw_spin_lock(&die.lock);
16404 - die.lock_owner = smp_processor_id();
16405 - die.lock_owner_depth = 0;
16406 - bust_spinlocks(1);
16408 - raw_local_irq_save(flags);
16411 - if (++die.lock_owner_depth < 3) {
16412 + if (die_nest_count < 3) {
16413 report_bug(regs->ip, regs);
16415 if (__die(str, regs, err))
16416 @@ -471,26 +467,7 @@ void die(const char *str, struct pt_regs
16417 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
16420 - bust_spinlocks(0);
16421 - die.lock_owner = -1;
16422 - add_taint(TAINT_DIE);
16423 - __raw_spin_unlock(&die.lock);
16424 - raw_local_irq_restore(flags);
16429 - if (kexec_should_crash(current))
16430 - crash_kexec(regs);
16432 - if (in_interrupt())
16433 - panic("Fatal exception in interrupt");
16435 - if (panic_on_oops)
16436 - panic("Fatal exception");
16439 - do_exit(SIGSEGV);
16440 + oops_end(flags, regs, SIGSEGV);
16444 @@ -554,7 +531,7 @@ void do_##name(struct pt_regs *regs, lon
16446 trace_hardirqs_fixup(); \
16447 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16448 - == NOTIFY_STOP) \
16449 + == NOTIFY_STOP) \
16451 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
16453 @@ -570,7 +547,7 @@ void do_##name(struct pt_regs *regs, lon
16454 info.si_code = sicode; \
16455 info.si_addr = (void __user *)siaddr; \
16456 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16457 - == NOTIFY_STOP) \
16458 + == NOTIFY_STOP) \
16460 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
16462 @@ -579,7 +556,7 @@ void do_##name(struct pt_regs *regs, lon
16463 void do_##name(struct pt_regs *regs, long error_code) \
16465 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16466 - == NOTIFY_STOP) \
16467 + == NOTIFY_STOP) \
16469 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
16471 @@ -594,28 +571,29 @@ void do_##name(struct pt_regs *regs, lon
16472 info.si_addr = (void __user *)siaddr; \
16473 trace_hardirqs_fixup(); \
16474 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16475 - == NOTIFY_STOP) \
16476 + == NOTIFY_STOP) \
16478 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
16481 -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16482 +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16483 #ifndef CONFIG_KPROBES
16484 DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
16486 DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
16487 DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
16488 -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16489 -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16490 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16491 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16492 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
16493 -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16494 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16495 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16496 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16497 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
16498 DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
16500 -void __kprobes do_general_protection(struct pt_regs * regs,
16503 +do_general_protection(struct pt_regs *regs, long error_code)
16505 + struct task_struct *tsk;
16506 struct thread_struct *thread;
16508 thread = ¤t->thread;
16509 @@ -623,23 +601,24 @@ void __kprobes do_general_protection(str
16510 if (regs->flags & X86_VM_MASK)
16514 if (!user_mode(regs))
16517 - current->thread.error_code = error_code;
16518 - current->thread.trap_no = 13;
16519 + tsk->thread.error_code = error_code;
16520 + tsk->thread.trap_no = 13;
16522 - if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
16523 - printk_ratelimit()) {
16524 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
16525 + printk_ratelimit()) {
16527 - "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16528 - current->comm, task_pid_nr(current),
16529 - regs->ip, regs->sp, error_code);
16530 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16531 + tsk->comm, task_pid_nr(tsk),
16532 + regs->ip, regs->sp, error_code);
16533 print_vma_addr(" in ", regs->ip);
16537 - force_sig(SIGSEGV, current);
16538 + force_sig(SIGSEGV, tsk);
16542 @@ -648,14 +627,15 @@ gp_in_vm86:
16546 - if (!fixup_exception(regs)) {
16547 - current->thread.error_code = error_code;
16548 - current->thread.trap_no = 13;
16549 - if (notify_die(DIE_GPF, "general protection fault", regs,
16550 + if (fixup_exception(regs))
16553 + tsk->thread.error_code = error_code;
16554 + tsk->thread.trap_no = 13;
16555 + if (notify_die(DIE_GPF, "general protection fault", regs,
16556 error_code, 13, SIGSEGV) == NOTIFY_STOP)
16558 - die("general protection fault", regs, error_code);
16561 + die("general protection fault", regs, error_code);
16564 static notrace __kprobes void
16565 @@ -722,9 +702,9 @@ unknown_nmi_error(unsigned char reason,
16567 static DEFINE_SPINLOCK(nmi_print_lock);
16569 -void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
16570 +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
16572 - if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16573 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16576 spin_lock(&nmi_print_lock);
16577 @@ -733,10 +713,12 @@ void notrace __kprobes die_nmi(struct pt
16578 * to get a message out:
16581 - printk(KERN_EMERG "%s", msg);
16582 + printk(KERN_EMERG "%s", str);
16583 printk(" on CPU%d, ip %08lx, registers:\n",
16584 smp_processor_id(), regs->ip);
16585 show_registers(regs);
16587 + panic("Non maskable interrupt");
16589 spin_unlock(&nmi_print_lock);
16591 @@ -756,14 +738,17 @@ void notrace __kprobes die_nmi(struct pt
16592 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
16594 unsigned char reason = 0;
16597 - /* Only the BSP gets external NMIs from the system: */
16598 - if (!smp_processor_id())
16599 + cpu = smp_processor_id();
16601 + /* Only the BSP gets external NMIs from the system. */
16603 reason = get_nmi_reason();
16605 if (!(reason & 0xc0)) {
16606 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
16610 #ifdef CONFIG_X86_LOCAL_APIC
16612 @@ -772,7 +757,7 @@ static notrace __kprobes void default_do
16614 if (nmi_watchdog_tick(regs, reason))
16616 - if (!do_nmi_callback(regs, smp_processor_id()))
16617 + if (!do_nmi_callback(regs, cpu))
16618 unknown_nmi_error(reason, regs);
16620 unknown_nmi_error(reason, regs);
16621 @@ -782,6 +767,8 @@ static notrace __kprobes void default_do
16623 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
16626 + /* AK: following checks seem to be broken on modern chipsets. FIXME */
16628 mem_parity_error(reason, regs);
16630 @@ -793,8 +780,6 @@ static notrace __kprobes void default_do
16634 -static int ignore_nmis;
16636 notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
16639 @@ -879,7 +864,7 @@ void __kprobes do_debug(struct pt_regs *
16640 tsk->thread.debugctlmsr = 0;
16642 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
16643 - SIGTRAP) == NOTIFY_STOP)
16644 + SIGTRAP) == NOTIFY_STOP)
16646 /* It's safe to allow irq's after DR6 has been saved */
16647 if (regs->flags & X86_EFLAGS_IF)
16648 @@ -940,9 +925,8 @@ clear_TF_reenable:
16649 void math_error(void __user *ip)
16651 struct task_struct *task;
16652 - unsigned short cwd;
16653 - unsigned short swd;
16655 + unsigned short cwd, swd;
16658 * Save the info for the exception handler and clear the error.
16659 @@ -961,7 +945,7 @@ void math_error(void __user *ip)
16660 * C1 reg you need in case of a stack fault, 0x040 is the stack
16661 * fault bit. We should only be taking one exception at a time,
16662 * so if this combination doesn't produce any single exception,
16663 - * then we have a bad program that isn't syncronizing its FPU usage
16664 + * then we have a bad program that isn't synchronizing its FPU usage
16665 * and it will suffer the consequences since we won't be able to
16666 * fully reproduce the context of the exception
16668 @@ -970,7 +954,7 @@ void math_error(void __user *ip)
16669 switch (swd & ~cwd & 0x3f) {
16670 case 0x000: /* No unmasked exception */
16672 - default: /* Multiple exceptions */
16673 + default: /* Multiple exceptions */
16675 case 0x001: /* Invalid Op */
16677 @@ -1006,8 +990,8 @@ void do_coprocessor_error(struct pt_regs
16678 static void simd_math_error(void __user *ip)
16680 struct task_struct *task;
16681 - unsigned short mxcsr;
16683 + unsigned short mxcsr;
16686 * Save the info for the exception handler and clear the error.
16687 @@ -1084,7 +1068,7 @@ void do_spurious_interrupt_bug(struct pt
16689 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
16691 - struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
16692 + struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
16693 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
16694 unsigned long new_kesp = kesp - base;
16695 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
16696 --- sle11-2009-10-16.orig/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:38:05.000000000 +0100
16697 +++ sle11-2009-10-16/arch/x86/kernel/traps_64-xen.c 2009-06-04 10:21:39.000000000 +0200
16698 @@ -10,73 +10,56 @@
16699 * 'Traps.c' handles hardware traps and faults after we have saved some
16700 * state in 'entry.S'.
16702 -#include <linux/sched.h>
16703 +#include <linux/moduleparam.h>
16704 +#include <linux/interrupt.h>
16705 +#include <linux/kallsyms.h>
16706 +#include <linux/spinlock.h>
16707 +#include <linux/kprobes.h>
16708 +#include <linux/uaccess.h>
16709 +#include <linux/utsname.h>
16710 +#include <linux/kdebug.h>
16711 #include <linux/kernel.h>
16712 +#include <linux/module.h>
16713 +#include <linux/ptrace.h>
16714 #include <linux/string.h>
16715 +#include <linux/unwind.h>
16716 +#include <linux/delay.h>
16717 #include <linux/errno.h>
16718 -#include <linux/ptrace.h>
16719 +#include <linux/kexec.h>
16720 +#include <linux/sched.h>
16721 #include <linux/timer.h>
16722 -#include <linux/mm.h>
16723 #include <linux/init.h>
16724 -#include <linux/delay.h>
16725 -#include <linux/spinlock.h>
16726 -#include <linux/interrupt.h>
16727 -#include <linux/kallsyms.h>
16728 -#include <linux/module.h>
16729 -#include <linux/moduleparam.h>
16730 -#include <linux/nmi.h>
16731 -#include <linux/kprobes.h>
16732 -#include <linux/kexec.h>
16733 -#include <linux/unwind.h>
16734 -#include <linux/uaccess.h>
16735 #include <linux/bug.h>
16736 -#include <linux/kdebug.h>
16737 -#include <linux/utsname.h>
16739 -#include <mach_traps.h>
16740 +#include <linux/nmi.h>
16741 +#include <linux/mm.h>
16743 #if defined(CONFIG_EDAC)
16744 #include <linux/edac.h>
16747 -#include <asm/system.h>
16748 -#include <asm/io.h>
16749 -#include <asm/atomic.h>
16750 +#include <asm/stacktrace.h>
16751 +#include <asm/processor.h>
16752 #include <asm/debugreg.h>
16753 +#include <asm/atomic.h>
16754 +#include <asm/system.h>
16755 +#include <asm/unwind.h>
16756 #include <asm/desc.h>
16757 #include <asm/i387.h>
16758 -#include <asm/processor.h>
16759 -#include <asm/unwind.h>
16760 +#include <asm/nmi.h>
16761 #include <asm/smp.h>
16762 +#include <asm/io.h>
16763 #include <asm/pgalloc.h>
16764 -#include <asm/pda.h>
16765 #include <asm/proto.h>
16766 -#include <asm/nmi.h>
16767 -#include <asm/stacktrace.h>
16768 +#include <asm/pda.h>
16769 +#include <asm/traps.h>
16771 -asmlinkage void divide_error(void);
16772 -asmlinkage void debug(void);
16773 -asmlinkage void nmi(void);
16774 -asmlinkage void int3(void);
16775 -asmlinkage void overflow(void);
16776 -asmlinkage void bounds(void);
16777 -asmlinkage void invalid_op(void);
16778 -asmlinkage void device_not_available(void);
16779 -asmlinkage void double_fault(void);
16780 -asmlinkage void coprocessor_segment_overrun(void);
16781 -asmlinkage void invalid_TSS(void);
16782 -asmlinkage void segment_not_present(void);
16783 -asmlinkage void stack_segment(void);
16784 -asmlinkage void general_protection(void);
16785 -asmlinkage void page_fault(void);
16786 -asmlinkage void coprocessor_error(void);
16787 -asmlinkage void simd_coprocessor_error(void);
16788 -asmlinkage void reserved(void);
16789 -asmlinkage void alignment_check(void);
16790 -asmlinkage void machine_check(void);
16791 -asmlinkage void spurious_interrupt_bug(void);
16792 +#include <mach_traps.h>
16794 +int panic_on_unrecovered_nmi;
16795 +int kstack_depth_to_print = 12;
16796 static unsigned int code_bytes = 64;
16797 +static int ignore_nmis;
16798 +static int die_counter;
16800 static inline void conditional_sti(struct pt_regs *regs)
16802 @@ -100,34 +83,9 @@ static inline void preempt_conditional_c
16803 dec_preempt_count();
16806 -int kstack_depth_to_print = 12;
16808 void printk_address(unsigned long address, int reliable)
16810 -#ifdef CONFIG_KALLSYMS
16811 - unsigned long offset = 0, symsize;
16812 - const char *symname;
16814 - char *delim = ":";
16815 - char namebuf[KSYM_NAME_LEN];
16816 - char reliab[4] = "";
16818 - symname = kallsyms_lookup(address, &symsize, &offset,
16819 - &modname, namebuf);
16821 - printk(" [<%016lx>]\n", address);
16825 - strcpy(reliab, "? ");
16828 - modname = delim = "";
16829 - printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
16830 - address, reliab, delim, modname, delim, symname, offset, symsize);
16832 - printk(" [<%016lx>]\n", address);
16834 + printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
16837 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
16838 @@ -206,8 +164,6 @@ static unsigned long *in_exception_stack
16842 -#define MSG(txt) ops->warning(data, txt)
16845 * x86-64 can have up to three kernel stacks:
16847 @@ -234,11 +190,11 @@ struct stack_frame {
16848 unsigned long return_address;
16852 -static inline unsigned long print_context_stack(struct thread_info *tinfo,
16853 - unsigned long *stack, unsigned long bp,
16854 - const struct stacktrace_ops *ops, void *data,
16855 - unsigned long *end)
16856 +static inline unsigned long
16857 +print_context_stack(struct thread_info *tinfo,
16858 + unsigned long *stack, unsigned long bp,
16859 + const struct stacktrace_ops *ops, void *data,
16860 + unsigned long *end)
16862 struct stack_frame *frame = (struct stack_frame *)bp;
16864 @@ -260,7 +216,7 @@ static inline unsigned long print_contex
16868 -void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
16869 +void dump_trace(struct task_struct *task, struct pt_regs *regs,
16870 unsigned long *stack, unsigned long bp,
16871 const struct stacktrace_ops *ops, void *data)
16873 @@ -269,36 +225,34 @@ void dump_trace(struct task_struct *tsk,
16875 struct thread_info *tinfo;
16879 - tinfo = task_thread_info(tsk);
16884 unsigned long dummy;
16886 - if (tsk && tsk != current)
16887 - stack = (unsigned long *)tsk->thread.sp;
16888 + if (task && task != current)
16889 + stack = (unsigned long *)task->thread.sp;
16892 #ifdef CONFIG_FRAME_POINTER
16894 - if (tsk == current) {
16895 + if (task == current) {
16896 /* Grab bp right from our regs */
16897 - asm("movq %%rbp, %0" : "=r" (bp):);
16898 + asm("movq %%rbp, %0" : "=r" (bp) :);
16900 /* bp is the last reg pushed by switch_to */
16901 - bp = *(unsigned long *) tsk->thread.sp;
16902 + bp = *(unsigned long *) task->thread.sp;
16910 * Print function call entries in all stacks, starting at the
16911 * current stack address. If the stacks consist of nested
16914 + tinfo = task_thread_info(task);
16917 unsigned long *estack_end;
16918 @@ -383,18 +337,24 @@ static const struct stacktrace_ops print
16919 .address = print_trace_address,
16923 -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
16924 - unsigned long bp)
16926 +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16927 + unsigned long *stack, unsigned long bp, char *log_lvl)
16929 printk("\nCall Trace:\n");
16930 - dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
16931 + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16935 +void show_trace(struct task_struct *task, struct pt_regs *regs,
16936 + unsigned long *stack, unsigned long bp)
16938 + show_trace_log_lvl(task, regs, stack, bp, "");
16942 -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
16943 - unsigned long bp)
16944 +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
16945 + unsigned long *sp, unsigned long bp, char *log_lvl)
16947 unsigned long *stack;
16949 @@ -406,14 +366,14 @@ _show_stack(struct task_struct *tsk, str
16950 // back trace for this cpu.
16954 - sp = (unsigned long *)tsk->thread.sp;
16956 + sp = (unsigned long *)task->thread.sp;
16958 sp = (unsigned long *)&sp;
16962 - for(i=0; i < kstack_depth_to_print; i++) {
16963 + for (i = 0; i < kstack_depth_to_print; i++) {
16964 if (stack >= irqstack && stack <= irqstack_end) {
16965 if (stack == irqstack_end) {
16966 stack = (unsigned long *) (irqstack_end[-1]);
16967 @@ -428,12 +388,12 @@ _show_stack(struct task_struct *tsk, str
16968 printk(" %016lx", *stack++);
16969 touch_nmi_watchdog();
16971 - show_trace(tsk, regs, sp, bp);
16972 + show_trace_log_lvl(task, regs, sp, bp, log_lvl);
16975 -void show_stack(struct task_struct *tsk, unsigned long * sp)
16976 +void show_stack(struct task_struct *task, unsigned long *sp)
16978 - _show_stack(tsk, NULL, sp, 0);
16979 + show_stack_log_lvl(task, NULL, sp, 0, "");
16983 @@ -441,8 +401,8 @@ void show_stack(struct task_struct *tsk,
16985 void dump_stack(void)
16987 - unsigned long dummy;
16988 unsigned long bp = 0;
16989 + unsigned long stack;
16991 #ifdef CONFIG_FRAME_POINTER
16993 @@ -454,7 +414,7 @@ void dump_stack(void)
16994 init_utsname()->release,
16995 (int)strcspn(init_utsname()->version, " "),
16996 init_utsname()->version);
16997 - show_trace(NULL, NULL, &dummy, bp);
16998 + show_trace(NULL, NULL, &stack, bp);
17001 EXPORT_SYMBOL(dump_stack);
17002 @@ -465,12 +425,8 @@ void show_registers(struct pt_regs *regs
17004 const int cpu = smp_processor_id();
17005 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
17007 - unsigned int code_prologue = code_bytes * 43 / 64;
17008 - unsigned int code_len = code_bytes;
17011 - ip = (u8 *) regs->ip - code_prologue;
17012 printk("CPU %d ", cpu);
17014 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
17015 @@ -481,15 +437,22 @@ void show_registers(struct pt_regs *regs
17016 * time of the fault..
17018 if (!user_mode(regs)) {
17019 + unsigned int code_prologue = code_bytes * 43 / 64;
17020 + unsigned int code_len = code_bytes;
17025 - _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
17026 + show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
17030 printk(KERN_EMERG "Code: ");
17032 + ip = (u8 *)regs->ip - code_prologue;
17033 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
17034 /* try starting at RIP */
17035 - ip = (u8 *) regs->ip;
17036 + ip = (u8 *)regs->ip;
17037 code_len = code_len - code_prologue + 1;
17039 for (i = 0; i < code_len; i++, ip++) {
17040 @@ -505,7 +468,7 @@ void show_registers(struct pt_regs *regs
17047 int is_valid_bugaddr(unsigned long ip)
17049 @@ -545,7 +508,7 @@ unsigned __kprobes long oops_begin(void)
17052 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
17058 @@ -563,10 +526,9 @@ void __kprobes oops_end(unsigned long fl
17062 -int __kprobes __die(const char * str, struct pt_regs * regs, long err)
17063 +int __kprobes __die(const char *str, struct pt_regs *regs, long err)
17065 - static int die_counter;
17066 - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
17067 + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
17068 #ifdef CONFIG_PREEMPT
17069 printk("PREEMPT ");
17071 @@ -577,8 +539,10 @@ int __kprobes __die(const char * str, st
17072 printk("DEBUG_PAGEALLOC");
17075 - if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17076 + if (notify_die(DIE_OOPS, str, regs, err,
17077 + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17080 show_registers(regs);
17081 add_taint(TAINT_DIE);
17082 /* Executive summary in case the oops scrolled away */
17083 @@ -590,7 +554,7 @@ int __kprobes __die(const char * str, st
17087 -void die(const char * str, struct pt_regs * regs, long err)
17088 +void die(const char *str, struct pt_regs *regs, long err)
17090 unsigned long flags = oops_begin();
17092 @@ -608,8 +572,7 @@ die_nmi(char *str, struct pt_regs *regs,
17094 unsigned long flags;
17096 - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
17098 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
17101 flags = oops_begin();
17102 @@ -617,7 +580,9 @@ die_nmi(char *str, struct pt_regs *regs,
17103 * We are in trouble anyway, lets at least try
17104 * to get a message out.
17106 - printk(str, smp_processor_id());
17107 + printk(KERN_EMERG "%s", str);
17108 + printk(" on CPU%d, ip %08lx, registers:\n",
17109 + smp_processor_id(), regs->ip);
17110 show_registers(regs);
17111 if (kexec_should_crash(current))
17113 @@ -630,44 +595,44 @@ die_nmi(char *str, struct pt_regs *regs,
17117 -static void __kprobes do_trap(int trapnr, int signr, char *str,
17118 - struct pt_regs * regs, long error_code,
17120 +static void __kprobes
17121 +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
17122 + long error_code, siginfo_t *info)
17124 struct task_struct *tsk = current;
17126 - if (user_mode(regs)) {
17128 - * We want error_code and trap_no set for userspace
17129 - * faults and kernelspace faults which result in
17130 - * die(), but not kernelspace faults which are fixed
17131 - * up. die() gives the process no chance to handle
17132 - * the signal and notice the kernel fault information,
17133 - * so that won't result in polluting the information
17134 - * about previously queued, but not yet delivered,
17135 - * faults. See also do_general_protection below.
17137 - tsk->thread.error_code = error_code;
17138 - tsk->thread.trap_no = trapnr;
17139 + if (!user_mode(regs))
17140 + goto kernel_trap;
17142 - if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17143 - printk_ratelimit()) {
17145 - "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17146 - tsk->comm, tsk->pid, str,
17147 - regs->ip, regs->sp, error_code);
17148 - print_vma_addr(" in ", regs->ip);
17152 + * We want error_code and trap_no set for userspace faults and
17153 + * kernelspace faults which result in die(), but not
17154 + * kernelspace faults which are fixed up. die() gives the
17155 + * process no chance to handle the signal and notice the
17156 + * kernel fault information, so that won't result in polluting
17157 + * the information about previously queued, but not yet
17158 + * delivered, faults. See also do_general_protection below.
17160 + tsk->thread.error_code = error_code;
17161 + tsk->thread.trap_no = trapnr;
17164 - force_sig_info(signr, info, tsk);
17166 - force_sig(signr, tsk);
17168 + if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17169 + printk_ratelimit()) {
17171 + "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17172 + tsk->comm, tsk->pid, str,
17173 + regs->ip, regs->sp, error_code);
17174 + print_vma_addr(" in ", regs->ip);
17179 + force_sig_info(signr, info, tsk);
17181 + force_sig(signr, tsk);
17185 if (!fixup_exception(regs)) {
17186 tsk->thread.error_code = error_code;
17187 tsk->thread.trap_no = trapnr;
17188 @@ -677,41 +642,39 @@ static void __kprobes do_trap(int trapnr
17191 #define DO_ERROR(trapnr, signr, str, name) \
17192 -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17194 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17195 - == NOTIFY_STOP) \
17197 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17199 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17200 + == NOTIFY_STOP) \
17202 conditional_sti(regs); \
17203 - do_trap(trapnr, signr, str, regs, error_code, NULL); \
17204 + do_trap(trapnr, signr, str, regs, error_code, NULL); \
17207 -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17208 -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17210 - siginfo_t info; \
17211 - info.si_signo = signr; \
17212 - info.si_errno = 0; \
17213 - info.si_code = sicode; \
17214 - info.si_addr = (void __user *)siaddr; \
17215 - trace_hardirqs_fixup(); \
17216 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17217 - == NOTIFY_STOP) \
17219 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17220 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17222 + siginfo_t info; \
17223 + info.si_signo = signr; \
17224 + info.si_errno = 0; \
17225 + info.si_code = sicode; \
17226 + info.si_addr = (void __user *)siaddr; \
17227 + trace_hardirqs_fixup(); \
17228 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17229 + == NOTIFY_STOP) \
17231 conditional_sti(regs); \
17232 - do_trap(trapnr, signr, str, regs, error_code, &info); \
17233 + do_trap(trapnr, signr, str, regs, error_code, &info); \
17236 -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17237 -DO_ERROR( 4, SIGSEGV, "overflow", overflow)
17238 -DO_ERROR( 5, SIGSEGV, "bounds", bounds)
17239 -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17240 -DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
17241 -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17242 +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17243 +DO_ERROR(4, SIGSEGV, "overflow", overflow)
17244 +DO_ERROR(5, SIGSEGV, "bounds", bounds)
17245 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17246 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17247 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
17248 -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17249 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17250 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
17251 -DO_ERROR(18, SIGSEGV, "reserved", reserved)
17253 /* Runs on IST stack */
17254 asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
17255 @@ -741,31 +704,34 @@ asmlinkage void do_double_fault(struct p
17256 die(str, regs, error_code);
17259 -asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
17261 +asmlinkage void __kprobes
17262 +do_general_protection(struct pt_regs *regs, long error_code)
17264 - struct task_struct *tsk = current;
17265 + struct task_struct *tsk;
17267 conditional_sti(regs);
17269 - if (user_mode(regs)) {
17270 - tsk->thread.error_code = error_code;
17271 - tsk->thread.trap_no = 13;
17273 + if (!user_mode(regs))
17274 + goto gp_in_kernel;
17276 - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17277 - printk_ratelimit()) {
17279 - "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17280 - tsk->comm, tsk->pid,
17281 - regs->ip, regs->sp, error_code);
17282 - print_vma_addr(" in ", regs->ip);
17285 + tsk->thread.error_code = error_code;
17286 + tsk->thread.trap_no = 13;
17288 - force_sig(SIGSEGV, tsk);
17291 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17292 + printk_ratelimit()) {
17294 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17295 + tsk->comm, tsk->pid,
17296 + regs->ip, regs->sp, error_code);
17297 + print_vma_addr(" in ", regs->ip);
17301 + force_sig(SIGSEGV, tsk);
17305 if (fixup_exception(regs))
17308 @@ -778,14 +744,14 @@ asmlinkage void __kprobes do_general_pro
17311 static notrace __kprobes void
17312 -mem_parity_error(unsigned char reason, struct pt_regs * regs)
17313 +mem_parity_error(unsigned char reason, struct pt_regs *regs)
17315 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
17317 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
17319 #if defined(CONFIG_EDAC)
17320 - if(edac_handler_set()) {
17321 + if (edac_handler_set()) {
17322 edac_atomic_assert_error();
17325 @@ -801,7 +767,7 @@ mem_parity_error(unsigned char reason, s
17328 static notrace __kprobes void
17329 -io_check_error(unsigned char reason, struct pt_regs * regs)
17330 +io_check_error(unsigned char reason, struct pt_regs *regs)
17332 printk("NMI: IOCK error (debug interrupt?)\n");
17333 show_registers(regs);
17334 @@ -827,14 +793,14 @@ unknown_nmi_error(unsigned char reason,
17336 /* Runs on IST stack. This code must keep interrupts off all the time.
17337 Nested NMIs are prevented by the CPU. */
17338 -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17339 +asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17341 unsigned char reason = 0;
17344 cpu = smp_processor_id();
17346 - /* Only the BSP gets external NMIs from the system. */
17347 + /* Only the BSP gets external NMIs from the system. */
17349 reason = get_nmi_reason();
17351 @@ -847,33 +813,58 @@ asmlinkage notrace __kprobes void defau
17352 * Ok, so this is none of the documented NMI sources,
17353 * so it must be the NMI watchdog.
17355 - if (nmi_watchdog_tick(regs,reason))
17356 + if (nmi_watchdog_tick(regs, reason))
17359 - if (!do_nmi_callback(regs,cpu))
17360 + if (!do_nmi_callback(regs, cpu))
17361 unknown_nmi_error(reason, regs);
17365 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
17369 /* AK: following checks seem to be broken on modern chipsets. FIXME */
17372 mem_parity_error(reason, regs);
17374 io_check_error(reason, regs);
17377 +asmlinkage notrace __kprobes void
17378 +do_nmi(struct pt_regs *regs, long error_code)
17382 + add_pda(__nmi_count, 1);
17384 + if (!ignore_nmis)
17385 + default_do_nmi(regs);
17390 +void stop_nmi(void)
17392 + acpi_nmi_disable();
17396 +void restart_nmi(void)
17399 + acpi_nmi_enable();
17402 /* runs on IST stack. */
17403 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
17405 trace_hardirqs_fixup();
17407 - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
17408 + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
17413 preempt_conditional_sti(regs);
17414 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
17415 preempt_conditional_cli(regs);
17416 @@ -904,8 +895,8 @@ asmlinkage __kprobes struct pt_regs *syn
17417 asmlinkage void __kprobes do_debug(struct pt_regs * regs,
17418 unsigned long error_code)
17420 - unsigned long condition;
17421 struct task_struct *tsk = current;
17422 + unsigned long condition;
17425 trace_hardirqs_fixup();
17426 @@ -926,21 +917,19 @@ asmlinkage void __kprobes do_debug(struc
17428 /* Mask out spurious debug traps due to lazy DR7 setting */
17429 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
17430 - if (!tsk->thread.debugreg7) {
17431 + if (!tsk->thread.debugreg7)
17436 tsk->thread.debugreg6 = condition;
17440 * Single-stepping through TF: make sure we ignore any events in
17441 * kernel space (but re-enable TF when returning to user mode).
17443 if (condition & DR_STEP) {
17444 - if (!user_mode(regs))
17445 - goto clear_TF_reenable;
17446 + if (!user_mode(regs))
17447 + goto clear_TF_reenable;
17450 /* Ok, finally something we can handle */
17451 @@ -953,7 +942,7 @@ asmlinkage void __kprobes do_debug(struc
17452 force_sig_info(SIGTRAP, &info, tsk);
17455 - set_debugreg(0UL, 7);
17456 + set_debugreg(0, 7);
17457 preempt_conditional_cli(regs);
17460 @@ -961,6 +950,7 @@ clear_TF_reenable:
17461 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
17462 regs->flags &= ~X86_EFLAGS_TF;
17463 preempt_conditional_cli(regs);
17467 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
17468 @@ -983,7 +973,7 @@ static int kernel_math_error(struct pt_r
17469 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
17471 void __user *ip = (void __user *)(regs->ip);
17472 - struct task_struct * task;
17473 + struct task_struct *task;
17475 unsigned short cwd, swd;
17477 @@ -1016,30 +1006,30 @@ asmlinkage void do_coprocessor_error(str
17478 cwd = get_fpu_cwd(task);
17479 swd = get_fpu_swd(task);
17480 switch (swd & ~cwd & 0x3f) {
17484 - case 0x001: /* Invalid Op */
17486 - * swd & 0x240 == 0x040: Stack Underflow
17487 - * swd & 0x240 == 0x240: Stack Overflow
17488 - * User must clear the SF bit (0x40) if set
17490 - info.si_code = FPE_FLTINV;
17492 - case 0x002: /* Denormalize */
17493 - case 0x010: /* Underflow */
17494 - info.si_code = FPE_FLTUND;
17496 - case 0x004: /* Zero Divide */
17497 - info.si_code = FPE_FLTDIV;
17499 - case 0x008: /* Overflow */
17500 - info.si_code = FPE_FLTOVF;
17502 - case 0x020: /* Precision */
17503 - info.si_code = FPE_FLTRES;
17505 + case 0x000: /* No unmasked exception */
17506 + default: /* Multiple exceptions */
17508 + case 0x001: /* Invalid Op */
17510 + * swd & 0x240 == 0x040: Stack Underflow
17511 + * swd & 0x240 == 0x240: Stack Overflow
17512 + * User must clear the SF bit (0x40) if set
17514 + info.si_code = FPE_FLTINV;
17516 + case 0x002: /* Denormalize */
17517 + case 0x010: /* Underflow */
17518 + info.si_code = FPE_FLTUND;
17520 + case 0x004: /* Zero Divide */
17521 + info.si_code = FPE_FLTDIV;
17523 + case 0x008: /* Overflow */
17524 + info.si_code = FPE_FLTOVF;
17526 + case 0x020: /* Precision */
17527 + info.si_code = FPE_FLTRES;
17530 force_sig_info(SIGFPE, &info, task);
17532 @@ -1052,7 +1042,7 @@ asmlinkage void bad_intr(void)
17533 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
17535 void __user *ip = (void __user *)(regs->ip);
17536 - struct task_struct * task;
17537 + struct task_struct *task;
17539 unsigned short mxcsr;
17541 @@ -1080,25 +1070,25 @@ asmlinkage void do_simd_coprocessor_erro
17543 mxcsr = get_fpu_mxcsr(task);
17544 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
17548 - case 0x001: /* Invalid Op */
17549 - info.si_code = FPE_FLTINV;
17551 - case 0x002: /* Denormalize */
17552 - case 0x010: /* Underflow */
17553 - info.si_code = FPE_FLTUND;
17555 - case 0x004: /* Zero Divide */
17556 - info.si_code = FPE_FLTDIV;
17558 - case 0x008: /* Overflow */
17559 - info.si_code = FPE_FLTOVF;
17561 - case 0x020: /* Precision */
17562 - info.si_code = FPE_FLTRES;
17567 + case 0x001: /* Invalid Op */
17568 + info.si_code = FPE_FLTINV;
17570 + case 0x002: /* Denormalize */
17571 + case 0x010: /* Underflow */
17572 + info.si_code = FPE_FLTUND;
17574 + case 0x004: /* Zero Divide */
17575 + info.si_code = FPE_FLTDIV;
17577 + case 0x008: /* Overflow */
17578 + info.si_code = FPE_FLTOVF;
17580 + case 0x020: /* Precision */
17581 + info.si_code = FPE_FLTRES;
17584 force_sig_info(SIGFPE, &info, task);
17586 @@ -1118,7 +1108,7 @@ asmlinkage void __attribute__((weak)) mc
17590 - * 'math_state_restore()' saves the current math information in the
17591 + * 'math_state_restore()' saves the current math information in the
17592 * old math state array, and gets the new ones from the current task
17594 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
17595 @@ -1145,7 +1135,14 @@ asmlinkage void math_state_restore(void)
17597 /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
17599 - restore_fpu_checking(&me->thread.xstate->fxsave);
17601 + * Paranoid restore. send a SIGSEGV if we fail to restore the state.
17603 + if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
17605 + force_sig(SIGSEGV, me);
17608 task_thread_info(me)->status |= TS_USEDFPU;
17611 @@ -1190,13 +1187,12 @@ void __init trap_init(void)
17612 ret = HYPERVISOR_set_trap_table(trap_table);
17614 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
17617 * initialize the per thread extended state:
17619 - init_thread_xstate();
17620 + init_thread_xstate();
17622 - * Should be a barrier for any external CPU state.
17623 + * Should be a barrier for any external CPU state:
17627 @@ -1212,27 +1208,25 @@ void __cpuinit smp_trap_init(trap_info_t
17632 static int __init oops_setup(char *s)
17637 if (!strcmp(s, "panic"))
17642 early_param("oops", oops_setup);
17644 static int __init kstack_setup(char *s)
17648 - kstack_depth_to_print = simple_strtoul(s,NULL,0);
17649 + kstack_depth_to_print = simple_strtoul(s, NULL, 0);
17652 early_param("kstack", kstack_setup);
17655 static int __init code_bytes_setup(char *s)
17657 code_bytes = simple_strtoul(s, NULL, 0);
17658 --- sle11-2009-10-16.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:38:05.000000000 +0100
17659 +++ sle11-2009-10-16/arch/x86/kernel/vsyscall_64-xen.c 2009-06-04 10:21:39.000000000 +0200
17661 #include <asm/topology.h>
17662 #include <asm/vgtod.h>
17664 -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
17665 +#define __vsyscall(nr) \
17666 + __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
17667 #define __syscall_clobber "r11","cx","memory"
17670 @@ -264,10 +265,7 @@ static void __cpuinit vsyscall_set_cpu(i
17672 d |= (node & 0xf) << 12;
17673 d |= (node >> 4) << 48;
17674 - if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
17675 - + GDT_ENTRY_PER_CPU),
17678 + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
17681 static void __cpuinit cpu_vsyscall_init(void *arg)
17682 @@ -281,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_bl
17684 long cpu = (long)arg;
17685 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
17686 - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
17687 + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
17688 return NOTIFY_DONE;
17691 @@ -311,7 +309,7 @@ static int __init vsyscall_init(void)
17692 #ifdef CONFIG_SYSCTL
17693 register_sysctl_table(kernel_root_table2);
17695 - on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
17696 + on_each_cpu(cpu_vsyscall_init, NULL, 1);
17697 hotcpu_notifier(cpu_vsyscall_notifier, 0);
17700 --- sle11-2009-10-16.orig/arch/x86/mach-xen/setup.c 2009-03-16 16:33:40.000000000 +0100
17701 +++ sle11-2009-10-16/arch/x86/mach-xen/setup.c 2009-06-04 10:21:39.000000000 +0200
17703 #include <xen/interface/callback.h>
17704 #include <xen/interface/memory.h>
17706 +#ifdef CONFIG_X86_32
17708 #ifdef CONFIG_HOTPLUG_CPU
17709 #define DEFAULT_SEND_IPI (1)
17711 @@ -44,51 +46,6 @@ static int __init print_ipi_mode(void)
17713 late_initcall(print_ipi_mode);
17716 - * machine_specific_memory_setup - Hook for machine specific memory setup.
17719 - * This is included late in kernel/setup.c so that it can make
17720 - * use of all of the static functions.
17723 -char * __init machine_specific_memory_setup(void)
17726 - struct xen_memory_map memmap;
17728 - * This is rather large for a stack variable but this early in
17729 - * the boot process we know we have plenty slack space.
17731 - struct e820entry map[E820MAX];
17733 - memmap.nr_entries = E820MAX;
17734 - set_xen_guest_handle(memmap.buffer, map);
17736 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
17737 - if ( rc == -ENOSYS ) {
17738 - memmap.nr_entries = 1;
17739 - map[0].addr = 0ULL;
17740 - map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
17741 - /* 8MB slack (to balance backend allocations). */
17742 - map[0].size += 8ULL << 20;
17743 - map[0].type = E820_RAM;
17748 - sanitize_e820_map(map, (char *)&memmap.nr_entries);
17750 - BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
17756 -extern void hypervisor_callback(void);
17757 -extern void failsafe_callback(void);
17758 -extern void nmi(void);
17760 unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
17761 EXPORT_SYMBOL(machine_to_phys_mapping);
17762 unsigned int machine_to_phys_order;
17763 @@ -121,33 +78,66 @@ void __init pre_setup_arch_hook(void)
17764 (unsigned long *)xen_start_info->mfn_list;
17767 +#endif /* CONFIG_X86_32 */
17769 +extern void hypervisor_callback(void);
17770 +extern void failsafe_callback(void);
17771 +extern void nmi(void);
17773 +#ifdef CONFIG_X86_64
17774 +#include <asm/proto.h>
17775 +#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
17777 +#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
17780 void __init machine_specific_arch_setup(void)
17783 static struct callback_register __initdata event = {
17784 .type = CALLBACKTYPE_event,
17785 - .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
17786 + .address = CALLBACK_ADDR(hypervisor_callback)
17788 static struct callback_register __initdata failsafe = {
17789 .type = CALLBACKTYPE_failsafe,
17790 - .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
17791 + .address = CALLBACK_ADDR(failsafe_callback)
17793 +#ifdef CONFIG_X86_64
17794 + static struct callback_register __initdata syscall = {
17795 + .type = CALLBACKTYPE_syscall,
17796 + .address = CALLBACK_ADDR(system_call)
17799 +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17800 static struct callback_register __initdata nmi_cb = {
17801 .type = CALLBACKTYPE_nmi,
17802 - .address = { __KERNEL_CS, (unsigned long)nmi },
17803 + .address = CALLBACK_ADDR(nmi)
17807 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
17809 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
17810 +#ifdef CONFIG_X86_64
17812 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
17814 #if CONFIG_XEN_COMPAT <= 0x030002
17815 +#ifdef CONFIG_X86_32
17816 if (ret == -ENOSYS)
17817 ret = HYPERVISOR_set_callbacks(
17818 event.address.cs, event.address.eip,
17819 failsafe.address.cs, failsafe.address.eip);
17821 + ret = HYPERVISOR_set_callbacks(
17823 + failsafe.address,
17824 + syscall.address);
17829 +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17830 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
17831 #if CONFIG_XEN_COMPAT <= 0x030002
17832 if (ret == -ENOSYS) {
17833 @@ -158,15 +148,43 @@ void __init machine_specific_arch_setup(
17834 HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
17839 +#ifdef CONFIG_X86_32
17840 /* Do an early initialization of the fixmap area */
17842 extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
17843 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
17844 pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
17845 pmd_t *pmd = pmd_offset(pud, addr);
17848 make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
17849 set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
17851 +#define __FIXADDR_TOP (-PAGE_SIZE)
17852 +#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
17853 + != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
17854 + FIX_BUG_ON(SHARED_INFO);
17855 + FIX_BUG_ON(ISAMAP_BEGIN);
17856 + FIX_BUG_ON(ISAMAP_END);
17857 +#undef __FIXADDR_TOP
17858 + BUG_ON(pte_index(hypervisor_virt_start));
17860 + /* Switch to the real shared_info page, and clear the
17862 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
17863 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
17864 + memset(empty_zero_page, 0, sizeof(empty_zero_page));
17866 + /* Setup mapping of lower 1st MB */
17867 + for (i = 0; i < NR_FIX_ISAMAPS; i++)
17868 + if (is_initial_xendomain())
17869 + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
17871 + __set_fixmap(FIX_ISAMAP_BEGIN - i,
17872 + virt_to_machine(empty_zero_page),
17877 --- sle11-2009-10-16.orig/arch/x86/mm/fault-xen.c 2009-03-16 16:38:05.000000000 +0100
17878 +++ sle11-2009-10-16/arch/x86/mm/fault-xen.c 2009-06-04 10:21:39.000000000 +0200
17880 #include <linux/string.h>
17881 #include <linux/types.h>
17882 #include <linux/ptrace.h>
17883 +#include <linux/mmiotrace.h>
17884 #include <linux/mman.h>
17885 #include <linux/mm.h>
17886 #include <linux/smp.h>
17887 @@ -49,17 +50,23 @@
17888 #define PF_RSVD (1<<3)
17889 #define PF_INSTR (1<<4)
17891 +static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
17893 +#ifdef CONFIG_MMIOTRACE_HOOKS
17894 + if (unlikely(is_kmmio_active()))
17895 + if (kmmio_handler(regs, addr) == 1)
17901 static inline int notify_page_fault(struct pt_regs *regs)
17903 #ifdef CONFIG_KPROBES
17906 /* kprobe_running() needs smp_processor_id() */
17907 -#ifdef CONFIG_X86_32
17908 if (!user_mode_vm(regs)) {
17910 - if (!user_mode(regs)) {
17913 if (kprobe_running() && kprobe_fault_handler(regs, 14))
17915 @@ -409,11 +416,7 @@ static void show_fault_oops(struct pt_re
17916 printk(KERN_CONT "NULL pointer dereference");
17918 printk(KERN_CONT "paging request");
17919 -#ifdef CONFIG_X86_32
17920 - printk(KERN_CONT " at %08lx\n", address);
17922 - printk(KERN_CONT " at %016lx\n", address);
17924 + printk(KERN_CONT " at %p\n", (void *) address);
17925 printk(KERN_ALERT "IP:");
17926 printk_address(regs->ip, 1);
17927 dump_pagetable(address);
17928 @@ -628,6 +631,8 @@ void __kprobes do_page_fault(struct pt_r
17930 if (notify_page_fault(regs))
17932 + if (unlikely(kmmio_fault(regs, address)))
17936 * We fault-in kernel-space virtual memory on-demand. The
17937 @@ -831,14 +836,10 @@ bad_area_nosemaphore:
17938 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17939 printk_ratelimit()) {
17941 -#ifdef CONFIG_X86_32
17942 - "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
17944 - "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
17946 + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
17947 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
17948 - tsk->comm, task_pid_nr(tsk), address, regs->ip,
17949 - regs->sp, error_code);
17950 + tsk->comm, task_pid_nr(tsk), address,
17951 + (void *) regs->ip, (void *) regs->sp, error_code);
17952 print_vma_addr(" in ", regs->ip);
17955 @@ -946,81 +947,45 @@ LIST_HEAD(pgd_list);
17956 void vmalloc_sync_all(void)
17958 #ifdef CONFIG_X86_32
17960 - * Note that races in the updates of insync and start aren't
17961 - * problematic: insync can only get set bits added, and updates to
17962 - * start are only improving performance (without affecting correctness
17964 - * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
17965 - * This change works just fine with 2-level paging too.
17967 -#define sync_index(a) ((a) >> PMD_SHIFT)
17968 - static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
17969 - static unsigned long start = TASK_SIZE;
17970 - unsigned long address;
17971 + unsigned long address = VMALLOC_START & PGDIR_MASK;
17973 if (SHARED_KERNEL_PMD)
17976 BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
17977 - for (address = start;
17978 - address < hypervisor_virt_start;
17979 - address += PMD_SIZE) {
17980 - if (!test_bit(sync_index(address), insync)) {
17981 - unsigned long flags;
17982 - struct page *page;
17984 - spin_lock_irqsave(&pgd_lock, flags);
17985 - /* XEN: failure path assumes non-empty pgd_list. */
17986 - if (unlikely(list_empty(&pgd_list))) {
17987 - spin_unlock_irqrestore(&pgd_lock, flags);
17990 - list_for_each_entry(page, &pgd_list, lru) {
17991 - if (!vmalloc_sync_one(page_address(page),
17995 - spin_unlock_irqrestore(&pgd_lock, flags);
17997 - set_bit(sync_index(address), insync);
17998 + for (; address < hypervisor_virt_start; address += PMD_SIZE) {
17999 + unsigned long flags;
18000 + struct page *page;
18002 + spin_lock_irqsave(&pgd_lock, flags);
18003 + list_for_each_entry(page, &pgd_list, lru) {
18004 + if (!vmalloc_sync_one(page_address(page),
18008 - if (address == start && test_bit(sync_index(address), insync))
18009 - start = address + PMD_SIZE;
18010 + spin_unlock_irqrestore(&pgd_lock, flags);
18012 #else /* CONFIG_X86_64 */
18014 - * Note that races in the updates of insync and start aren't
18015 - * problematic: insync can only get set bits added, and updates to
18016 - * start are only improving performance (without affecting correctness
18019 - static DECLARE_BITMAP(insync, PTRS_PER_PGD);
18020 - static unsigned long start = VMALLOC_START & PGDIR_MASK;
18021 + unsigned long start = VMALLOC_START & PGDIR_MASK;
18022 unsigned long address;
18024 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
18025 - if (!test_bit(pgd_index(address), insync)) {
18026 - const pgd_t *pgd_ref = pgd_offset_k(address);
18027 - unsigned long flags;
18028 - struct page *page;
18030 - if (pgd_none(*pgd_ref))
18032 - spin_lock_irqsave(&pgd_lock, flags);
18033 - list_for_each_entry(page, &pgd_list, lru) {
18035 - pgd = (pgd_t *)page_address(page) + pgd_index(address);
18036 - if (pgd_none(*pgd))
18037 - set_pgd(pgd, *pgd_ref);
18039 - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18041 - spin_unlock_irqrestore(&pgd_lock, flags);
18042 - set_bit(pgd_index(address), insync);
18043 + const pgd_t *pgd_ref = pgd_offset_k(address);
18044 + unsigned long flags;
18045 + struct page *page;
18047 + if (pgd_none(*pgd_ref))
18049 + spin_lock_irqsave(&pgd_lock, flags);
18050 + list_for_each_entry(page, &pgd_list, lru) {
18052 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
18053 + if (pgd_none(*pgd))
18054 + set_pgd(pgd, *pgd_ref);
18056 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18058 - if (address == start)
18059 - start = address + PGDIR_SIZE;
18060 + spin_unlock_irqrestore(&pgd_lock, flags);
18064 --- sle11-2009-10-16.orig/arch/x86/mm/hypervisor.c 2009-05-14 11:18:39.000000000 +0200
18065 +++ sle11-2009-10-16/arch/x86/mm/hypervisor.c 2009-06-04 10:21:39.000000000 +0200
18066 @@ -709,6 +709,72 @@ void xen_destroy_contiguous_region(unsig
18068 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
18070 +int __init early_create_contiguous_region(unsigned long pfn,
18071 + unsigned int order,
18072 + unsigned int address_bits)
18074 + unsigned long *in_frames = discontig_frames, out_frame = pfn;
18077 + struct xen_memory_exchange exchange = {
18079 + .nr_extents = 1UL << order,
18080 + .extent_order = 0,
18081 + .domid = DOMID_SELF
18085 + .extent_order = order,
18086 + .address_bits = address_bits,
18087 + .domid = DOMID_SELF
18091 + if (xen_feature(XENFEAT_auto_translated_physmap))
18094 + if (unlikely(order > MAX_CONTIG_ORDER))
18097 + for (i = 0; i < (1U << order); ++i) {
18098 + in_frames[i] = pfn_to_mfn(pfn + i);
18099 + set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY);
18102 + set_xen_guest_handle(exchange.in.extent_start, in_frames);
18103 + set_xen_guest_handle(exchange.out.extent_start, &out_frame);
18105 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
18106 + success = (exchange.nr_exchanged == (1UL << order));
18107 + BUG_ON(!success && (exchange.nr_exchanged || !rc));
18108 + BUG_ON(success && rc);
18109 +#if CONFIG_XEN_COMPAT <= 0x030002
18110 + if (unlikely(rc == -ENOSYS)) {
18111 + /* Compatibility when XENMEM_exchange is unavailable. */
18112 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
18113 + &exchange.in) != (1UL << order))
18115 + success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18116 + &exchange.out) == 1);
18118 + for (i = 0; i < (1U << order); ++i)
18119 + in_frames[i] = pfn + i;
18120 + if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18121 + &exchange.in) != (1UL << order))
18127 + for (i = 0; i < (1U << order); ++i, ++out_frame) {
18129 + out_frame = in_frames[i];
18130 + set_phys_to_machine(pfn + i, out_frame);
18133 + return success ? 0 : -ENOMEM;
18136 static void undo_limit_pages(struct page *pages, unsigned int order)
18138 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
18139 @@ -875,42 +941,9 @@ int write_ldt_entry(struct desc_struct *
18140 return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
18143 -#define MAX_BATCHED_FULL_PTES 32
18145 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
18146 - unsigned long addr, unsigned long end, pgprot_t newprot,
18147 - int dirty_accountable)
18148 +int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
18151 - int rc = 0, i = 0;
18152 - mmu_update_t u[MAX_BATCHED_FULL_PTES];
18156 - if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
18159 - pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
18161 - if (pte_present(*pte)) {
18162 - pte_t ptent = pte_modify(*pte, newprot);
18164 - if (dirty_accountable && pte_dirty(ptent))
18165 - ptent = pte_mkwrite(ptent);
18166 - u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
18167 - | ((unsigned long)pte & ~PAGE_MASK)
18168 - | MMU_PT_UPDATE_PRESERVE_AD;
18169 - u[i].val = __pte_val(ptent);
18170 - if (++i == MAX_BATCHED_FULL_PTES) {
18171 - if ((rc = HYPERVISOR_mmu_update(
18172 - &u[0], i, NULL, DOMID_SELF)) != 0)
18177 - } while (pte++, addr += PAGE_SIZE, addr != end);
18179 - rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
18180 - pte_unmap_unlock(pte - 1, ptl);
18181 - BUG_ON(rc && rc != -ENOSYS);
18183 + maddr_t mach_gp = virt_to_machine(gdt + entry);
18184 + return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
18186 --- sle11-2009-10-16.orig/arch/x86/mm/init_32-xen.c 2009-03-16 16:38:05.000000000 +0100
18187 +++ sle11-2009-10-16/arch/x86/mm/init_32-xen.c 2009-06-04 10:21:39.000000000 +0200
18190 unsigned int __VMALLOC_RESERVE = 128 << 20;
18192 +unsigned long max_low_pfn_mapped;
18193 unsigned long max_pfn_mapped;
18195 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
18196 @@ -61,6 +62,27 @@ unsigned long highstart_pfn, highend_pfn
18198 static noinline int do_test_wp_bit(void);
18201 +static unsigned long __initdata table_start;
18202 +static unsigned long __initdata table_end;
18203 +static unsigned long __initdata table_top;
18205 +static int __initdata after_init_bootmem;
18207 +static __init void *alloc_low_page(unsigned long *phys)
18209 + unsigned long pfn = table_end++;
18212 + if (pfn >= table_top)
18213 + panic("alloc_low_page: ran out of memory");
18215 + adr = __va(pfn * PAGE_SIZE);
18216 + memset(adr, 0, PAGE_SIZE);
18217 + *phys = pfn * PAGE_SIZE;
18222 * Creates a middle page table and puts a pointer to it in the
18223 * given global directory entry. This only returns the gd entry
18224 @@ -72,9 +94,12 @@ static pmd_t * __init one_md_table_init(
18227 #ifdef CONFIG_X86_PAE
18228 + unsigned long phys;
18229 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
18230 - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18232 + if (after_init_bootmem)
18233 + pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18235 + pmd_table = (pmd_t *)alloc_low_page(&phys);
18236 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
18237 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
18238 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
18239 @@ -101,12 +126,16 @@ static pte_t * __init one_page_table_ini
18241 pte_t *page_table = NULL;
18243 + if (after_init_bootmem) {
18244 #ifdef CONFIG_DEBUG_PAGEALLOC
18245 - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18246 + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18248 - if (!page_table) {
18252 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18254 + unsigned long phys;
18255 + page_table = (pte_t *)alloc_low_page(&phys);
18258 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
18259 @@ -167,24 +196,24 @@ static inline int is_kernel_text(unsigne
18260 * of max_low_pfn pages, by creating page tables starting from address
18263 -static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
18264 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
18265 + unsigned long start_pfn,
18266 + unsigned long end_pfn,
18269 int pgd_idx, pmd_idx, pte_ofs;
18274 + unsigned pages_2m = 0, pages_4k = 0;
18276 - unsigned long max_ram_pfn = xen_start_info->nr_pages;
18277 - if (max_ram_pfn > max_low_pfn)
18278 - max_ram_pfn = max_low_pfn;
18279 + if (!cpu_has_pse)
18282 - pgd_idx = pgd_index(PAGE_OFFSET);
18284 + pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18285 pgd = pgd_base + pgd_idx;
18287 - pmd_idx = pmd_index(PAGE_OFFSET);
18288 - pte_ofs = pte_index(PAGE_OFFSET);
18290 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
18293 @@ -198,10 +227,16 @@ static void __init kernel_physical_mappi
18295 pmd = one_md_table_init(pgd);
18297 - if (pfn >= max_low_pfn)
18299 + if (pfn >= end_pfn)
18301 +#ifdef CONFIG_X86_PAE
18302 + pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18304 - for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
18308 + for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
18309 pmd++, pmd_idx++) {
18310 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
18312 @@ -211,13 +246,8 @@ static void __init kernel_physical_mappi
18314 * Map with big pages if possible, otherwise
18315 * create normal page tables:
18317 - * Don't use a large page for the first 2/4MB of memory
18318 - * because there are often fixed size MTRRs in there
18319 - * and overlapping MTRRs into large pages can cause
18322 - if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
18324 unsigned int addr2;
18325 pgprot_t prot = PAGE_KERNEL_LARGE;
18327 @@ -228,49 +258,35 @@ static void __init kernel_physical_mappi
18328 is_kernel_text(addr2))
18329 prot = PAGE_KERNEL_LARGE_EXEC;
18332 set_pmd(pmd, pfn_pmd(pfn, prot));
18334 pfn += PTRS_PER_PTE;
18335 - max_pfn_mapped = pfn;
18338 pte = one_page_table_init(pmd);
18340 - for (pte += pte_ofs;
18341 - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
18342 + pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18344 + for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
18345 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
18346 pgprot_t prot = PAGE_KERNEL;
18348 /* XEN: Only map initial RAM allocation. */
18349 - if ((pfn >= max_ram_pfn) || pte_present(*pte))
18350 + if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
18352 if (is_kernel_text(addr))
18353 prot = PAGE_KERNEL_EXEC;
18356 set_pte(pte, pfn_pte(pfn, prot));
18358 - max_pfn_mapped = pfn;
18363 + update_page_count(PG_LEVEL_2M, pages_2m);
18364 + update_page_count(PG_LEVEL_4K, pages_4k);
18367 -#ifndef CONFIG_XEN
18369 -static inline int page_kills_ppro(unsigned long pagenr)
18371 - if (pagenr >= 0x70000 && pagenr <= 0x7003F)
18378 -#define page_kills_ppro(p) 0
18383 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
18384 * is valid. The argument is a physical page number.
18385 @@ -331,30 +347,63 @@ static void __init permanent_kmaps_init(
18386 pkmap_page_table = pte;
18389 -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
18390 +static void __init add_one_highpage_init(struct page *page, int pfn)
18392 - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
18393 - ClearPageReserved(page);
18394 - init_page_count(page);
18395 - if (pfn < xen_start_info->nr_pages)
18396 - __free_page(page);
18397 - totalhigh_pages++;
18399 - SetPageReserved(page);
18400 + ClearPageReserved(page);
18401 + init_page_count(page);
18402 + if (pfn < xen_start_info->nr_pages)
18403 + __free_page(page);
18404 + totalhigh_pages++;
18407 +struct add_highpages_data {
18408 + unsigned long start_pfn;
18409 + unsigned long end_pfn;
18412 +static int __init add_highpages_work_fn(unsigned long start_pfn,
18413 + unsigned long end_pfn, void *datax)
18416 + struct page *page;
18417 + unsigned long final_start_pfn, final_end_pfn;
18418 + struct add_highpages_data *data;
18420 + data = (struct add_highpages_data *)datax;
18422 + final_start_pfn = max(start_pfn, data->start_pfn);
18423 + final_end_pfn = min(end_pfn, data->end_pfn);
18424 + if (final_start_pfn >= final_end_pfn)
18427 + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
18429 + if (!pfn_valid(node_pfn))
18431 + page = pfn_to_page(node_pfn);
18432 + add_one_highpage_init(page, node_pfn);
18439 +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
18440 + unsigned long end_pfn)
18442 + struct add_highpages_data data;
18444 + data.start_pfn = start_pfn;
18445 + data.end_pfn = end_pfn;
18447 + work_with_active_regions(nid, add_highpages_work_fn, &data);
18450 #ifndef CONFIG_NUMA
18451 -static void __init set_highmem_pages_init(int bad_ppro)
18452 +static void __init set_highmem_pages_init(void)
18455 + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
18457 - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
18459 - * Holes under sparsemem might not have no mem_map[]:
18461 - if (pfn_valid(pfn))
18462 - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
18464 totalram_pages += totalhigh_pages;
18466 #endif /* !CONFIG_NUMA */
18467 @@ -362,24 +411,11 @@ static void __init set_highmem_pages_ini
18469 # define kmap_init() do { } while (0)
18470 # define permanent_kmaps_init(pgd_base) do { } while (0)
18471 -# define set_highmem_pages_init(bad_ppro) do { } while (0)
18472 +# define set_highmem_pages_init() do { } while (0)
18473 #endif /* CONFIG_HIGHMEM */
18475 -pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
18476 -EXPORT_SYMBOL(__PAGE_KERNEL);
18478 -pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
18480 pgd_t *swapper_pg_dir;
18482 -static void __init xen_pagetable_setup_start(pgd_t *base)
18486 -static void __init xen_pagetable_setup_done(pgd_t *base)
18491 * Build a proper pagetable for the kernel mappings. Up until this
18492 * point, we've been running on some set of pagetables constructed by
18493 @@ -399,27 +435,10 @@ static void __init xen_pagetable_setup_d
18494 * be partially populated, and so it avoids stomping on any existing
18497 -static void __init pagetable_init(void)
18498 +static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
18500 - pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18501 unsigned long vaddr, end;
18503 - xen_pagetable_setup_start(pgd_base);
18505 - /* Enable PSE if available */
18507 - set_in_cr4(X86_CR4_PSE);
18509 - /* Enable PGE if available */
18510 - if (cpu_has_pge) {
18511 - set_in_cr4(X86_CR4_PGE);
18512 - __PAGE_KERNEL |= _PAGE_GLOBAL;
18513 - __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
18516 - kernel_physical_mapping_init(pgd_base);
18517 - remap_numa_kva();
18520 * Fixed mappings, only the page table structure has to be
18521 * created - mappings will be set by set_fixmap():
18522 @@ -429,10 +448,13 @@ static void __init pagetable_init(void)
18523 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
18524 page_table_range_init(vaddr, end, pgd_base);
18525 early_ioremap_reset();
18528 - permanent_kmaps_init(pgd_base);
18529 +static void __init pagetable_init(void)
18531 + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18533 - xen_pagetable_setup_done(pgd_base);
18534 + permanent_kmaps_init(pgd_base);
18537 #if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
18538 @@ -475,7 +497,7 @@ void zap_low_mappings(void)
18542 -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
18543 +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
18544 EXPORT_SYMBOL_GPL(__supported_pte_mask);
18546 #ifdef CONFIG_X86_PAE
18547 @@ -528,42 +550,364 @@ static void __init set_nx(void)
18551 +/* user-defined highmem size */
18552 +static unsigned int highmem_pages = -1;
18555 - * paging_init() sets up the page tables - note that the first 8MB are
18556 - * already mapped by head.S.
18558 - * This routines also unmaps the page at virtual kernel address 0, so
18559 - * that we can trap those pesky NULL-reference errors in the kernel.
18560 + * highmem=size forces highmem to be exactly 'size' bytes.
18561 + * This works even on boxes that have no highmem otherwise.
18562 + * This also works to reduce highmem size on bigger boxes.
18564 -void __init paging_init(void)
18565 +static int __init parse_highmem(char *arg)
18570 + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
18573 +early_param("highmem", parse_highmem);
18576 + * Determine low and high memory ranges:
18578 +void __init find_low_pfn_range(void)
18580 + /* it could update max_pfn */
18582 + /* max_low_pfn is 0, we already have early_res support */
18584 + max_low_pfn = max_pfn;
18585 + if (max_low_pfn > MAXMEM_PFN) {
18586 + if (highmem_pages == -1)
18587 + highmem_pages = max_pfn - MAXMEM_PFN;
18588 + if (highmem_pages + MAXMEM_PFN < max_pfn)
18589 + max_pfn = MAXMEM_PFN + highmem_pages;
18590 + if (highmem_pages + MAXMEM_PFN > max_pfn) {
18591 + printk(KERN_WARNING "only %luMB highmem pages "
18592 + "available, ignoring highmem size of %uMB.\n",
18593 + pages_to_mb(max_pfn - MAXMEM_PFN),
18594 + pages_to_mb(highmem_pages));
18595 + highmem_pages = 0;
18597 + max_low_pfn = MAXMEM_PFN;
18598 +#ifndef CONFIG_HIGHMEM
18599 + /* Maximum memory usable is what is directly addressable */
18600 + printk(KERN_WARNING "Warning only %ldMB will be used.\n",
18602 + if (max_pfn > MAX_NONPAE_PFN)
18603 + printk(KERN_WARNING
18604 + "Use a HIGHMEM64G enabled kernel.\n");
18606 + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
18607 + max_pfn = MAXMEM_PFN;
18608 +#else /* !CONFIG_HIGHMEM */
18609 +#ifndef CONFIG_HIGHMEM64G
18610 + if (max_pfn > MAX_NONPAE_PFN) {
18611 + max_pfn = MAX_NONPAE_PFN;
18612 + printk(KERN_WARNING "Warning only 4GB will be used."
18613 + "Use a HIGHMEM64G enabled kernel.\n");
18615 +#endif /* !CONFIG_HIGHMEM64G */
18616 +#endif /* !CONFIG_HIGHMEM */
18618 + if (highmem_pages == -1)
18619 + highmem_pages = 0;
18620 +#ifdef CONFIG_HIGHMEM
18621 + if (highmem_pages >= max_pfn) {
18622 + printk(KERN_ERR "highmem size specified (%uMB) is "
18623 + "bigger than pages available (%luMB)!.\n",
18624 + pages_to_mb(highmem_pages),
18625 + pages_to_mb(max_pfn));
18626 + highmem_pages = 0;
18628 + if (highmem_pages) {
18629 + if (max_low_pfn - highmem_pages <
18630 + 64*1024*1024/PAGE_SIZE){
18631 + printk(KERN_ERR "highmem size %uMB results in "
18632 + "smaller than 64MB lowmem, ignoring it.\n"
18633 + , pages_to_mb(highmem_pages));
18634 + highmem_pages = 0;
18636 + max_low_pfn -= highmem_pages;
18639 + if (highmem_pages)
18640 + printk(KERN_ERR "ignoring highmem size on non-highmem"
18646 +#ifndef CONFIG_NEED_MULTIPLE_NODES
18647 +void __init initmem_init(unsigned long start_pfn,
18648 + unsigned long end_pfn)
18650 +#ifdef CONFIG_HIGHMEM
18651 + highstart_pfn = highend_pfn = max_pfn;
18652 + if (max_pfn > max_low_pfn)
18653 + highstart_pfn = max_low_pfn;
18654 + memory_present(0, 0, highend_pfn);
18655 + e820_register_active_regions(0, 0, highend_pfn);
18656 + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
18657 + pages_to_mb(highend_pfn - highstart_pfn));
18658 + num_physpages = highend_pfn;
18659 + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
18661 + memory_present(0, 0, max_low_pfn);
18662 + e820_register_active_regions(0, 0, max_low_pfn);
18663 + num_physpages = max_low_pfn;
18664 + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
18666 +#ifdef CONFIG_FLATMEM
18667 + max_mapnr = num_physpages;
18669 + printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
18670 + pages_to_mb(max_low_pfn));
18672 + setup_bootmem_allocator();
18674 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
18676 +static void __init zone_sizes_init(void)
18678 + unsigned long max_zone_pfns[MAX_NR_ZONES];
18679 + memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
18680 + max_zone_pfns[ZONE_DMA] =
18681 + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
18682 + max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
18683 +#ifdef CONFIG_HIGHMEM
18684 + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
18687 + free_area_init_nodes(max_zone_pfns);
18690 +void __init setup_bootmem_allocator(void)
18693 + unsigned long bootmap_size, bootmap;
18694 + unsigned long end_pfn = min(max_low_pfn, xen_start_info->nr_pages);
18697 + * Initialize the boot-time allocator (with low memory only):
18699 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
18700 + bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
18701 + min(max_pfn_mapped, xen_start_info->nr_pages)<<PAGE_SHIFT,
18702 + bootmap_size, PAGE_SIZE);
18703 + if (bootmap == -1L)
18704 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
18705 + reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
18707 + /* don't touch min_low_pfn */
18708 + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
18709 + min_low_pfn, end_pfn);
18710 + printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
18711 + max_pfn_mapped<<PAGE_SHIFT);
18712 + printk(KERN_INFO " low ram: %08lx - %08lx\n",
18713 + min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
18714 + printk(KERN_INFO " bootmap %08lx - %08lx\n",
18715 + bootmap, bootmap + bootmap_size);
18716 + for_each_online_node(i)
18717 + free_bootmem_with_active_regions(i, end_pfn);
18718 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
18720 + after_init_bootmem = 1;
18723 +static unsigned long __init extend_init_mapping(unsigned long tables_space)
18725 + unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
18726 + + xen_start_info->nr_pt_frames;
18727 + unsigned long start = start_pfn, va = (unsigned long)&_text;
18733 + /* Ensure init mappings cover kernel text/data and initial tables. */
18734 + while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
18735 + pgd = pgd_offset_k(va);
18736 + pud = pud_offset(pgd, va);
18737 + pmd = pmd_offset(pud, va);
18738 + if (pmd_none(*pmd)) {
18739 + unsigned long pa = start_pfn++ << PAGE_SHIFT;
18741 + memset(__va(pa), 0, PAGE_SIZE);
18742 + make_lowmem_page_readonly(__va(pa),
18743 + XENFEAT_writable_page_tables);
18744 + xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
18746 + pte = pte_offset_kernel(pmd, va);
18747 + if (pte_none(*pte)) {
18748 + pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
18750 + if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
18756 + /* Finally, blow away any spurious initial mappings. */
18758 + pgd = pgd_offset_k(va);
18759 + pud = pud_offset(pgd, va);
18760 + pmd = pmd_offset(pud, va);
18761 + if (pmd_none(*pmd))
18763 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
18768 + if (start_pfn > start)
18769 + reserve_early(start << PAGE_SHIFT,
18770 + start_pfn << PAGE_SHIFT, "INITMAP");
18772 + return start_pfn;
18775 +static void __init find_early_table_space(unsigned long end)
18777 + unsigned long puds, pmds, ptes, tables;
18779 + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
18780 + tables = PAGE_ALIGN(puds * sizeof(pud_t));
18782 + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
18783 + tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
18785 + if (cpu_has_pse) {
18786 + unsigned long extra;
18788 + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
18789 + extra += PMD_SIZE;
18790 + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
18792 + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
18794 + tables += PAGE_ALIGN(ptes * sizeof(pte_t));
18797 + tables += PAGE_SIZE
18798 + * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK)
18799 + - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK))
18802 + table_start = extend_init_mapping(tables);
18804 + table_end = table_start;
18805 + table_top = table_start + (tables>>PAGE_SHIFT);
18807 + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
18808 + end, table_start << PAGE_SHIFT,
18809 + (table_start << PAGE_SHIFT) + tables);
18812 +unsigned long __init_refok init_memory_mapping(unsigned long start,
18813 + unsigned long end)
18815 + pgd_t *pgd_base = swapper_pg_dir;
18816 + unsigned long start_pfn, end_pfn;
18817 + unsigned long big_page_start;
18820 + * Find space for the kernel direct mapping tables.
18822 + if (!after_init_bootmem)
18823 + find_early_table_space(end);
18825 #ifdef CONFIG_X86_PAE
18828 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
18831 + /* Enable PSE if available */
18833 + set_in_cr4(X86_CR4_PSE);
18835 + /* Enable PGE if available */
18836 + if (cpu_has_pge) {
18837 + set_in_cr4(X86_CR4_PGE);
18838 + __supported_pte_mask |= _PAGE_GLOBAL;
18842 + * Don't use a large page for the first 2/4MB of memory
18843 + * because there are often fixed size MTRRs in there
18844 + * and overlapping MTRRs into large pages can cause
18847 + big_page_start = PMD_SIZE;
18849 + if (start < big_page_start) {
18850 + start_pfn = start >> PAGE_SHIFT;
18851 + end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
18853 + /* head is not big page alignment ? */
18854 + start_pfn = start >> PAGE_SHIFT;
18855 + end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18856 + << (PMD_SHIFT - PAGE_SHIFT);
18858 + if (start_pfn < end_pfn)
18859 + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
18861 + /* big page range */
18862 + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18863 + << (PMD_SHIFT - PAGE_SHIFT);
18864 + if (start_pfn < (big_page_start >> PAGE_SHIFT))
18865 + start_pfn = big_page_start >> PAGE_SHIFT;
18866 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
18867 + if (start_pfn < end_pfn)
18868 + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
18871 + /* tail is not big page alignment ? */
18872 + start_pfn = end_pfn;
18873 + if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
18874 + end_pfn = end >> PAGE_SHIFT;
18875 + if (start_pfn < end_pfn)
18876 + kernel_physical_mapping_init(pgd_base, start_pfn,
18880 + early_ioremap_page_table_range_init(pgd_base);
18882 + __flush_tlb_all();
18884 + if (!after_init_bootmem)
18885 + reserve_early(table_start << PAGE_SHIFT,
18886 + table_end << PAGE_SHIFT, "PGTABLE");
18888 + if (!after_init_bootmem)
18889 + early_memtest(start, end);
18891 + return end >> PAGE_SHIFT;
18896 + * paging_init() sets up the page tables - note that the first 8MB are
18897 + * already mapped by head.S.
18899 + * This routines also unmaps the page at virtual kernel address 0, so
18900 + * that we can trap those pesky NULL-reference errors in the kernel.
18902 +void __init paging_init(void)
18910 - /* Switch to the real shared_info page, and clear the
18912 - set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
18913 - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
18914 - memset(empty_zero_page, 0, sizeof(empty_zero_page));
18916 - /* Setup mapping of lower 1st MB */
18917 - for (i = 0; i < NR_FIX_ISAMAPS; i++)
18918 - if (is_initial_xendomain())
18919 - set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
18921 - __set_fixmap(FIX_ISAMAP_BEGIN - i,
18922 - virt_to_machine(empty_zero_page),
18925 + * NOTE: at this point the bootmem allocator is fully available.
18928 + zone_sizes_init();
18932 @@ -598,7 +942,7 @@ static struct kcore_list kcore_mem, kcor
18933 void __init mem_init(void)
18935 int codesize, reservedpages, datasize, initsize;
18936 - int tmp, bad_ppro;
18941 @@ -606,19 +950,6 @@ void __init mem_init(void)
18942 #ifdef CONFIG_FLATMEM
18945 - bad_ppro = ppro_with_ram_bug();
18947 -#ifdef CONFIG_HIGHMEM
18948 - /* check that fixmap and pkmap do not overlap */
18949 - if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
18951 - "fixmap and kmap areas overlap - this will crash\n");
18952 - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
18953 - PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
18958 /* this will put all low memory onto the freelists */
18959 totalram_pages += free_all_bootmem();
18960 /* XEN: init and count low-mem pages outside initial allocation. */
18961 @@ -636,7 +967,7 @@ void __init mem_init(void)
18962 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
18965 - set_highmem_pages_init(bad_ppro);
18966 + set_highmem_pages_init();
18968 codesize = (unsigned long) &_etext - (unsigned long) &_text;
18969 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
18970 @@ -657,7 +988,6 @@ void __init mem_init(void)
18971 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
18974 -#if 1 /* double-sanity-check paranoia */
18975 printk(KERN_INFO "virtual kernel memory layout:\n"
18976 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
18977 #ifdef CONFIG_HIGHMEM
18978 @@ -698,7 +1028,6 @@ void __init mem_init(void)
18980 BUG_ON(VMALLOC_START > VMALLOC_END);
18981 BUG_ON((unsigned long)high_memory > VMALLOC_START);
18982 -#endif /* double-sanity-check paranoia */
18984 if (boot_cpu_data.wp_works_ok < 0)
18986 @@ -755,6 +1084,8 @@ void mark_rodata_ro(void)
18987 unsigned long start = PFN_ALIGN(_text);
18988 unsigned long size = PFN_ALIGN(_etext) - start;
18990 +#ifndef CONFIG_DYNAMIC_FTRACE
18991 + /* Dynamic tracing modifies the kernel text section */
18992 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
18993 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
18995 @@ -767,6 +1098,8 @@ void mark_rodata_ro(void)
18996 printk(KERN_INFO "Testing CPA: write protecting again\n");
18997 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
18999 +#endif /* CONFIG_DYNAMIC_FTRACE */
19002 size = (unsigned long)__end_rodata - start;
19003 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
19004 @@ -829,3 +1162,9 @@ void free_initrd_mem(unsigned long start
19005 free_init_pages("initrd memory", start, end);
19009 +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
19012 + return reserve_bootmem(phys, len, flags);
19014 --- sle11-2009-10-16.orig/arch/x86/mm/init_64-xen.c 2009-03-16 16:38:05.000000000 +0100
19015 +++ sle11-2009-10-16/arch/x86/mm/init_64-xen.c 2009-06-04 10:21:39.000000000 +0200
19017 #include <linux/swap.h>
19018 #include <linux/smp.h>
19019 #include <linux/init.h>
19020 +#include <linux/initrd.h>
19021 #include <linux/pagemap.h>
19022 #include <linux/bootmem.h>
19023 #include <linux/proc_fs.h>
19026 #include <xen/features.h>
19029 + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
19030 + * The direct mapping extends to max_pfn_mapped, so that we can directly access
19031 + * apertures, ACPI and other tables without having to play with fixmaps.
19033 +unsigned long max_low_pfn_mapped;
19034 +unsigned long max_pfn_mapped;
19036 #if CONFIG_XEN_COMPAT <= 0x030002
19037 unsigned int __kernel_page_user;
19038 EXPORT_SYMBOL(__kernel_page_user);
19039 @@ -60,13 +69,12 @@ EXPORT_SYMBOL(__kernel_page_user);
19042 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
19043 -extern unsigned long start_pfn;
19045 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
19046 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
19049 -int direct_gbpages __meminitdata
19050 +int direct_gbpages
19051 #ifdef CONFIG_DIRECT_GBPAGES
19054 @@ -145,55 +153,23 @@ void __meminit early_make_page_readonly(
19055 * around without checking the pgd every time.
19058 -void show_mem(void)
19060 - long i, total = 0, reserved = 0;
19061 - long shared = 0, cached = 0;
19062 - struct page *page;
19063 - pg_data_t *pgdat;
19065 - printk(KERN_INFO "Mem-info:\n");
19066 - show_free_areas();
19067 - for_each_online_pgdat(pgdat) {
19068 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
19070 - * This loop can take a while with 256 GB and
19071 - * 4k pages so defer the NMI watchdog:
19073 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
19074 - touch_nmi_watchdog();
19076 - if (!pfn_valid(pgdat->node_start_pfn + i))
19079 - page = pfn_to_page(pgdat->node_start_pfn + i);
19081 - if (PageReserved(page))
19083 - else if (PageSwapCache(page))
19085 - else if (page_count(page))
19086 - shared += page_count(page) - 1;
19089 - printk(KERN_INFO "%lu pages of RAM\n", total);
19090 - printk(KERN_INFO "%lu reserved pages\n", reserved);
19091 - printk(KERN_INFO "%lu pages shared\n", shared);
19092 - printk(KERN_INFO "%lu pages swap cached\n", cached);
19095 static unsigned long __meminitdata table_start;
19096 -static unsigned long __meminitdata table_end;
19097 +static unsigned long __meminitdata table_cur;
19098 +static unsigned long __meminitdata table_top;
19100 -static __init void *spp_getpage(void)
19102 + * NOTE: This function is marked __ref because it calls __init function
19103 + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
19105 +static __ref void *spp_getpage(void)
19110 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
19111 - else if (start_pfn < table_end) {
19112 - ptr = __va(start_pfn << PAGE_SHIFT);
19114 + else if (table_cur < table_top) {
19115 + ptr = __va(table_cur << PAGE_SHIFT);
19117 memset(ptr, 0, PAGE_SIZE);
19119 ptr = alloc_bootmem_pages(PAGE_SIZE);
19120 @@ -208,30 +184,18 @@ static __init void *spp_getpage(void)
19124 -#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
19125 -#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
19127 -static __init void
19128 -set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
19130 +set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
19135 - pte_t *pte, new_pte;
19137 - pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
19140 - pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
19141 - if (pgd_none(*pgd)) {
19143 - "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19146 - pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
19147 + pud = pud_page + pud_index(vaddr);
19148 if (pud_none(*pud)) {
19149 pmd = (pmd_t *) spp_getpage();
19150 make_page_readonly(pmd, XENFEAT_writable_page_tables);
19151 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19152 + pud_populate(&init_mm, pud, pmd);
19153 if (pmd != pmd_offset(pud, 0)) {
19154 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19155 pmd, pmd_offset(pud, 0));
19156 @@ -242,19 +206,20 @@ set_pte_phys(unsigned long vaddr, unsign
19157 if (pmd_none(*pmd)) {
19158 pte = (pte_t *) spp_getpage();
19159 make_page_readonly(pte, XENFEAT_writable_page_tables);
19160 - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19161 + pmd_populate_kernel(&init_mm, pmd, pte);
19162 if (pte != pte_offset_kernel(pmd, 0)) {
19163 printk(KERN_ERR "PAGETABLE BUG #02!\n");
19167 - if (pgprot_val(prot))
19168 - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
19170 - new_pte = __pte(0);
19172 pte = pte_offset_kernel(pmd, vaddr);
19173 if (!pte_none(*pte) && __pte_val(new_pte) &&
19174 +#ifdef CONFIG_ACPI
19175 + /* __acpi_map_table() fails to properly call clear_fixmap() */
19176 + (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19177 + vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19179 __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19181 set_pte(pte, new_pte);
19182 @@ -266,15 +231,13 @@ set_pte_phys(unsigned long vaddr, unsign
19183 __flush_tlb_one(vaddr);
19186 -static __init void
19187 -set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
19189 +set_pte_vaddr(unsigned long vaddr, pte_t pteval)
19194 - pte_t *pte, new_pte;
19197 - pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
19198 + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
19200 pgd = pgd_offset_k(vaddr);
19201 if (pgd_none(*pgd)) {
19202 @@ -282,47 +245,51 @@ set_pte_phys_ma(unsigned long vaddr, uns
19203 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19206 - pud = pud_offset(pgd, vaddr);
19207 - if (pud_none(*pud)) {
19208 - pmd = (pmd_t *) spp_getpage();
19209 - make_page_readonly(pmd, XENFEAT_writable_page_tables);
19210 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19211 - if (pmd != pmd_offset(pud, 0)) {
19212 - printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19213 - pmd, pmd_offset(pud, 0));
19214 + pud_page = (pud_t*)pgd_page_vaddr(*pgd);
19215 + set_pte_vaddr_pud(pud_page, vaddr, pteval);
19218 +#ifndef CONFIG_XEN
19220 + * Create large page table mappings for a range of physical addresses.
19222 +static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
19229 + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
19230 + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
19231 + pgd = pgd_offset_k((unsigned long)__va(phys));
19232 + if (pgd_none(*pgd)) {
19233 + pud = (pud_t *) spp_getpage();
19234 + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
19238 - pmd = pmd_offset(pud, vaddr);
19239 - if (pmd_none(*pmd)) {
19240 - pte = (pte_t *) spp_getpage();
19241 - make_page_readonly(pte, XENFEAT_writable_page_tables);
19242 - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19243 - if (pte != pte_offset_kernel(pmd, 0)) {
19244 - printk(KERN_ERR "PAGETABLE BUG #02!\n");
19246 + pud = pud_offset(pgd, (unsigned long)__va(phys));
19247 + if (pud_none(*pud)) {
19248 + pmd = (pmd_t *) spp_getpage();
19249 + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
19252 + pmd = pmd_offset(pud, phys);
19253 + BUG_ON(!pmd_none(*pmd));
19254 + set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
19256 - new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
19259 - pte = pte_offset_kernel(pmd, vaddr);
19260 - if (!pte_none(*pte) && __pte_val(new_pte) &&
19261 -#ifdef CONFIG_ACPI
19262 - /* __acpi_map_table() fails to properly call clear_fixmap() */
19263 - (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19264 - vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19266 - __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19268 - set_pte(pte, new_pte);
19269 +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
19271 + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
19275 - * It's enough to flush this one mapping.
19276 - * (PGE mappings get flushed as well)
19278 - __flush_tlb_one(vaddr);
19279 +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
19281 + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
19284 -#ifndef CONFIG_XEN
19286 * The head.S code sets up the kernel high mapping:
19288 @@ -352,63 +319,52 @@ void __init cleanup_highmap(void)
19292 -/* NOTE: this is meant to be run only at boot */
19293 -void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
19295 - unsigned long address = __fix_to_virt(idx);
19297 - if (idx >= __end_of_fixed_addresses) {
19298 - printk(KERN_ERR "Invalid __set_fixmap\n");
19302 - case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
19303 - set_pte_phys(address, phys, prot, 0);
19304 - set_pte_phys(address, phys, prot, 1);
19306 - case FIX_EARLYCON_MEM_BASE:
19307 - xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
19308 - pfn_pte_ma(phys >> PAGE_SHIFT, prot));
19311 - set_pte_phys_ma(address, phys, prot);
19316 -static __meminit void *alloc_static_page(unsigned long *phys)
19317 +static __ref void *alloc_low_page(unsigned long *phys)
19319 - unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
19320 + unsigned long pfn;
19323 if (after_bootmem) {
19324 - void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
19325 + adr = (void *)get_zeroed_page(GFP_ATOMIC);
19331 - *phys = start_pfn << PAGE_SHIFT;
19333 - memset((void *)va, 0, PAGE_SIZE);
19334 - return (void *)va;
19335 + BUG_ON(!table_cur);
19336 + pfn = table_cur++;
19337 + if (pfn >= table_top)
19338 + panic("alloc_low_page: ran out of memory");
19340 + adr = early_ioremap(pfn_to_mfn(pfn) * PAGE_SIZE, PAGE_SIZE);
19341 + memset(adr, 0, PAGE_SIZE);
19342 + *phys = pfn * PAGE_SIZE;
19346 -#define PTE_SIZE PAGE_SIZE
19347 +static __ref void unmap_low_page(void *adr)
19349 + if (after_bootmem)
19352 + early_iounmap(adr, PAGE_SIZE);
19355 static inline int __meminit make_readonly(unsigned long paddr)
19357 extern char __vsyscall_0;
19360 - /* Make new page tables read-only. */
19361 + /* Make new page tables read-only on the first pass. */
19362 if (!xen_feature(XENFEAT_writable_page_tables)
19363 + && !max_pfn_mapped
19364 && (paddr >= (table_start << PAGE_SHIFT))
19365 - && (paddr < (table_end << PAGE_SHIFT)))
19366 + && (paddr < (table_top << PAGE_SHIFT)))
19368 /* Make old page tables read-only. */
19369 if (!xen_feature(XENFEAT_writable_page_tables)
19370 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
19371 - && (paddr < (start_pfn << PAGE_SHIFT)))
19372 + && (paddr < (table_cur << PAGE_SHIFT)))
19376 @@ -425,118 +381,131 @@ static inline int __meminit make_readonl
19380 -#ifndef CONFIG_XEN
19381 -/* Must run before zap_low_mappings */
19382 -__meminit void *early_ioremap(unsigned long addr, unsigned long size)
19383 +static unsigned long __meminit
19384 +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
19386 - pmd_t *pmd, *last_pmd;
19387 - unsigned long vaddr;
19390 - pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19391 - vaddr = __START_KERNEL_map;
19392 - pmd = level2_kernel_pgt;
19393 - last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
19395 - for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
19396 - for (i = 0; i < pmds; i++) {
19397 - if (pmd_present(pmd[i]))
19398 - goto continue_outer_loop;
19400 - vaddr += addr & ~PMD_MASK;
19401 - addr &= PMD_MASK;
19402 + unsigned pages = 0;
19403 + unsigned long last_map_addr = end;
19406 + pte_t *pte = pte_page + pte_index(addr);
19408 + for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
19409 + unsigned long pteval = addr | __PAGE_KERNEL;
19411 - for (i = 0; i < pmds; i++, addr += PMD_SIZE)
19412 - set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
19413 - __flush_tlb_all();
19415 - return (void *)vaddr;
19416 -continue_outer_loop:
19418 + if (addr >= end ||
19419 + (!after_bootmem &&
19420 + (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages))
19423 + if (__pte_val(*pte))
19426 + if (make_readonly(addr))
19427 + pteval &= ~_PAGE_RW;
19429 + printk(" pte=%p addr=%lx pte=%016lx\n",
19430 + pte, addr, pteval);
19431 + if (!after_bootmem)
19432 + *pte = __pte(pteval & __supported_pte_mask);
19434 + set_pte(pte, __pte(pteval & __supported_pte_mask));
19435 + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
19438 - printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
19440 + update_page_count(PG_LEVEL_4K, pages);
19442 + return last_map_addr;
19446 - * To avoid virtual aliases later:
19448 -__meminit void early_iounmap(void *addr, unsigned long size)
19449 +static unsigned long __meminit
19450 +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
19452 - unsigned long vaddr;
19456 - vaddr = (unsigned long)addr;
19457 - pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19458 - pmd = level2_kernel_pgt + pmd_index(vaddr);
19460 - for (i = 0; i < pmds; i++)
19461 - pmd_clear(pmd + i);
19462 + pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
19464 - __flush_tlb_all();
19465 + BUG_ON(!max_pfn_mapped);
19466 + return phys_pte_init(pte, address, end);
19470 static unsigned long __meminit
19471 -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
19472 +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
19473 + unsigned long page_size_mask)
19475 + unsigned long pages = 0;
19476 + unsigned long last_map_addr = end;
19477 + unsigned long start = address;
19479 int i = pmd_index(address);
19481 - for (; i < PTRS_PER_PMD; i++) {
19482 + for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
19483 unsigned long pte_phys;
19484 - pmd_t *pmd = pmd_page + i;
19485 - pte_t *pte, *pte_save;
19487 + pmd_t *pmd = pmd_page + pmd_index(address);
19490 if (address >= end)
19493 if (__pmd_val(*pmd)) {
19494 - address += PMD_SIZE;
19495 + if (!pmd_large(*pmd)) {
19496 + spin_lock(&init_mm.page_table_lock);
19497 + last_map_addr = phys_pte_update(pmd, address,
19499 + spin_unlock(&init_mm.page_table_lock);
19501 + /* Count entries we're using from level2_ident_pgt */
19507 - pte = alloc_static_page(&pte_phys);
19509 - for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
19510 - unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
19512 - if (address >= (after_bootmem
19514 - : xen_start_info->nr_pages << PAGE_SHIFT))
19516 - else if (make_readonly(address))
19517 - pteval &= ~_PAGE_RW;
19518 - set_pte(pte, __pte(pteval & __supported_pte_mask));
19519 + if (page_size_mask & (1<<PG_LEVEL_2M)) {
19521 + spin_lock(&init_mm.page_table_lock);
19522 + set_pte((pte_t *)pmd,
19523 + pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19524 + spin_unlock(&init_mm.page_table_lock);
19525 + last_map_addr = (address & PMD_MASK) + PMD_SIZE;
19529 + pte = alloc_low_page(&pte_phys);
19530 + last_map_addr = phys_pte_init(pte, address, end);
19531 + unmap_low_page(pte);
19533 if (!after_bootmem) {
19534 - early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19535 - *pmd = __pmd(pte_phys | _KERNPG_TABLE);
19536 + if (max_pfn_mapped)
19537 + make_page_readonly(__va(pte_phys),
19538 + XENFEAT_writable_page_tables);
19539 + *pmd = __pmd(pte_phys | _PAGE_TABLE);
19541 - make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19542 - set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
19543 + make_page_readonly(pte, XENFEAT_writable_page_tables);
19544 + spin_lock(&init_mm.page_table_lock);
19545 + pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
19546 + spin_unlock(&init_mm.page_table_lock);
19550 + update_page_count(PG_LEVEL_2M, pages);
19551 + return last_map_addr;
19554 static unsigned long __meminit
19555 -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
19556 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
19557 + unsigned long page_size_mask)
19559 pmd_t *pmd = pmd_offset(pud, 0);
19560 unsigned long last_map_addr;
19562 - spin_lock(&init_mm.page_table_lock);
19563 - last_map_addr = phys_pmd_init(pmd, address, end);
19564 - spin_unlock(&init_mm.page_table_lock);
19565 + BUG_ON(!max_pfn_mapped);
19566 + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
19568 return last_map_addr;
19571 static unsigned long __meminit
19572 -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
19573 +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
19574 + unsigned long page_size_mask)
19576 + unsigned long pages = 0;
19577 unsigned long last_map_addr = end;
19578 int i = pud_index(addr);
19580 @@ -550,29 +519,55 @@ phys_pud_init(pud_t *pud_page, unsigned
19582 if (__pud_val(*pud)) {
19583 if (!pud_large(*pud))
19584 - last_map_addr = phys_pmd_update(pud, addr, end);
19585 + last_map_addr = phys_pmd_update(pud, addr, end,
19590 - if (direct_gbpages) {
19591 + if (page_size_mask & (1<<PG_LEVEL_1G)) {
19593 + spin_lock(&init_mm.page_table_lock);
19594 set_pte((pte_t *)pud,
19595 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19596 + spin_unlock(&init_mm.page_table_lock);
19597 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
19601 - pmd = alloc_static_page(&pmd_phys);
19603 - spin_lock(&init_mm.page_table_lock);
19604 - *pud = __pud(pmd_phys | _KERNPG_TABLE);
19605 - last_map_addr = phys_pmd_init(pmd, addr, end);
19606 - spin_unlock(&init_mm.page_table_lock);
19607 + pmd = alloc_low_page(&pmd_phys);
19608 + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
19609 + unmap_low_page(pmd);
19611 - early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
19612 + if (!after_bootmem) {
19613 + if (max_pfn_mapped)
19614 + make_page_readonly(__va(pmd_phys),
19615 + XENFEAT_writable_page_tables);
19616 + if (page_size_mask & (1 << PG_LEVEL_NUM))
19617 + xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE));
19619 + *pud = __pud(pmd_phys | _PAGE_TABLE);
19621 + make_page_readonly(pmd, XENFEAT_writable_page_tables);
19622 + spin_lock(&init_mm.page_table_lock);
19623 + pud_populate(&init_mm, pud, __va(pmd_phys));
19624 + spin_unlock(&init_mm.page_table_lock);
19628 + update_page_count(PG_LEVEL_1G, pages);
19630 - return last_map_addr >> PAGE_SHIFT;
19631 + return last_map_addr;
19634 +static unsigned long __meminit
19635 +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
19636 + unsigned long page_size_mask)
19640 + pud = (pud_t *)pgd_page_vaddr(*pgd);
19642 + return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM));
19645 void __init xen_init_pt(void)
19646 @@ -651,86 +646,36 @@ void __init xen_init_pt(void)
19650 -static void __init extend_init_mapping(unsigned long tables_space)
19652 - unsigned long va = __START_KERNEL_map;
19653 - unsigned long start = start_pfn;
19654 - unsigned long phys, addr, *pte_page;
19656 - pte_t *pte, new_pte;
19657 - unsigned long *page = (unsigned long *)init_level4_pgt;
19659 - addr = page[pgd_index(va)];
19660 - addr_to_page(addr, page);
19661 - addr = page[pud_index(va)];
19662 - addr_to_page(addr, page);
19664 - /* Kill mapping of low 1MB. */
19665 - while (va < (unsigned long)&_text) {
19666 - if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
19671 - /* Ensure init mappings cover kernel text/data and initial tables. */
19672 - while (va < (__START_KERNEL_map
19673 - + (start_pfn << PAGE_SHIFT)
19674 - + tables_space)) {
19675 - pmd = (pmd_t *)&page[pmd_index(va)];
19676 - if (pmd_none(*pmd)) {
19677 - pte_page = alloc_static_page(&phys);
19678 - early_make_page_readonly(
19679 - pte_page, XENFEAT_writable_page_tables);
19680 - set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
19682 - addr = page[pmd_index(va)];
19683 - addr_to_page(addr, pte_page);
19685 - pte = (pte_t *)&pte_page[pte_index(va)];
19686 - if (pte_none(*pte)) {
19687 - new_pte = pfn_pte(
19688 - (va - __START_KERNEL_map) >> PAGE_SHIFT,
19689 - __pgprot(_KERNPG_TABLE));
19690 - xen_l1_entry_update(pte, new_pte);
19695 - /* Finally, blow away any spurious initial mappings. */
19697 - pmd = (pmd_t *)&page[pmd_index(va)];
19698 - if (pmd_none(*pmd))
19700 - if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
19705 - if (start_pfn > start)
19706 - reserve_early(start << PAGE_SHIFT,
19707 - start_pfn << PAGE_SHIFT, "INITMAP");
19710 static void __init find_early_table_space(unsigned long end)
19712 unsigned long puds, pmds, ptes, tables;
19714 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
19715 + tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
19716 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
19717 - ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
19718 + tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
19720 - tables = round_up(puds * 8, PAGE_SIZE) +
19721 - round_up(pmds * 8, PAGE_SIZE) +
19722 - round_up(ptes * 8, PAGE_SIZE);
19723 + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
19724 + tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
19726 - extend_init_mapping(tables);
19727 + if (!table_top) {
19728 + table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
19729 + xen_start_info->nr_pt_frames;
19730 + table_cur = table_start;
19733 + * [table_start, table_top) gets passed to reserve_early(),
19734 + * so we must not use table_cur here, despite continuing
19735 + * to allocate from there. table_cur possibly being below
19736 + * table_start is otoh not a problem.
19738 + table_start = table_top;
19741 - table_start = start_pfn;
19742 - table_end = table_start + (tables>>PAGE_SHIFT);
19743 + table_top = table_cur + (tables >> PAGE_SHIFT);
19745 - early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
19746 - end, table_start << PAGE_SHIFT,
19747 - (table_start << PAGE_SHIFT) + tables);
19748 + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
19749 + end, table_cur << PAGE_SHIFT, table_top << PAGE_SHIFT);
19752 static void __init xen_finish_init_mapping(void)
19753 @@ -752,18 +697,18 @@ static void __init xen_finish_init_mappi
19754 xen_start_info->mod_start = (unsigned long)
19755 __va(__pa(xen_start_info->mod_start));
19757 - /* Destroy the Xen-created mappings beyond the kernel image as
19758 - * well as the temporary mappings created above. Prevents
19759 - * overlap with modules area (if init mapping is very big).
19761 + /* Destroy the Xen-created mappings beyond the kernel image. */
19762 start = PAGE_ALIGN((unsigned long)_end);
19763 - end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
19764 + end = __START_KERNEL_map + (table_start << PAGE_SHIFT);
19765 for (; start < end; start += PAGE_SIZE)
19766 if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
19769 - /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
19770 - table_end = ~0UL;
19771 + /* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
19772 + start = table_top;
19773 + WARN(table_cur != start, "start=%lx cur=%lx top=%lx\n",
19774 + table_start, table_cur, start);
19775 + table_top = ~0UL;
19777 /* Switch to the real shared_info page, and clear the dummy page. */
19778 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
19779 @@ -780,8 +725,7 @@ static void __init xen_finish_init_mappi
19783 - /* Disable the 'start_pfn' allocator. */
19784 - table_end = start_pfn;
19785 + table_top = max(table_cur, start);
19788 static void __init init_gbpages(void)
19789 @@ -794,126 +738,91 @@ static void __init init_gbpages(void)
19793 -#ifdef CONFIG_MEMTEST_BOOTPARAM
19795 -static void __init memtest(unsigned long start_phys, unsigned long size,
19796 - unsigned pattern)
19797 +static unsigned long __init kernel_physical_mapping_init(unsigned long start,
19798 + unsigned long end,
19799 + unsigned long page_size_mask)
19802 - unsigned long *start;
19803 - unsigned long start_bad;
19804 - unsigned long last_bad;
19805 - unsigned long val;
19806 - unsigned long start_phys_aligned;
19807 - unsigned long count;
19808 - unsigned long incr;
19810 - switch (pattern) {
19818 - val = 0x5555555555555555UL;
19821 - val = 0xaaaaaaaaaaaaaaaaUL;
19827 - incr = sizeof(unsigned long);
19828 - start_phys_aligned = ALIGN(start_phys, incr);
19829 - count = (size - (start_phys_aligned - start_phys))/incr;
19830 - start = __va(start_phys_aligned);
19834 - for (i = 0; i < count; i++)
19836 - for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
19837 - if (*start != val) {
19838 - if (start_phys_aligned == last_bad + incr) {
19839 - last_bad += incr;
19842 - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19843 - val, start_bad, last_bad + incr);
19844 - reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19846 - start_bad = last_bad = start_phys_aligned;
19851 - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19852 - val, start_bad, last_bad + incr);
19853 - reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19857 + unsigned long next, last_map_addr = end;
19859 -static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
19860 + start = (unsigned long)__va(start);
19861 + end = (unsigned long)__va(end);
19863 -static int __init parse_memtest(char *arg)
19866 - memtest_pattern = simple_strtoul(arg, NULL, 0);
19869 + for (; start < end; start = next) {
19870 + pgd_t *pgd = pgd_offset_k(start);
19871 + unsigned long pud_phys;
19874 -early_param("memtest", parse_memtest);
19875 + next = (start + PGDIR_SIZE) & PGDIR_MASK;
19879 -static void __init early_memtest(unsigned long start, unsigned long end)
19881 - u64 t_start, t_size;
19882 - unsigned pattern;
19883 + if (__pgd_val(*pgd)) {
19884 + last_map_addr = phys_pud_update(pgd, __pa(start),
19885 + __pa(end), page_size_mask);
19889 - if (!memtest_pattern)
19891 + pud = alloc_low_page(&pud_phys);
19892 + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
19894 + unmap_low_page(pud);
19896 + if(!after_bootmem) {
19897 + if (max_pfn_mapped)
19898 + make_page_readonly(__va(pud_phys),
19899 + XENFEAT_writable_page_tables);
19900 + xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
19902 + make_page_readonly(pud, XENFEAT_writable_page_tables);
19903 + spin_lock(&init_mm.page_table_lock);
19904 + pgd_populate(&init_mm, pgd, __va(pud_phys));
19905 + spin_unlock(&init_mm.page_table_lock);
19909 - printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
19910 - for (pattern = 0; pattern < memtest_pattern; pattern++) {
19913 - while (t_start < end) {
19914 - t_start = find_e820_area_size(t_start, &t_size, 1);
19915 + return last_map_addr;
19919 - if (t_start >= end)
19921 - if (t_start + t_size > end)
19922 - t_size = end - t_start;
19923 +struct map_range {
19924 + unsigned long start;
19925 + unsigned long end;
19926 + unsigned page_size_mask;
19929 - printk(KERN_CONT "\n %016llx - %016llx pattern %d",
19930 - (unsigned long long)t_start,
19931 - (unsigned long long)t_start + t_size, pattern);
19932 +#define NR_RANGE_MR 5
19934 - memtest(t_start, t_size, pattern);
19935 +static int save_mr(struct map_range *mr, int nr_range,
19936 + unsigned long start_pfn, unsigned long end_pfn,
19937 + unsigned long page_size_mask)
19940 - t_start += t_size;
19942 + if (start_pfn < end_pfn) {
19943 + if (nr_range >= NR_RANGE_MR)
19944 + panic("run out of range for init_memory_mapping\n");
19945 + mr[nr_range].start = start_pfn<<PAGE_SHIFT;
19946 + mr[nr_range].end = end_pfn<<PAGE_SHIFT;
19947 + mr[nr_range].page_size_mask = page_size_mask;
19950 - printk(KERN_CONT "\n");
19953 -static void __init early_memtest(unsigned long start, unsigned long end)
19961 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
19962 * This runs before bootmem is initialized and gets pages directly from
19963 * the physical memory. To access them they are temporarily mapped.
19965 -unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
19966 +unsigned long __init_refok init_memory_mapping(unsigned long start,
19967 + unsigned long end)
19969 - unsigned long next, last_map_addr = end;
19970 - unsigned long start_phys = start, end_phys = end;
19971 + unsigned long last_map_addr = 0;
19972 + unsigned long page_size_mask = 0;
19973 + unsigned long start_pfn, end_pfn;
19975 + struct map_range mr[NR_RANGE_MR];
19978 printk(KERN_INFO "init_memory_mapping\n");
19980 @@ -924,51 +833,150 @@ unsigned long __init_refok init_memory_m
19981 * memory mapped. Unfortunately this is done currently before the
19982 * nodes are discovered.
19984 - if (!after_bootmem) {
19985 + if (!after_bootmem)
19987 - find_early_table_space(end);
19989 + if (direct_gbpages)
19990 + page_size_mask |= 1 << PG_LEVEL_1G;
19992 + page_size_mask |= 1 << PG_LEVEL_2M;
19994 + memset(mr, 0, sizeof(mr));
19997 + /* head if not big page alignment ?*/
19998 + start_pfn = start >> PAGE_SHIFT;
19999 + end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
20000 + << (PMD_SHIFT - PAGE_SHIFT);
20001 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
20003 + /* big page (2M) range*/
20004 + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
20005 + << (PMD_SHIFT - PAGE_SHIFT);
20006 + end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
20007 + << (PUD_SHIFT - PAGE_SHIFT);
20008 + if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
20009 + end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
20010 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20011 + page_size_mask & (1<<PG_LEVEL_2M));
20013 + /* big page (1G) range */
20014 + start_pfn = end_pfn;
20015 + end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
20016 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20018 + ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
20020 + /* tail is not big page (1G) alignment */
20021 + start_pfn = end_pfn;
20022 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
20023 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20024 + page_size_mask & (1<<PG_LEVEL_2M));
20026 + /* tail is not big page (2M) alignment */
20027 + start_pfn = end_pfn;
20028 + end_pfn = end>>PAGE_SHIFT;
20029 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
20031 + /* try to merge same page size and continuous */
20032 + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
20033 + unsigned long old_start;
20034 + if (mr[i].end != mr[i+1].start ||
20035 + mr[i].page_size_mask != mr[i+1].page_size_mask)
20038 + old_start = mr[i].start;
20039 + memmove(&mr[i], &mr[i+1],
20040 + (nr_range - 1 - i) * sizeof (struct map_range));
20041 + mr[i--].start = old_start;
20045 - start = (unsigned long)__va(start);
20046 - end = (unsigned long)__va(end);
20047 + for (i = 0; i < nr_range; i++)
20048 + printk(KERN_DEBUG " %010lx - %010lx page %s\n",
20049 + mr[i].start, mr[i].end,
20050 + (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
20051 + (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
20053 - for (; start < end; start = next) {
20054 - pgd_t *pgd = pgd_offset_k(start);
20055 - unsigned long pud_phys;
20057 + if (!after_bootmem)
20058 + find_early_table_space(end);
20060 - if (after_bootmem)
20061 - pud = pud_offset(pgd, start & PGDIR_MASK);
20063 - pud = alloc_static_page(&pud_phys);
20064 - next = start + PGDIR_SIZE;
20067 - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
20068 - if (!after_bootmem) {
20069 - early_make_page_readonly(pud, XENFEAT_writable_page_tables);
20070 - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
20072 + unsigned long addr, va = __START_KERNEL_map;
20073 + unsigned long *page = (unsigned long *)init_level4_pgt;
20075 + /* Kill mapping of memory below _text. */
20076 + while (va < (unsigned long)&_text) {
20077 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
20082 + /* Blow away any spurious initial mappings. */
20083 + va = __START_KERNEL_map + (table_start << PAGE_SHIFT);
20084 + addr = page[pgd_index(va)];
20085 + addr_to_page(addr, page);
20086 + addr = page[pud_index(va)];
20087 + addr_to_page(addr, page);
20088 + while (pmd_index(va) | pte_index(va)) {
20089 + if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
20091 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
20097 - if (!after_bootmem) {
20098 - BUG_ON(start_pfn != table_end);
20099 + for (i = 0; i < nr_range; i++)
20100 + last_map_addr = kernel_physical_mapping_init(
20101 + mr[i].start, mr[i].end,
20102 + mr[i].page_size_mask);
20104 + BUG_ON(table_cur > table_top);
20106 xen_finish_init_mapping();
20108 + else if (table_cur < table_top)
20109 + /* Disable the 'table_cur' allocator. */
20110 + table_top = table_cur;
20114 - if (!after_bootmem)
20115 + if (!after_bootmem && table_top > table_start)
20116 reserve_early(table_start << PAGE_SHIFT,
20117 - table_end << PAGE_SHIFT, "PGTABLE");
20118 + table_top << PAGE_SHIFT, "PGTABLE");
20120 + printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
20121 + last_map_addr, end);
20123 if (!after_bootmem)
20124 - early_memtest(start_phys, end_phys);
20125 + early_memtest(start, end);
20127 - return last_map_addr;
20128 + return last_map_addr >> PAGE_SHIFT;
20131 #ifndef CONFIG_NUMA
20132 +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
20134 + unsigned long bootmap_size, bootmap;
20136 + e820_register_active_regions(0, start_pfn, end_pfn);
20138 + if (end_pfn > xen_start_info->nr_pages)
20139 + end_pfn = xen_start_info->nr_pages;
20141 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
20142 + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
20144 + if (bootmap == -1L)
20145 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
20146 + /* don't touch min_low_pfn */
20147 + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
20149 + free_bootmem_with_active_regions(0, end_pfn);
20150 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
20151 + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
20154 void __init paging_init(void)
20156 unsigned long max_zone_pfns[MAX_NR_ZONES];
20157 @@ -976,9 +984,9 @@ void __init paging_init(void)
20158 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
20159 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
20160 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
20161 - max_zone_pfns[ZONE_NORMAL] = end_pfn;
20162 + max_zone_pfns[ZONE_NORMAL] = max_pfn;
20164 - memory_present(0, 0, end_pfn);
20165 + memory_present(0, 0, max_pfn);
20167 free_area_init_nodes(max_zone_pfns);
20169 @@ -1069,8 +1077,8 @@ void __init mem_init(void)
20170 init_page_count(pfn_to_page(pfn));
20173 - reservedpages = end_pfn - totalram_pages -
20174 - absent_pages_in_range(0, end_pfn);
20175 + reservedpages = max_pfn - totalram_pages -
20176 + absent_pages_in_range(0, max_pfn);
20179 codesize = (unsigned long) &_etext - (unsigned long) &_text;
20180 @@ -1089,7 +1097,7 @@ void __init mem_init(void)
20181 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
20182 "%ldk reserved, %ldk data, %ldk init)\n",
20183 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
20184 - end_pfn << (PAGE_SHIFT-10),
20185 + max_pfn << (PAGE_SHIFT-10),
20187 reservedpages << (PAGE_SHIFT-10),
20189 @@ -1152,6 +1160,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
20190 void mark_rodata_ro(void)
20192 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
20193 + unsigned long rodata_start =
20194 + ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20196 +#ifdef CONFIG_DYNAMIC_FTRACE
20197 + /* Dynamic tracing modifies the kernel text section */
20198 + start = rodata_start;
20201 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
20202 (end - start) >> 10);
20203 @@ -1161,8 +1176,7 @@ void mark_rodata_ro(void)
20204 * The rodata section (but not the kernel text!) should also be
20207 - start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20208 - set_memory_nx(start, (end - start) >> PAGE_SHIFT);
20209 + set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
20213 @@ -1184,24 +1198,26 @@ void free_initrd_mem(unsigned long start
20217 -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
20218 +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
20225 unsigned long pfn = phys >> PAGE_SHIFT;
20227 - if (pfn >= end_pfn) {
20228 + if (pfn >= max_pfn) {
20230 * This can happen with kdump kernels when accessing
20233 if (pfn < max_pfn_mapped)
20237 - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
20238 + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
20244 /* Should check here against the e820 map to avoid double free */
20245 @@ -1209,9 +1225,13 @@ void __init reserve_bootmem_generic(unsi
20246 nid = phys_to_nid(phys);
20247 next_nid = phys_to_nid(phys + len - 1);
20248 if (nid == next_nid)
20249 - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
20250 + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
20252 - reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20253 + ret = reserve_bootmem(phys, len, flags);
20259 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20261 @@ -1222,6 +1242,8 @@ void __init reserve_bootmem_generic(unsi
20262 set_dma_reserve(dma_reserve);
20269 int kern_addr_valid(unsigned long addr)
20270 @@ -1326,7 +1348,7 @@ vmemmap_populate(struct page *start_page
20273 for (; addr < end; addr = next) {
20274 - next = pmd_addr_end(addr, end);
20277 pgd = vmemmap_pgd_populate(addr, node);
20279 @@ -1336,33 +1358,51 @@ vmemmap_populate(struct page *start_page
20283 - pmd = pmd_offset(pud, addr);
20284 - if (pmd_none(*pmd)) {
20287 + if (!cpu_has_pse) {
20288 + next = (addr + PAGE_SIZE) & PAGE_MASK;
20289 + pmd = vmemmap_pmd_populate(pud, addr, node);
20294 + p = vmemmap_pte_populate(pmd, addr, node);
20296 - p = vmemmap_alloc_block(PMD_SIZE, node);
20300 - entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20301 - PAGE_KERNEL_LARGE);
20302 - set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20304 - /* check to see if we have contiguous blocks */
20305 - if (p_end != p || node_start != node) {
20307 - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20308 - addr_start, addr_end-1, p_start, p_end-1, node_start);
20309 - addr_start = addr;
20310 - node_start = node;
20313 - addr_end = addr + PMD_SIZE;
20314 - p_end = p + PMD_SIZE;
20315 + addr_end = addr + PAGE_SIZE;
20316 + p_end = p + PAGE_SIZE;
20318 - vmemmap_verify((pte_t *)pmd, node, addr, next);
20319 + next = pmd_addr_end(addr, end);
20321 + pmd = pmd_offset(pud, addr);
20322 + if (pmd_none(*pmd)) {
20325 + p = vmemmap_alloc_block(PMD_SIZE, node);
20329 + entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20330 + PAGE_KERNEL_LARGE);
20331 + set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20333 + /* check to see if we have contiguous blocks */
20334 + if (p_end != p || node_start != node) {
20336 + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20337 + addr_start, addr_end-1, p_start, p_end-1, node_start);
20338 + addr_start = addr;
20339 + node_start = node;
20343 + addr_end = addr + PMD_SIZE;
20344 + p_end = p + PMD_SIZE;
20346 + vmemmap_verify((pte_t *)pmd, node, addr, next);
20352 --- sle11-2009-10-16.orig/arch/x86/mm/ioremap-xen.c 2009-03-16 16:38:05.000000000 +0100
20353 +++ sle11-2009-10-16/arch/x86/mm/ioremap-xen.c 2009-06-04 10:21:39.000000000 +0200
20355 #include <linux/pfn.h>
20356 #include <linux/slab.h>
20357 #include <linux/vmalloc.h>
20358 +#include <linux/mmiotrace.h>
20360 #include <asm/cacheflush.h>
20361 #include <asm/e820.h>
20362 @@ -274,7 +275,8 @@ int ioremap_check_change_attr(unsigned l
20363 for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
20364 unsigned long pfn = mfn_to_local_pfn(mfn);
20366 - if (pfn >= max_pfn_mapped)
20367 + if (pfn >= max_low_pfn_mapped &&
20368 + (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
20370 rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
20371 PAGE_SIZE, prot_val);
20372 @@ -297,11 +299,14 @@ static void __iomem *__ioremap_caller(re
20374 unsigned long mfn, offset, vaddr;
20375 resource_size_t last_addr;
20376 + const resource_size_t unaligned_phys_addr = phys_addr;
20377 + const unsigned long unaligned_size = size;
20378 struct vm_struct *area;
20379 unsigned long new_prot_val;
20382 domid_t domid = DOMID_IO;
20383 + void __iomem *ret_addr;
20385 /* Don't allow wraparound or zero size */
20386 last_addr = phys_addr + size - 1;
20387 @@ -318,7 +323,7 @@ static void __iomem *__ioremap_caller(re
20389 * Don't remap the low PCI/ISA area, it's always mapped..
20391 - if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
20392 + if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
20393 return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
20396 @@ -342,7 +347,7 @@ static void __iomem *__ioremap_caller(re
20397 phys_addr &= PAGE_MASK;
20398 size = PAGE_ALIGN(last_addr+1) - phys_addr;
20400 - retval = reserve_memtype(phys_addr, phys_addr + size,
20401 + retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
20402 prot_val, &new_prot_val);
20404 pr_debug("Warning: reserve_memtype returned %d\n", retval);
20405 @@ -410,7 +415,10 @@ static void __iomem *__ioremap_caller(re
20409 - return (void __iomem *) (vaddr + offset);
20410 + ret_addr = (void __iomem *) (vaddr + offset);
20411 + mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
20417 @@ -438,7 +446,7 @@ void __iomem *ioremap_nocache(resource_s
20420 * Ideally, this should be:
20421 - * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20422 + * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20424 * Till we fix all X drivers to use ioremap_wc(), we will use
20426 @@ -462,7 +470,7 @@ EXPORT_SYMBOL(ioremap_nocache);
20428 void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
20430 - if (pat_wc_enabled)
20432 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
20433 __builtin_return_address(0));
20435 @@ -502,6 +510,14 @@ static void __iomem *ioremap_default(res
20439 +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
20440 + unsigned long prot_val)
20442 + return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
20443 + __builtin_return_address(0));
20445 +EXPORT_SYMBOL(ioremap_prot);
20448 * iounmap - Free a IO remapping
20449 * @addr: virtual address from ioremap_*
20450 @@ -526,6 +542,8 @@ void iounmap(volatile void __iomem *addr
20451 addr = (volatile void __iomem *)
20452 (PAGE_MASK & (unsigned long __force)addr);
20454 + mmiotrace_iounmap(addr);
20456 /* Use the vm area unlocked, assuming the caller
20457 ensures there isn't another iounmap for the same address
20458 in parallel. Reuse of the virtual address is prevented by
20459 @@ -533,7 +551,7 @@ void iounmap(volatile void __iomem *addr
20460 cpa takes care of the direct mappings. */
20461 read_lock(&vmlist_lock);
20462 for (p = vmlist; p; p = p->next) {
20463 - if (p->addr == addr)
20464 + if (p->addr == (void __force *)addr)
20467 read_unlock(&vmlist_lock);
20468 @@ -547,7 +565,7 @@ void iounmap(volatile void __iomem *addr
20469 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
20471 /* Finally remove it */
20472 - o = remove_vm_area((void *)addr);
20473 + o = remove_vm_area((void __force *)addr);
20474 BUG_ON(p != o || o == NULL);
20477 @@ -567,7 +585,7 @@ void *xlate_dev_mem_ptr(unsigned long ph
20478 if (page_is_ram(start >> PAGE_SHIFT))
20481 - addr = (void *)ioremap_default(start, PAGE_SIZE);
20482 + addr = (void __force *)ioremap_default(start, PAGE_SIZE);
20484 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
20486 @@ -595,8 +613,7 @@ static int __init early_ioremap_debug_se
20487 early_param("early_ioremap_debug", early_ioremap_debug_setup);
20489 static __initdata int after_paging_init;
20490 -static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
20491 - __section(.bss.page_aligned);
20492 +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
20494 #ifdef CONFIG_X86_32
20495 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
20496 @@ -695,10 +712,11 @@ static void __init __early_set_fixmap(en
20499 pte = early_ioremap_pte(addr);
20501 if (pgprot_val(flags))
20502 set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
20504 - pte_clear(NULL, addr, pte);
20505 + pte_clear(&init_mm, addr, pte);
20506 __flush_tlb_one(addr);
20509 @@ -726,13 +744,11 @@ static int __init check_early_ioremap_le
20511 if (!early_ioremap_nested)
20514 - printk(KERN_WARNING
20515 + WARN(1, KERN_WARNING
20516 "Debug warning: early ioremap leak of %d areas detected.\n",
20517 - early_ioremap_nested);
20518 + early_ioremap_nested);
20519 printk(KERN_WARNING
20520 - "please boot with early_ioremap_debug and report the dmesg.\n");
20522 + "please boot with early_ioremap_debug and report the dmesg.\n");
20526 --- sle11-2009-10-16.orig/arch/x86/mm/pageattr-xen.c 2009-03-16 16:38:05.000000000 +0100
20527 +++ sle11-2009-10-16/arch/x86/mm/pageattr-xen.c 2009-06-04 10:21:39.000000000 +0200
20528 @@ -34,6 +34,47 @@ struct cpa_data {
20529 unsigned force_split : 1;
20532 +#ifdef CONFIG_PROC_FS
20533 +static unsigned long direct_pages_count[PG_LEVEL_NUM];
20535 +void update_page_count(int level, unsigned long pages)
20537 + unsigned long flags;
20539 + /* Protect against CPA */
20540 + spin_lock_irqsave(&pgd_lock, flags);
20541 + direct_pages_count[level] += pages;
20542 + spin_unlock_irqrestore(&pgd_lock, flags);
20545 +static void split_page_count(int level)
20547 + direct_pages_count[level]--;
20548 + direct_pages_count[level - 1] += PTRS_PER_PTE;
20551 +int arch_report_meminfo(char *page)
20553 + int n = sprintf(page, "DirectMap4k: %8lu kB\n",
20554 + direct_pages_count[PG_LEVEL_4K] << 2);
20555 +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
20556 + n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
20557 + direct_pages_count[PG_LEVEL_2M] << 11);
20559 + n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
20560 + direct_pages_count[PG_LEVEL_2M] << 12);
20562 +#ifdef CONFIG_X86_64
20563 + if (direct_gbpages)
20564 + n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
20565 + direct_pages_count[PG_LEVEL_1G] << 20);
20570 +static inline void split_page_count(int level) { }
20573 #ifdef CONFIG_X86_64
20575 static inline unsigned long highmap_start_pfn(void)
20576 @@ -106,7 +147,7 @@ static void cpa_flush_all(unsigned long
20578 BUG_ON(irqs_disabled());
20580 - on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
20581 + on_each_cpu(__cpa_flush_all, (void *) cache, 1);
20584 static void __cpa_flush_range(void *arg)
20585 @@ -127,7 +168,7 @@ static void cpa_flush_range(unsigned lon
20586 BUG_ON(irqs_disabled());
20587 WARN_ON(PAGE_ALIGN(start) != start);
20589 - on_each_cpu(__cpa_flush_range, NULL, 1, 1);
20590 + on_each_cpu(__cpa_flush_range, NULL, 1);
20594 @@ -229,6 +270,7 @@ pte_t *lookup_address(unsigned long addr
20596 return pte_offset_kernel(pmd, address);
20598 +EXPORT_SYMBOL_GPL(lookup_address);
20601 * Set the new pmd in all the pgds we know about:
20602 @@ -509,6 +551,16 @@ static int split_large_page(pte_t *kpte,
20606 + if (address >= (unsigned long)__va(0) &&
20607 + address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
20608 + split_page_count(level);
20610 +#ifdef CONFIG_X86_64
20611 + if (address >= (unsigned long)__va(1UL<<32) &&
20612 + address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
20613 + split_page_count(level);
20617 * Get the target mfn from the original entry:
20619 @@ -566,10 +618,9 @@ repeat:
20620 if (!__pte_val(old_pte)) {
20623 - printk(KERN_WARNING "CPA: called for zero pte. "
20624 + WARN(1, KERN_WARNING "CPA: called for zero pte. "
20625 "vaddr = %lx cpa->vaddr = %lx\n", address,
20631 @@ -634,15 +685,24 @@ static int cpa_process_alias(struct cpa_
20632 struct cpa_data alias_cpa;
20635 - if (cpa->pfn > max_pfn_mapped)
20636 + if (cpa->pfn >= max_pfn_mapped)
20639 +#ifdef CONFIG_X86_64
20640 + if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
20644 * No need to redo, when the primary call touched the direct
20647 - if (!within(cpa->vaddr, PAGE_OFFSET,
20648 - PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
20649 + if (!(within(cpa->vaddr, PAGE_OFFSET,
20650 + PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
20651 +#ifdef CONFIG_X86_64
20652 + || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
20653 + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
20658 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
20659 @@ -796,6 +856,51 @@ static inline int change_page_attr_clear
20660 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
20664 +static void _free_memtype(u64 pstart, u64 pend)
20666 + u64 pa = pstart &= __PHYSICAL_MASK;
20667 + u64 ma = phys_to_machine(pa);
20669 + while ((pa += PAGE_SIZE) < pend) {
20670 + if (phys_to_machine(pa) != ma + (pa - pstart)) {
20671 + free_memtype(ma, ma + (pa - pstart));
20673 + ma = phys_to_machine(pa);
20676 + free_memtype(ma, ma + (pend - pstart));
20678 +#define free_memtype _free_memtype
20680 +static int _reserve_memtype(u64 pstart, u64 pend, unsigned long req_type)
20682 + u64 pcur = pstart &= __PHYSICAL_MASK, pa = pcur;
20683 + u64 ma = phys_to_machine(pa);
20686 + while ((pa += PAGE_SIZE) < pend) {
20687 + if (phys_to_machine(pa) != ma + (pa - pcur)) {
20688 + rc = reserve_memtype(ma, ma + (pa - pcur),
20693 + ma = phys_to_machine(pa);
20697 + rc = reserve_memtype(ma, ma + (pend - pcur), req_type, NULL);
20699 + if (unlikely(!rc) && pstart < pcur)
20700 + _free_memtype(pstart, pcur);
20704 +#define reserve_memtype(s, e, r, n) \
20705 + _reserve_memtype(s, e, BUILD_BUG_ON_ZERO(n) ?: (r))
20708 int _set_memory_uc(unsigned long addr, int numpages)
20711 @@ -810,7 +915,7 @@ int set_memory_uc(unsigned long addr, in
20713 * for now UC MINUS. see comments in ioremap_nocache()
20715 - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20716 + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20717 _PAGE_CACHE_UC_MINUS, NULL))
20720 @@ -826,10 +931,10 @@ int _set_memory_wc(unsigned long addr, i
20722 int set_memory_wc(unsigned long addr, int numpages)
20724 - if (!pat_wc_enabled)
20725 + if (!pat_enabled)
20726 return set_memory_uc(addr, numpages);
20728 - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20729 + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20730 _PAGE_CACHE_WC, NULL))
20733 @@ -845,7 +950,7 @@ int _set_memory_wb(unsigned long addr, i
20735 int set_memory_wb(unsigned long addr, int numpages)
20737 - free_memtype(addr, addr + numpages * PAGE_SIZE);
20738 + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
20740 return _set_memory_wb(addr, numpages);
20742 --- sle11-2009-10-16.orig/arch/x86/mm/pat-xen.c 2009-03-16 16:38:05.000000000 +0100
20743 +++ sle11-2009-10-16/arch/x86/mm/pat-xen.c 2009-06-04 10:21:39.000000000 +0200
20745 #include <linux/gfp.h>
20746 #include <linux/fs.h>
20747 #include <linux/bootmem.h>
20748 +#include <linux/debugfs.h>
20749 +#include <linux/seq_file.h>
20751 #include <asm/msr.h>
20752 #include <asm/tlbflush.h>
20753 @@ -26,11 +28,11 @@
20754 #include <asm/io.h>
20756 #ifdef CONFIG_X86_PAT
20757 -int __read_mostly pat_wc_enabled = 1;
20758 +int __read_mostly pat_enabled = 1;
20760 void __cpuinit pat_disable(char *reason)
20762 - pat_wc_enabled = 0;
20764 printk(KERN_INFO "%s\n", reason);
20767 @@ -42,6 +44,19 @@ static int __init nopat(char *str)
20768 early_param("nopat", nopat);
20772 +static int debug_enable;
20773 +static int __init pat_debug_setup(char *str)
20775 + debug_enable = 1;
20778 +__setup("debugpat", pat_debug_setup);
20780 +#define dprintk(fmt, arg...) \
20781 + do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
20784 static u64 __read_mostly boot_pat_state;
20787 @@ -53,24 +68,25 @@ enum {
20788 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
20791 -#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
20792 +#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
20794 void pat_init(void)
20798 - if (!pat_wc_enabled)
20799 + if (!pat_enabled)
20802 /* Paranoia check. */
20803 - if (!cpu_has_pat) {
20804 - printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
20805 + if (!cpu_has_pat && boot_pat_state) {
20807 - * Panic if this happens on the secondary CPU, and we
20808 + * If this happens we are on a secondary CPU, but
20809 * switched to PAT on the boot CPU. We have no way to
20812 - BUG_ON(boot_pat_state);
20814 + printk(KERN_ERR "PAT enabled, "
20815 + "but not supported by secondary CPU\n");
20820 @@ -87,8 +103,8 @@ void pat_init(void)
20821 * 011 UC _PAGE_CACHE_UC
20824 - pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
20825 - PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
20826 + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
20827 + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
20829 /* Boot CPU check */
20830 if (!boot_pat_state)
20831 @@ -113,13 +129,13 @@ void pat_init(void)
20832 static char *cattr_name(unsigned long flags)
20834 switch (flags & _PAGE_CACHE_MASK) {
20835 - case _PAGE_CACHE_UC: return "uncached";
20836 - case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20837 - case _PAGE_CACHE_WB: return "write-back";
20838 - case _PAGE_CACHE_WC: return "write-combining";
20839 - case _PAGE_CACHE_WP: return "write-protected";
20840 - case _PAGE_CACHE_WT: return "write-through";
20841 - default: return "broken";
20842 + case _PAGE_CACHE_UC: return "uncached";
20843 + case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20844 + case _PAGE_CACHE_WB: return "write-back";
20845 + case _PAGE_CACHE_WC: return "write-combining";
20846 + case _PAGE_CACHE_WP: return "write-protected";
20847 + case _PAGE_CACHE_WT: return "write-through";
20848 + default: return "broken";
20852 @@ -157,49 +173,55 @@ static DEFINE_SPINLOCK(memtype_lock); /
20853 * The intersection is based on "Effective Memory Type" tables in IA-32
20856 -static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
20857 - unsigned long *ret_prot)
20858 +static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
20860 - unsigned long pat_type;
20863 - pat_type = prot & _PAGE_CACHE_MASK;
20864 - prot &= (~_PAGE_CACHE_MASK);
20867 - * We return the PAT request directly for types where PAT takes
20868 - * precedence with respect to MTRR and for UC_MINUS.
20869 - * Consistency checks with other PAT requests is done later
20870 - * while going through memtype list.
20872 - if (pat_type == _PAGE_CACHE_WC) {
20873 - *ret_prot = prot | _PAGE_CACHE_WC;
20875 - } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
20876 - *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
20878 - } else if (pat_type == _PAGE_CACHE_UC) {
20879 - *ret_prot = prot | _PAGE_CACHE_UC;
20884 * Look for MTRR hint to get the effective type in case where PAT
20885 * request is for WB.
20887 - mtrr_type = mtrr_type_lookup(start, end);
20888 + if (req_type == _PAGE_CACHE_WB) {
20891 - if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
20892 - *ret_prot = prot | _PAGE_CACHE_UC;
20893 - } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
20894 - *ret_prot = prot | _PAGE_CACHE_WC;
20896 - *ret_prot = prot | _PAGE_CACHE_WB;
20897 + mtrr_type = mtrr_type_lookup(start, end);
20898 + if (mtrr_type == MTRR_TYPE_UNCACHABLE)
20899 + return _PAGE_CACHE_UC;
20900 + if (mtrr_type == MTRR_TYPE_WRCOMB)
20901 + return _PAGE_CACHE_WC;
20907 +static int chk_conflict(struct memtype *new, struct memtype *entry,
20908 + unsigned long *type)
20910 + if (new->type != entry->type) {
20912 + new->type = entry->type;
20913 + *type = entry->type;
20918 + /* check overlaps with more than one entry in the list */
20919 + list_for_each_entry_continue(entry, &memtype_list, nd) {
20920 + if (new->end <= entry->start)
20922 + else if (new->type != entry->type)
20928 + printk(KERN_INFO "%s:%d conflicting memory types "
20929 + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
20930 + new->end, cattr_name(new->type), cattr_name(entry->type));
20934 +static struct memtype *cached_entry;
20935 +static u64 cached_start;
20938 * req_type typically has one of the:
20940 @@ -210,37 +232,36 @@ static int pat_x_mtrr_type(u64 start, u6
20941 * req_type will have a special case value '-1', when requester want to inherit
20942 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
20944 - * If ret_type is NULL, function will return an error if it cannot reserve the
20945 - * region with req_type. If ret_type is non-null, function will return
20946 - * available type in ret_type in case of no error. In case of any error
20947 + * If new_type is NULL, function will return an error if it cannot reserve the
20948 + * region with req_type. If new_type is non-NULL, function will return
20949 + * available type in new_type in case of no error. In case of any error
20950 * it will return a negative return value.
20952 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
20953 - unsigned long *ret_type)
20954 + unsigned long *new_type)
20956 - struct memtype *new_entry = NULL;
20957 - struct memtype *parse;
20958 + struct memtype *new, *entry;
20959 unsigned long actual_type;
20960 + struct list_head *where;
20963 - /* Only track when pat_wc_enabled */
20964 - if (!pat_wc_enabled) {
20965 + BUG_ON(start >= end); /* end is exclusive */
20967 + if (!pat_enabled) {
20968 /* This is identical to page table setting without PAT */
20970 - if (req_type == -1) {
20971 - *ret_type = _PAGE_CACHE_WB;
20973 - *ret_type = req_type;
20976 + if (req_type == -1)
20977 + *new_type = _PAGE_CACHE_WB;
20979 + *new_type = req_type & _PAGE_CACHE_MASK;
20984 /* Low ISA region is always mapped WB in page table. No need to track */
20985 - if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
20987 - *ret_type = _PAGE_CACHE_WB;
20989 + if (is_ISA_range(start, end - 1)) {
20991 + *new_type = _PAGE_CACHE_WB;
20995 @@ -253,206 +274,118 @@ int reserve_memtype(u64 start, u64 end,
20997 u8 mtrr_type = mtrr_type_lookup(start, end);
20999 - if (mtrr_type == MTRR_TYPE_WRBACK) {
21000 - req_type = _PAGE_CACHE_WB;
21001 + if (mtrr_type == MTRR_TYPE_WRBACK)
21002 actual_type = _PAGE_CACHE_WB;
21004 - req_type = _PAGE_CACHE_UC_MINUS;
21006 actual_type = _PAGE_CACHE_UC_MINUS;
21009 - req_type &= _PAGE_CACHE_MASK;
21010 - err = pat_x_mtrr_type(start, end, req_type, &actual_type);
21015 - *ret_type = actual_type;
21017 + actual_type = pat_x_mtrr_type(start, end,
21018 + req_type & _PAGE_CACHE_MASK);
21023 - new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21025 + new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21029 - new_entry->start = start;
21030 - new_entry->end = end;
21031 - new_entry->type = actual_type;
21032 + new->start = start;
21034 + new->type = actual_type;
21037 - *ret_type = actual_type;
21039 + *new_type = actual_type;
21041 spin_lock(&memtype_lock);
21043 - /* Search for existing mapping that overlaps the current range */
21044 - list_for_each_entry(parse, &memtype_list, nd) {
21045 - struct memtype *saved_ptr;
21046 + if (cached_entry && start >= cached_start)
21047 + entry = cached_entry;
21049 + entry = list_entry(&memtype_list, struct memtype, nd);
21051 - if (parse->start >= end) {
21052 - pr_debug("New Entry\n");
21053 - list_add(&new_entry->nd, parse->nd.prev);
21054 - new_entry = NULL;
21055 + /* Search for existing mapping that overlaps the current range */
21057 + list_for_each_entry_continue(entry, &memtype_list, nd) {
21058 + if (end <= entry->start) {
21059 + where = entry->nd.prev;
21060 + cached_entry = list_entry(where, struct memtype, nd);
21064 - if (start <= parse->start && end >= parse->start) {
21065 - if (actual_type != parse->type && ret_type) {
21066 - actual_type = parse->type;
21067 - *ret_type = actual_type;
21068 - new_entry->type = actual_type;
21071 - if (actual_type != parse->type) {
21073 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21074 - current->comm, current->pid,
21076 - cattr_name(actual_type),
21077 - cattr_name(parse->type));
21082 - saved_ptr = parse;
21084 - * Check to see whether the request overlaps more
21085 - * than one entry in the list
21087 - list_for_each_entry_continue(parse, &memtype_list, nd) {
21088 - if (end <= parse->start) {
21092 - if (actual_type != parse->type) {
21094 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21095 - current->comm, current->pid,
21097 - cattr_name(actual_type),
21098 - cattr_name(parse->type));
21106 + } else if (start <= entry->start) { /* end > entry->start */
21107 + err = chk_conflict(new, entry, new_type);
21109 + dprintk("Overlap at 0x%Lx-0x%Lx\n",
21110 + entry->start, entry->end);
21111 + where = entry->nd.prev;
21112 + cached_entry = list_entry(where,
21113 + struct memtype, nd);
21116 - pr_debug("Overlap at 0x%Lx-0x%Lx\n",
21117 - saved_ptr->start, saved_ptr->end);
21118 - /* No conflict. Go ahead and add this new entry */
21119 - list_add(&new_entry->nd, saved_ptr->nd.prev);
21120 - new_entry = NULL;
21124 - if (start < parse->end) {
21125 - if (actual_type != parse->type && ret_type) {
21126 - actual_type = parse->type;
21127 - *ret_type = actual_type;
21128 - new_entry->type = actual_type;
21131 - if (actual_type != parse->type) {
21133 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21134 - current->comm, current->pid,
21136 - cattr_name(actual_type),
21137 - cattr_name(parse->type));
21142 - saved_ptr = parse;
21144 - * Check to see whether the request overlaps more
21145 - * than one entry in the list
21147 - list_for_each_entry_continue(parse, &memtype_list, nd) {
21148 - if (end <= parse->start) {
21152 - if (actual_type != parse->type) {
21154 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21155 - current->comm, current->pid,
21157 - cattr_name(actual_type),
21158 - cattr_name(parse->type));
21161 + } else if (start < entry->end) { /* start > entry->start */
21162 + err = chk_conflict(new, entry, new_type);
21164 + dprintk("Overlap at 0x%Lx-0x%Lx\n",
21165 + entry->start, entry->end);
21166 + cached_entry = list_entry(entry->nd.prev,
21167 + struct memtype, nd);
21170 + * Move to right position in the linked
21171 + * list to add this new entry
21173 + list_for_each_entry_continue(entry,
21174 + &memtype_list, nd) {
21175 + if (start <= entry->start) {
21176 + where = entry->nd.prev;
21186 - pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
21187 - saved_ptr->start, saved_ptr->end);
21188 - /* No conflict. Go ahead and add this new entry */
21189 - list_add(&new_entry->nd, &saved_ptr->nd);
21190 - new_entry = NULL;
21197 - "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
21198 - start, end, cattr_name(new_entry->type),
21199 - cattr_name(req_type));
21200 - kfree(new_entry);
21201 + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
21202 + "track %s, req %s\n",
21203 + start, end, cattr_name(new->type), cattr_name(req_type));
21205 spin_unlock(&memtype_lock);
21210 - /* No conflict. Not yet added to the list. Add to the tail */
21211 - list_add_tail(&new_entry->nd, &memtype_list);
21212 - pr_debug("New Entry\n");
21214 + cached_start = start;
21218 - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21219 - start, end, cattr_name(actual_type),
21220 - cattr_name(req_type), cattr_name(*ret_type));
21223 - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
21224 - start, end, cattr_name(actual_type),
21225 - cattr_name(req_type));
21228 + list_add(&new->nd, where);
21230 + list_add_tail(&new->nd, &memtype_list);
21232 spin_unlock(&memtype_lock);
21234 + dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21235 + start, end, cattr_name(new->type), cattr_name(req_type),
21236 + new_type ? cattr_name(*new_type) : "-");
21241 int free_memtype(u64 start, u64 end)
21243 - struct memtype *ml;
21244 + struct memtype *entry;
21247 - /* Only track when pat_wc_enabled */
21248 - if (!pat_wc_enabled) {
21249 + if (!pat_enabled)
21253 /* Low ISA region is always mapped WB. No need to track */
21254 - if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
21255 + if (is_ISA_range(start, end - 1))
21259 spin_lock(&memtype_lock);
21260 - list_for_each_entry(ml, &memtype_list, nd) {
21261 - if (ml->start == start && ml->end == end) {
21262 - list_del(&ml->nd);
21264 + list_for_each_entry(entry, &memtype_list, nd) {
21265 + if (entry->start == start && entry->end == end) {
21266 + if (cached_entry == entry || cached_start == start)
21267 + cached_entry = NULL;
21269 + list_del(&entry->nd);
21274 @@ -464,27 +397,19 @@ int free_memtype(u64 start, u64 end)
21275 current->comm, current->pid, start, end);
21278 - pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21279 + dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21285 - * /dev/mem mmap interface. The memtype used for mapping varies:
21286 - * - Use UC for mappings with O_SYNC flag
21287 - * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
21288 - * inherit the memtype from existing mapping.
21289 - * - Else use UC_MINUS memtype (for backward compatibility with existing
21292 pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
21293 unsigned long size, pgprot_t vma_prot)
21298 -#ifdef CONFIG_NONPROMISC_DEVMEM
21299 -/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
21300 +#ifdef CONFIG_STRICT_DEVMEM
21301 +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
21302 static inline int range_is_allowed(unsigned long mfn, unsigned long size)
21305 @@ -508,20 +433,20 @@ static inline int range_is_allowed(unsig
21309 -#endif /* CONFIG_NONPROMISC_DEVMEM */
21310 +#endif /* CONFIG_STRICT_DEVMEM */
21312 int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
21313 unsigned long size, pgprot_t *vma_prot)
21315 u64 addr = (u64)mfn << PAGE_SHIFT;
21316 - unsigned long flags = _PAGE_CACHE_UC_MINUS;
21317 + unsigned long flags = -1;
21320 if (!range_is_allowed(mfn, size))
21323 if (file->f_flags & O_SYNC) {
21324 - flags = _PAGE_CACHE_UC;
21325 + flags = _PAGE_CACHE_UC_MINUS;
21328 #ifndef CONFIG_X86_32
21329 @@ -534,25 +459,26 @@ int phys_mem_access_prot_allowed(struct
21330 * caching for the high addresses through the KEN pin, but
21331 * we maintain the tradition of paranoia in this code.
21333 - if (!pat_wc_enabled &&
21334 - ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
21335 - test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
21336 - test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
21337 - test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
21338 - (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21339 + if (!pat_enabled &&
21340 + !(boot_cpu_has(X86_FEATURE_MTRR) ||
21341 + boot_cpu_has(X86_FEATURE_K6_MTRR) ||
21342 + boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
21343 + boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
21344 + (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21345 flags = _PAGE_CACHE_UC;
21351 - * With O_SYNC, we can only take UC mapping. Fail if we cannot.
21352 + * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
21354 * Without O_SYNC, we want to get
21355 * - WB for WB-able memory and no other conflicting mappings
21356 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
21357 * - Inherit from confliting mappings otherwise
21359 - if (flags != _PAGE_CACHE_UC_MINUS) {
21360 + if (flags != -1) {
21361 retval = reserve_memtype(addr, addr + size, flags, NULL);
21363 retval = reserve_memtype(addr, addr + size, -1, &flags);
21364 @@ -600,3 +526,88 @@ void unmap_devmem(unsigned long mfn, uns
21365 free_memtype(addr, addr + size);
21368 +#if defined(CONFIG_DEBUG_FS)
21370 +/* get Nth element of the linked list */
21371 +static struct memtype *memtype_get_idx(loff_t pos)
21373 + struct memtype *list_node, *print_entry;
21376 + print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21377 + if (!print_entry)
21380 + spin_lock(&memtype_lock);
21381 + list_for_each_entry(list_node, &memtype_list, nd) {
21383 + *print_entry = *list_node;
21384 + spin_unlock(&memtype_lock);
21385 + return print_entry;
21389 + spin_unlock(&memtype_lock);
21390 + kfree(print_entry);
21394 +static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
21398 + seq_printf(seq, "PAT memtype list:\n");
21401 + return memtype_get_idx(*pos);
21404 +static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
21407 + return memtype_get_idx(*pos);
21410 +static void memtype_seq_stop(struct seq_file *seq, void *v)
21414 +static int memtype_seq_show(struct seq_file *seq, void *v)
21416 + struct memtype *print_entry = (struct memtype *)v;
21418 + seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
21419 + print_entry->start, print_entry->end);
21420 + kfree(print_entry);
21424 +static struct seq_operations memtype_seq_ops = {
21425 + .start = memtype_seq_start,
21426 + .next = memtype_seq_next,
21427 + .stop = memtype_seq_stop,
21428 + .show = memtype_seq_show,
21431 +static int memtype_seq_open(struct inode *inode, struct file *file)
21433 + return seq_open(file, &memtype_seq_ops);
21436 +static const struct file_operations memtype_fops = {
21437 + .open = memtype_seq_open,
21438 + .read = seq_read,
21439 + .llseek = seq_lseek,
21440 + .release = seq_release,
21443 +static int __init pat_memtype_list_init(void)
21445 + debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
21446 + NULL, &memtype_fops);
21450 +late_initcall(pat_memtype_list_init);
21452 +#endif /* CONFIG_DEBUG_FS */
21453 --- sle11-2009-10-16.orig/arch/x86/mm/pgtable-xen.c 2009-03-16 16:38:05.000000000 +0100
21454 +++ sle11-2009-10-16/arch/x86/mm/pgtable-xen.c 2009-06-04 10:21:39.000000000 +0200
21456 #include <asm/pgalloc.h>
21457 #include <asm/pgtable.h>
21458 #include <asm/tlb.h>
21459 +#include <asm/fixmap.h>
21460 #include <asm/hypervisor.h>
21461 #include <asm/mmu_context.h>
21463 @@ -410,15 +411,9 @@ static inline void pgd_list_del(pgd_t *p
21464 static void pgd_ctor(void *p)
21467 - unsigned long flags;
21469 pgd_test_and_unpin(pgd);
21471 - /* Clear usermode parts of PGD */
21472 - memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
21474 - spin_lock_irqsave(&pgd_lock, flags);
21476 /* If the pgd points to a shared pagetable level (either the
21477 ptes in non-PAE, or shared PMD in PAE), then just copy the
21478 references from swapper_pg_dir. */
21479 @@ -440,13 +435,9 @@ static void pgd_ctor(void *p)
21480 __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
21483 -#ifndef CONFIG_X86_PAE
21484 /* list required to sync kernel mapping updates */
21485 if (!SHARED_KERNEL_PMD)
21489 - spin_unlock_irqrestore(&pgd_lock, flags);
21492 static void pgd_dtor(void *pgd)
21493 @@ -475,33 +466,6 @@ static void pgd_dtor(void *pgd)
21495 #ifdef CONFIG_X86_PAE
21497 - * Mop up any pmd pages which may still be attached to the pgd.
21498 - * Normally they will be freed by munmap/exit_mmap, but any pmd we
21499 - * preallocate which never got a corresponding vma will need to be
21500 - * freed manually.
21502 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21506 - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
21507 - pgd_t pgd = pgdp[i];
21509 - if (__pgd_val(pgd) != 0) {
21510 - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21512 - pgdp[i] = xen_make_pgd(0);
21514 - paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21515 - pmd_free(mm, pmd);
21519 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21520 - xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21524 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
21525 * updating the top-level pagetable entries to guarantee the
21526 * processor notices the update. Since this is expensive, and
21527 @@ -512,61 +476,7 @@ static void pgd_mop_up_pmds(struct mm_st
21528 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
21529 * and initialize the kernel pmds here.
21531 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21534 - pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
21535 - unsigned long addr, flags;
21539 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
21540 - * allocation). We therefore store virtual addresses of pmds as they
21541 - * do not change across save/restore, and poke the machine addresses
21542 - * into the pgdir under the pgd_lock.
21544 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
21545 - pmds[i] = pmd_alloc_one(mm, addr);
21550 - spin_lock_irqsave(&pgd_lock, flags);
21552 - /* Protect against save/restore: move below 4GB under pgd_lock. */
21553 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21554 - && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21555 - spin_unlock_irqrestore(&pgd_lock, flags);
21558 - pmd_free(mm, pmds[i]);
21562 - /* Copy kernel pmd contents and write-protect the new pmds. */
21563 - pud = pud_offset(pgd, 0);
21564 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
21565 - i++, pud++, addr += PUD_SIZE) {
21566 - if (i >= KERNEL_PGD_BOUNDARY) {
21568 - (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21569 - sizeof(pmd_t) * PTRS_PER_PMD);
21570 - make_lowmem_page_readonly(
21571 - pmds[i], XENFEAT_writable_page_tables);
21574 - /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21575 - pud_populate(mm, pud, pmds[i]);
21578 - /* List required to sync kernel mapping updates and
21579 - * to pin/unpin on save/restore. */
21580 - pgd_list_add(pgd);
21582 - spin_unlock_irqrestore(&pgd_lock, flags);
21586 +#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
21588 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
21590 @@ -596,16 +506,101 @@ void pud_populate(struct mm_struct *mm,
21593 #else /* !CONFIG_X86_PAE */
21595 /* No need to prepopulate any pagetable entries in non-PAE modes. */
21596 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21597 +#define PREALLOCATED_PMDS 0
21599 +#endif /* CONFIG_X86_PAE */
21601 +static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
21606 +#ifdef CONFIG_X86_PAE
21608 + xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
21611 + for(i = 0; i < PREALLOCATED_PMDS; i++)
21613 + pmd_free(mm, pmds[i]);
21616 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
21617 +static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
21620 + bool failed = false;
21622 + for(i = 0; i < PREALLOCATED_PMDS; i++) {
21623 + pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
21630 + free_pmds(pmds, mm, false);
21638 + * Mop up any pmd pages which may still be attached to the pgd.
21639 + * Normally they will be freed by munmap/exit_mmap, but any pmd we
21640 + * preallocate which never got a corresponding vma will need to be
21641 + * freed manually.
21643 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21647 + for(i = 0; i < PREALLOCATED_PMDS; i++) {
21648 + pgd_t pgd = pgdp[i];
21650 + if (__pgd_val(pgd) != 0) {
21651 + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21653 + pgdp[i] = xen_make_pgd(0);
21655 + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21656 + pmd_free(mm, pmd);
21660 +#ifdef CONFIG_X86_PAE
21661 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21662 + xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21666 +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
21669 + unsigned long addr;
21672 + if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
21675 + pud = pud_offset(pgd, 0);
21676 + for (addr = i = 0; i < PREALLOCATED_PMDS;
21677 + i++, pud++, addr += PUD_SIZE) {
21678 + pmd_t *pmd = pmds[i];
21680 + if (i >= KERNEL_PGD_BOUNDARY) {
21682 + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21683 + sizeof(pmd_t) * PTRS_PER_PMD);
21684 + make_lowmem_page_readonly(
21685 + pmd, XENFEAT_writable_page_tables);
21688 + /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21689 + pud_populate(mm, pud, pmd);
21692 -#endif /* CONFIG_X86_PAE */
21694 #ifdef CONFIG_X86_64
21695 /* We allocate two contiguous pages for kernel and user. */
21696 @@ -616,19 +611,52 @@ static void pgd_mop_up_pmds(struct mm_st
21698 pgd_t *pgd_alloc(struct mm_struct *mm)
21700 - pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21702 + pmd_t *pmds[PREALLOCATED_PMDS];
21703 + unsigned long flags;
21705 + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21710 - /* so that alloc_pd can use it */
21715 - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
21716 - free_pages((unsigned long)pgd, PGD_ORDER);
21718 + if (preallocate_pmds(pmds, mm) != 0)
21719 + goto out_free_pgd;
21721 + if (paravirt_pgd_alloc(mm) != 0)
21722 + goto out_free_pmds;
21725 + * Make sure that pre-populating the pmds is atomic with
21726 + * respect to anything walking the pgd_list, so that they
21727 + * never see a partially populated pgd.
21729 + spin_lock_irqsave(&pgd_lock, flags);
21731 +#ifdef CONFIG_X86_PAE
21732 + /* Protect against save/restore: move below 4GB under pgd_lock. */
21733 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21734 + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21735 + spin_unlock_irqrestore(&pgd_lock, flags);
21736 + goto out_free_pmds;
21741 + pgd_prepopulate_pmd(mm, pgd, pmds);
21743 + spin_unlock_irqrestore(&pgd_lock, flags);
21748 + free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
21750 + free_pages((unsigned long)pgd, PGD_ORDER);
21755 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
21756 @@ -644,6 +672,7 @@ void pgd_free(struct mm_struct *mm, pgd_
21759 pgd_mop_up_pmds(mm, pgd);
21760 + paravirt_pgd_free(mm, pgd);
21761 free_pages((unsigned long)pgd, PGD_ORDER);
21764 @@ -685,7 +714,7 @@ int ptep_test_and_clear_young(struct vm_
21766 if (pte_young(*ptep))
21767 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
21769 + (unsigned long *) &ptep->pte);
21772 pte_update(vma->vm_mm, addr, ptep);
21773 @@ -707,3 +736,42 @@ int ptep_clear_flush_young(struct vm_are
21780 +void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21782 + unsigned long address = __fix_to_virt(idx);
21785 + if (idx >= __end_of_fixed_addresses) {
21791 +#ifdef CONFIG_X86_64
21792 + extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
21794 + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
21795 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21796 + set_pte_vaddr_pud(level3_user_pgt, address, pte);
21798 + case FIX_EARLYCON_MEM_BASE:
21799 + xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
21800 + pfn_pte_ma(phys >> PAGE_SHIFT, flags));
21804 + case FIX_WP_TEST:
21806 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21810 + pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21813 + set_pte_vaddr(address, pte);
21816 --- sle11-2009-10-16.orig/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:38:05.000000000 +0100
21817 +++ sle11-2009-10-16/arch/x86/mm/pgtable_32-xen.c 2009-06-04 10:21:39.000000000 +0200
21818 @@ -25,51 +25,49 @@
21819 #include <xen/features.h>
21820 #include <asm/hypervisor.h>
21822 -void show_mem(void)
21824 + * Associate a virtual page frame with a given physical page frame
21825 + * and protection flags for that frame.
21827 +void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
21829 - int total = 0, reserved = 0;
21830 - int shared = 0, cached = 0;
21832 - struct page *page;
21833 - pg_data_t *pgdat;
21835 - unsigned long flags;
21837 - printk(KERN_INFO "Mem-info:\n");
21838 - show_free_areas();
21839 - for_each_online_pgdat(pgdat) {
21840 - pgdat_resize_lock(pgdat, &flags);
21841 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
21842 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
21843 - touch_nmi_watchdog();
21844 - page = pgdat_page_nr(pgdat, i);
21846 - if (PageHighMem(page))
21848 - if (PageReserved(page))
21850 - else if (PageSwapCache(page))
21852 - else if (page_count(page))
21853 - shared += page_count(page) - 1;
21855 - pgdat_resize_unlock(pgdat, &flags);
21857 - printk(KERN_INFO "%d pages of RAM\n", total);
21858 - printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
21859 - printk(KERN_INFO "%d reserved pages\n", reserved);
21860 - printk(KERN_INFO "%d pages shared\n", shared);
21861 - printk(KERN_INFO "%d pages swap cached\n", cached);
21863 - printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
21864 - printk(KERN_INFO "%lu pages writeback\n",
21865 - global_page_state(NR_WRITEBACK));
21866 - printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
21867 - printk(KERN_INFO "%lu pages slab\n",
21868 - global_page_state(NR_SLAB_RECLAIMABLE) +
21869 - global_page_state(NR_SLAB_UNRECLAIMABLE));
21870 - printk(KERN_INFO "%lu pages pagetables\n",
21871 - global_page_state(NR_PAGETABLE));
21872 +#ifndef CONFIG_XEN
21878 + pgd = swapper_pg_dir + pgd_index(vaddr);
21879 + if (pgd_none(*pgd)) {
21883 + pud = pud_offset(pgd, vaddr);
21884 + if (pud_none(*pud)) {
21888 + pmd = pmd_offset(pud, vaddr);
21889 + if (pmd_none(*pmd)) {
21893 + pte = pte_offset_kernel(pmd, vaddr);
21894 + if (pte_val(pteval))
21895 + set_pte_present(&init_mm, vaddr, pte, pteval);
21897 + pte_clear(&init_mm, vaddr, pte);
21900 + * It's enough to flush this one mapping.
21901 + * (PGE mappings get flushed as well)
21903 + __flush_tlb_one(vaddr);
21905 + if (HYPERVISOR_update_va_mapping(vaddr, pteval,
21906 + UVMF_INVLPG|UVMF_ALL))
21912 @@ -107,35 +105,10 @@ void set_pmd_pfn(unsigned long vaddr, un
21913 __flush_tlb_one(vaddr);
21916 -static int fixmaps;
21917 unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
21918 unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
21919 EXPORT_SYMBOL(__FIXADDR_TOP);
21921 -void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21923 - unsigned long address = __fix_to_virt(idx);
21926 - if (idx >= __end_of_fixed_addresses) {
21931 - case FIX_WP_TEST:
21933 - pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21936 - pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21939 - if (HYPERVISOR_update_va_mapping(address, pte,
21940 - UVMF_INVLPG|UVMF_ALL))
21946 * reserve_top_address - reserves a hole in the top of kernel address space
21947 * @reserve - size of hole to reserve
21948 @@ -145,13 +118,48 @@ void __set_fixmap (enum fixed_addresses
21950 void __init reserve_top_address(unsigned long reserve)
21952 - BUG_ON(fixmaps > 0);
21953 + BUG_ON(fixmaps_set > 0);
21954 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
21956 __FIXADDR_TOP = -reserve - PAGE_SIZE;
21957 __VMALLOC_RESERVE += reserve;
21961 + * vmalloc=size forces the vmalloc area to be exactly 'size'
21962 + * bytes. This can be used to increase (or decrease) the
21963 + * vmalloc area - the default is 128m.
21965 +static int __init parse_vmalloc(char *arg)
21970 + __VMALLOC_RESERVE = memparse(arg, &arg);
21973 +early_param("vmalloc", parse_vmalloc);
21975 +#ifndef CONFIG_XEN
21977 + * reservetop=size reserves a hole at the top of the kernel address space which
21978 + * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
21979 + * so relocating the fixmap can be done before paging initialization.
21981 +static int __init parse_reservetop(char *arg)
21983 + unsigned long address;
21988 + address = memparse(arg, &arg);
21989 + reserve_top_address(address);
21992 +early_param("reservetop", parse_reservetop);
21995 void make_lowmem_page_readonly(void *va, unsigned int feature)
21998 --- sle11-2009-10-16.orig/arch/x86/pci/amd_bus.c 2009-10-28 14:55:02.000000000 +0100
21999 +++ sle11-2009-10-16/arch/x86/pci/amd_bus.c 2009-06-04 10:21:39.000000000 +0200
22000 @@ -607,6 +607,14 @@ static int __init pci_io_ecs_init(void)
22001 for_each_online_cpu(cpu)
22002 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
22003 (void *)(long)cpu);
22007 + rdmsrl(MSR_AMD64_NB_CFG, reg);
22008 + if (!(reg & ENABLE_CF8_EXT_CFG))
22012 pci_probe |= PCI_HAS_IO_ECS;
22015 @@ -614,6 +622,10 @@ static int __init pci_io_ecs_init(void)
22017 static int __init amd_postcore_init(void)
22020 + if (!is_initial_xendomain())
22023 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
22026 --- sle11-2009-10-16.orig/arch/x86/pci/irq-xen.c 2009-03-16 16:38:05.000000000 +0100
22027 +++ sle11-2009-10-16/arch/x86/pci/irq-xen.c 2009-06-04 10:21:39.000000000 +0200
22029 #include <linux/slab.h>
22030 #include <linux/interrupt.h>
22031 #include <linux/dmi.h>
22032 -#include <asm/io.h>
22033 -#include <asm/smp.h>
22034 +#include <linux/io.h>
22035 +#include <linux/smp.h>
22036 #include <asm/io_apic.h>
22037 #include <linux/irq.h>
22038 #include <linux/acpi.h>
22039 @@ -45,7 +45,8 @@ struct irq_router {
22041 u16 vendor, device;
22042 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
22043 - int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
22044 + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
22048 struct irq_router_handler {
22049 @@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_d
22050 * and perform checksum verification.
22053 -static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
22054 +static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
22056 struct irq_routing_table *rt;
22058 @@ -74,10 +75,11 @@ static inline struct irq_routing_table *
22059 rt->size < sizeof(struct irq_routing_table))
22062 - for (i=0; i < rt->size; i++)
22063 + for (i = 0; i < rt->size; i++)
22066 - DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
22067 + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
22072 @@ -104,7 +106,9 @@ static struct irq_routing_table * __init
22074 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
22076 - for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
22077 + for (addr = (u8 *) isa_bus_to_virt(0xf0000);
22078 + addr < (u8 *) isa_bus_to_virt(0x100000);
22080 rt = pirq_check_routing_table(addr);
22083 @@ -126,20 +130,20 @@ static void __init pirq_peer_trick(void)
22084 struct irq_info *e;
22086 memset(busmap, 0, sizeof(busmap));
22087 - for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
22088 + for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
22093 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
22094 - for(j=0; j<4; j++)
22095 + for (j = 0; j < 4; j++)
22096 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
22100 busmap[e->bus] = 1;
22102 - for(i = 1; i < 256; i++) {
22103 + for (i = 1; i < 256; i++) {
22105 if (!busmap[i] || pci_find_bus(0, i))
22107 @@ -187,7 +191,8 @@ static unsigned int read_config_nybble(s
22108 return (nr & 1) ? (x >> 4) : (x & 0xf);
22111 -static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
22112 +static void write_config_nybble(struct pci_dev *router, unsigned offset,
22113 + unsigned nr, unsigned int val)
22116 unsigned reg = offset + (nr >> 1);
22117 @@ -289,7 +294,7 @@ static int pirq_ite_get(struct pci_dev *
22118 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
22120 WARN_ON_ONCE(pirq > 4);
22121 - return read_config_nybble(router,0x43, pirqmap[pirq-1]);
22122 + return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
22125 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22126 @@ -318,7 +323,7 @@ static int pirq_opti_set(struct pci_dev
22129 * Cyrix: nibble offset 0x5C
22130 - * 0x5C bits 7:4 is INTB bits 3:0 is INTA
22131 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA
22132 * 0x5D bits 7:4 is INTD bits 3:0 is INTC
22134 static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
22135 @@ -354,7 +359,7 @@ static int pirq_cyrix_set(struct pci_dev
22136 * Apparently there are systems implementing PCI routing table using
22137 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
22138 * We try our best to handle both link mappings.
22141 * Currently (2003-05-21) it appears most SiS chipsets follow the
22142 * definition of routing registers from the SiS-5595 southbridge.
22143 * According to the SiS 5595 datasheets the revision id's of the
22144 @@ -374,7 +379,7 @@ static int pirq_cyrix_set(struct pci_dev
22147 * bit 6 OHCI function disabled (0), enabled (1)
22150 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
22152 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
22153 @@ -437,7 +442,7 @@ static int pirq_vlsi_get(struct pci_dev
22155 WARN_ON_ONCE(pirq >= 9);
22157 - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
22158 + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
22161 return read_config_nybble(router, 0x74, pirq-1);
22162 @@ -447,7 +452,7 @@ static int pirq_vlsi_set(struct pci_dev
22164 WARN_ON_ONCE(pirq >= 9);
22166 - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
22167 + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
22170 write_config_nybble(router, 0x74, pirq-1, irq);
22171 @@ -471,7 +476,8 @@ static int pirq_serverworks_get(struct p
22172 return inb(0xc01) & 0xf;
22175 -static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22176 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
22177 + int pirq, int irq)
22181 @@ -491,22 +497,20 @@ static int pirq_amd756_get(struct pci_de
22186 irq = read_config_nybble(router, 0x56, pirq - 1);
22188 - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
22189 - dev->vendor, dev->device, pirq, irq);
22190 + dev_info(&dev->dev,
22191 + "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
22192 + dev->vendor, dev->device, pirq, irq);
22196 static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22198 - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
22199 - dev->vendor, dev->device, pirq, irq);
22200 + dev_info(&dev->dev,
22201 + "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
22202 + dev->vendor, dev->device, pirq, irq);
22205 write_config_nybble(router, 0x56, pirq - 1, irq);
22210 @@ -553,50 +557,51 @@ static __init int intel_router_probe(str
22211 if (pci_dev_present(pirq_440gx))
22216 - case PCI_DEVICE_ID_INTEL_82371FB_0:
22217 - case PCI_DEVICE_ID_INTEL_82371SB_0:
22218 - case PCI_DEVICE_ID_INTEL_82371AB_0:
22219 - case PCI_DEVICE_ID_INTEL_82371MX:
22220 - case PCI_DEVICE_ID_INTEL_82443MX_0:
22221 - case PCI_DEVICE_ID_INTEL_82801AA_0:
22222 - case PCI_DEVICE_ID_INTEL_82801AB_0:
22223 - case PCI_DEVICE_ID_INTEL_82801BA_0:
22224 - case PCI_DEVICE_ID_INTEL_82801BA_10:
22225 - case PCI_DEVICE_ID_INTEL_82801CA_0:
22226 - case PCI_DEVICE_ID_INTEL_82801CA_12:
22227 - case PCI_DEVICE_ID_INTEL_82801DB_0:
22228 - case PCI_DEVICE_ID_INTEL_82801E_0:
22229 - case PCI_DEVICE_ID_INTEL_82801EB_0:
22230 - case PCI_DEVICE_ID_INTEL_ESB_1:
22231 - case PCI_DEVICE_ID_INTEL_ICH6_0:
22232 - case PCI_DEVICE_ID_INTEL_ICH6_1:
22233 - case PCI_DEVICE_ID_INTEL_ICH7_0:
22234 - case PCI_DEVICE_ID_INTEL_ICH7_1:
22235 - case PCI_DEVICE_ID_INTEL_ICH7_30:
22236 - case PCI_DEVICE_ID_INTEL_ICH7_31:
22237 - case PCI_DEVICE_ID_INTEL_ESB2_0:
22238 - case PCI_DEVICE_ID_INTEL_ICH8_0:
22239 - case PCI_DEVICE_ID_INTEL_ICH8_1:
22240 - case PCI_DEVICE_ID_INTEL_ICH8_2:
22241 - case PCI_DEVICE_ID_INTEL_ICH8_3:
22242 - case PCI_DEVICE_ID_INTEL_ICH8_4:
22243 - case PCI_DEVICE_ID_INTEL_ICH9_0:
22244 - case PCI_DEVICE_ID_INTEL_ICH9_1:
22245 - case PCI_DEVICE_ID_INTEL_ICH9_2:
22246 - case PCI_DEVICE_ID_INTEL_ICH9_3:
22247 - case PCI_DEVICE_ID_INTEL_ICH9_4:
22248 - case PCI_DEVICE_ID_INTEL_ICH9_5:
22249 - case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22250 - case PCI_DEVICE_ID_INTEL_ICH10_0:
22251 - case PCI_DEVICE_ID_INTEL_ICH10_1:
22252 - case PCI_DEVICE_ID_INTEL_ICH10_2:
22253 - case PCI_DEVICE_ID_INTEL_ICH10_3:
22254 - r->name = "PIIX/ICH";
22255 - r->get = pirq_piix_get;
22256 - r->set = pirq_piix_set;
22258 + switch (device) {
22259 + case PCI_DEVICE_ID_INTEL_82371FB_0:
22260 + case PCI_DEVICE_ID_INTEL_82371SB_0:
22261 + case PCI_DEVICE_ID_INTEL_82371AB_0:
22262 + case PCI_DEVICE_ID_INTEL_82371MX:
22263 + case PCI_DEVICE_ID_INTEL_82443MX_0:
22264 + case PCI_DEVICE_ID_INTEL_82801AA_0:
22265 + case PCI_DEVICE_ID_INTEL_82801AB_0:
22266 + case PCI_DEVICE_ID_INTEL_82801BA_0:
22267 + case PCI_DEVICE_ID_INTEL_82801BA_10:
22268 + case PCI_DEVICE_ID_INTEL_82801CA_0:
22269 + case PCI_DEVICE_ID_INTEL_82801CA_12:
22270 + case PCI_DEVICE_ID_INTEL_82801DB_0:
22271 + case PCI_DEVICE_ID_INTEL_82801E_0:
22272 + case PCI_DEVICE_ID_INTEL_82801EB_0:
22273 + case PCI_DEVICE_ID_INTEL_ESB_1:
22274 + case PCI_DEVICE_ID_INTEL_ICH6_0:
22275 + case PCI_DEVICE_ID_INTEL_ICH6_1:
22276 + case PCI_DEVICE_ID_INTEL_ICH7_0:
22277 + case PCI_DEVICE_ID_INTEL_ICH7_1:
22278 + case PCI_DEVICE_ID_INTEL_ICH7_30:
22279 + case PCI_DEVICE_ID_INTEL_ICH7_31:
22280 + case PCI_DEVICE_ID_INTEL_ESB2_0:
22281 + case PCI_DEVICE_ID_INTEL_ICH8_0:
22282 + case PCI_DEVICE_ID_INTEL_ICH8_1:
22283 + case PCI_DEVICE_ID_INTEL_ICH8_2:
22284 + case PCI_DEVICE_ID_INTEL_ICH8_3:
22285 + case PCI_DEVICE_ID_INTEL_ICH8_4:
22286 + case PCI_DEVICE_ID_INTEL_ICH9_0:
22287 + case PCI_DEVICE_ID_INTEL_ICH9_1:
22288 + case PCI_DEVICE_ID_INTEL_ICH9_2:
22289 + case PCI_DEVICE_ID_INTEL_ICH9_3:
22290 + case PCI_DEVICE_ID_INTEL_ICH9_4:
22291 + case PCI_DEVICE_ID_INTEL_ICH9_5:
22292 + case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22293 + case PCI_DEVICE_ID_INTEL_ICH10_0:
22294 + case PCI_DEVICE_ID_INTEL_ICH10_1:
22295 + case PCI_DEVICE_ID_INTEL_ICH10_2:
22296 + case PCI_DEVICE_ID_INTEL_ICH10_3:
22297 + case PCI_DEVICE_ID_INTEL_PCH_0:
22298 + case PCI_DEVICE_ID_INTEL_PCH_1:
22299 + r->name = "PIIX/ICH";
22300 + r->get = pirq_piix_get;
22301 + r->set = pirq_piix_set;
22306 @@ -610,7 +615,7 @@ static __init int via_router_probe(struc
22307 * workarounds for some buggy BIOSes
22309 if (device == PCI_DEVICE_ID_VIA_82C586_0) {
22310 - switch(router->device) {
22311 + switch (router->device) {
22312 case PCI_DEVICE_ID_VIA_82C686:
22314 * Asus k7m bios wrongly reports 82C686A
22315 @@ -635,7 +640,7 @@ static __init int via_router_probe(struc
22320 + switch (device) {
22321 case PCI_DEVICE_ID_VIA_82C586_0:
22323 r->get = pirq_via586_get;
22324 @@ -658,28 +663,27 @@ static __init int via_router_probe(struc
22326 static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22330 - case PCI_DEVICE_ID_VLSI_82C534:
22331 - r->name = "VLSI 82C534";
22332 - r->get = pirq_vlsi_get;
22333 - r->set = pirq_vlsi_set;
22335 + switch (device) {
22336 + case PCI_DEVICE_ID_VLSI_82C534:
22337 + r->name = "VLSI 82C534";
22338 + r->get = pirq_vlsi_get;
22339 + r->set = pirq_vlsi_set;
22346 -static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22347 +static __init int serverworks_router_probe(struct irq_router *r,
22348 + struct pci_dev *router, u16 device)
22352 - case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22353 - case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22354 - r->name = "ServerWorks";
22355 - r->get = pirq_serverworks_get;
22356 - r->set = pirq_serverworks_set;
22358 + switch (device) {
22359 + case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22360 + case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22361 + r->name = "ServerWorks";
22362 + r->get = pirq_serverworks_get;
22363 + r->set = pirq_serverworks_set;
22368 @@ -688,7 +692,7 @@ static __init int sis_router_probe(struc
22370 if (device != PCI_DEVICE_ID_SI_503)
22375 r->get = pirq_sis_get;
22376 r->set = pirq_sis_set;
22377 @@ -697,50 +701,45 @@ static __init int sis_router_probe(struc
22379 static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22383 - case PCI_DEVICE_ID_CYRIX_5520:
22384 - r->name = "NatSemi";
22385 - r->get = pirq_cyrix_get;
22386 - r->set = pirq_cyrix_set;
22388 + switch (device) {
22389 + case PCI_DEVICE_ID_CYRIX_5520:
22390 + r->name = "NatSemi";
22391 + r->get = pirq_cyrix_get;
22392 + r->set = pirq_cyrix_set;
22398 static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22402 - case PCI_DEVICE_ID_OPTI_82C700:
22403 - r->name = "OPTI";
22404 - r->get = pirq_opti_get;
22405 - r->set = pirq_opti_set;
22407 + switch (device) {
22408 + case PCI_DEVICE_ID_OPTI_82C700:
22409 + r->name = "OPTI";
22410 + r->get = pirq_opti_get;
22411 + r->set = pirq_opti_set;
22417 static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22421 - case PCI_DEVICE_ID_ITE_IT8330G_0:
22423 - r->get = pirq_ite_get;
22424 - r->set = pirq_ite_set;
22426 + switch (device) {
22427 + case PCI_DEVICE_ID_ITE_IT8330G_0:
22429 + r->get = pirq_ite_get;
22430 + r->set = pirq_ite_set;
22436 static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22440 + switch (device) {
22441 case PCI_DEVICE_ID_AL_M1533:
22442 case PCI_DEVICE_ID_AL_M1563:
22443 - printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
22445 r->get = pirq_ali_get;
22446 r->set = pirq_ali_set;
22447 @@ -751,25 +750,24 @@ static __init int ali_router_probe(struc
22449 static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22453 - case PCI_DEVICE_ID_AMD_VIPER_740B:
22454 - r->name = "AMD756";
22456 - case PCI_DEVICE_ID_AMD_VIPER_7413:
22457 - r->name = "AMD766";
22459 - case PCI_DEVICE_ID_AMD_VIPER_7443:
22460 - r->name = "AMD768";
22464 + switch (device) {
22465 + case PCI_DEVICE_ID_AMD_VIPER_740B:
22466 + r->name = "AMD756";
22468 + case PCI_DEVICE_ID_AMD_VIPER_7413:
22469 + r->name = "AMD766";
22471 + case PCI_DEVICE_ID_AMD_VIPER_7443:
22472 + r->name = "AMD768";
22477 r->get = pirq_amd756_get;
22478 r->set = pirq_amd756_set;
22483 static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22486 @@ -811,7 +809,7 @@ static struct pci_dev *pirq_router_dev;
22487 * FIXME: should we have an option to say "generic for
22492 static void __init pirq_find_router(struct irq_router *r)
22494 struct irq_routing_table *rt = pirq_table;
22495 @@ -830,7 +828,7 @@ static void __init pirq_find_router(stru
22496 r->name = "default";
22501 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
22502 rt->rtr_vendor, rt->rtr_device);
22504 @@ -841,19 +839,19 @@ static void __init pirq_find_router(stru
22508 - for( h = pirq_routers; h->vendor; h++) {
22509 + for (h = pirq_routers; h->vendor; h++) {
22510 /* First look for a router match */
22511 - if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
22512 + if (rt->rtr_vendor == h->vendor &&
22513 + h->probe(r, pirq_router_dev, rt->rtr_device))
22515 /* Fall back to a device match */
22516 - if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
22517 + if (pirq_router_dev->vendor == h->vendor &&
22518 + h->probe(r, pirq_router_dev, pirq_router_dev->device))
22521 - printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
22522 - pirq_router.name,
22523 - pirq_router_dev->vendor,
22524 - pirq_router_dev->device,
22525 - pci_name(pirq_router_dev));
22526 + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
22527 + pirq_router.name,
22528 + pirq_router_dev->vendor, pirq_router_dev->device);
22530 /* The device remains referenced for the kernel lifetime */
22532 @@ -861,11 +859,13 @@ static void __init pirq_find_router(stru
22533 static struct irq_info *pirq_get_info(struct pci_dev *dev)
22535 struct irq_routing_table *rt = pirq_table;
22536 - int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
22537 + int entries = (rt->size - sizeof(struct irq_routing_table)) /
22538 + sizeof(struct irq_info);
22539 struct irq_info *info;
22541 for (info = rt->slots; entries--; info++)
22542 - if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22543 + if (info->bus == dev->bus->number &&
22544 + PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22548 @@ -884,7 +884,7 @@ static int pcibios_lookup_irq(struct pci
22550 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
22552 - DBG(KERN_DEBUG " -> no interrupt pin\n");
22553 + dev_dbg(&dev->dev, "no interrupt pin\n");
22557 @@ -893,20 +893,21 @@ static int pcibios_lookup_irq(struct pci
22562 - DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
22564 info = pirq_get_info(dev);
22566 - DBG(" -> not found in routing table\n" KERN_DEBUG);
22567 + dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
22571 pirq = info->irq[pin].link;
22572 mask = info->irq[pin].bitmap;
22574 - DBG(" -> not routed\n" KERN_DEBUG);
22575 + dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
22578 - DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
22579 + dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
22580 + 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
22581 mask &= pcibios_irq_mask;
22583 /* Work around broken HP Pavilion Notebooks which assign USB to
22584 @@ -919,7 +920,8 @@ static int pcibios_lookup_irq(struct pci
22587 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
22588 - if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
22589 + if (acer_tm360_irqrouting && dev->irq == 11 &&
22590 + dev->vendor == PCI_VENDOR_ID_O2) {
22593 dev->irq = r->get(pirq_router_dev, dev, pirq);
22594 @@ -932,51 +934,50 @@ static int pcibios_lookup_irq(struct pci
22597 if (newirq && !((1 << newirq) & mask)) {
22598 - if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
22599 - else printk("\n" KERN_WARNING
22600 - "PCI: IRQ %i for device %s doesn't match PIRQ mask "
22601 - "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
22603 + if (pci_probe & PCI_USE_PIRQ_MASK)
22606 + dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
22607 + "%#x; try pci=usepirqmask\n", newirq, mask);
22609 if (!newirq && assign) {
22610 for (i = 0; i < 16; i++) {
22611 if (!(mask & (1 << i)))
22613 - if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
22614 + if (pirq_penalty[i] < pirq_penalty[newirq] &&
22615 + can_request_irq(i, IRQF_SHARED))
22619 - DBG(" -> newirq=%d", newirq);
22620 + dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
22622 /* Check if it is hardcoded */
22623 if ((pirq & 0xf0) == 0xf0) {
22625 - DBG(" -> hardcoded IRQ %d\n", irq);
22626 - msg = "Hardcoded";
22627 - } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22628 - ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
22629 - DBG(" -> got IRQ %d\n", irq);
22631 + msg = "hardcoded";
22632 + } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22633 + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
22635 eisa_set_level_irq(irq);
22636 - } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22637 - DBG(" -> assigning IRQ %d", newirq);
22638 + } else if (newirq && r->set &&
22639 + (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22640 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
22641 eisa_set_level_irq(newirq);
22642 - DBG(" ... OK\n");
22643 - msg = "Assigned";
22644 + msg = "assigned";
22650 - DBG(" ... failed\n");
22651 if (newirq && mask == (1 << newirq)) {
22657 + dev_dbg(&dev->dev, "can't route interrupt\n");
22661 - printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
22662 + dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
22664 /* Update IRQ for all devices with the same pirq value */
22665 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
22666 @@ -988,20 +989,25 @@ static int pcibios_lookup_irq(struct pci
22669 if (info->irq[pin].link == pirq) {
22670 - /* We refuse to override the dev->irq information. Give a warning! */
22671 - if ( dev2->irq && dev2->irq != irq && \
22673 + * We refuse to override the dev->irq
22674 + * information. Give a warning!
22676 + if (dev2->irq && dev2->irq != irq && \
22677 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
22678 - ((1 << dev2->irq) & mask)) ) {
22679 + ((1 << dev2->irq) & mask))) {
22680 #ifndef CONFIG_PCI_MSI
22681 - printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
22682 - pci_name(dev2), dev2->irq, irq);
22683 + dev_info(&dev2->dev, "IRQ routing conflict: "
22684 + "have IRQ %d, want IRQ %d\n",
22692 pirq_penalty[irq]++;
22694 - printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
22695 + dev_info(&dev->dev, "sharing IRQ %d with %s\n",
22696 + irq, pci_name(dev2));
22700 @@ -1015,15 +1021,20 @@ static void __init pcibios_fixup_irqs(vo
22701 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
22702 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
22704 - * If the BIOS has set an out of range IRQ number, just ignore it.
22705 - * Also keep track of which IRQ's are already in use.
22706 + * If the BIOS has set an out of range IRQ number, just
22707 + * ignore it. Also keep track of which IRQ's are
22708 + * already in use.
22710 if (dev->irq >= 16) {
22711 - DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
22712 + dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
22715 - /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
22716 - if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
22718 + * If the IRQ is already assigned to a PCI device,
22719 + * ignore its ISA use penalty
22721 + if (pirq_penalty[dev->irq] >= 100 &&
22722 + pirq_penalty[dev->irq] < 100000)
22723 pirq_penalty[dev->irq] = 0;
22724 pirq_penalty[dev->irq]++;
22726 @@ -1035,13 +1046,17 @@ static void __init pcibios_fixup_irqs(vo
22728 * Recalculate IRQ numbers if we use the I/O APIC.
22730 - if (io_apic_assign_pci_irqs)
22732 + if (io_apic_assign_pci_irqs) {
22736 - pin--; /* interrupt pins are numbered starting from 1 */
22737 - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
22739 + * interrupt pins are numbered starting
22743 + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
22744 + PCI_SLOT(dev->devfn), pin);
22746 * Busses behind bridges are typically not listed in the MP-table.
22747 * In this case we have to look up the IRQ based on the parent bus,
22748 @@ -1049,18 +1064,18 @@ static void __init pcibios_fixup_irqs(vo
22749 * busses itself so we should get into this branch reliably.
22751 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22752 - struct pci_dev * bridge = dev->bus->self;
22753 + struct pci_dev *bridge = dev->bus->self;
22755 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22756 - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22757 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22758 PCI_SLOT(bridge->devfn), pin);
22760 - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22761 - pci_name(bridge), 'A' + pin, irq);
22762 + dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
22763 + pci_name(bridge),
22767 - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22768 - pci_name(dev), 'A' + pin, irq);
22769 + dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
22773 @@ -1082,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq
22775 if (!broken_hp_bios_irq9) {
22776 broken_hp_bios_irq9 = 1;
22777 - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22778 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22783 @@ -1095,7 +1111,8 @@ static int __init fix_acer_tm360_irqrout
22785 if (!acer_tm360_irqrouting) {
22786 acer_tm360_irqrouting = 1;
22787 - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22788 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22793 @@ -1107,7 +1124,8 @@ static struct dmi_system_id __initdata p
22795 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
22796 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
22797 - DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
22798 + DMI_MATCH(DMI_PRODUCT_VERSION,
22799 + "HP Pavilion Notebook Model GE"),
22800 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
22803 @@ -1122,7 +1140,7 @@ static struct dmi_system_id __initdata p
22807 -static int __init pcibios_irq_init(void)
22808 +int __init pcibios_irq_init(void)
22810 DBG(KERN_DEBUG "PCI: IRQ init\n");
22812 @@ -1142,11 +1160,14 @@ static int __init pcibios_irq_init(void)
22813 pirq_find_router(&pirq_router);
22814 if (pirq_table->exclusive_irqs) {
22816 - for (i=0; i<16; i++)
22817 + for (i = 0; i < 16; i++)
22818 if (!(pirq_table->exclusive_irqs & (1 << i)))
22819 pirq_penalty[i] += 100;
22821 - /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
22823 + * If we're using the I/O APIC, avoid using the PCI IRQ
22826 if (io_apic_assign_pci_irqs)
22829 @@ -1157,9 +1178,6 @@ static int __init pcibios_irq_init(void)
22833 -subsys_initcall(pcibios_irq_init);
22836 static void pirq_penalize_isa_irq(int irq, int active)
22839 @@ -1193,7 +1211,7 @@ static int pirq_enable_irq(struct pci_de
22840 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
22843 - pin--; /* interrupt pins are numbered starting from 1 */
22844 + pin--; /* interrupt pins are numbered starting from 1 */
22846 if (io_apic_assign_pci_irqs) {
22848 @@ -1207,35 +1225,41 @@ static int pirq_enable_irq(struct pci_de
22851 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22852 - struct pci_dev * bridge = dev->bus->self;
22853 + struct pci_dev *bridge = dev->bus->self;
22855 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22856 - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22857 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22858 PCI_SLOT(bridge->devfn), pin);
22860 - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22861 - pci_name(bridge), 'A' + pin, irq);
22862 + dev_warn(&dev->dev, "using bridge %s "
22863 + "INT %c to get IRQ %d\n",
22864 + pci_name(bridge), 'A' + pin,
22870 - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22871 - pci_name(dev), 'A' + pin, irq);
22872 + dev_info(&dev->dev, "PCI->APIC IRQ transform: "
22873 + "INT %c -> IRQ %d\n", 'A' + pin, irq);
22877 - msg = " Probably buggy MP table.";
22878 + msg = "; probably buggy MP table";
22879 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
22882 - msg = " Please try using pci=biosirq.";
22883 + msg = "; please try using pci=biosirq";
22885 - /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
22886 - if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
22888 + * With IDE legacy devices the IRQ lookup failure is not
22891 + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
22892 + !(dev->class & 0x5))
22895 - printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
22896 - 'A' + pin, pci_name(dev), msg);
22897 + dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
22902 --- sle11-2009-10-16.orig/arch/x86/vdso/Makefile 2009-03-16 16:33:40.000000000 +0100
22903 +++ sle11-2009-10-16/arch/x86/vdso/Makefile 2009-06-04 10:21:39.000000000 +0200
22904 @@ -65,9 +65,7 @@ obj-$(VDSO32-y) += vdso32-syms.lds
22905 vdso32.so-$(VDSO32-y) += int80
22906 vdso32.so-$(CONFIG_COMPAT) += syscall
22907 vdso32.so-$(VDSO32-y) += sysenter
22908 -xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
22909 -xen-vdso32-$(CONFIG_X86_32) += syscall
22910 -vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
22911 +vdso32.so-$(CONFIG_X86_XEN) += syscall
22913 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
22915 --- sle11-2009-10-16.orig/arch/x86/vdso/vdso32.S 2009-03-16 16:33:40.000000000 +0100
22916 +++ sle11-2009-10-16/arch/x86/vdso/vdso32.S 2009-06-04 10:21:39.000000000 +0200
22917 @@ -9,7 +9,7 @@ vdso32_int80_end:
22919 .globl vdso32_syscall_start, vdso32_syscall_end
22920 vdso32_syscall_start:
22921 -#ifdef CONFIG_COMPAT
22922 +#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
22923 .incbin "arch/x86/vdso/vdso32-syscall.so"
22925 vdso32_syscall_end:
22926 @@ -19,16 +19,4 @@ vdso32_sysenter_start:
22927 .incbin "arch/x86/vdso/vdso32-sysenter.so"
22928 vdso32_sysenter_end:
22930 -#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
22931 - .globl vdso32_int80_start, vdso32_int80_end
22932 -vdso32_int80_start:
22933 - .incbin "arch/x86/vdso/vdso32-int80.so"
22935 -#elif defined(CONFIG_X86_XEN)
22936 - .globl vdso32_syscall_start, vdso32_syscall_end
22937 -vdso32_syscall_start:
22938 - .incbin "arch/x86/vdso/vdso32-syscall.so"
22939 -vdso32_syscall_end:
22943 --- sle11-2009-10-16.orig/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:38:05.000000000 +0100
22944 +++ sle11-2009-10-16/arch/x86/vdso/vdso32-setup-xen.c 2009-06-04 10:21:39.000000000 +0200
22945 @@ -195,50 +195,28 @@ static __init void relocate_vdso(Elf32_E
22950 - * These symbols are defined by vdso32.S to mark the bounds
22951 - * of the ELF DSO images included therein.
22953 -extern const char vdso32_default_start, vdso32_default_end;
22954 -extern const char vdso32_sysenter_start, vdso32_sysenter_end;
22955 static struct page *vdso32_pages[1];
22957 #ifdef CONFIG_X86_64
22959 -#if CONFIG_XEN_COMPAT < 0x030200
22960 -static int use_int80 = 1;
22962 -static int use_sysenter __read_mostly = -1;
22964 -#define vdso32_sysenter() (use_sysenter > 0)
22965 +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
22966 +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
22968 -/* May not be __init: called during resume */
22969 -void syscall32_cpu_init(void)
22970 +void __cpuinit syscall32_cpu_init(void)
22972 - static const struct callback_register cstar = {
22973 + static const struct callback_register __cpuinitconst cstar = {
22974 .type = CALLBACKTYPE_syscall32,
22975 .address = (unsigned long)ia32_cstar_target
22977 - static const struct callback_register sysenter = {
22978 + static const struct callback_register __cpuinitconst sysenter = {
22979 .type = CALLBACKTYPE_sysenter,
22980 .address = (unsigned long)ia32_sysenter_target
22983 - if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
22984 - (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
22985 -#if CONFIG_XEN_COMPAT < 0x030200
22992 - if (use_sysenter < 0) {
22993 - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
22994 - use_sysenter = 1;
22995 - if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
22996 - use_sysenter = 1;
22998 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
22999 + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
23000 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
23001 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
23004 #define compat_uses_vma 1
23005 @@ -250,6 +228,7 @@ static inline void map_compat_vdso(int m
23006 #else /* CONFIG_X86_32 */
23008 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
23009 +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
23011 extern asmlinkage void ia32pv_cstar_target(void);
23012 static const struct callback_register __cpuinitconst cstar = {
23013 @@ -265,13 +244,13 @@ void __cpuinit enable_sep_cpu(void)
23014 .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
23017 - if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
23018 + if (vdso32_syscall()) {
23019 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
23024 - if (!boot_cpu_has(X86_FEATURE_SEP))
23025 + if (!vdso32_sysenter())
23028 if (xen_feature(XENFEAT_supervisor_mode_kernel))
23029 @@ -341,34 +320,26 @@ int __init sysenter_setup(void)
23031 #ifdef CONFIG_X86_32
23035 -#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
23037 - extern const char vdso32_int80_start, vdso32_int80_end;
23039 - vsyscall = &vdso32_int80_start;
23040 - vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
23042 -#elif defined(CONFIG_X86_32)
23043 - if (boot_cpu_has(X86_FEATURE_SYSCALL)
23044 - && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
23045 - || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
23046 - setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
23047 - barrier(); /* until clear_bit()'s constraints are correct ... */
23048 if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
23049 - extern const char vdso32_syscall_start, vdso32_syscall_end;
23051 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
23052 + && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
23053 + setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
23055 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
23056 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
23060 + if (vdso32_syscall()) {
23061 vsyscall = &vdso32_syscall_start;
23062 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
23065 - if (!vdso32_sysenter()) {
23066 - vsyscall = &vdso32_default_start;
23067 - vsyscall_len = &vdso32_default_end - &vdso32_default_start;
23069 + } else if (vdso32_sysenter()){
23070 vsyscall = &vdso32_sysenter_start;
23071 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
23073 + vsyscall = &vdso32_int80_start;
23074 + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
23077 memcpy(syscall_page, vsyscall, vsyscall_len);
23078 --- sle11-2009-10-16.orig/arch/x86/xen/Kconfig 2009-02-16 16:17:21.000000000 +0100
23079 +++ sle11-2009-10-16/arch/x86/xen/Kconfig 2009-06-04 10:21:39.000000000 +0200
23080 @@ -17,7 +17,7 @@ config XEN_MAX_DOMAIN_MEMORY
23081 int "Maximum allowed size of a domain in gigabytes"
23082 default 8 if X86_32
23083 default 32 if X86_64
23085 + depends on PARAVIRT_XEN
23087 The pseudo-physical to machine address array is sized
23088 according to the maximum possible memory size of a Xen
23089 @@ -26,5 +26,5 @@ config XEN_MAX_DOMAIN_MEMORY
23091 config XEN_SAVE_RESTORE
23094 + depends on PARAVIRT_XEN && PM
23096 \ No newline at end of file
23097 --- sle11-2009-10-16.orig/drivers/acpi/processor_core.c 2009-08-26 11:54:44.000000000 +0200
23098 +++ sle11-2009-10-16/drivers/acpi/processor_core.c 2009-08-26 12:04:00.000000000 +0200
23099 @@ -730,9 +730,11 @@ static int __cpuinit acpi_processor_star
23103 - sysdev = get_cpu_sysdev(pr->id);
23104 - if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
23106 + if (pr->id != -1) {
23107 + sysdev = get_cpu_sysdev(pr->id);
23108 + if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
23112 status = acpi_install_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
23113 acpi_processor_notify, pr);
23114 @@ -904,7 +906,8 @@ static int acpi_processor_remove(struct
23115 status = acpi_remove_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
23116 acpi_processor_notify);
23118 - sysfs_remove_link(&device->dev.kobj, "sysdev");
23119 + if (pr->id != -1)
23120 + sysfs_remove_link(&device->dev.kobj, "sysdev");
23122 acpi_processor_remove_fs(device);
23124 --- sle11-2009-10-16.orig/drivers/char/tpm/tpm_vtpm.c 2009-08-26 11:52:33.000000000 +0200
23125 +++ sle11-2009-10-16/drivers/char/tpm/tpm_vtpm.c 2009-06-04 10:21:39.000000000 +0200
23126 @@ -347,7 +347,7 @@ static int _vtpm_send_queued(struct tpm_
23131 + unsigned long flags;
23132 unsigned char buffer[1];
23133 struct vtpm_state *vtpms;
23134 vtpms = (struct vtpm_state *)chip_get_private(chip);
23135 --- sle11-2009-10-16.orig/drivers/misc/Kconfig 2009-10-28 14:55:02.000000000 +0100
23136 +++ sle11-2009-10-16/drivers/misc/Kconfig 2009-08-26 12:04:11.000000000 +0200
23137 @@ -440,7 +440,7 @@ config ENCLOSURE_SERVICES
23139 tristate "Support communication between SGI SSIs"
23141 - depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP
23142 + depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP && !XEN
23143 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
23144 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
23145 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP
23146 @@ -467,7 +467,7 @@ config HP_ILO
23149 tristate "SGI GRU driver"
23150 - depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP
23151 + depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP && !XEN
23153 select MMU_NOTIFIER
23155 --- sle11-2009-10-16.orig/drivers/pci/msi-xen.c 2009-03-16 16:38:05.000000000 +0100
23156 +++ sle11-2009-10-16/drivers/pci/msi-xen.c 2009-06-04 10:21:39.000000000 +0200
23157 @@ -90,12 +90,10 @@ arch_teardown_msi_irqs(struct pci_dev *d
23161 -static void msi_set_enable(struct pci_dev *dev, int enable)
23162 +static void __msi_set_enable(struct pci_dev *dev, int pos, int enable)
23167 - pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
23169 pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
23170 control &= ~PCI_MSI_FLAGS_ENABLE;
23171 @@ -105,6 +103,11 @@ static void msi_set_enable(struct pci_de
23175 +static void msi_set_enable(struct pci_dev *dev, int enable)
23177 + __msi_set_enable(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), enable);
23180 static void msix_set_enable(struct pci_dev *dev, int enable)
23183 @@ -568,9 +571,8 @@ int pci_enable_msi(struct pci_dev* dev)
23185 /* Check whether driver already requested for MSI-X irqs */
23186 if (dev->msix_enabled) {
23187 - printk(KERN_INFO "PCI: %s: Can't enable MSI. "
23188 - "Device already has MSI-X enabled\n",
23190 + dev_info(&dev->dev, "can't enable MSI "
23191 + "(MSI-X already enabled)\n");
23195 @@ -702,9 +704,8 @@ int pci_enable_msix(struct pci_dev* dev,
23197 /* Check whether driver already requested for MSI vector */
23198 if (dev->msi_enabled) {
23199 - printk(KERN_INFO "PCI: %s: Can't enable MSI-X. "
23200 - "Device already has an MSI irq assigned\n",
23202 + dev_info(&dev->dev, "can't enable MSI-X "
23203 + "(MSI IRQ already assigned)\n");
23207 --- sle11-2009-10-16.orig/drivers/pci/quirks.c 2009-10-28 14:55:02.000000000 +0100
23208 +++ sle11-2009-10-16/drivers/pci/quirks.c 2009-06-04 10:21:39.000000000 +0200
23209 @@ -44,9 +44,8 @@ static void __devinit quirk_release_reso
23210 /* PCI Host Bridge isn't a target device */
23214 - "PCI: Disable memory decoding and release memory resources [%s].\n",
23216 + dev_info(&dev->dev,
23217 + "disable memory decoding and release memory resources\n");
23218 pci_read_config_word(dev, PCI_COMMAND, &command);
23219 command &= ~PCI_COMMAND_MEMORY;
23220 pci_write_config_word(dev, PCI_COMMAND, command);
23221 --- sle11-2009-10-16.orig/drivers/pci/setup-res.c 2009-10-28 14:55:02.000000000 +0100
23222 +++ sle11-2009-10-16/drivers/pci/setup-res.c 2009-06-04 10:21:39.000000000 +0200
23223 @@ -129,7 +129,7 @@ int pci_claim_resource(struct pci_dev *d
23224 #ifdef CONFIG_PCI_REASSIGN
23225 void pci_disable_bridge_window(struct pci_dev *dev)
23227 - printk(KERN_DEBUG "PCI: Disable bridge window on %s\n", pci_name(dev));
23228 + dev_dbg(&dev->dev, "disable bridge window\n");
23230 /* MMIO Base/Limit */
23231 pci_write_config_dword(dev, PCI_MEMORY_BASE, 0x0000fff0);
23232 @@ -190,8 +190,8 @@ int pci_assign_resource(struct pci_dev *
23233 res->flags &= ~IORESOURCE_STARTALIGN;
23234 if (resno < PCI_BRIDGE_RESOURCES) {
23235 #ifdef CONFIG_PCI_REASSIGN
23236 - printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23237 - "%016llx - %016llx\n", resno, pci_name(dev),
23238 + dev_dbg(&dev->dev, "assign resource(%d) "
23239 + "%016llx - %016llx\n", resno,
23240 (unsigned long long)res->start,
23241 (unsigned long long)res->end);
23243 @@ -235,8 +235,8 @@ int pci_assign_resource_fixed(struct pci
23244 (unsigned long long)res->end);
23245 } else if (resno < PCI_BRIDGE_RESOURCES) {
23246 #ifdef CONFIG_PCI_REASSIGN
23247 - printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23248 - "%016llx - %016llx\n", resno, pci_name(dev),
23249 + dev_dbg(&dev->dev, "assign resource(%d) "
23250 + "%016llx - %016llx\n", resno,
23251 (unsigned long long)res->start,
23252 (unsigned long long)res->end);
23254 --- sle11-2009-10-16.orig/drivers/xen/Makefile 2009-03-16 16:38:05.000000000 +0100
23255 +++ sle11-2009-10-16/drivers/xen/Makefile 2009-06-04 10:21:39.000000000 +0200
23257 -obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
23258 +obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o manage.o
23259 xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
23260 xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
23262 --- sle11-2009-10-16.orig/drivers/xen/balloon/balloon.c 2009-06-29 15:28:36.000000000 +0200
23263 +++ sle11-2009-10-16/drivers/xen/balloon/balloon.c 2009-06-29 15:30:29.000000000 +0200
23264 @@ -84,7 +84,7 @@ static unsigned long frame_list[PAGE_SIZ
23265 /* VM /proc information for memory */
23266 extern unsigned long totalram_pages;
23268 -#if !defined(MODULE) && defined(CONFIG_HIGHMEM)
23269 +#ifdef CONFIG_HIGHMEM
23270 extern unsigned long totalhigh_pages;
23271 #define inc_totalhigh_pages() (totalhigh_pages++)
23272 #define dec_totalhigh_pages() (totalhigh_pages--)
23273 --- sle11-2009-10-16.orig/drivers/xen/balloon/sysfs.c 2009-06-29 15:29:24.000000000 +0200
23274 +++ sle11-2009-10-16/drivers/xen/balloon/sysfs.c 2009-06-29 15:31:06.000000000 +0200
23277 #define BALLOON_SHOW(name, format, args...) \
23278 static ssize_t show_##name(struct sys_device *dev, \
23279 + struct sysdev_attribute *attr, \
23282 return sprintf(buf, format, ##args); \
23283 @@ -56,14 +57,15 @@ BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(b
23284 BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high));
23285 BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
23287 -static ssize_t show_target_kb(struct sys_device *dev, char *buf)
23288 +static ssize_t show_target_kb(struct sys_device *dev,
23289 + struct sysdev_attribute *attr, char *buf)
23291 return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
23294 static ssize_t store_target_kb(struct sys_device *dev,
23297 + struct sysdev_attribute *attr,
23298 + const char *buf, size_t count)
23300 char memstring[64], *endchar;
23301 unsigned long long target_bytes;
23302 --- sle11-2009-10-16.orig/drivers/xen/blktap/blktap.c 2009-04-20 11:40:14.000000000 +0200
23303 +++ sle11-2009-10-16/drivers/xen/blktap/blktap.c 2009-06-04 10:21:39.000000000 +0200
23305 #include <linux/gfp.h>
23306 #include <linux/poll.h>
23307 #include <linux/delay.h>
23308 +#include <linux/nsproxy.h>
23309 #include <asm/tlbflush.h>
23311 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
23312 @@ -498,7 +499,7 @@ found:
23314 if ((class = get_xen_class()) != NULL)
23315 device_create(class, NULL, MKDEV(blktap_major, minor),
23316 - "blktap%d", minor);
23317 + NULL, "blktap%d", minor);
23321 @@ -1683,7 +1684,8 @@ static int __init blkif_init(void)
23322 * We only create the device when a request of a new device is
23325 - device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
23326 + device_create(class, NULL, MKDEV(blktap_major, 0), NULL,
23329 /* this is bad, but not fatal */
23330 WPRINTK("blktap: sysfs xen_class not created\n");
23331 --- sle11-2009-10-16.orig/drivers/xen/char/mem.c 2009-03-16 16:38:05.000000000 +0100
23332 +++ sle11-2009-10-16/drivers/xen/char/mem.c 2009-06-04 10:21:39.000000000 +0200
23333 @@ -35,7 +35,7 @@ static inline int uncached_access(struct
23335 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
23337 -#ifdef CONFIG_NONPROMISC_DEVMEM
23338 +#ifdef CONFIG_STRICT_DEVMEM
23339 u64 from = ((u64)pfn) << PAGE_SHIFT;
23340 u64 to = from + size;
23342 @@ -172,7 +172,10 @@ static void mmap_mem_close(struct vm_are
23344 static struct vm_operations_struct mmap_mem_ops = {
23345 .open = mmap_mem_open,
23346 - .close = mmap_mem_close
23347 + .close = mmap_mem_close,
23348 +#ifdef CONFIG_HAVE_IOREMAP_PROT
23349 + .access = generic_access_phys
23353 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
23354 --- sle11-2009-10-16.orig/drivers/xen/console/console.c 2009-03-16 16:38:05.000000000 +0100
23355 +++ sle11-2009-10-16/drivers/xen/console/console.c 2009-06-04 10:21:39.000000000 +0200
23356 @@ -432,9 +432,7 @@ static void __xencons_tx_flush(void)
23358 if (work_done && (xencons_tty != NULL)) {
23359 wake_up_interruptible(&xencons_tty->write_wait);
23360 - if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
23361 - (xencons_tty->ldisc.write_wakeup != NULL))
23362 - (xencons_tty->ldisc.write_wakeup)(xencons_tty);
23363 + tty_wakeup(xencons_tty);
23367 @@ -635,8 +633,8 @@ static void xencons_close(struct tty_str
23369 tty_wait_until_sent(tty, 0);
23370 tty_driver_flush_buffer(tty);
23371 - if (tty->ldisc.flush_buffer != NULL)
23372 - tty->ldisc.flush_buffer(tty);
23373 + if (tty->ldisc.ops->flush_buffer != NULL)
23374 + tty->ldisc.ops->flush_buffer(tty);
23376 spin_lock_irqsave(&xencons_lock, flags);
23377 xencons_tty = NULL;
23378 --- sle11-2009-10-16.orig/drivers/xen/core/evtchn.c 2009-03-16 16:33:40.000000000 +0100
23379 +++ sle11-2009-10-16/drivers/xen/core/evtchn.c 2009-06-04 10:21:39.000000000 +0200
23380 @@ -746,8 +746,9 @@ static struct irq_chip dynirq_chip = {
23383 /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
23384 -static int pirq_eoi_does_unmask;
23385 +static bool pirq_eoi_does_unmask;
23386 static unsigned long *pirq_needs_eoi;
23387 +static DECLARE_BITMAP(probing_pirq, NR_PIRQS);
23389 static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
23391 @@ -794,25 +795,31 @@ static inline void pirq_query_unmask(int
23392 set_bit(irq - PIRQ_BASE, pirq_needs_eoi);
23396 - * On startup, if there is no action associated with the IRQ then we are
23397 - * probing. In this case we should not share with others as it will confuse us.
23399 -#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
23400 +static int set_type_pirq(unsigned int irq, unsigned int type)
23402 + if (type != IRQ_TYPE_PROBE)
23404 + set_bit(irq - PIRQ_BASE, probing_pirq);
23408 static unsigned int startup_pirq(unsigned int irq)
23410 struct evtchn_bind_pirq bind_pirq;
23411 int evtchn = evtchn_from_irq(irq);
23413 - if (VALID_EVTCHN(evtchn))
23414 + if (VALID_EVTCHN(evtchn)) {
23415 + clear_bit(irq - PIRQ_BASE, probing_pirq);
23419 bind_pirq.pirq = evtchn_get_xen_pirq(irq);
23420 /* NB. We are happy to share unless we are probing. */
23421 - bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
23422 + bind_pirq.flags = test_and_clear_bit(irq - PIRQ_BASE, probing_pirq)
23423 + || (irq_desc[irq].status & IRQ_AUTODETECT)
23424 + ? 0 : BIND_PIRQ__WILL_SHARE;
23425 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
23426 - if (!probing_irq(irq))
23427 + if (bind_pirq.flags)
23428 printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
23431 @@ -891,6 +898,7 @@ static struct irq_chip pirq_chip = {
23432 .mask_ack = ack_pirq,
23435 + .set_type = set_type_pirq,
23437 .set_affinity = set_affinity_irq,
23439 @@ -1003,6 +1011,7 @@ void xen_poll_irq(int irq)
23443 +#ifdef CONFIG_PM_SLEEP
23444 static void restore_cpu_virqs(unsigned int cpu)
23446 struct evtchn_bind_virq bind_virq;
23447 @@ -1095,6 +1104,7 @@ void irq_resume(void)
23453 #if defined(CONFIG_X86_IO_APIC)
23454 #define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
23455 @@ -1177,7 +1187,7 @@ void __init xen_init_IRQ(void)
23456 * BITS_TO_LONGS(ALIGN(NR_PIRQS, PAGE_SIZE * 8)));
23457 eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
23458 if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
23459 - pirq_eoi_does_unmask = 1;
23460 + pirq_eoi_does_unmask = true;
23462 /* No event channels are 'live' right now. */
23463 for (i = 0; i < NR_EVENT_CHANNELS; i++)
23464 --- sle11-2009-10-16.orig/drivers/xen/core/gnttab.c 2008-12-01 11:25:57.000000000 +0100
23465 +++ sle11-2009-10-16/drivers/xen/core/gnttab.c 2009-06-04 10:21:39.000000000 +0200
23466 @@ -449,6 +449,7 @@ static int map_pte_fn(pte_t *pte, struct
23470 +#ifdef CONFIG_PM_SLEEP
23471 static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
23472 unsigned long addr, void *data)
23474 @@ -456,6 +457,7 @@ static int unmap_pte_fn(pte_t *pte, stru
23475 set_pte_at(&init_mm, addr, pte, __pte(0));
23480 void *arch_gnttab_alloc_shared(unsigned long *frames)
23482 @@ -633,6 +635,75 @@ void __gnttab_dma_map_page(struct page *
23483 } while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
23486 +#ifdef __HAVE_ARCH_PTE_SPECIAL
23488 +static unsigned int GNTMAP_pte_special;
23490 +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
23491 + unsigned int count)
23495 + if (unlikely(cmd != GNTTABOP_map_grant_ref))
23498 + for (i = 0; i < count; ++i, ++map) {
23499 + if (!(map->flags & GNTMAP_host_map)
23500 + || !(map->flags & GNTMAP_application_map))
23502 + if (GNTMAP_pte_special)
23503 + map->flags |= GNTMAP_pte_special;
23505 + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
23512 +EXPORT_SYMBOL(gnttab_pre_map_adjust);
23514 +#if CONFIG_XEN_COMPAT < 0x030400
23515 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
23520 + for (i = 0; i < count && rc == 0; ++i, ++map) {
23523 + if (!(map->flags & GNTMAP_host_map)
23524 + || !(map->flags & GNTMAP_application_map))
23528 + pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
23529 + | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
23531 + & __supported_pte_mask);
23533 +#error Architecture not yet supported.
23535 + if (!(map->flags & GNTMAP_readonly))
23536 + pte = pte_mkwrite(pte);
23538 + if (map->flags & GNTMAP_contains_pte) {
23541 + u.ptr = map->host_addr;
23542 + u.val = __pte_val(pte);
23543 + rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
23545 + rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
23550 +EXPORT_SYMBOL(gnttab_post_map_adjust);
23553 +#endif /* __HAVE_ARCH_PTE_SPECIAL */
23555 int gnttab_resume(void)
23557 if (max_nr_grant_frames() < nr_grant_frames)
23558 @@ -640,6 +711,7 @@ int gnttab_resume(void)
23559 return gnttab_map(0, nr_grant_frames - 1);
23562 +#ifdef CONFIG_PM_SLEEP
23563 int gnttab_suspend(void)
23566 @@ -649,6 +721,7 @@ int gnttab_suspend(void)
23572 #else /* !CONFIG_XEN */
23574 @@ -759,6 +832,18 @@ int __devinit gnttab_init(void)
23575 gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
23576 gnttab_free_head = NR_RESERVED_ENTRIES;
23578 +#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
23579 + if (!xen_feature(XENFEAT_auto_translated_physmap)
23580 + && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
23582 + GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
23583 + >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
23585 +#error Architecture not yet supported.
23593 --- sle11-2009-10-16.orig/drivers/xen/core/machine_kexec.c 2009-03-16 16:38:05.000000000 +0100
23594 +++ sle11-2009-10-16/drivers/xen/core/machine_kexec.c 2009-06-04 10:21:39.000000000 +0200
23595 @@ -91,7 +91,7 @@ void __init xen_machine_kexec_setup_reso
23596 xen_hypervisor_res.start = range.start;
23597 xen_hypervisor_res.end = range.start + range.size - 1;
23598 xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
23599 -#ifdef CONFIG_X86_64
23601 insert_resource(&iomem_resource, &xen_hypervisor_res);
23604 @@ -106,7 +106,7 @@ void __init xen_machine_kexec_setup_reso
23606 crashk_res.start = range.start;
23607 crashk_res.end = range.start + range.size - 1;
23608 -#ifdef CONFIG_X86_64
23610 insert_resource(&iomem_resource, &crashk_res);
23613 @@ -160,7 +160,7 @@ void __init xen_machine_kexec_setup_reso
23617 -#ifndef CONFIG_X86_64
23618 +#ifndef CONFIG_X86
23619 void __init xen_machine_kexec_register_resources(struct resource *res)
23621 request_resource(res, &xen_hypervisor_res);
23622 --- sle11-2009-10-16.orig/drivers/xen/core/machine_reboot.c 2009-10-28 14:55:02.000000000 +0100
23623 +++ sle11-2009-10-16/drivers/xen/core/machine_reboot.c 2009-06-04 10:21:39.000000000 +0200
23624 @@ -57,6 +57,7 @@ EXPORT_SYMBOL(machine_restart);
23625 EXPORT_SYMBOL(machine_halt);
23626 EXPORT_SYMBOL(machine_power_off);
23628 +#ifdef CONFIG_PM_SLEEP
23629 static void pre_suspend(void)
23631 HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
23632 @@ -111,6 +112,7 @@ static void post_suspend(int suspend_can
23633 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
23634 virt_to_mfn(pfn_to_mfn_frame_list_list);
23638 #else /* !(defined(__i386__) || defined(__x86_64__)) */
23640 @@ -129,6 +131,7 @@ static void post_suspend(int suspend_can
23644 +#ifdef CONFIG_PM_SLEEP
23647 void (*resume_notifier)(int);
23648 @@ -222,7 +225,8 @@ int __xen_suspend(int fast_suspend, void
23650 if (fast_suspend) {
23652 - err = stop_machine_run(take_machine_down, &suspend, 0);
23653 + err = stop_machine(take_machine_down, &suspend,
23654 + &cpumask_of_cpu(0));
23656 xenbus_suspend_cancel();
23658 @@ -245,3 +249,4 @@ int __xen_suspend(int fast_suspend, void
23663 --- sle11-2009-10-16.orig/drivers/xen/core/reboot.c 2009-02-16 16:17:21.000000000 +0100
23664 +++ sle11-2009-10-16/drivers/xen/core/reboot.c 2009-06-04 10:21:39.000000000 +0200
23665 @@ -29,17 +29,12 @@ MODULE_LICENSE("Dual BSD/GPL");
23666 /* Ignore multiple shutdown requests. */
23667 static int shutting_down = SHUTDOWN_INVALID;
23669 -/* Was last suspend request cancelled? */
23670 -static int suspend_cancelled;
23672 /* Can we leave APs online when we suspend? */
23673 static int fast_suspend;
23675 static void __shutdown_handler(struct work_struct *unused);
23676 static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
23678 -static int setup_suspend_evtchn(void);
23680 int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));
23682 static int shutdown_process(void *__unused)
23683 @@ -69,6 +64,13 @@ static int shutdown_process(void *__unus
23687 +#ifdef CONFIG_PM_SLEEP
23689 +static int setup_suspend_evtchn(void);
23691 +/* Was last suspend request cancelled? */
23692 +static int suspend_cancelled;
23694 static void xen_resume_notifier(int _suspend_cancelled)
23696 int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
23697 @@ -118,6 +120,10 @@ static int xen_suspend(void *__unused)
23702 +# define xen_suspend NULL
23705 static void switch_shutdown_state(int new_state)
23707 int prev_state, old_state = SHUTDOWN_INVALID;
23708 @@ -194,8 +200,10 @@ static void shutdown_handler(struct xenb
23709 new_state = SHUTDOWN_POWEROFF;
23710 else if (strcmp(str, "reboot") == 0)
23712 +#ifdef CONFIG_PM_SLEEP
23713 else if (strcmp(str, "suspend") == 0)
23714 new_state = SHUTDOWN_SUSPEND;
23716 else if (strcmp(str, "halt") == 0)
23717 new_state = SHUTDOWN_HALT;
23719 @@ -247,6 +255,7 @@ static struct xenbus_watch sysrq_watch =
23720 .callback = sysrq_handler
23723 +#ifdef CONFIG_PM_SLEEP
23724 static irqreturn_t suspend_int(int irq, void* dev_id)
23726 switch_shutdown_state(SHUTDOWN_SUSPEND);
23727 @@ -274,6 +283,9 @@ static int setup_suspend_evtchn(void)
23732 +#define setup_suspend_evtchn() 0
23735 static int setup_shutdown_watcher(void)
23737 --- sle11-2009-10-16.orig/drivers/xen/core/smpboot.c 2009-03-16 16:38:05.000000000 +0100
23738 +++ sle11-2009-10-16/drivers/xen/core/smpboot.c 2009-06-04 10:21:39.000000000 +0200
23741 extern irqreturn_t smp_reschedule_interrupt(int, void *);
23742 extern irqreturn_t smp_call_function_interrupt(int, void *);
23743 +extern irqreturn_t smp_call_function_single_interrupt(int, void *);
23745 extern int local_setup_timer(unsigned int cpu);
23746 extern void local_teardown_timer(unsigned int cpu);
23747 @@ -50,8 +51,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
23749 static DEFINE_PER_CPU(int, resched_irq);
23750 static DEFINE_PER_CPU(int, callfunc_irq);
23751 +static DEFINE_PER_CPU(int, call1func_irq);
23752 static char resched_name[NR_CPUS][15];
23753 static char callfunc_name[NR_CPUS][15];
23754 +static char call1func_name[NR_CPUS][15];
23756 #ifdef CONFIG_X86_LOCAL_APIC
23757 #define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
23758 @@ -73,15 +76,13 @@ void __init prefill_possible_map(void)
23760 for (i = 0; i < NR_CPUS; i++) {
23761 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
23764 cpu_set(i, cpu_possible_map);
23765 + nr_cpu_ids = i + 1;
23770 -void __init smp_alloc_memory(void)
23775 set_cpu_sibling_map(unsigned int cpu)
23777 @@ -110,7 +111,8 @@ static int __cpuinit xen_smp_intr_init(u
23781 - per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
23782 + per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) =
23783 + per_cpu(call1func_irq, cpu) = -1;
23785 sprintf(resched_name[cpu], "resched%u", cpu);
23786 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
23787 @@ -134,6 +136,17 @@ static int __cpuinit xen_smp_intr_init(u
23789 per_cpu(callfunc_irq, cpu) = rc;
23791 + sprintf(call1func_name[cpu], "call1func%u", cpu);
23792 + rc = bind_ipi_to_irqhandler(CALL_FUNC_SINGLE_VECTOR,
23794 + smp_call_function_single_interrupt,
23795 + IRQF_DISABLED|IRQF_NOBALANCING,
23796 + call1func_name[cpu],
23800 + per_cpu(call1func_irq, cpu) = rc;
23802 rc = xen_spinlock_init(cpu);
23805 @@ -148,6 +161,8 @@ static int __cpuinit xen_smp_intr_init(u
23806 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23807 if (per_cpu(callfunc_irq, cpu) >= 0)
23808 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23809 + if (per_cpu(call1func_irq, cpu) >= 0)
23810 + unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23811 xen_spinlock_cleanup(cpu);
23814 @@ -160,6 +175,7 @@ static void __cpuexit xen_smp_intr_exit(
23816 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23817 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23818 + unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23819 xen_spinlock_cleanup(cpu);
23822 @@ -167,11 +183,7 @@ static void __cpuexit xen_smp_intr_exit(
23823 void __cpuinit cpu_bringup(void)
23827 identify_secondary_cpu(¤t_cpu_data);
23829 - identify_cpu(¤t_cpu_data);
23831 touch_softlockup_watchdog();
23833 local_irq_enable();
23834 @@ -251,9 +263,6 @@ void __init smp_prepare_cpus(unsigned in
23835 struct task_struct *idle;
23837 struct vcpu_get_physid cpu_id;
23839 - struct desc_ptr *gdt_descr;
23844 @@ -266,7 +275,7 @@ void __init smp_prepare_cpus(unsigned in
23846 current_thread_info()->cpu = 0;
23848 - for (cpu = 0; cpu < NR_CPUS; cpu++) {
23849 + for_each_possible_cpu (cpu) {
23850 cpus_clear(per_cpu(cpu_sibling_map, cpu));
23851 cpus_clear(per_cpu(cpu_core_map, cpu));
23853 @@ -293,21 +302,10 @@ void __init smp_prepare_cpus(unsigned in
23855 panic("failed fork for CPU %d", cpu);
23858 - gdt_descr = &cpu_gdt_descr[cpu];
23859 - gdt_descr->address = get_zeroed_page(GFP_KERNEL);
23860 - if (unlikely(!gdt_descr->address)) {
23861 - printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
23865 - gdt_descr->size = GDT_SIZE;
23866 - memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
23867 - gdt_addr = (void *)gdt_descr->address;
23871 - gdt_addr = get_cpu_gdt_table(cpu);
23873 + gdt_addr = get_cpu_gdt_table(cpu);
23874 make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
23877 @@ -353,8 +351,8 @@ void __init smp_prepare_boot_cpu(void)
23880 init_gdt(smp_processor_id());
23881 - switch_to_new_gdt();
23883 + switch_to_new_gdt();
23884 prefill_possible_map();
23887 --- sle11-2009-10-16.orig/drivers/xen/core/spinlock.c 2009-03-16 16:33:40.000000000 +0100
23888 +++ sle11-2009-10-16/drivers/xen/core/spinlock.c 2009-06-04 10:36:24.000000000 +0200
23890 * portions of this file.
23893 +#if CONFIG_XEN_COMPAT >= 0x030200
23895 #include <linux/init.h>
23896 #include <linux/irq.h>
23897 #include <linux/kernel.h>
23898 @@ -73,9 +75,9 @@ int xen_spin_wait(raw_spinlock_t *lock,
23899 /* announce we're spinning */
23900 spinning.ticket = token;
23901 spinning.lock = lock;
23902 - spinning.prev = __get_cpu_var(spinning);
23903 + spinning.prev = x86_read_percpu(spinning);
23905 - __get_cpu_var(spinning) = &spinning;
23906 + x86_write_percpu(spinning, &spinning);
23908 /* clear pending */
23909 xen_clear_irq_pending(irq);
23910 @@ -102,7 +104,7 @@ int xen_spin_wait(raw_spinlock_t *lock,
23911 kstat_this_cpu.irqs[irq] += !rc;
23913 /* announce we're done */
23914 - __get_cpu_var(spinning) = spinning.prev;
23915 + x86_write_percpu(spinning, spinning.prev);
23916 rm_lock = &__get_cpu_var(spinning_rm_lock);
23917 raw_local_irq_save(flags);
23918 __raw_write_lock(rm_lock);
23919 @@ -159,3 +161,5 @@ void xen_spin_kick(raw_spinlock_t *lock,
23922 EXPORT_SYMBOL(xen_spin_kick);
23924 +#endif /* CONFIG_XEN_COMPAT >= 0x030200 */
23925 --- sle11-2009-10-16.orig/drivers/xen/fbfront/xenfb.c 2009-03-16 16:38:05.000000000 +0100
23926 +++ sle11-2009-10-16/drivers/xen/fbfront/xenfb.c 2009-06-04 10:21:39.000000000 +0200
23931 +#include <linux/console.h>
23932 #include <linux/kernel.h>
23933 #include <linux/errno.h>
23934 #include <linux/fb.h>
23935 @@ -545,6 +546,28 @@ static unsigned long vmalloc_to_mfn(void
23936 return pfn_to_mfn(vmalloc_to_pfn(address));
23939 +static __devinit void
23940 +xenfb_make_preferred_console(void)
23942 + struct console *c;
23944 + if (console_set_on_cmdline)
23947 + acquire_console_sem();
23948 + for (c = console_drivers; c; c = c->next) {
23949 + if (!strcmp(c->name, "tty") && c->index == 0)
23952 + release_console_sem();
23954 + unregister_console(c);
23955 + c->flags |= CON_CONSDEV;
23956 + c->flags &= ~CON_PRINTBUFFER; /* don't print again */
23957 + register_console(c);
23961 static int __devinit xenfb_probe(struct xenbus_device *dev,
23962 const struct xenbus_device_id *id)
23964 @@ -665,6 +688,7 @@ static int __devinit xenfb_probe(struct
23968 + xenfb_make_preferred_console();
23972 @@ -882,4 +906,5 @@ static void __exit xenfb_cleanup(void)
23973 module_init(xenfb_init);
23974 module_exit(xenfb_cleanup);
23976 +MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
23977 MODULE_LICENSE("GPL");
23978 --- sle11-2009-10-16.orig/drivers/xen/fbfront/xenkbd.c 2009-03-04 11:25:55.000000000 +0100
23979 +++ sle11-2009-10-16/drivers/xen/fbfront/xenkbd.c 2009-06-04 10:21:39.000000000 +0200
23980 @@ -350,4 +350,5 @@ static void __exit xenkbd_cleanup(void)
23981 module_init(xenkbd_init);
23982 module_exit(xenkbd_cleanup);
23984 +MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
23985 MODULE_LICENSE("GPL");
23986 --- sle11-2009-10-16.orig/drivers/xen/gntdev/gntdev.c 2009-03-16 16:38:05.000000000 +0100
23987 +++ sle11-2009-10-16/drivers/xen/gntdev/gntdev.c 2009-06-04 10:21:39.000000000 +0200
23988 @@ -418,7 +418,7 @@ static int __init gntdev_init(void)
23991 device = device_create(class, NULL, MKDEV(gntdev_major, 0),
23993 + NULL, GNTDEV_NAME);
23994 if (IS_ERR(device)) {
23995 printk(KERN_ERR "Error creating gntdev device in xen_class\n");
23996 printk(KERN_ERR "gntdev created with major number = %d\n",
23997 --- sle11-2009-10-16.orig/drivers/xen/netfront/accel.c 2009-03-30 16:39:19.000000000 +0200
23998 +++ sle11-2009-10-16/drivers/xen/netfront/accel.c 2009-06-04 10:21:39.000000000 +0200
24003 +#include <linux/version.h>
24004 #include <linux/netdevice.h>
24005 #include <linux/skbuff.h>
24006 #include <linux/list.h>
24007 --- sle11-2009-10-16.orig/drivers/xen/netfront/netfront.c 2009-03-30 16:40:17.000000000 +0200
24008 +++ sle11-2009-10-16/drivers/xen/netfront/netfront.c 2009-06-04 10:21:39.000000000 +0200
24009 @@ -640,7 +640,7 @@ static int network_open(struct net_devic
24011 spin_unlock_bh(&np->rx_lock);
24013 - network_maybe_wake_tx(dev);
24014 + netif_start_queue(dev);
24018 --- sle11-2009-10-16.orig/drivers/xen/sfc_netback/accel.h 2009-03-30 16:00:09.000000000 +0200
24019 +++ sle11-2009-10-16/drivers/xen/sfc_netback/accel.h 2009-06-04 10:21:39.000000000 +0200
24021 #ifndef NETBACK_ACCEL_H
24022 #define NETBACK_ACCEL_H
24024 +#include <linux/version.h>
24025 #include <linux/slab.h>
24026 #include <linux/ip.h>
24027 #include <linux/tcp.h>
24028 --- sle11-2009-10-16.orig/drivers/xen/sfc_netfront/accel.h 2009-03-30 16:34:56.000000000 +0200
24029 +++ sle11-2009-10-16/drivers/xen/sfc_netfront/accel.h 2009-06-04 10:21:39.000000000 +0200
24031 #include <xen/evtchn.h>
24033 #include <linux/kernel.h>
24034 +#include <linux/version.h>
24035 #include <linux/list.h>
24037 enum netfront_accel_post_status {
24038 --- sle11-2009-10-16.orig/drivers/xen/xenbus/xenbus_client.c 2009-03-24 10:13:17.000000000 +0100
24039 +++ sle11-2009-10-16/drivers/xen/xenbus/xenbus_client.c 2009-06-04 10:21:39.000000000 +0200
24040 @@ -150,7 +150,7 @@ int xenbus_watch_pathfmt(struct xenbus_d
24043 va_start(ap, pathfmt);
24044 - path = kvasprintf(GFP_KERNEL, pathfmt, ap);
24045 + path = kvasprintf(GFP_NOIO | __GFP_HIGH, pathfmt, ap);
24049 --- sle11-2009-10-16.orig/drivers/xen/xenbus/xenbus_comms.c 2009-02-16 16:17:21.000000000 +0100
24050 +++ sle11-2009-10-16/drivers/xen/xenbus/xenbus_comms.c 2009-06-04 10:21:39.000000000 +0200
24051 @@ -228,14 +228,11 @@ int xb_init_comms(void)
24052 intf->rsp_cons = intf->rsp_prod;
24055 +#if defined(CONFIG_XEN) || defined(MODULE)
24057 unbind_from_irqhandler(xenbus_irq, &xb_waitq);
24059 -#if defined(CONFIG_XEN) || defined(MODULE)
24060 err = bind_caller_port_to_irqhandler(
24062 - err = bind_evtchn_to_irqhandler(
24064 xen_store_evtchn, wake_waiting,
24065 0, "xenbus", &xb_waitq);
24067 @@ -244,6 +241,20 @@ int xb_init_comms(void)
24072 + if (xenbus_irq) {
24073 + /* Already have an irq; assume we're resuming */
24074 + rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
24076 + err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
24077 + 0, "xenbus", &xb_waitq);
24079 + printk(KERN_ERR "XENBUS request irq failed %i\n", err);
24082 + xenbus_irq = err;
24088 --- sle11-2009-10-16.orig/drivers/xen/xenbus/xenbus_probe.c 2009-03-16 16:38:05.000000000 +0100
24089 +++ sle11-2009-10-16/drivers/xen/xenbus/xenbus_probe.c 2009-06-04 10:21:39.000000000 +0200
24091 __FUNCTION__, __LINE__, ##args)
24093 #include <linux/kernel.h>
24094 +#include <linux/version.h>
24095 #include <linux/err.h>
24096 #include <linux/string.h>
24097 #include <linux/ctype.h>
24098 --- sle11-2009-10-16.orig/fs/aio.c 2009-03-24 10:13:25.000000000 +0100
24099 +++ sle11-2009-10-16/fs/aio.c 2009-06-04 10:21:39.000000000 +0200
24100 @@ -1335,7 +1335,7 @@ static int make_aio_fd(struct kioctx *io
24104 - fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
24105 + fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
24109 --- sle11-2009-10-16.orig/include/asm-generic/pgtable.h 2009-03-04 11:28:34.000000000 +0100
24110 +++ sle11-2009-10-16/include/asm-generic/pgtable.h 2009-06-04 10:21:39.000000000 +0200
24111 @@ -99,10 +99,6 @@ static inline void ptep_set_wrprotect(st
24115 -#ifndef arch_change_pte_range
24116 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
24119 #ifndef __HAVE_ARCH_PTE_SAME
24120 #define pte_same(A,B) (pte_val(A) == pte_val(B))
24122 --- sle11-2009-10-16.orig/include/asm-x86/dma-mapping.h 2009-03-16 16:38:05.000000000 +0100
24123 +++ sle11-2009-10-16/include/asm-x86/dma-mapping.h 2009-06-04 10:21:39.000000000 +0200
24124 @@ -74,7 +74,7 @@ static inline struct dma_mapping_ops *ge
24125 /* Make sure we keep the same behaviour */
24126 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
24128 -#ifdef CONFIG_X86_32
24129 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
24132 struct dma_mapping_ops *ops = get_dma_ops(dev);
24133 --- sle11-2009-10-16.orig/include/asm-x86/kexec.h 2008-12-01 11:11:08.000000000 +0100
24134 +++ sle11-2009-10-16/include/asm-x86/kexec.h 2009-06-04 10:21:39.000000000 +0200
24136 # define VA_PTE_0 5
24137 # define PA_PTE_1 6
24138 # define VA_PTE_1 7
24139 +# ifndef CONFIG_XEN
24140 # define PA_SWAP_PAGE 8
24141 # ifdef CONFIG_X86_PAE
24142 # define PA_PMD_0 9
24145 # define PAGES_NR 9
24147 +# else /* CONFIG_XEN */
24149 + * The hypervisor interface implicitly requires that all entries (except
24150 + * for possibly the final one) are arranged in matching PA_/VA_ pairs.
24152 +# define PA_PMD_0 8
24153 +# define VA_PMD_0 9
24154 +# define PA_PMD_1 10
24155 +# define VA_PMD_1 11
24156 +# define PA_SWAP_PAGE 12
24157 +# define PAGES_NR 13
24158 +# endif /* CONFIG_XEN */
24160 # define PA_CONTROL_PAGE 0
24161 # define VA_CONTROL_PAGE 1
24162 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:38:05.000000000 +0100
24163 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/desc.h 2009-06-04 10:21:39.000000000 +0200
24164 @@ -31,11 +31,17 @@ extern struct desc_ptr idt_descr;
24165 extern gate_desc idt_table[];
24169 + struct desc_struct gdt[GDT_ENTRIES];
24170 +} __attribute__((aligned(PAGE_SIZE)));
24171 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
24173 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
24175 + return per_cpu(gdt_page, cpu).gdt;
24178 #ifdef CONFIG_X86_64
24179 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
24180 -extern struct desc_ptr cpu_gdt_descr[];
24181 -/* the cpu gdt accessor */
24182 -#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
24184 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
24185 unsigned dpl, unsigned ist, unsigned seg)
24186 @@ -53,16 +59,6 @@ static inline void pack_gate(gate_desc *
24191 - struct desc_struct gdt[GDT_ENTRIES];
24192 -} __attribute__((aligned(PAGE_SIZE)));
24193 -DECLARE_PER_CPU(struct gdt_page, gdt_page);
24195 -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
24197 - return per_cpu(gdt_page, cpu).gdt;
24200 static inline void pack_gate(gate_desc *gate, unsigned char type,
24201 unsigned long base, unsigned dpl, unsigned flags,
24202 unsigned short seg)
24203 @@ -333,6 +329,28 @@ static inline void set_intr_gate(unsigne
24204 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
24207 +#define SYS_VECTOR_FREE 0
24208 +#define SYS_VECTOR_ALLOCED 1
24210 +extern int first_system_vector;
24211 +extern char system_vectors[];
24213 +static inline void alloc_system_vector(int vector)
24215 + if (system_vectors[vector] == SYS_VECTOR_FREE) {
24216 + system_vectors[vector] = SYS_VECTOR_ALLOCED;
24217 + if (first_system_vector > vector)
24218 + first_system_vector = vector;
24223 +static inline void alloc_intr_gate(unsigned int n, void *addr)
24225 + alloc_system_vector(n);
24226 + set_intr_gate(n, addr);
24230 * This routine sets up an interrupt gate at directory privilege level 3.
24232 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/fixmap.h 2009-03-16 16:38:05.000000000 +0100
24233 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/fixmap.h 2009-06-04 10:21:39.000000000 +0200
24235 # include "fixmap_64.h"
24238 +extern int fixmaps_set;
24240 +void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
24242 +static inline void __set_fixmap(enum fixed_addresses idx,
24243 + maddr_t phys, pgprot_t flags)
24245 + xen_set_fixmap(idx, phys, flags);
24248 +#define set_fixmap(idx, phys) \
24249 + __set_fixmap(idx, phys, PAGE_KERNEL)
24252 + * Some hardware wants to get fixmapped without caching.
24254 +#define set_fixmap_nocache(idx, phys) \
24255 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24257 #define clear_fixmap(idx) \
24258 __set_fixmap(idx, 0, __pgprot(0))
24260 +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24261 +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24263 +extern void __this_fixmap_does_not_exist(void);
24266 + * 'index to address' translation. If anyone tries to use the idx
24267 + * directly without translation, we catch the bug with a NULL-deference
24268 + * kernel oops. Illegal ranges of incoming indices are caught too.
24270 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24273 + * this branch gets completely eliminated after inlining,
24274 + * except when someone tries to use fixaddr indices in an
24275 + * illegal way. (such as mixing up address types or using
24276 + * out-of-range indices).
24278 + * If it doesn't get removed, the linker will complain
24279 + * loudly with a reasonably clear error message..
24281 + if (idx >= __end_of_fixed_addresses)
24282 + __this_fixmap_does_not_exist();
24284 + return __fix_to_virt(idx);
24287 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
24289 + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24290 + return __virt_to_fix(vaddr);
24293 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:38:05.000000000 +0100
24294 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-06-04 10:21:39.000000000 +0200
24295 @@ -58,10 +58,17 @@ enum fixed_addresses {
24296 #ifdef CONFIG_X86_LOCAL_APIC
24297 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24299 -#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
24300 +#ifndef CONFIG_XEN
24301 +#ifdef CONFIG_X86_IO_APIC
24302 FIX_IO_APIC_BASE_0,
24303 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
24307 +#define NR_FIX_ISAMAPS 256
24309 + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24311 #ifdef CONFIG_X86_VISWS_APIC
24312 FIX_CO_CPU, /* Cobalt timer */
24313 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
24314 @@ -78,51 +85,38 @@ enum fixed_addresses {
24315 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
24316 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
24318 -#ifdef CONFIG_ACPI
24320 - FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24322 #ifdef CONFIG_PCI_MMCONFIG
24325 #ifdef CONFIG_PARAVIRT
24326 FIX_PARAVIRT_BOOTMAP,
24329 -#define NR_FIX_ISAMAPS 256
24331 - FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24332 __end_of_permanent_fixed_addresses,
24334 * 256 temporary boot-time mappings, used by early_ioremap(),
24335 * before ioremap() is functional.
24337 - * We round it up to the next 512 pages boundary so that we
24338 + * We round it up to the next 256 pages boundary so that we
24339 * can have a single pgd entry and a single pte table:
24341 #define NR_FIX_BTMAPS 64
24342 #define FIX_BTMAPS_NESTING 4
24343 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24344 - (__end_of_permanent_fixed_addresses & 511),
24345 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
24346 + (__end_of_permanent_fixed_addresses & 255),
24347 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24349 +#ifdef CONFIG_ACPI
24351 + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24353 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24356 __end_of_fixed_addresses
24359 -extern void __set_fixmap(enum fixed_addresses idx,
24360 - maddr_t phys, pgprot_t flags);
24361 extern void reserve_top_address(unsigned long reserve);
24363 -#define set_fixmap(idx, phys) \
24364 - __set_fixmap(idx, phys, PAGE_KERNEL)
24366 - * Some hardware wants to get fixmapped without caching.
24368 -#define set_fixmap_nocache(idx, phys) \
24369 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24371 #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
24373 @@ -131,38 +125,5 @@ extern void reserve_top_address(unsigned
24374 #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
24375 #define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
24377 -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24378 -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24380 -extern void __this_fixmap_does_not_exist(void);
24383 - * 'index to address' translation. If anyone tries to use the idx
24384 - * directly without tranlation, we catch the bug with a NULL-deference
24385 - * kernel oops. Illegal ranges of incoming indices are caught too.
24387 -static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24390 - * this branch gets completely eliminated after inlining,
24391 - * except when someone tries to use fixaddr indices in an
24392 - * illegal way. (such as mixing up address types or using
24393 - * out-of-range indices).
24395 - * If it doesn't get removed, the linker will complain
24396 - * loudly with a reasonably clear error message..
24398 - if (idx >= __end_of_fixed_addresses)
24399 - __this_fixmap_does_not_exist();
24401 - return __fix_to_virt(idx);
24404 -static inline unsigned long virt_to_fix(const unsigned long vaddr)
24406 - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24407 - return __virt_to_fix(vaddr);
24410 #endif /* !__ASSEMBLY__ */
24412 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:38:05.000000000 +0100
24413 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-06-04 10:21:39.000000000 +0200
24415 #define _ASM_FIXMAP_64_H
24417 #include <linux/kernel.h>
24418 +#include <asm/acpi.h>
24419 #include <asm/apicdef.h>
24420 #include <asm/page.h>
24421 #include <asm/vsyscall.h>
24422 @@ -40,7 +41,6 @@ enum fixed_addresses {
24425 FIX_EARLYCON_MEM_BASE,
24427 #ifdef CONFIG_X86_LOCAL_APIC
24428 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24430 @@ -53,14 +53,21 @@ enum fixed_addresses {
24431 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
24432 + MAX_EFI_IO_PAGES - 1,
24434 +#ifdef CONFIG_PARAVIRT
24435 + FIX_PARAVIRT_BOOTMAP,
24441 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24444 #define NR_FIX_ISAMAPS 256
24446 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24447 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24448 + FIX_OHCI1394_BASE,
24450 __end_of_permanent_fixed_addresses,
24452 * 256 temporary boot-time mappings, used by early_ioremap(),
24453 @@ -71,27 +78,12 @@ enum fixed_addresses {
24455 #define NR_FIX_BTMAPS 64
24456 #define FIX_BTMAPS_NESTING 4
24458 - __end_of_permanent_fixed_addresses + 512 -
24459 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24460 (__end_of_permanent_fixed_addresses & 511),
24461 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24462 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24463 - FIX_OHCI1394_BASE,
24465 __end_of_fixed_addresses
24468 -extern void __set_fixmap(enum fixed_addresses idx,
24469 - unsigned long phys, pgprot_t flags);
24471 -#define set_fixmap(idx, phys) \
24472 - __set_fixmap(idx, phys, PAGE_KERNEL)
24474 - * Some hardware wants to get fixmapped without caching.
24476 -#define set_fixmap_nocache(idx, phys) \
24477 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24479 #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
24480 #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
24481 #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
24482 @@ -100,30 +92,4 @@ extern void __set_fixmap(enum fixed_addr
24483 #define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
24484 #define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
24486 -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24488 -extern void __this_fixmap_does_not_exist(void);
24491 - * 'index to address' translation. If anyone tries to use the idx
24492 - * directly without translation, we catch the bug with a NULL-deference
24493 - * kernel oops. Illegal ranges of incoming indices are caught too.
24495 -static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24498 - * this branch gets completely eliminated after inlining,
24499 - * except when someone tries to use fixaddr indices in an
24500 - * illegal way. (such as mixing up address types or using
24501 - * out-of-range indices).
24503 - * If it doesn't get removed, the linker will complain
24504 - * loudly with a reasonably clear error message..
24506 - if (idx >= __end_of_fixed_addresses)
24507 - __this_fixmap_does_not_exist();
24509 - return __fix_to_virt(idx);
24513 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:38:05.000000000 +0100
24514 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/highmem.h 2009-06-04 10:21:39.000000000 +0200
24515 @@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *p
24517 #define flush_cache_kmaps() do { } while (0)
24519 +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
24520 + unsigned long end_pfn);
24522 void clear_highpage(struct page *);
24523 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
24525 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/hypercall.h 2009-02-16 16:18:36.000000000 +0100
24526 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/hypercall.h 2009-06-04 10:21:39.000000000 +0200
24527 @@ -323,9 +323,19 @@ static inline int __must_check
24528 HYPERVISOR_grant_table_op(
24529 unsigned int cmd, void *uop, unsigned int count)
24531 + bool fixup = false;
24534 if (arch_use_lazy_mmu_mode())
24535 xen_multicall_flush(false);
24536 - return _hypercall3(int, grant_table_op, cmd, uop, count);
24537 +#ifdef GNTTABOP_map_grant_ref
24538 + if (cmd == GNTTABOP_map_grant_ref)
24540 + fixup = gnttab_pre_map_adjust(cmd, uop, count);
24541 + rc = _hypercall3(int, grant_table_op, cmd, uop, count);
24542 + if (rc == 0 && fixup)
24543 + rc = gnttab_post_map_adjust(uop, count);
24547 static inline int __must_check
24548 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2009-03-16 16:33:40.000000000 +0100
24549 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/hypervisor.h 2009-06-04 10:21:39.000000000 +0200
24552 #include <linux/types.h>
24553 #include <linux/kernel.h>
24554 -#include <linux/version.h>
24555 #include <linux/errno.h>
24556 #include <xen/interface/xen.h>
24557 #include <xen/interface/platform.h>
24558 @@ -112,6 +111,8 @@ int xen_create_contiguous_region(
24559 unsigned long vstart, unsigned int order, unsigned int address_bits);
24560 void xen_destroy_contiguous_region(
24561 unsigned long vstart, unsigned int order);
24562 +int early_create_contiguous_region(unsigned long pfn, unsigned int order,
24563 + unsigned int address_bits);
24567 @@ -181,6 +182,29 @@ static inline void xen_multicall_flush(b
24569 #endif /* CONFIG_XEN && !MODULE */
24573 +struct gnttab_map_grant_ref;
24574 +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
24575 + unsigned int count);
24576 +#if CONFIG_XEN_COMPAT < 0x030400
24577 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
24579 +static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
24580 + unsigned int count)
24587 +#else /* !CONFIG_XEN */
24589 +#define gnttab_pre_map_adjust(...) false
24590 +#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
24592 +#endif /* CONFIG_XEN */
24594 #if defined(CONFIG_X86_64)
24595 #define MULTI_UVMFLAGS_INDEX 2
24596 #define MULTI_UVMDOMID_INDEX 3
24597 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/io.h 2009-03-16 16:38:05.000000000 +0100
24598 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/io.h 2009-09-24 11:02:00.000000000 +0200
24601 #define ARCH_HAS_IOREMAP_WC
24603 +#include <linux/compiler.h>
24606 + * early_ioremap() and early_iounmap() are for temporary early boot-time
24607 + * mappings, before the real ioremap() is functional.
24608 + * A boot-time mapping is currently limited to at most 16 pages.
24610 +#ifndef __ASSEMBLY__
24611 +extern void early_ioremap_init(void);
24612 +extern void early_ioremap_clear(void);
24613 +extern void early_ioremap_reset(void);
24614 +extern void *early_ioremap(unsigned long offset, unsigned long size);
24615 +extern void early_iounmap(void *addr, unsigned long size);
24616 +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24619 +#define build_mmio_read(name, size, type, reg, barrier) \
24620 +static inline type name(const volatile void __iomem *addr) \
24621 +{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
24622 +:"m" (*(volatile type __force *)addr) barrier); return ret; }
24624 +#define build_mmio_write(name, size, type, reg, barrier) \
24625 +static inline void name(type val, volatile void __iomem *addr) \
24626 +{ asm volatile("mov" size " %0,%1": :reg (val), \
24627 +"m" (*(volatile type __force *)addr) barrier); }
24629 +build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
24630 +build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
24631 +build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
24633 +build_mmio_read(__readb, "b", unsigned char, "=q", )
24634 +build_mmio_read(__readw, "w", unsigned short, "=r", )
24635 +build_mmio_read(__readl, "l", unsigned int, "=r", )
24637 +build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
24638 +build_mmio_write(writew, "w", unsigned short, "r", :"memory")
24639 +build_mmio_write(writel, "l", unsigned int, "r", :"memory")
24641 +build_mmio_write(__writeb, "b", unsigned char, "q", )
24642 +build_mmio_write(__writew, "w", unsigned short, "r", )
24643 +build_mmio_write(__writel, "l", unsigned int, "r", )
24645 +#define readb_relaxed(a) __readb(a)
24646 +#define readw_relaxed(a) __readw(a)
24647 +#define readl_relaxed(a) __readl(a)
24648 +#define __raw_readb __readb
24649 +#define __raw_readw __readw
24650 +#define __raw_readl __readl
24652 +#define __raw_writeb __writeb
24653 +#define __raw_writew __writew
24654 +#define __raw_writel __writel
24656 +#define mmiowb() barrier()
24658 +#ifdef CONFIG_X86_64
24659 +build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
24660 +build_mmio_read(__readq, "q", unsigned long, "=r", )
24661 +build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
24662 +build_mmio_write(__writeq, "q", unsigned long, "r", )
24664 +#define readq_relaxed(a) __readq(a)
24665 +#define __raw_readq __readq
24666 +#define __raw_writeq writeq
24668 +/* Let people know we have them */
24669 +#define readq readq
24670 +#define writeq writeq
24673 +#define native_io_delay xen_io_delay
24675 #ifdef CONFIG_X86_32
24676 -# include "io_32.h"
24677 +# include "../../io_32.h"
24679 -# include "io_64.h"
24680 +# include "../../io_64.h"
24683 +#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
24685 +/* We will be supplying our own /dev/mem implementation */
24686 +#define ARCH_HAS_DEV_MEM
24688 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
24689 +#undef page_to_phys
24690 +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
24691 +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
24693 +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
24694 + (unsigned long) (bv)->bv_offset)
24696 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
24697 + (bvec_to_phys(vec1) + (vec1)->bv_len == bvec_to_phys(vec2) \
24698 + && bvec_to_pseudophys(vec1) + (vec1)->bv_len \
24699 + == bvec_to_pseudophys(vec2))
24701 +#undef virt_to_bus
24702 +#undef bus_to_virt
24703 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
24704 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
24706 +#include <asm/fixmap.h>
24708 +#undef __ISA_IO_base
24709 +#undef isa_virt_to_bus
24710 +#undef isa_page_to_bus
24711 +#undef isa_bus_to_virt
24712 +#define isa_virt_to_bus(_x) ({ \
24713 + unsigned long _va_ = (unsigned long)(_x); \
24714 + _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \
24715 + ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \
24716 + : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); })
24717 +#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
24721 extern void *xlate_dev_mem_ptr(unsigned long phys);
24722 extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
24724 -extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
24725 -extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
24727 extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
24728 unsigned long prot_val);
24729 extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
24732 + * early_ioremap() and early_iounmap() are for temporary early boot-time
24733 + * mappings, before the real ioremap() is functional.
24734 + * A boot-time mapping is currently limited to at most 16 pages.
24736 +extern void early_ioremap_init(void);
24737 +extern void early_ioremap_clear(void);
24738 +extern void early_ioremap_reset(void);
24739 +extern void *early_ioremap(unsigned long offset, unsigned long size);
24740 +extern void early_iounmap(void *addr, unsigned long size);
24741 +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24744 #endif /* _ASM_X86_IO_H */
24745 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
24746 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/irq_vectors.h 2009-06-04 10:21:39.000000000 +0200
24748 +#ifndef _ASM_IRQ_VECTORS_H
24749 +#define _ASM_IRQ_VECTORS_H
24751 +#ifdef CONFIG_X86_32
24752 +# define SYSCALL_VECTOR 0x80
24754 +# define IA32_SYSCALL_VECTOR 0x80
24757 +#define RESCHEDULE_VECTOR 0
24758 +#define CALL_FUNCTION_VECTOR 1
24759 +#define CALL_FUNC_SINGLE_VECTOR 2
24760 +#define SPIN_UNLOCK_VECTOR 3
24764 + * The maximum number of vectors supported by i386 processors
24765 + * is limited to 256. For processors other than i386, NR_VECTORS
24766 + * should be changed accordingly.
24768 +#define NR_VECTORS 256
24770 +#define FIRST_VM86_IRQ 3
24771 +#define LAST_VM86_IRQ 15
24772 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
24775 + * The flat IRQ space is divided into two regions:
24776 + * 1. A one-to-one mapping of real physical IRQs. This space is only used
24777 + * if we have physical device-access privilege. This region is at the
24778 + * start of the IRQ space so that existing device drivers do not need
24779 + * to be modified to translate physical IRQ numbers into our IRQ space.
24780 + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
24781 + * are bound using the provided bind/unbind functions.
24784 +#define PIRQ_BASE 0
24785 +#if defined(NR_CPUS) && defined(MAX_IO_APICS)
24786 +# if NR_CPUS < MAX_IO_APICS
24787 +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
24789 +# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
24793 +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
24794 +#define NR_DYNIRQS 256
24796 +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
24797 +#define NR_IRQ_VECTORS NR_IRQS
24799 +#endif /* _ASM_IRQ_VECTORS_H */
24800 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:38:05.000000000 +0100
24801 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/irqflags.h 2009-06-04 10:21:39.000000000 +0200
24802 @@ -118,7 +118,7 @@ static inline void halt(void)
24804 #ifndef CONFIG_X86_64
24805 #define INTERRUPT_RETURN iret
24806 -#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
24807 +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
24808 sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
24810 jnz 14f /* process more events if necessary... */ ; \
24811 @@ -175,18 +175,6 @@ static inline void trace_hardirqs_fixup_
24814 #ifdef CONFIG_X86_64
24816 - * Currently paravirt can't handle swapgs nicely when we
24817 - * don't have a stack we can rely on (such as a user space
24818 - * stack). So we either find a way around these or just fault
24819 - * and emulate if a guest tries to call swapgs directly.
24821 - * Either way, this is a good way to document that we don't
24822 - * have a reliable stack. x86_64 only.
24824 -#define SWAPGS_UNSAFE_STACK swapgs
24825 -#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
24826 -#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
24827 #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
24828 #define ARCH_LOCKDEP_SYS_EXIT_IRQ \
24830 @@ -198,24 +186,6 @@ static inline void trace_hardirqs_fixup_
24834 -#define ARCH_TRACE_IRQS_ON \
24838 - call trace_hardirqs_on; \
24843 -#define ARCH_TRACE_IRQS_OFF \
24847 - call trace_hardirqs_off; \
24852 #define ARCH_LOCKDEP_SYS_EXIT \
24855 @@ -229,8 +199,8 @@ static inline void trace_hardirqs_fixup_
24858 #ifdef CONFIG_TRACE_IRQFLAGS
24859 -# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
24860 -# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
24861 +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
24862 +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
24864 # define TRACE_IRQS_ON
24865 # define TRACE_IRQS_OFF
24866 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/mmu_context.h 2009-02-16 16:18:36.000000000 +0100
24867 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/mmu_context.h 2009-06-04 10:21:39.000000000 +0200
24869 +#ifndef __ASM_X86_MMU_CONTEXT_H
24870 +#define __ASM_X86_MMU_CONTEXT_H
24872 +#include <asm/desc.h>
24873 +#include <asm/atomic.h>
24874 +#include <asm/pgalloc.h>
24875 +#include <asm/tlbflush.h>
24877 +void arch_exit_mmap(struct mm_struct *mm);
24878 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24880 +void mm_pin(struct mm_struct *mm);
24881 +void mm_unpin(struct mm_struct *mm);
24882 +void mm_pin_all(void);
24884 +static inline void xen_activate_mm(struct mm_struct *prev,
24885 + struct mm_struct *next)
24887 + if (!PagePinned(virt_to_page(next->pgd)))
24892 + * Used for LDT copy/destruction.
24894 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24895 +void destroy_context(struct mm_struct *mm);
24897 #ifdef CONFIG_X86_32
24898 # include "mmu_context_32.h"
24900 # include "mmu_context_64.h"
24903 +#define activate_mm(prev, next) \
24905 + xen_activate_mm(prev, next); \
24906 + switch_mm((prev), (next), NULL); \
24910 +#endif /* __ASM_X86_MMU_CONTEXT_H */
24911 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:38:05.000000000 +0100
24912 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-06-04 10:21:39.000000000 +0200
24914 #ifndef __I386_SCHED_H
24915 #define __I386_SCHED_H
24917 -#include <asm/desc.h>
24918 -#include <asm/atomic.h>
24919 -#include <asm/pgalloc.h>
24920 -#include <asm/tlbflush.h>
24922 -void arch_exit_mmap(struct mm_struct *mm);
24923 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24925 -void mm_pin(struct mm_struct *mm);
24926 -void mm_unpin(struct mm_struct *mm);
24927 -void mm_pin_all(void);
24929 -static inline void xen_activate_mm(struct mm_struct *prev,
24930 - struct mm_struct *next)
24932 - if (!PagePinned(virt_to_page(next->pgd)))
24937 - * Used for LDT copy/destruction.
24939 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24940 -void destroy_context(struct mm_struct *mm);
24943 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24945 #if 0 /* XEN: no lazy tlb */
24946 @@ -107,10 +81,4 @@ static inline void switch_mm(struct mm_s
24947 #define deactivate_mm(tsk, mm) \
24948 asm("movl %0,%%gs": :"r" (0));
24950 -#define activate_mm(prev, next) \
24952 - xen_activate_mm(prev, next); \
24953 - switch_mm((prev), (next), NULL); \
24957 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:38:05.000000000 +0100
24958 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-06-04 10:21:39.000000000 +0200
24960 #ifndef __X86_64_MMU_CONTEXT_H
24961 #define __X86_64_MMU_CONTEXT_H
24963 -#include <asm/desc.h>
24964 -#include <asm/atomic.h>
24965 -#include <asm/pgalloc.h>
24966 -#include <asm/page.h>
24967 -#include <asm/pda.h>
24968 -#include <asm/pgtable.h>
24969 -#include <asm/tlbflush.h>
24971 -void arch_exit_mmap(struct mm_struct *mm);
24972 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24975 - * possibly do the LDT unload here?
24977 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24978 -void destroy_context(struct mm_struct *mm);
24980 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24982 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
24983 @@ -58,10 +41,6 @@ static inline void __prepare_arch_switch
24987 -extern void mm_pin(struct mm_struct *mm);
24988 -extern void mm_unpin(struct mm_struct *mm);
24989 -void mm_pin_all(void);
24991 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
24992 struct task_struct *tsk)
24994 @@ -124,11 +103,4 @@ do { \
24995 asm volatile("movl %0,%%fs"::"r"(0)); \
24998 -static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
25000 - if (!PagePinned(virt_to_page(next->pgd)))
25002 - switch_mm(prev, next, NULL);
25006 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:38:05.000000000 +0100
25007 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/page.h 2009-06-04 10:21:39.000000000 +0200
25009 * below. The preprocessor will warn if the two definitions aren't identical.
25011 #define _PAGE_BIT_PRESENT 0
25012 -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25013 -#define _PAGE_BIT_IO 9
25014 -#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25015 +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25016 +#define _PAGE_BIT_IO 11
25017 +#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25019 #define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
25020 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
25022 (ie, 32-bit PAE). */
25023 #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
25025 -/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
25026 -#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
25027 +/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
25028 +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
25030 +/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
25031 +#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
25033 #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
25034 #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
25036 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
25037 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
25039 -/* to align the pointer to the (next) page boundary */
25040 -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
25041 +#define HUGE_MAX_HSTATE 2
25043 #ifndef __ASSEMBLY__
25044 #include <linux/types.h>
25047 #ifndef __ASSEMBLY__
25049 +typedef struct { pgdval_t pgd; } pgd_t;
25050 +typedef struct { pgprotval_t pgprot; } pgprot_t;
25052 extern int page_is_ram(unsigned long pagenr);
25053 extern int devmem_is_allowed(unsigned long pagenr);
25054 +extern void map_devmem(unsigned long pfn, unsigned long size,
25055 + pgprot_t vma_prot);
25056 +extern void unmap_devmem(unsigned long pfn, unsigned long size,
25057 + pgprot_t vma_prot);
25059 +extern unsigned long max_low_pfn_mapped;
25060 extern unsigned long max_pfn_mapped;
25063 @@ -84,15 +94,11 @@ static inline void copy_user_page(void *
25064 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
25065 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
25067 -typedef struct { pgprotval_t pgprot; } pgprot_t;
25069 #define pgprot_val(x) ((x).pgprot)
25070 #define __pgprot(x) ((pgprot_t) { (x) } )
25072 #include <asm/maddr.h>
25074 -typedef struct { pgdval_t pgd; } pgd_t;
25076 #define __pgd_ma(x) ((pgd_t) { (x) } )
25077 static inline pgd_t xen_make_pgd(pgdval_t val)
25079 @@ -196,6 +202,11 @@ static inline pteval_t xen_pte_val(pte_t
25083 +static inline pteval_t xen_pte_flags(pte_t pte)
25085 + return __pte_val(pte) & PTE_FLAGS_MASK;
25088 #define pgd_val(x) xen_pgd_val(x)
25089 #define __pgd(x) xen_make_pgd(x)
25091 @@ -210,6 +221,7 @@ static inline pteval_t xen_pte_val(pte_t
25094 #define pte_val(x) xen_pte_val(x)
25095 +#define pte_flags(x) xen_pte_flags(x)
25096 #define __pte(x) xen_make_pte(x)
25098 #define __pa(x) __phys_addr((unsigned long)(x))
25099 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:38:05.000000000 +0100
25100 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/page_64.h 2009-06-04 10:21:39.000000000 +0200
25102 #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
25103 #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
25106 + * Set __PAGE_OFFSET to the most negative possible address +
25107 + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
25108 + * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
25109 + * what Xen requires.
25111 #define __PAGE_OFFSET _AC(0xffff880000000000, UL)
25113 #define __PHYSICAL_START CONFIG_PHYSICAL_START
25115 void clear_page(void *page);
25116 void copy_page(void *to, void *from);
25118 -extern unsigned long end_pfn;
25119 +/* duplicated to the one in bootmem.h */
25120 +extern unsigned long max_pfn;
25122 static inline unsigned long __phys_addr(unsigned long x)
25124 @@ -91,6 +98,11 @@ typedef union { pteval_t pte; unsigned i
25125 extern unsigned long init_memory_mapping(unsigned long start,
25126 unsigned long end);
25128 +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
25130 +extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
25131 +extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
25133 #endif /* !__ASSEMBLY__ */
25135 #ifdef CONFIG_FLATMEM
25136 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:38:05.000000000 +0100
25137 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pci.h 2009-06-04 10:21:39.000000000 +0200
25138 @@ -21,6 +21,8 @@ struct pci_sysdata {
25142 +extern int pci_routeirq;
25144 /* scan a bus after allocating a pci_sysdata for it */
25145 extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
25147 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pci_32.h 2009-02-16 16:18:36.000000000 +0100
25148 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pci_32.h 2009-06-04 10:21:39.000000000 +0200
25149 @@ -38,12 +38,14 @@ struct pci_dev;
25150 #define PCI_DMA_BUS_IS_PHYS (1)
25152 /* pci_unmap_{page,single} is a nop so... */
25153 -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
25154 -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
25155 -#define pci_unmap_addr(PTR, ADDR_NAME) (0)
25156 -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
25157 -#define pci_unmap_len(PTR, LEN_NAME) (0)
25158 -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
25159 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
25160 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
25161 +#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
25162 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
25163 + do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
25164 +#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
25165 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
25166 + do { break; } while (pci_unmap_len(PTR, LEN_NAME))
25170 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgalloc.h 2009-03-16 16:38:05.000000000 +0100
25171 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgalloc.h 2009-06-04 10:21:39.000000000 +0200
25174 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
25176 +static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
25177 +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
25179 static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
25180 static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
25181 static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
25182 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:38:05.000000000 +0100
25183 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable.h 2009-06-04 10:21:39.000000000 +0200
25184 @@ -13,11 +13,12 @@
25185 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
25186 #define _PAGE_BIT_PAT 7 /* on 4KB pages */
25187 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
25188 -#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
25189 +#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
25190 +#define _PAGE_BIT_UNUSED2 10
25191 +#define _PAGE_BIT_IO 11 /* Mapped page is I/O or foreign and
25192 * has no associated page struct. */
25193 -#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
25194 -#define _PAGE_BIT_UNUSED3 11
25195 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
25196 +#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
25197 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
25199 /* If _PAGE_BIT_PRESENT is clear, we use these: */
25200 @@ -28,34 +29,31 @@
25201 /* if the user mapped it with PROT_NONE; pte_present gives true */
25202 #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
25205 - * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
25206 - * sign-extended value on 32-bit with all 1's in the upper word,
25207 - * which preserves the upper pte values on 64-bit ptes:
25209 -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25210 -#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
25211 -#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
25212 -#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
25213 -#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
25214 -#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
25215 -#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
25216 -#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
25217 -#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
25218 -#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25219 -#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
25220 -#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
25221 -#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
25222 -#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
25223 +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25224 +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
25225 +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
25226 +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
25227 +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
25228 +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
25229 +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
25230 +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
25231 +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
25232 +#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
25233 +#define _PAGE_UNUSED2 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2)
25234 +#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25235 +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
25236 +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
25237 +#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
25238 +#define __HAVE_ARCH_PTE_SPECIAL
25240 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
25241 -#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
25242 +#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
25244 -#define _PAGE_NX 0
25245 +#define _PAGE_NX (_AT(pteval_t, 0))
25248 -#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
25249 -#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
25250 +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
25251 +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
25253 #ifndef __ASSEMBLY__
25254 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
25255 @@ -71,8 +69,8 @@ extern unsigned int __kernel_page_user;
25256 _PAGE_DIRTY | __kernel_page_user)
25258 /* Set of bits not changed in pte_modify */
25259 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25260 - _PAGE_ACCESSED | _PAGE_DIRTY)
25261 +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25262 + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
25265 * PAT settings are part of the hypervisor interface, which sets the
25266 @@ -102,19 +100,9 @@ extern unsigned int __kernel_page_user;
25267 #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
25270 -#ifdef CONFIG_X86_32
25271 -#define _PAGE_KERNEL_EXEC \
25272 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
25273 -#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
25275 -#ifndef __ASSEMBLY__
25276 -extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
25277 -#endif /* __ASSEMBLY__ */
25279 #define __PAGE_KERNEL_EXEC \
25280 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
25281 #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
25284 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
25285 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
25286 @@ -125,25 +113,22 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
25287 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
25288 #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
25289 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
25290 +#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
25291 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
25294 - * We don't support GLOBAL page in xenolinux64
25296 -#define MAKE_GLOBAL(x) __pgprot((x))
25298 -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
25299 -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
25300 -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
25301 -#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
25302 -#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
25303 -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
25304 -#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
25305 -#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
25306 -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
25307 -#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
25308 -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
25309 -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25310 +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
25311 +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
25312 +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
25313 +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
25314 +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
25315 +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
25316 +#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
25317 +#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
25318 +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
25319 +#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
25320 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
25321 +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
25322 +#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25325 #define __P000 PAGE_NONE
25326 @@ -182,27 +167,27 @@ extern struct list_head pgd_list;
25328 static inline int pte_dirty(pte_t pte)
25330 - return __pte_val(pte) & _PAGE_DIRTY;
25331 + return pte_flags(pte) & _PAGE_DIRTY;
25334 static inline int pte_young(pte_t pte)
25336 - return __pte_val(pte) & _PAGE_ACCESSED;
25337 + return pte_flags(pte) & _PAGE_ACCESSED;
25340 static inline int pte_write(pte_t pte)
25342 - return __pte_val(pte) & _PAGE_RW;
25343 + return pte_flags(pte) & _PAGE_RW;
25346 static inline int pte_file(pte_t pte)
25348 - return __pte_val(pte) & _PAGE_FILE;
25349 + return pte_flags(pte) & _PAGE_FILE;
25352 static inline int pte_huge(pte_t pte)
25354 - return __pte_val(pte) & _PAGE_PSE;
25355 + return pte_flags(pte) & _PAGE_PSE;
25358 static inline int pte_global(pte_t pte)
25359 @@ -212,12 +197,12 @@ static inline int pte_global(pte_t pte)
25361 static inline int pte_exec(pte_t pte)
25363 - return !(__pte_val(pte) & _PAGE_NX);
25364 + return !(pte_flags(pte) & _PAGE_NX);
25367 static inline int pte_special(pte_t pte)
25370 + return pte_flags(pte) & _PAGE_SPECIAL;
25373 static inline int pmd_large(pmd_t pte)
25374 @@ -228,22 +213,22 @@ static inline int pmd_large(pmd_t pte)
25376 static inline pte_t pte_mkclean(pte_t pte)
25378 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
25379 + return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY);
25382 static inline pte_t pte_mkold(pte_t pte)
25384 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
25385 + return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED);
25388 static inline pte_t pte_wrprotect(pte_t pte)
25390 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
25391 + return __pte_ma(__pte_val(pte) & ~_PAGE_RW);
25394 static inline pte_t pte_mkexec(pte_t pte)
25396 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
25397 + return __pte_ma(__pte_val(pte) & ~_PAGE_NX);
25400 static inline pte_t pte_mkdirty(pte_t pte)
25401 @@ -268,7 +253,7 @@ static inline pte_t pte_mkhuge(pte_t pte
25403 static inline pte_t pte_clrhuge(pte_t pte)
25405 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
25406 + return __pte_ma(__pte_val(pte) & ~_PAGE_PSE);
25409 static inline pte_t pte_mkglobal(pte_t pte)
25410 @@ -283,35 +268,46 @@ static inline pte_t pte_clrglobal(pte_t
25412 static inline pte_t pte_mkspecial(pte_t pte)
25415 + return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL);
25418 extern pteval_t __supported_pte_mask;
25420 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
25422 - return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
25423 - pgprot_val(pgprot)) & __supported_pte_mask);
25424 + pgprotval_t prot = pgprot_val(pgprot);
25426 + if (prot & _PAGE_PRESENT)
25427 + prot &= __supported_pte_mask;
25428 + return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25431 static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
25433 - return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
25434 - pgprot_val(pgprot)) & __supported_pte_mask);
25435 + pgprotval_t prot = pgprot_val(pgprot);
25437 + if (prot & _PAGE_PRESENT)
25438 + prot &= __supported_pte_mask;
25439 + return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25442 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
25444 - return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
25445 - pgprot_val(pgprot)) & __supported_pte_mask);
25446 + pgprotval_t prot = pgprot_val(pgprot);
25448 + if (prot & _PAGE_PRESENT)
25449 + prot &= __supported_pte_mask;
25450 + return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25453 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
25455 - pteval_t val = pte_val(pte);
25456 + pgprotval_t prot = pgprot_val(newprot);
25457 + pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
25459 - val &= _PAGE_CHG_MASK;
25460 - val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
25461 + if (prot & _PAGE_PRESENT)
25462 + prot &= __supported_pte_mask;
25463 + val |= prot & ~_PAGE_CHG_MASK;
25467 @@ -325,9 +321,11 @@ static inline pgprot_t pgprot_modify(pgp
25468 return __pgprot(preservebits | addbits);
25471 -#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
25472 +#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
25474 -#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
25475 +#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \
25476 + ? pgprot_val(p) & __supported_pte_mask \
25479 #ifndef __ASSEMBLY__
25480 #define __HAVE_PHYS_MEM_ACCESS_PROT
25481 @@ -338,6 +336,17 @@ int phys_mem_access_prot_allowed(struct
25482 unsigned long size, pgprot_t *vma_prot);
25485 +/* Install a pte for a particular vaddr in kernel space. */
25486 +void set_pte_vaddr(unsigned long vaddr, pte_t pte);
25488 +#ifndef CONFIG_XEN
25489 +extern void native_pagetable_setup_start(pgd_t *base);
25490 +extern void native_pagetable_setup_done(pgd_t *base);
25492 +static inline void xen_pagetable_setup_start(pgd_t *base) {}
25493 +static inline void xen_pagetable_setup_done(pgd_t *base) {}
25496 #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
25497 #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
25499 @@ -373,6 +382,26 @@ int phys_mem_access_prot_allowed(struct
25500 # include "pgtable_64.h"
25504 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25506 + * this macro returns the index of the entry in the pgd page which would
25507 + * control the given virtual address
25509 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25512 + * pgd_offset() returns a (pgd_t *)
25513 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25515 +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25517 + * a shortcut which implies the use of the kernel's pgd, instead
25520 +#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25523 #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
25524 #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
25526 @@ -383,8 +412,15 @@ enum {
25533 +#ifdef CONFIG_PROC_FS
25534 +extern void update_page_count(int level, unsigned long pages);
25536 +static inline void update_page_count(int level, unsigned long pages) { }
25540 * Helper function that returns the kernel pagetable entry controlling
25541 * the virtual address 'address'. NULL means no pagetable entry present.
25542 @@ -441,6 +477,8 @@ static inline void xen_pte_clear(struct
25543 * race with other CPU's that might be updating the dirty
25544 * bit at the same time.
25546 +struct vm_area_struct;
25548 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
25549 extern int ptep_set_access_flags(struct vm_area_struct *vma,
25550 unsigned long address, pte_t *ptep,
25551 @@ -523,9 +561,6 @@ static inline void clone_pgd_range(pgd_t
25552 memcpy(dst, src, count * sizeof(pgd_t));
25555 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
25556 - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
25558 #define arbitrary_virt_to_machine(va) \
25560 unsigned int __lvl; \
25561 @@ -548,6 +583,34 @@ struct page *kmap_atomic_to_page(void *)
25562 #define ptep_to_machine(ptep) virt_to_machine(ptep)
25565 +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
25566 +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
25569 +#if CONFIG_XEN_COMPAT < 0x030300
25570 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
25571 + return ptep_get_and_clear(mm, addr, ptep);
25576 +static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
25577 + pte_t *ptep, pte_t pte)
25581 +#if CONFIG_XEN_COMPAT < 0x030300
25582 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
25583 + set_pte_at(mm, addr, ptep, pte);
25587 + u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
25588 + u.val = __pte_val(pte);
25589 + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
25593 #include <asm-generic/pgtable.h>
25595 #include <xen/features.h>
25596 @@ -576,10 +639,6 @@ int touch_pte_range(struct mm_struct *mm
25597 unsigned long address,
25598 unsigned long size);
25600 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
25601 - unsigned long addr, unsigned long end, pgprot_t newprot,
25602 - int dirty_accountable);
25604 #endif /* __ASSEMBLY__ */
25606 #endif /* _ASM_X86_PGTABLE_H */
25607 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:38:05.000000000 +0100
25608 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-06-04 10:21:39.000000000 +0200
25609 @@ -14,11 +14,11 @@
25610 #define pmd_ERROR(e) \
25611 printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
25612 __FILE__, __LINE__, &(e), __pmd_val(e), \
25613 - (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25614 + (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25615 #define pgd_ERROR(e) \
25616 printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
25617 __FILE__, __LINE__, &(e), __pgd_val(e), \
25618 - (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25619 + (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25621 static inline int pud_none(pud_t pud)
25623 @@ -27,7 +27,7 @@ static inline int pud_none(pud_t pud)
25625 static inline int pud_bad(pud_t pud)
25627 - return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25628 + return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25631 static inline int pud_present(pud_t pud)
25632 @@ -102,9 +102,9 @@ static inline void pud_clear(pud_t *pudp
25636 -#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
25637 +#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))
25639 -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
25640 +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
25643 /* Find an entry in the second-level page table.. */
25644 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:38:05.000000000 +0100
25645 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-06-04 10:21:39.000000000 +0200
25646 @@ -89,10 +89,10 @@ extern unsigned long pg0[];
25647 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
25648 can temporarily clear it. */
25649 #define pmd_present(x) (__pmd_val(x))
25650 -#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25651 +#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25653 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
25654 -#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25655 +#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25659 @@ -119,26 +119,6 @@ extern unsigned long pg0[];
25661 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
25664 - * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25666 - * this macro returns the index of the entry in the pgd page which would
25667 - * control the given virtual address
25669 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25670 -#define pgd_index_k(addr) pgd_index((addr))
25673 - * pgd_offset() returns a (pgd_t *)
25674 - * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25676 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25679 - * a shortcut which implies the use of the kernel's pgd, instead
25682 -#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25684 static inline int pud_large(pud_t pud) { return 0; }
25686 @@ -165,7 +145,7 @@ static inline int pud_large(pud_t pud) {
25687 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25689 #define pmd_page_vaddr(pmd) \
25690 - ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
25691 + ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
25693 #if defined(CONFIG_HIGHPTE)
25694 #define pte_offset_map(dir, address) \
25695 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:38:05.000000000 +0100
25696 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-06-04 10:21:39.000000000 +0200
25697 @@ -23,6 +23,8 @@ extern void xen_init_pt(void);
25698 extern pud_t level3_kernel_pgt[512];
25699 extern pud_t level3_ident_pgt[512];
25700 extern pmd_t level2_kernel_pgt[512];
25701 +extern pmd_t level2_fixmap_pgt[512];
25702 +extern pmd_t level2_ident_pgt[512];
25703 extern pgd_t init_level4_pgt[];
25705 #define swapper_pg_dir init_level4_pgt
25706 @@ -79,6 +81,9 @@ extern void paging_init(void);
25710 +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
25713 #define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
25715 static inline void xen_set_pte(pte_t *ptep, pte_t pte)
25716 @@ -145,29 +150,29 @@ static inline void xen_pgd_clear(pgd_t *
25717 #define PGDIR_MASK (~(PGDIR_SIZE - 1))
25720 -#define MAXMEM _AC(0x00003fffffffffff, UL)
25721 +#define MAXMEM _AC(0x000004ffffffffff, UL)
25722 #define VMALLOC_START _AC(0xffffc20000000000, UL)
25723 #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
25724 #define VMEMMAP_START _AC(0xffffe20000000000, UL)
25725 #define MODULES_VADDR _AC(0xffffffffa0000000, UL)
25726 -#define MODULES_END _AC(0xfffffffffff00000, UL)
25727 +#define MODULES_END _AC(0xffffffffff000000, UL)
25728 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
25730 #ifndef __ASSEMBLY__
25732 static inline int pgd_bad(pgd_t pgd)
25734 - return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25735 + return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25738 static inline int pud_bad(pud_t pud)
25740 - return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25741 + return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25744 static inline int pmd_bad(pmd_t pmd)
25746 - return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25747 + return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25750 #define pte_none(x) (!(x).pte)
25751 @@ -175,7 +180,7 @@ static inline int pmd_bad(pmd_t pmd)
25753 #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
25755 -#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
25756 +#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
25757 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
25758 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
25759 #define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
25760 @@ -200,11 +205,8 @@ static inline int pmd_bad(pmd_t pmd)
25763 #define pgd_page_vaddr(pgd) \
25764 - ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
25765 + ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
25766 #define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
25767 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25768 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25769 -#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
25770 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
25771 static inline int pgd_large(pgd_t pgd) { return 0; }
25772 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
25773 @@ -226,7 +228,7 @@ static inline int pud_large(pud_t pte)
25776 /* PMD - Level 2 access */
25777 -#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
25778 +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
25779 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25781 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
25782 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:38:05.000000000 +0100
25783 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/processor.h 2009-06-04 10:21:39.000000000 +0200
25784 @@ -134,7 +134,7 @@ extern __u32 cleared_cpu_caps[NCAPINTS
25786 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25787 #define cpu_data(cpu) per_cpu(cpu_info, cpu)
25788 -#define current_cpu_data cpu_data(smp_processor_id())
25789 +#define current_cpu_data __get_cpu_var(cpu_info)
25791 #define cpu_data(cpu) boot_cpu_data
25792 #define current_cpu_data boot_cpu_data
25793 @@ -153,7 +153,7 @@ static inline int hlt_works(int cpu)
25795 extern void cpu_detect(struct cpuinfo_x86 *c);
25797 -extern void identify_cpu(struct cpuinfo_x86 *);
25798 +extern void early_cpu_init(void);
25799 extern void identify_boot_cpu(void);
25800 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25801 extern void print_cpu_info(struct cpuinfo_x86 *);
25802 @@ -267,15 +267,11 @@ struct tss_struct {
25803 struct thread_struct *io_bitmap_owner;
25806 - * Pad the TSS to be cacheline-aligned (size is 0x100):
25808 - unsigned long __cacheline_filler[35];
25810 * .. and then another 0x100 bytes for the emergency kernel stack:
25812 unsigned long stack[64];
25814 -} __attribute__((packed));
25815 +} ____cacheline_aligned;
25817 DECLARE_PER_CPU(struct tss_struct, init_tss);
25819 @@ -668,11 +664,36 @@ static inline void __sti_mwait(unsigned
25821 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25823 -extern int force_mwait;
25825 extern void select_idle_routine(const struct cpuinfo_x86 *c);
25827 extern unsigned long boot_option_idle_override;
25828 +extern unsigned long idle_halt;
25829 +extern unsigned long idle_nomwait;
25831 +#ifndef CONFIG_XEN
25833 + * on systems with caches, caches must be flashed as the absolute
25834 + * last instruction before going into a suspended halt. Otherwise,
25835 + * dirty data can linger in the cache and become stale on resume,
25836 + * leading to strange errors.
25838 + * perform a variety of operations to guarantee that the compiler
25839 + * will not reorder instructions. wbinvd itself is serializing
25840 + * so the processor will not reorder.
25842 + * Systems without cache can just go into halt.
25844 +static inline void wbinvd_halt(void)
25847 + /* check for clflush to determine if wbinvd is legal */
25848 + if (cpu_has_clflush)
25849 + asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
25856 extern void enable_sep_cpu(void);
25857 extern int sysenter_setup(void);
25858 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:38:05.000000000 +0100
25859 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/segment.h 2009-06-04 10:21:39.000000000 +0200
25861 #ifndef _ASM_X86_SEGMENT_H_
25862 #define _ASM_X86_SEGMENT_H_
25864 +/* Constructor for a conventional segment GDT (or LDT) entry */
25865 +/* This is a macro so it can be used in initializers */
25866 +#define GDT_ENTRY(flags, base, limit) \
25867 + ((((base) & 0xff000000ULL) << (56-24)) | \
25868 + (((flags) & 0x0000f0ffULL) << 40) | \
25869 + (((limit) & 0x000f0000ULL) << (48-16)) | \
25870 + (((base) & 0x00ffffffULL) << 16) | \
25871 + (((limit) & 0x0000ffffULL)))
25873 /* Simple and small GDT entries for booting only */
25875 #define GDT_ENTRY_BOOT_CS 2
25876 @@ -61,18 +70,14 @@
25877 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
25879 #define GDT_ENTRY_DEFAULT_USER_CS 14
25880 -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
25882 #define GDT_ENTRY_DEFAULT_USER_DS 15
25883 -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
25885 #define GDT_ENTRY_KERNEL_BASE 12
25887 #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
25888 -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25890 #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
25891 -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25893 #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
25894 #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
25895 @@ -143,10 +148,11 @@
25897 #include <asm/cache.h>
25899 -#define __KERNEL_CS 0x10
25900 -#define __KERNEL_DS 0x18
25901 +#define GDT_ENTRY_KERNEL32_CS 1
25902 +#define GDT_ENTRY_KERNEL_CS 2
25903 +#define GDT_ENTRY_KERNEL_DS 3
25905 -#define __KERNEL32_CS 0x08
25906 +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
25909 * we cannot use the same code segment descriptor for user and kernel
25910 @@ -154,10 +160,10 @@
25911 * The segment offset needs to contain a RPL. Grr. -AK
25912 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
25915 -#define __USER32_CS 0x23 /* 4*8+3 */
25916 -#define __USER_DS 0x2b /* 5*8+3 */
25917 -#define __USER_CS 0x33 /* 6*8+3 */
25918 +#define GDT_ENTRY_DEFAULT_USER32_CS 4
25919 +#define GDT_ENTRY_DEFAULT_USER_DS 5
25920 +#define GDT_ENTRY_DEFAULT_USER_CS 6
25921 +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
25922 #define __USER32_DS __USER_DS
25924 #define GDT_ENTRY_TSS 8 /* needs two entries */
25925 @@ -179,6 +185,11 @@
25929 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25930 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25931 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
25932 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
25934 /* User mode is privilege level 3 */
25935 #define USER_RPL 0x3
25936 /* LDT segment has TI set, GDT has it cleared */
25937 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/smp.h 2009-03-16 16:38:05.000000000 +0100
25938 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/smp.h 2009-06-04 10:21:39.000000000 +0200
25939 @@ -25,23 +25,16 @@ extern cpumask_t cpu_initialized;
25940 extern void (*mtrr_hook)(void);
25941 extern void zap_low_mappings(void);
25943 +extern int __cpuinit get_local_pda(int cpu);
25945 extern int smp_num_siblings;
25946 extern unsigned int num_processors;
25947 extern cpumask_t cpu_initialized;
25949 -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
25950 -extern u16 x86_cpu_to_apicid_init[];
25951 -extern u16 x86_bios_cpu_apicid_init[];
25952 -extern void *x86_cpu_to_apicid_early_ptr;
25953 -extern void *x86_bios_cpu_apicid_early_ptr;
25955 -#define x86_cpu_to_apicid_early_ptr NULL
25956 -#define x86_bios_cpu_apicid_early_ptr NULL
25959 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
25960 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
25961 DECLARE_PER_CPU(u16, cpu_llc_id);
25963 DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
25964 DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
25966 @@ -63,9 +56,9 @@ struct smp_ops {
25968 void (*smp_send_stop)(void);
25969 void (*smp_send_reschedule)(int cpu);
25970 - int (*smp_call_function_mask)(cpumask_t mask,
25971 - void (*func)(void *info), void *info,
25974 + void (*send_call_func_ipi)(cpumask_t mask);
25975 + void (*send_call_func_single_ipi)(int cpu);
25978 /* Globals due to paravirt */
25979 @@ -106,11 +99,14 @@ static inline void smp_send_reschedule(i
25980 smp_ops.smp_send_reschedule(cpu);
25983 -static inline int smp_call_function_mask(cpumask_t mask,
25984 - void (*func) (void *info), void *info,
25986 +static inline void arch_send_call_function_single_ipi(int cpu)
25988 - return smp_ops.smp_call_function_mask(mask, func, info, wait);
25989 + smp_ops.send_call_func_single_ipi(cpu);
25992 +static inline void arch_send_call_function_ipi(cpumask_t mask)
25994 + smp_ops.send_call_func_ipi(mask);
25997 void native_smp_prepare_boot_cpu(void);
25998 @@ -122,23 +118,19 @@ int native_cpu_up(unsigned int cpunum);
26000 void xen_smp_send_stop(void);
26001 void xen_smp_send_reschedule(int cpu);
26002 -int xen_smp_call_function_mask(cpumask_t mask,
26003 - void (*func) (void *info), void *info,
26005 +void xen_send_call_func_ipi(cpumask_t mask);
26006 +void xen_send_call_func_single_ipi(int cpu);
26008 #define smp_send_stop xen_smp_send_stop
26009 #define smp_send_reschedule xen_smp_send_reschedule
26010 -#define smp_call_function_mask xen_smp_call_function_mask
26012 -extern void prefill_possible_map(void);
26013 +#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi
26014 +#define arch_send_call_function_ipi xen_send_call_func_ipi
26016 #endif /* CONFIG_XEN */
26018 extern int __cpu_disable(void);
26019 extern void __cpu_die(unsigned int cpu);
26021 -extern void prefill_possible_map(void);
26023 void smp_store_cpu_info(int id);
26024 #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
26026 @@ -149,6 +141,14 @@ static inline int num_booting_cpus(void)
26028 #endif /* CONFIG_SMP */
26030 +#if defined(CONFIG_SMP) && (defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_XEN))
26031 +extern void prefill_possible_map(void);
26033 +static inline void prefill_possible_map(void)
26038 extern unsigned disabled_cpus __cpuinitdata;
26040 #ifdef CONFIG_X86_32_SMP
26041 @@ -216,12 +216,8 @@ static inline int hard_smp_processor_id(
26042 #endif /* CONFIG_X86_LOCAL_APIC */
26044 #ifdef CONFIG_HOTPLUG_CPU
26045 -extern void cpu_exit_clear(void);
26046 extern void cpu_uninit(void);
26049 -extern void smp_alloc_memory(void);
26050 -extern void lock_ipi_call_lock(void);
26051 -extern void unlock_ipi_call_lock(void);
26052 #endif /* __ASSEMBLY__ */
26054 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:38:05.000000000 +0100
26055 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/spinlock.h 2009-06-04 11:09:05.000000000 +0200
26057 # define UNLOCK_LOCK_PREFIX
26061 + * Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
26063 +#if CONFIG_XEN_COMPAT >= 0x030200
26065 int xen_spinlock_init(unsigned int cpu);
26066 void xen_spinlock_cleanup(unsigned int cpu);
26067 extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
26068 @@ -65,14 +70,14 @@ extern void xen_spin_kick(raw_spinlock_t
26070 #if (NR_CPUS < 256)
26071 #define TICKET_SHIFT 8
26072 -#define __raw_spin_lock_preamble \
26073 +#define __ticket_spin_lock_preamble \
26074 asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
26075 "cmpb %h0, %b0\n\t" \
26077 : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
26080 -#define __raw_spin_lock_body \
26081 +#define __ticket_spin_lock_body \
26083 "cmpb %h0, %b0\n\t" \
26085 @@ -88,7 +93,7 @@ extern void xen_spin_kick(raw_spinlock_t
26089 -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26090 +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26094 @@ -107,7 +112,7 @@ static __always_inline int __raw_spin_tr
26098 -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26099 +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26101 unsigned int token;
26102 unsigned char kick;
26103 @@ -124,7 +129,7 @@ static __always_inline void __raw_spin_u
26106 #define TICKET_SHIFT 16
26107 -#define __raw_spin_lock_preamble \
26108 +#define __ticket_spin_lock_preamble \
26110 unsigned int tmp; \
26111 asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
26112 @@ -136,7 +141,7 @@ static __always_inline void __raw_spin_u
26113 : "0" (0x00010000) \
26114 : "memory", "cc"); \
26116 -#define __raw_spin_lock_body \
26117 +#define __ticket_spin_lock_body \
26119 unsigned int tmp; \
26120 asm("shldl $16, %0, %2\n" \
26121 @@ -155,7 +160,7 @@ static __always_inline void __raw_spin_u
26122 : "memory", "cc"); \
26125 -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26126 +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26130 @@ -177,7 +182,7 @@ static __always_inline int __raw_spin_tr
26134 -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26135 +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26137 unsigned int token, tmp;
26139 @@ -195,49 +200,161 @@ static __always_inline void __raw_spin_u
26143 -static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26144 +static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
26146 int tmp = ACCESS_ONCE(lock->slock);
26148 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
26151 -static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26152 +static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
26154 int tmp = ACCESS_ONCE(lock->slock);
26156 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
26159 -static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26160 +static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
26162 unsigned int token, count;
26165 - __raw_spin_lock_preamble;
26166 + __ticket_spin_lock_preamble;
26167 if (unlikely(!free))
26168 token = xen_spin_adjust(lock, token);
26171 - __raw_spin_lock_body;
26172 + __ticket_spin_lock_body;
26173 } while (unlikely(!count) && !xen_spin_wait(lock, token));
26176 -static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26177 - unsigned long flags)
26178 +static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock,
26179 + unsigned long flags)
26181 unsigned int token, count;
26184 - __raw_spin_lock_preamble;
26185 + __ticket_spin_lock_preamble;
26186 if (unlikely(!free))
26187 token = xen_spin_adjust(lock, token);
26190 - __raw_spin_lock_body;
26191 + __ticket_spin_lock_body;
26192 } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
26195 +#define __raw_spin(n) __ticket_spin_##n
26197 +#else /* CONFIG_XEN_COMPAT < 0x030200 */
26199 + * Define virtualization-friendly old-style lock byte lock, for use in
26200 + * pv_lock_ops if desired.
26202 + * This differs from the pre-2.6.24 spinlock by always using xchgb
26203 + * rather than decb to take the lock; this allows it to use a
26204 + * zero-initialized lock structure. It also maintains a 1-byte
26205 + * contention counter, so that we can implement
26206 + * __byte_spin_is_contended.
26208 +struct __byte_spinlock {
26213 +#error NR_CPUS >= 256 support not implemented
26217 +static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
26218 +static inline void xen_spinlock_cleanup(unsigned int cpu) {}
26220 +static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
26222 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26223 + return bl->lock != 0;
26226 +static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
26228 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26229 + return bl->spinners != 0;
26232 +static inline void __byte_spin_lock(raw_spinlock_t *lock)
26234 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26237 + asm("1: xchgb %1, %0\n"
26240 + " " LOCK_PREFIX "incb %2\n"
26244 + " " LOCK_PREFIX "decb %2\n"
26247 + : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
26250 +#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
26252 +static inline int __byte_spin_trylock(raw_spinlock_t *lock)
26254 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26257 + asm("xchgb %1,%0"
26258 + : "+m" (bl->lock), "+q" (old) : : "memory");
26263 +static inline void __byte_spin_unlock(raw_spinlock_t *lock)
26265 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26270 +#define __raw_spin(n) __byte_spin_##n
26272 +#endif /* CONFIG_XEN_COMPAT */
26274 +static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26276 + return __raw_spin(is_locked)(lock);
26279 +static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26281 + return __raw_spin(is_contended)(lock);
26284 +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26286 + __raw_spin(lock)(lock);
26289 +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26290 + unsigned long flags)
26292 + __raw_spin(lock_flags)(lock, flags);
26295 +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26297 + return __raw_spin(trylock)(lock);
26300 +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26302 + __raw_spin(unlock)(lock);
26307 static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
26309 while (__raw_spin_is_locked(lock))
26310 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:38:05.000000000 +0100
26311 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/system.h 2009-06-04 10:21:39.000000000 +0200
26312 @@ -137,7 +137,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
26313 #define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
26314 #define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
26316 -extern void load_gs_index(unsigned);
26317 +extern void xen_load_gs_index(unsigned);
26320 * Load a segment. Fall back on loading the zero
26321 @@ -154,14 +154,14 @@ extern void load_gs_index(unsigned);
26324 _ASM_EXTABLE(1b,3b) \
26325 - : :"r" (value), "r" (0))
26326 + : :"r" (value), "r" (0) : "memory")
26330 * Save a segment register away
26332 #define savesegment(seg, value) \
26333 - asm volatile("mov %%" #seg ",%0":"=rm" (value))
26334 + asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
26336 static inline unsigned long get_limit(unsigned long segment)
26338 @@ -269,6 +269,7 @@ static inline void xen_wbinvd(void)
26339 #ifdef CONFIG_X86_64
26340 #define read_cr8() (xen_read_cr8())
26341 #define write_cr8(x) (xen_write_cr8(x))
26342 +#define load_gs_index xen_load_gs_index
26345 /* Clear the 'TS' bit */
26346 @@ -287,13 +288,12 @@ static inline void clflush(volatile void
26347 void disable_hlt(void);
26348 void enable_hlt(void);
26350 -extern int es7000_plat;
26351 void cpu_idle_wait(void);
26353 extern unsigned long arch_align_stack(unsigned long sp);
26354 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
26356 -void default_idle(void);
26357 +void xen_idle(void);
26360 * Force strict CPU ordering.
26361 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/xor_64.h 2009-03-16 16:38:05.000000000 +0100
26362 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/xor_64.h 2009-06-04 10:21:39.000000000 +0200
26364 +#ifndef ASM_X86__XOR_64_H
26365 +#define ASM_X86__XOR_64_H
26368 * x86-64 changes / gcc fixes from Andi Kleen.
26369 * Copyright 2002 Andi Kleen, SuSE Labs.
26370 @@ -330,3 +333,5 @@ do { \
26371 We may also be able to load into the L1 only depending on how the cpu
26372 deals with a load to a line that is being prefetched. */
26373 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
26375 +#endif /* ASM_X86__XOR_64_H */
26376 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/irq_vectors.h 2009-03-16 16:33:40.000000000 +0100
26377 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26380 - * This file should contain #defines for all of the interrupt vector
26381 - * numbers used by this architecture.
26383 - * In addition, there are some standard defines:
26385 - * FIRST_EXTERNAL_VECTOR:
26386 - * The first free place for external interrupts
26388 - * SYSCALL_VECTOR:
26389 - * The IRQ vector a syscall makes the user to kernel transition
26393 - * The IRQ number the timer interrupt comes in at.
26396 - * The total number of interrupt vectors (including all the
26397 - * architecture specific interrupts) needed.
26400 -#ifndef _ASM_IRQ_VECTORS_H
26401 -#define _ASM_IRQ_VECTORS_H
26404 - * IDT vectors usable for external interrupt sources start
26407 -#define FIRST_EXTERNAL_VECTOR 0x20
26409 -#define SYSCALL_VECTOR 0x80
26412 - * Vectors 0x20-0x2f are used for ISA interrupts.
26417 - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
26419 - * some of the following vectors are 'rare', they are merged
26420 - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
26421 - * TLB, reschedule and local APIC vectors are performance-critical.
26423 - * Vectors 0xf0-0xfa are free (reserved for future Linux use).
26425 -#define SPURIOUS_APIC_VECTOR 0xff
26426 -#define ERROR_APIC_VECTOR 0xfe
26427 -#define INVALIDATE_TLB_VECTOR 0xfd
26428 -#define RESCHEDULE_VECTOR 0xfc
26429 -#define CALL_FUNCTION_VECTOR 0xfb
26431 -#define THERMAL_APIC_VECTOR 0xf0
26433 - * Local APIC timer IRQ vector is on a different priority level,
26434 - * to work around the 'lost local interrupt if more than 2 IRQ
26435 - * sources per level' errata.
26437 -#define LOCAL_TIMER_VECTOR 0xef
26440 -#define SPURIOUS_APIC_VECTOR 0xff
26441 -#define ERROR_APIC_VECTOR 0xfe
26444 - * First APIC vector available to drivers: (vectors 0x30-0xee)
26445 - * we start at 0x31 to spread out vectors evenly between priority
26446 - * levels. (0x80 is the syscall vector)
26448 -#define FIRST_DEVICE_VECTOR 0x31
26449 -#define FIRST_SYSTEM_VECTOR 0xef
26452 - * 16 8259A IRQ's, 208 potential APIC interrupt sources.
26453 - * Right now the APIC is mostly only used for SMP.
26454 - * 256 vectors is an architectural limit. (we can have
26455 - * more than 256 devices theoretically, but they will
26456 - * have to use shared interrupts)
26457 - * Since vectors 0x00-0x1f are used/reserved for the CPU,
26458 - * the usable vector space is 0x20-0xff (224 vectors)
26461 -#define RESCHEDULE_VECTOR 0
26462 -#define CALL_FUNCTION_VECTOR 1
26463 -#define SPIN_UNLOCK_VECTOR 2
26467 - * The maximum number of vectors supported by i386 processors
26468 - * is limited to 256. For processors other than i386, NR_VECTORS
26469 - * should be changed accordingly.
26471 -#define NR_VECTORS 256
26473 -#define FPU_IRQ 13
26475 -#define FIRST_VM86_IRQ 3
26476 -#define LAST_VM86_IRQ 15
26477 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
26480 - * The flat IRQ space is divided into two regions:
26481 - * 1. A one-to-one mapping of real physical IRQs. This space is only used
26482 - * if we have physical device-access privilege. This region is at the
26483 - * start of the IRQ space so that existing device drivers do not need
26484 - * to be modified to translate physical IRQ numbers into our IRQ space.
26485 - * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
26486 - * are bound using the provided bind/unbind functions.
26489 -#define PIRQ_BASE 0
26490 -#if !defined(MAX_IO_APICS)
26491 -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26492 -#elif NR_CPUS < MAX_IO_APICS
26493 -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26495 -# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
26498 -#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
26499 -#define NR_DYNIRQS 256
26501 -#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
26502 -#define NR_IRQ_VECTORS NR_IRQS
26504 -#endif /* _ASM_IRQ_VECTORS_H */
26505 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/setup_arch_post.h 2009-10-28 14:55:02.000000000 +0100
26506 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26509 - * machine_specific_* - Hooks for machine specific setup.
26512 - * This is included late in kernel/setup.c so that it can make
26513 - * use of all of the static functions.
26516 -#include <xen/interface/callback.h>
26518 -extern void hypervisor_callback(void);
26519 -extern void failsafe_callback(void);
26520 -extern void nmi(void);
26522 -static void __init machine_specific_arch_setup(void)
26525 - static struct callback_register __initdata event = {
26526 - .type = CALLBACKTYPE_event,
26527 - .address = (unsigned long) hypervisor_callback,
26529 - static struct callback_register __initdata failsafe = {
26530 - .type = CALLBACKTYPE_failsafe,
26531 - .address = (unsigned long)failsafe_callback,
26533 - static struct callback_register __initdata syscall = {
26534 - .type = CALLBACKTYPE_syscall,
26535 - .address = (unsigned long)system_call,
26537 -#ifdef CONFIG_X86_LOCAL_APIC
26538 - static struct callback_register __initdata nmi_cb = {
26539 - .type = CALLBACKTYPE_nmi,
26540 - .address = (unsigned long)nmi,
26544 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
26546 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
26548 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
26549 -#if CONFIG_XEN_COMPAT <= 0x030002
26550 - if (ret == -ENOSYS)
26551 - ret = HYPERVISOR_set_callbacks(
26553 - failsafe.address,
26554 - syscall.address);
26558 -#ifdef CONFIG_X86_LOCAL_APIC
26559 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
26560 -#if CONFIG_XEN_COMPAT <= 0x030002
26561 - if (ret == -ENOSYS) {
26562 - static struct xennmi_callback __initdata cb = {
26563 - .handler_address = (unsigned long)nmi
26566 - HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
26571 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/setup_arch_pre.h 2009-10-28 14:55:02.000000000 +0100
26572 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26574 -/* Hook to call BIOS initialisation function */
26576 -#define ARCH_SETUP machine_specific_arch_setup();
26578 -static void __init machine_specific_arch_setup(void);
26579 --- sle11-2009-10-16.orig/include/asm-x86/traps.h 2009-10-28 14:55:02.000000000 +0100
26580 +++ sle11-2009-10-16/include/asm-x86/traps.h 2009-06-04 10:21:39.000000000 +0200
26581 @@ -23,6 +23,9 @@ asmlinkage void spurious_interrupt_bug(v
26582 #ifdef CONFIG_X86_MCE
26583 asmlinkage void machine_check(void);
26584 #endif /* CONFIG_X86_MCE */
26585 +#ifdef CONFIG_X86_XEN
26586 +asmlinkage void fixup_4gb_segment(void);
26589 void do_divide_error(struct pt_regs *, long);
26590 void do_overflow(struct pt_regs *, long);
26591 @@ -48,6 +51,9 @@ void math_error(void __user *);
26592 void do_coprocessor_error(struct pt_regs *, long);
26593 void do_simd_coprocessor_error(struct pt_regs *, long);
26594 void do_spurious_interrupt_bug(struct pt_regs *, long);
26596 +void do_fixup_4gb_segment(struct pt_regs *, long);
26598 unsigned long patch_espfix_desc(unsigned long, unsigned long);
26599 asmlinkage void math_emulate(long);
26601 --- sle11-2009-10-16.orig/include/asm-x86/xen/interface_64.h 2009-10-28 14:55:02.000000000 +0100
26602 +++ sle11-2009-10-16/include/asm-x86/xen/interface_64.h 2009-06-04 10:21:39.000000000 +0200
26603 @@ -136,7 +136,7 @@ struct cpu_user_regs {
26604 uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
26605 uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
26607 -DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
26608 +DEFINE_XEN_GUEST_HANDLE_STRUCT(cpu_user_regs);
26612 --- sle11-2009-10-16.orig/include/linux/page-flags.h 2009-03-16 16:38:05.000000000 +0100
26613 +++ sle11-2009-10-16/include/linux/page-flags.h 2009-06-04 10:21:39.000000000 +0200
26614 @@ -110,9 +110,11 @@ enum pageflags {
26616 PG_checked = PG_owner_priv_1,
26618 +#ifdef CONFIG_PARAVIRT_XEN
26620 PG_pinned = PG_owner_priv_1,
26621 PG_savepinned = PG_dirty,
26625 PG_slob_page = PG_active,
26626 @@ -187,8 +189,12 @@ PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU,
26627 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
26628 __PAGEFLAG(Slab, slab)
26629 PAGEFLAG(Checked, checked) /* Used by some filesystems */
26630 +#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
26631 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
26633 +#ifdef CONFIG_PARAVIRT_XEN
26634 PAGEFLAG(SavePinned, savepinned); /* Xen */
26636 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
26637 PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
26638 __SETPAGEFLAG(Private, private)
26639 --- sle11-2009-10-16.orig/include/xen/interface/memory.h 2009-03-16 16:38:05.000000000 +0100
26640 +++ sle11-2009-10-16/include/xen/interface/memory.h 2009-06-04 10:21:39.000000000 +0200
26641 @@ -82,6 +82,7 @@ struct xen_memory_reservation {
26645 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_memory_reservation);
26646 typedef struct xen_memory_reservation xen_memory_reservation_t;
26647 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
26649 @@ -167,11 +168,7 @@ struct xen_machphys_mfn_list {
26650 * any large discontiguities in the machine address space, 2MB gaps in
26651 * the machphys table will be represented by an MFN base of zero.
26653 -#ifndef CONFIG_PARAVIRT_XEN
26654 XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
26656 - ulong extent_start;
26660 * Number of extents written to the above array. This will be smaller
26661 @@ -179,6 +176,7 @@ struct xen_machphys_mfn_list {
26663 unsigned int nr_extents;
26665 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
26666 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
26667 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
26669 @@ -218,6 +216,7 @@ struct xen_add_to_physmap {
26670 /* GPFN where the source mapping page should appear. */
26673 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
26674 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
26675 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
26677 @@ -250,21 +249,13 @@ struct xen_translate_gpfn_list {
26678 xen_ulong_t nr_gpfns;
26680 /* List of GPFNs to translate. */
26681 -#ifndef CONFIG_PARAVIRT_XEN
26682 XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
26688 * Output list to contain MFN translations. May be the same as the input
26689 * list (in which case each input GPFN is overwritten with the output MFN).
26691 -#ifndef CONFIG_PARAVIRT_XEN
26692 XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
26697 DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
26698 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
26699 --- sle11-2009-10-16.orig/kernel/hrtimer.c 2009-10-28 14:55:02.000000000 +0100
26700 +++ sle11-2009-10-16/kernel/hrtimer.c 2009-06-04 10:21:39.000000000 +0200
26701 @@ -1084,7 +1084,7 @@ ktime_t hrtimer_get_remaining(const stru
26703 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
26705 -#ifdef CONFIG_NO_HZ
26706 +#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26708 * hrtimer_get_next_event - get the time until next expiry event
26710 --- sle11-2009-10-16.orig/kernel/kexec.c 2009-02-17 12:38:20.000000000 +0100
26711 +++ sle11-2009-10-16/kernel/kexec.c 2009-06-04 10:21:39.000000000 +0200
26712 @@ -54,7 +54,7 @@ int dump_after_notifier;
26713 unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
26715 #if defined(CONFIG_XEN) && defined(CONFIG_X86)
26716 -__attribute__((__section__(".bss.page_aligned"), __aligned__(PAGE_SIZE)))
26717 +__page_aligned_bss
26719 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
26720 size_t vmcoreinfo_size;
26721 --- sle11-2009-10-16.orig/kernel/timer.c 2009-10-28 14:55:02.000000000 +0100
26722 +++ sle11-2009-10-16/kernel/timer.c 2009-06-04 10:21:39.000000000 +0200
26723 @@ -884,7 +884,7 @@ static inline void __run_timers(struct t
26724 spin_unlock_irq(&base->lock);
26727 -#ifdef CONFIG_NO_HZ
26728 +#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26730 * Find out when the next timer event is due to happen. This
26731 * is used on S/390 to stop all activity when a cpus is idle.
26732 --- sle11-2009-10-16.orig/lib/swiotlb-xen.c 2009-03-16 16:38:05.000000000 +0100
26733 +++ sle11-2009-10-16/lib/swiotlb-xen.c 2009-06-04 10:21:39.000000000 +0200
26734 @@ -750,7 +750,7 @@ swiotlb_sync_sg_for_device(struct device
26738 -swiotlb_dma_mapping_error(dma_addr_t dma_addr)
26739 +swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
26741 return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
26743 --- sle11-2009-10-16.orig/mm/mprotect.c 2009-03-04 11:28:34.000000000 +0100
26744 +++ sle11-2009-10-16/mm/mprotect.c 2009-06-04 10:21:39.000000000 +0200
26745 @@ -92,8 +92,6 @@ static inline void change_pmd_range(stru
26746 next = pmd_addr_end(addr, end);
26747 if (pmd_none_or_clear_bad(pmd))
26749 - if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
26751 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
26752 } while (pmd++, addr = next, addr != end);