1 From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
2 Subject: [PATCH] Linux: Update to 2.6.27
5 This patch contains the differences between Linux 2.6.26 and 2.6.27.
7 Acked-by: Jeff Mahoney <jeffm@suse.com>
8 Automatically created from "patches.kernel.org/patch-2.6.27" by xen-port-patches.py
10 --- sle11-2009-06-04.orig/arch/x86/Kconfig 2009-03-16 16:38:05.000000000 +0100
11 +++ sle11-2009-06-04/arch/x86/Kconfig 2009-06-04 10:21:39.000000000 +0200
12 @@ -594,7 +594,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
14 bool "AMD IOMMU support"
16 - depends on X86_64 && PCI && ACPI
17 + depends on X86_64 && PCI && ACPI && !X86_64_XEN
19 With this option you can enable support for AMD IOMMU hardware in
20 your system. An IOMMU is a hardware component which provides
21 @@ -629,8 +629,10 @@ config MAXSMP
24 int "Maximum number of CPUs (2-4096)"
28 + default "32" if MAXSMP && XEN
29 default "4096" if MAXSMP
30 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
31 default "16" if X86_64_XEN
32 @@ -1227,7 +1229,7 @@ config MTRR
35 prompt "MTRR cleanup support"
37 + depends on MTRR && !XEN
39 Convert MTRR layout from continuous to discrete, so X drivers can
40 add writeback entries.
41 --- sle11-2009-06-04.orig/arch/x86/Kconfig.debug 2009-03-16 16:33:40.000000000 +0100
42 +++ sle11-2009-06-04/arch/x86/Kconfig.debug 2009-06-04 10:21:39.000000000 +0200
43 @@ -25,6 +25,7 @@ config STRICT_DEVMEM
44 config X86_VERBOSE_BOOTUP
45 bool "Enable verbose x86 bootup info messages"
49 Enables the informational output from the decompression stage
50 (e.g. bzImage) of the boot. If you disable this you will still
51 @@ -179,7 +180,7 @@ config MMIOTRACE_HOOKS
54 bool "Memory mapped IO tracing"
55 - depends on DEBUG_KERNEL && PCI
56 + depends on DEBUG_KERNEL && PCI && !XEN
58 select MMIOTRACE_HOOKS
60 --- sle11-2009-06-04.orig/arch/x86/Makefile 2009-02-16 16:18:36.000000000 +0100
61 +++ sle11-2009-06-04/arch/x86/Makefile 2009-06-04 10:21:39.000000000 +0200
62 @@ -116,8 +116,8 @@ mflags-$(CONFIG_X86_VOYAGER) := -Iinclud
63 mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
66 -mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-x86/mach-xen
67 -mcore-$(CONFIG_X86_XEN) := arch/x86/mach-xen/
68 +mflags-$(CONFIG_XEN) := -Iinclude/asm-x86/mach-xen
69 +mcore-$(CONFIG_XEN) := arch/x86/mach-xen/
71 # generic subarchitecture
72 mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
73 @@ -128,8 +128,10 @@ mcore-$(CONFIG_X86_GENERICARCH) := arch/
74 mflags-y += -Iinclude/asm-x86/mach-default
76 # 64 bit does not support subarch support - clear sub arch variables
77 +ifneq ($(CONFIG_XEN),y)
78 fcore-$(CONFIG_X86_64) :=
79 mcore-$(CONFIG_X86_64) :=
82 KBUILD_CFLAGS += $(mflags-y)
83 KBUILD_AFLAGS += $(mflags-y)
84 --- sle11-2009-06-04.orig/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:38:05.000000000 +0100
85 +++ sle11-2009-06-04/arch/x86/ia32/ia32entry-xen.S 2009-06-04 10:21:39.000000000 +0200
87 #include <asm/irqflags.h>
88 #include <linux/linkage.h>
90 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
91 +#include <linux/elf-em.h>
92 +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
93 +#define __AUDIT_ARCH_LE 0x40000000
95 +#ifndef CONFIG_AUDITSYSCALL
96 +#define sysexit_audit int_ret_from_sys_call
97 +#define sysretl_audit int_ret_from_sys_call
100 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
102 .macro IA32_ARG_FIXUP noebp=0
108 + * Reload arg registers from stack in case ptrace changed them.
109 + * We don't reload %eax because syscall_trace_enter() returned
110 + * the value it wants us to use in the table lookup.
112 .macro LOAD_ARGS32 offset
113 movl \offset(%rsp),%r11d
114 movl \offset+8(%rsp),%r10d
116 movl \offset+48(%rsp),%edx
117 movl \offset+56(%rsp),%esi
118 movl \offset+64(%rsp),%edi
119 - movl \offset+72(%rsp),%eax
122 .macro CFI_STARTPROC32 simple
127 +#ifdef CONFIG_PARAVIRT
128 +ENTRY(native_usergs_sysret32)
131 +ENDPROC(native_usergs_sysret32)
133 +ENTRY(native_irq_enable_sysexit)
137 +ENDPROC(native_irq_enable_sysexit)
141 * 32bit SYSENTER instruction entry.
143 @@ -98,7 +125,7 @@ ENTRY(ia32_sysenter_target)
145 movl %ebp,%ebp /* zero extension */
147 - movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
148 + movl 48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d
149 movl $__USER32_DS,40(%rsp)
151 movl $__USER32_CS,16(%rsp)
152 @@ -113,19 +140,79 @@ ENTRY(ia32_sysenter_target)
155 GET_THREAD_INFO(%r10)
156 - orl $TS_COMPAT,threadinfo_status(%r10)
157 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
158 + orl $TS_COMPAT,TI_status(%r10)
159 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
160 jnz sysenter_tracesys
162 cmpl $(IA32_NR_syscalls-1),%eax
167 call *ia32_sys_call_table(,%rax,8)
168 movq %rax,RAX-ARGOFFSET(%rsp)
169 + GET_THREAD_INFO(%r10)
170 + DISABLE_INTERRUPTS(CLBR_NONE)
172 + testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
174 + jmp int_ret_from_sys_call
176 +#ifdef CONFIG_AUDITSYSCALL
177 + .macro auditsys_entry_common
178 + movl %esi,%r9d /* 6th arg: 4th syscall arg */
179 + movl %edx,%r8d /* 5th arg: 3rd syscall arg */
180 + /* (already in %ecx) 4th arg: 2nd syscall arg */
181 + movl %ebx,%edx /* 3rd arg: 1st syscall arg */
182 + movl %eax,%esi /* 2nd arg: syscall number */
183 + movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
184 + call audit_syscall_entry
185 + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
186 + cmpl $(IA32_NR_syscalls-1),%eax
188 + movl %ebx,%edi /* reload 1st syscall arg */
189 + movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
190 + movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
191 + movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
192 + movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
195 + .macro auditsys_exit exit,ebpsave=RBP
196 + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
197 + jnz int_ret_from_sys_call
199 + ENABLE_INTERRUPTS(CLBR_NONE)
200 + movl %eax,%esi /* second arg, syscall return value */
201 + cmpl $0,%eax /* is it < 0? */
202 + setl %al /* 1 if so, 0 if not */
203 + movzbl %al,%edi /* zero-extend that into %edi */
204 + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
205 + call audit_syscall_exit
206 + GET_THREAD_INFO(%r10)
207 + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
208 + movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
209 + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
210 + DISABLE_INTERRUPTS(CLBR_NONE)
212 + testl %edi,TI_flags(%r10)
214 jmp int_ret_from_sys_call
218 + auditsys_entry_common
219 + movl %ebp,%r9d /* reload 6th syscall arg */
220 + jmp sysenter_dispatch
223 + auditsys_exit sysexit_from_sys_call
228 +#ifdef CONFIG_AUDITSYSCALL
229 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
230 + jz sysenter_auditsys
235 @@ -186,18 +273,38 @@ ENTRY(ia32_cstar_target)
238 GET_THREAD_INFO(%r10)
239 - orl $TS_COMPAT,threadinfo_status(%r10)
240 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
241 + orl $TS_COMPAT,TI_status(%r10)
242 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
245 cmpl $IA32_NR_syscalls-1,%eax
249 call *ia32_sys_call_table(,%rax,8)
250 movq %rax,RAX-ARGOFFSET(%rsp)
251 + GET_THREAD_INFO(%r10)
252 + DISABLE_INTERRUPTS(CLBR_NONE)
253 + testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
255 jmp int_ret_from_sys_call
258 +#ifdef CONFIG_AUDITSYSCALL
260 + movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
261 + auditsys_entry_common
262 + movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
266 + auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
270 +#ifdef CONFIG_AUDITSYSCALL
271 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
277 @@ -263,8 +370,8 @@ ENTRY(ia32_syscall)
278 this could be a problem. */
280 GET_THREAD_INFO(%r10)
281 - orl $TS_COMPAT,threadinfo_status(%r10)
282 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
283 + orl $TS_COMPAT,TI_status(%r10)
284 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
287 cmpl $(IA32_NR_syscalls-1),%eax
288 @@ -309,13 +416,11 @@ quiet_ni_syscall:
289 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
290 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
291 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
292 - PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
293 PTREGSCALL stub32_execve, sys32_execve, %rcx
294 PTREGSCALL stub32_fork, sys_fork, %rdi
295 PTREGSCALL stub32_clone, sys32_clone, %rdx
296 PTREGSCALL stub32_vfork, sys_vfork, %rdi
297 PTREGSCALL stub32_iopl, sys_iopl, %rsi
298 - PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
300 ENTRY(ia32_ptregs_common)
302 @@ -415,7 +520,7 @@ ia32_sys_call_table:
304 .quad sys_setreuid16 /* 70 */
306 - .quad stub32_sigsuspend
307 + .quad sys32_sigsuspend
308 .quad compat_sys_sigpending
309 .quad sys_sethostname
310 .quad compat_sys_setrlimit /* 75 */
311 @@ -522,7 +627,7 @@ ia32_sys_call_table:
312 .quad sys32_rt_sigpending
313 .quad compat_sys_rt_sigtimedwait
314 .quad sys32_rt_sigqueueinfo
315 - .quad stub32_rt_sigsuspend
316 + .quad sys_rt_sigsuspend
317 .quad sys32_pread /* 180 */
320 @@ -670,4 +775,10 @@ ia32_sys_call_table:
321 .quad sys32_fallocate
322 .quad compat_sys_timerfd_settime /* 325 */
323 .quad compat_sys_timerfd_gettime
324 + .quad compat_sys_signalfd4
326 + .quad sys_epoll_create1
327 + .quad sys_dup3 /* 330 */
329 + .quad sys_inotify_init1
331 --- sle11-2009-06-04.orig/arch/x86/kernel/Makefile 2009-03-16 16:38:05.000000000 +0100
332 +++ sle11-2009-06-04/arch/x86/kernel/Makefile 2009-06-04 10:21:39.000000000 +0200
333 @@ -120,9 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
335 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
337 - obj-$(CONFIG_XEN) += nmi_64.o
338 + obj-$(CONFIG_XEN) += nmi.o
339 time_64-$(CONFIG_XEN) += time_32.o
342 -disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
343 - pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
344 +disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \
345 + i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \
346 + tlb_$(BITS).o tsc.o tsc_sync.o vsmp_64.o
347 --- sle11-2009-06-04.orig/arch/x86/kernel/acpi/boot.c 2009-03-16 16:38:05.000000000 +0100
348 +++ sle11-2009-06-04/arch/x86/kernel/acpi/boot.c 2009-06-04 10:21:39.000000000 +0200
349 @@ -951,7 +951,9 @@ void __init mp_register_ioapic(int id, u
350 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
351 mp_ioapics[idx].mp_apicaddr = address;
354 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
356 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
358 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
359 @@ -1108,7 +1110,7 @@ int mp_register_gsi(u32 gsi, int trigger
363 -#ifdef CONFIG_X86_32
364 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
365 #define MAX_GSI_NUM 4096
366 #define IRQ_COMPRESSION_START 64
368 @@ -1156,7 +1158,7 @@ int mp_register_gsi(u32 gsi, int trigger
369 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
370 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
371 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
372 -#ifdef CONFIG_X86_32
373 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
374 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
377 @@ -1164,7 +1166,7 @@ int mp_register_gsi(u32 gsi, int trigger
380 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
381 -#ifdef CONFIG_X86_32
382 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
384 * For GSI >= 64, use IRQ compression
386 --- sle11-2009-06-04.orig/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:38:05.000000000 +0100
387 +++ sle11-2009-06-04/arch/x86/kernel/acpi/sleep-xen.c 2009-06-04 10:21:39.000000000 +0200
389 #include <linux/bootmem.h>
390 #include <linux/dmi.h>
391 #include <linux/cpumask.h>
392 +#include <asm/segment.h>
394 #include "realmode/wakeup.h"
396 @@ -20,7 +21,7 @@ unsigned long acpi_realmode_flags;
397 /* address in low memory of the wakeup routine. */
398 static unsigned long acpi_realmode;
401 +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
402 static char temp_stack[10240];
405 @@ -54,18 +55,27 @@ int acpi_save_state_mem(void)
406 header->video_mode = saved_video_mode;
408 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
411 + * Set up the wakeup GDT. We set these up as Big Real Mode,
412 + * that is, with limits set to 4 GB. At least the Lenovo
413 + * Thinkpad X61 is known to need this for the video BIOS
414 + * initialization quirk to work; this is likely to also
415 + * be the case for other laptops or integrated video devices.
418 /* GDT[0]: GDT self-pointer */
419 header->wakeup_gdt[0] =
420 (u64)(sizeof(header->wakeup_gdt) - 1) +
421 ((u64)(acpi_wakeup_address +
422 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
424 - /* GDT[1]: real-mode-like code segment */
425 - header->wakeup_gdt[1] = (0x009bULL << 40) +
426 - ((u64)acpi_wakeup_address << 16) + 0xffff;
427 - /* GDT[2]: real-mode-like data segment */
428 - header->wakeup_gdt[2] = (0x0093ULL << 40) +
429 - ((u64)acpi_wakeup_address << 16) + 0xffff;
430 + /* GDT[1]: big real mode-like code segment */
431 + header->wakeup_gdt[1] =
432 + GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
433 + /* GDT[2]: big real mode-like data segment */
434 + header->wakeup_gdt[2] =
435 + GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
438 store_gdt((struct desc_ptr *)&header->pmode_gdt);
439 @@ -79,7 +89,7 @@ int acpi_save_state_mem(void)
440 #endif /* !CONFIG_64BIT */
442 header->pmode_cr0 = read_cr0();
443 - header->pmode_cr4 = read_cr4();
444 + header->pmode_cr4 = read_cr4_safe();
445 header->realmode_flags = acpi_realmode_flags;
446 header->real_magic = 0x12345678;
448 @@ -89,7 +99,9 @@ int acpi_save_state_mem(void)
449 saved_magic = 0x12345678;
450 #else /* CONFIG_64BIT */
451 header->trampoline_segment = setup_trampoline() >> 4;
452 - init_rsp = (unsigned long)temp_stack + 4096;
454 + stack_start.sp = temp_stack + 4096;
456 initial_code = (unsigned long)wakeup_long64;
457 saved_magic = 0x123456789abcdef0;
458 #endif /* CONFIG_64BIT */
459 @@ -145,6 +157,12 @@ static int __init acpi_sleep_setup(char
460 acpi_realmode_flags |= 2;
461 if (strncmp(str, "s3_beep", 7) == 0)
462 acpi_realmode_flags |= 4;
463 +#ifdef CONFIG_HIBERNATION
464 + if (strncmp(str, "s4_nohwsig", 10) == 0)
465 + acpi_no_s4_hw_signature();
467 + if (strncmp(str, "old_ordering", 12) == 0)
468 + acpi_old_suspend_ordering();
469 str = strchr(str, ',');
471 str += strspn(str, ", \t");
472 --- sle11-2009-06-04.orig/arch/x86/kernel/apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
473 +++ sle11-2009-06-04/arch/x86/kernel/apic_32-xen.c 2009-06-04 10:21:39.000000000 +0200
474 @@ -59,7 +59,10 @@ static cpumask_t timer_bcast_ipi;
476 * Debug level, exported for io_apic.c
479 +unsigned int apic_verbosity;
481 +/* Have we found an MP table */
482 +int smp_found_config;
485 static int modern_apic(void)
486 --- sle11-2009-06-04.orig/arch/x86/kernel/apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
487 +++ sle11-2009-06-04/arch/x86/kernel/apic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
488 @@ -39,7 +39,10 @@ int disable_apic;
490 * Debug level, exported for io_apic.c
493 +unsigned int apic_verbosity;
495 +/* Have we found an MP table */
496 +int smp_found_config;
499 * The guts of the apic timer interrupt
500 --- sle11-2009-06-04.orig/arch/x86/kernel/asm-offsets_64.c 2008-11-25 12:35:54.000000000 +0100
501 +++ sle11-2009-06-04/arch/x86/kernel/asm-offsets_64.c 2009-06-04 10:21:39.000000000 +0200
502 @@ -138,7 +138,7 @@ int main(void)
505 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
507 +#ifdef CONFIG_PARAVIRT_XEN
509 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
510 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
511 --- sle11-2009-06-04.orig/arch/x86/kernel/cpu/amd_64.c 2009-06-04 11:08:07.000000000 +0200
512 +++ sle11-2009-06-04/arch/x86/kernel/cpu/amd_64.c 2009-06-04 10:21:39.000000000 +0200
513 @@ -193,6 +193,7 @@ static void __cpuinit init_amd(struct cp
514 fam10h_check_enable_mmcfg();
518 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
519 unsigned long long tseg;
521 @@ -211,6 +212,7 @@ static void __cpuinit init_amd(struct cp
522 set_memory_4k((unsigned long)__va(tseg), 1);
528 static struct cpu_dev amd_cpu_dev __cpuinitdata = {
529 --- sle11-2009-06-04.orig/arch/x86/kernel/cpu/bugs_64.c 2009-06-04 11:08:07.000000000 +0200
530 +++ sle11-2009-06-04/arch/x86/kernel/cpu/bugs_64.c 2009-06-04 10:21:39.000000000 +0200
531 @@ -20,6 +20,7 @@ void __init check_bugs(void)
533 alternative_instructions();
537 * Make sure the first 2MB area is not mapped by huge pages
538 * There are typically fixed size MTRRs in there and overlapping
539 @@ -30,4 +31,5 @@ void __init check_bugs(void)
542 set_memory_4k((unsigned long)__va(0), 1);
545 --- sle11-2009-06-04.orig/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:38:05.000000000 +0100
546 +++ sle11-2009-06-04/arch/x86/kernel/cpu/common-xen.c 2009-06-04 10:21:39.000000000 +0200
548 #include <asm/mtrr.h>
551 +#include <asm/asm.h>
552 #ifdef CONFIG_X86_LOCAL_APIC
553 #include <asm/mpspec.h>
554 #include <asm/apic.h>
555 @@ -341,11 +342,24 @@ static void __init early_cpu_detect(void
557 get_cpu_vendor(c, 1);
561 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
562 cpu_devs[c->x86_vendor]->c_early_init)
563 cpu_devs[c->x86_vendor]->c_early_init(c);
568 + * The NOPL instruction is supposed to exist on all CPUs with
569 + * family >= 6; unfortunately, that's not true in practice because
570 + * of early VIA chips and (more importantly) broken virtualizers that
571 + * are not easy to detect. In the latter case it doesn't even *fail*
572 + * reliably, so probing for it doesn't even work. Disable it completely
573 + * unless we can find a reliable way to detect all the broken cases.
575 +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
577 + clear_cpu_cap(c, X86_FEATURE_NOPL);
580 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
581 @@ -402,8 +416,8 @@ static void __cpuinit generic_identify(s
584 init_scattered_cpuid_features(c);
590 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
591 @@ -434,7 +448,7 @@ __setup("serialnumber", x86_serial_nr_se
593 * This does the hard work of actually picking apart the CPU stuff...
595 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
596 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
600 @@ -448,6 +462,8 @@ void __cpuinit identify_cpu(struct cpuin
601 c->x86_max_cores = 1;
602 c->x86_clflush_size = 32;
603 memset(&c->x86_capability, 0, sizeof c->x86_capability);
604 + if (boot_cpu_has(X86_FEATURE_SYSCALL32))
605 + set_cpu_cap(c, X86_FEATURE_SYSCALL32);
607 if (!have_cpuid_p()) {
609 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
610 +++ sle11-2009-06-04/arch/x86/kernel/cpu/common_64-xen.c 2009-06-04 10:21:39.000000000 +0200
612 +#include <linux/init.h>
613 +#include <linux/kernel.h>
614 +#include <linux/sched.h>
615 +#include <linux/string.h>
616 +#include <linux/bootmem.h>
617 +#include <linux/bitops.h>
618 +#include <linux/module.h>
619 +#include <linux/kgdb.h>
620 +#include <linux/topology.h>
621 +#include <linux/delay.h>
622 +#include <linux/smp.h>
623 +#include <linux/percpu.h>
624 +#include <asm/i387.h>
625 +#include <asm/msr.h>
627 +#include <asm/linkage.h>
628 +#include <asm/mmu_context.h>
629 +#include <asm/mtrr.h>
630 +#include <asm/mce.h>
631 +#include <asm/pat.h>
632 +#include <asm/asm.h>
633 +#include <asm/numa.h>
634 +#ifdef CONFIG_X86_LOCAL_APIC
635 +#include <asm/mpspec.h>
636 +#include <asm/apic.h>
637 +#include <mach_apic.h>
638 +#elif defined(CONFIG_XEN)
639 +#include <mach_apic.h>
641 +#include <asm/pda.h>
642 +#include <asm/pgtable.h>
643 +#include <asm/processor.h>
644 +#include <asm/desc.h>
645 +#include <asm/atomic.h>
646 +#include <asm/proto.h>
647 +#include <asm/sections.h>
648 +#include <asm/setup.h>
649 +#include <asm/genapic.h>
653 +/* We need valid kernel segments for data and code in long mode too
654 + * IRET will check the segment types kkeil 2000/10/28
655 + * Also sysret mandates a special GDT layout
657 +/* The TLS descriptors are currently at a different place compared to i386.
658 + Hopefully nobody expects them at a fixed place (Wine?) */
659 +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
660 + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
661 + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
662 + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
663 + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
664 + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
665 + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
667 +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
669 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
671 +/* Current gdt points %fs at the "master" per-cpu area: after this,
672 + * it's on the real one. */
673 +void switch_to_new_gdt(void)
676 + struct desc_ptr gdt_descr;
678 + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
679 + gdt_descr.size = GDT_SIZE - 1;
680 + load_gdt(&gdt_descr);
682 + void *va, *gdt_addr = get_cpu_gdt_table(smp_processor_id());
683 + unsigned long frames[16];
684 + unsigned int f = 0;
686 + for (va = gdt_addr; va < gdt_addr + GDT_SIZE; va += PAGE_SIZE) {
687 + frames[f++] = virt_to_mfn(va);
688 + make_page_readonly(va, XENFEAT_writable_descriptor_tables);
690 + if (HYPERVISOR_set_gdt(frames, GDT_SIZE / sizeof(struct desc_struct)))
695 +struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
697 +static void __cpuinit default_init(struct cpuinfo_x86 *c)
699 + display_cacheinfo(c);
702 +static struct cpu_dev __cpuinitdata default_cpu = {
703 + .c_init = default_init,
704 + .c_vendor = "Unknown",
706 +static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
708 +int __cpuinit get_model_name(struct cpuinfo_x86 *c)
712 + if (c->extended_cpuid_level < 0x80000004)
715 + v = (unsigned int *) c->x86_model_id;
716 + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
717 + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
718 + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
719 + c->x86_model_id[48] = 0;
724 +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
726 + unsigned int n, dummy, ebx, ecx, edx;
728 + n = c->extended_cpuid_level;
730 + if (n >= 0x80000005) {
731 + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
732 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
733 + "D cache %dK (%d bytes/line)\n",
734 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
735 + c->x86_cache_size = (ecx>>24) + (edx>>24);
736 + /* On K8 L1 TLB is inclusive, so don't count it */
737 + c->x86_tlbsize = 0;
740 + if (n >= 0x80000006) {
741 + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
742 + ecx = cpuid_ecx(0x80000006);
743 + c->x86_cache_size = ecx >> 16;
744 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
746 + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
747 + c->x86_cache_size, ecx & 0xFF);
751 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
754 + u32 eax, ebx, ecx, edx;
755 + int index_msb, core_bits;
757 + cpuid(1, &eax, &ebx, &ecx, &edx);
760 + if (!cpu_has(c, X86_FEATURE_HT))
762 + if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
765 + smp_num_siblings = (ebx & 0xff0000) >> 16;
767 + if (smp_num_siblings == 1) {
768 + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
769 + } else if (smp_num_siblings > 1) {
771 + if (smp_num_siblings > NR_CPUS) {
772 + printk(KERN_WARNING "CPU: Unsupported number of "
773 + "siblings %d", smp_num_siblings);
774 + smp_num_siblings = 1;
778 + index_msb = get_count_order(smp_num_siblings);
779 + c->phys_proc_id = phys_pkg_id(index_msb);
781 + smp_num_siblings = smp_num_siblings / c->x86_max_cores;
783 + index_msb = get_count_order(smp_num_siblings);
785 + core_bits = get_count_order(c->x86_max_cores);
787 + c->cpu_core_id = phys_pkg_id(index_msb) &
788 + ((1 << core_bits) - 1);
791 + if ((c->x86_max_cores * smp_num_siblings) > 1) {
792 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
794 + printk(KERN_INFO "CPU: Processor Core ID: %d\n",
801 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
803 + char *v = c->x86_vendor_id;
805 + static int printed;
807 + for (i = 0; i < X86_VENDOR_NUM; i++) {
809 + if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
810 + (cpu_devs[i]->c_ident[1] &&
811 + !strcmp(v, cpu_devs[i]->c_ident[1]))) {
813 + this_cpu = cpu_devs[i];
820 + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
821 + printk(KERN_ERR "CPU: Your system may be unstable.\n");
823 + c->x86_vendor = X86_VENDOR_UNKNOWN;
826 +static void __init early_cpu_support_print(void)
829 + struct cpu_dev *cpu_devx;
831 + printk("KERNEL supported cpus:\n");
832 + for (i = 0; i < X86_VENDOR_NUM; i++) {
833 + cpu_devx = cpu_devs[i];
836 + for (j = 0; j < 2; j++) {
837 + if (!cpu_devx->c_ident[j])
839 + printk(" %s %s\n", cpu_devx->c_vendor,
840 + cpu_devx->c_ident[j]);
846 + * The NOPL instruction is supposed to exist on all CPUs with
847 + * family >= 6, unfortunately, that's not true in practice because
848 + * of early VIA chips and (more importantly) broken virtualizers that
849 + * are not easy to detect. Hence, probe for it based on first
852 + * Note: no 64-bit chip is known to lack these, but put the code here
853 + * for consistency with 32 bits, and to make it utterly trivial to
854 + * diagnose the problem should it ever surface.
856 +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
858 + const u32 nopl_signature = 0x888c53b1; /* Random number */
859 + u32 has_nopl = nopl_signature;
861 + clear_cpu_cap(c, X86_FEATURE_NOPL);
864 + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
866 + " .section .fixup,\"ax\"\n"
870 + _ASM_EXTABLE(1b,3b)
871 + : "+a" (has_nopl));
873 + if (has_nopl == nopl_signature)
874 + set_cpu_cap(c, X86_FEATURE_NOPL);
878 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
880 +void __init early_cpu_init(void)
882 + struct cpu_vendor_dev *cvdev;
884 + for (cvdev = __x86cpuvendor_start ;
885 + cvdev < __x86cpuvendor_end ;
887 + cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
888 + early_cpu_support_print();
889 + early_identify_cpu(&boot_cpu_data);
892 +/* Do some early cpuid on the boot CPU to get some parameter that are
893 + needed before check_bugs. Everything advanced is in identify_cpu
895 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
899 + c->loops_per_jiffy = loops_per_jiffy;
900 + c->x86_cache_size = -1;
901 + c->x86_vendor = X86_VENDOR_UNKNOWN;
902 + c->x86_model = c->x86_mask = 0; /* So far unknown... */
903 + c->x86_vendor_id[0] = '\0'; /* Unset */
904 + c->x86_model_id[0] = '\0'; /* Unset */
905 + c->x86_clflush_size = 64;
906 + c->x86_cache_alignment = c->x86_clflush_size;
907 + c->x86_max_cores = 1;
908 + c->x86_coreid_bits = 0;
909 + c->extended_cpuid_level = 0;
910 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
912 + /* Get vendor name */
913 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
914 + (unsigned int *)&c->x86_vendor_id[0],
915 + (unsigned int *)&c->x86_vendor_id[8],
916 + (unsigned int *)&c->x86_vendor_id[4]);
920 + /* Initialize the standard set of capabilities */
921 + /* Note that the vendor-specific code below might override */
923 + /* Intel-defined flags: level 0x00000001 */
924 + if (c->cpuid_level >= 0x00000001) {
926 + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
927 + &c->x86_capability[0]);
928 + c->x86 = (tfms >> 8) & 0xf;
929 + c->x86_model = (tfms >> 4) & 0xf;
930 + c->x86_mask = tfms & 0xf;
932 + c->x86 += (tfms >> 20) & 0xff;
934 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
935 + if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
936 + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
938 + /* Have CPUID level 0 only - unheard of */
942 + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
944 + c->phys_proc_id = c->initial_apicid;
946 + /* AMD-defined flags: level 0x80000001 */
947 + xlvl = cpuid_eax(0x80000000);
948 + c->extended_cpuid_level = xlvl;
949 + if ((xlvl & 0xffff0000) == 0x80000000) {
950 + if (xlvl >= 0x80000001) {
951 + c->x86_capability[1] = cpuid_edx(0x80000001);
952 + c->x86_capability[6] = cpuid_ecx(0x80000001);
954 + if (xlvl >= 0x80000004)
955 + get_model_name(c); /* Default name */
958 + /* Transmeta-defined flags: level 0x80860001 */
959 + xlvl = cpuid_eax(0x80860000);
960 + if ((xlvl & 0xffff0000) == 0x80860000) {
961 + /* Don't set x86_cpuid_level here for now to not confuse. */
962 + if (xlvl >= 0x80860001)
963 + c->x86_capability[2] = cpuid_edx(0x80860001);
966 + if (c->extended_cpuid_level >= 0x80000007)
967 + c->x86_power = cpuid_edx(0x80000007);
969 + if (c->extended_cpuid_level >= 0x80000008) {
970 + u32 eax = cpuid_eax(0x80000008);
972 + c->x86_virt_bits = (eax >> 8) & 0xff;
973 + c->x86_phys_bits = eax & 0xff;
978 + if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
979 + cpu_devs[c->x86_vendor]->c_early_init)
980 + cpu_devs[c->x86_vendor]->c_early_init(c);
982 + validate_pat_support(c);
986 + * This does the hard work of actually picking apart the CPU stuff...
988 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
992 + early_identify_cpu(c);
994 + init_scattered_cpuid_features(c);
996 + c->apicid = phys_pkg_id(0);
999 + * Vendor-specific initialization. In this section we
1000 + * canonicalize the feature flags, meaning if there are
1001 + * features a certain CPU supports which CPUID doesn't
1002 + * tell us, CPUID claiming incorrect flags, or other bugs,
1003 + * we handle them here.
1005 + * At the end of this section, c->x86_capability better
1006 + * indicate the features this CPU genuinely supports!
1008 + if (this_cpu->c_init)
1009 + this_cpu->c_init(c);
1014 + * On SMP, boot_cpu_data holds the common feature set between
1015 + * all CPUs; so make sure that we indicate which features are
1016 + * common between the CPUs. The first time this routine gets
1017 + * executed, c == &boot_cpu_data.
1019 + if (c != &boot_cpu_data) {
1020 + /* AND the already accumulated flags with these */
1021 + for (i = 0; i < NCAPINTS; i++)
1022 + boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1025 + /* Clear all flags overriden by options */
1026 + for (i = 0; i < NCAPINTS; i++)
1027 + c->x86_capability[i] &= ~cleared_cpu_caps[i];
1029 +#ifdef CONFIG_X86_MCE
1032 + select_idle_routine(c);
1035 + numa_add_cpu(smp_processor_id());
1040 +void __cpuinit identify_boot_cpu(void)
1042 + identify_cpu(&boot_cpu_data);
1045 +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1047 + BUG_ON(c == &boot_cpu_data);
1052 +static __init int setup_noclflush(char *arg)
1054 + setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1057 +__setup("noclflush", setup_noclflush);
1059 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1061 + if (c->x86_model_id[0])
1062 + printk(KERN_CONT "%s", c->x86_model_id);
1064 + if (c->x86_mask || c->cpuid_level >= 0)
1065 + printk(KERN_CONT " stepping %02x\n", c->x86_mask);
1067 + printk(KERN_CONT "\n");
1070 +static __init int setup_disablecpuid(char *arg)
1073 + if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1074 + setup_clear_cpu_cap(bit);
1079 +__setup("clearcpuid=", setup_disablecpuid);
1081 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1083 +struct x8664_pda **_cpu_pda __read_mostly;
1084 +EXPORT_SYMBOL(_cpu_pda);
1086 +#ifndef CONFIG_X86_NO_IDT
1087 +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
1090 +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
1092 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
1093 +EXPORT_SYMBOL_GPL(__supported_pte_mask);
1095 +static int do_not_nx __cpuinitdata;
1098 +Control non executable mappings for 64bit processes.
1103 +static int __init nonx_setup(char *str)
1107 + if (!strncmp(str, "on", 2)) {
1108 + __supported_pte_mask |= _PAGE_NX;
1110 + } else if (!strncmp(str, "off", 3)) {
1112 + __supported_pte_mask &= ~_PAGE_NX;
1116 +early_param("noexec", nonx_setup);
1118 +int force_personality32;
1121 +Control non executable heap for 32bit processes.
1122 +To control the stack too use noexec=off
1124 +on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
1125 +off PROT_READ implies PROT_EXEC
1127 +static int __init nonx32_setup(char *str)
1129 + if (!strcmp(str, "on"))
1130 + force_personality32 &= ~READ_IMPLIES_EXEC;
1131 + else if (!strcmp(str, "off"))
1132 + force_personality32 |= READ_IMPLIES_EXEC;
1135 +__setup("noexec32=", nonx32_setup);
1137 +static void __init_refok switch_pt(int cpu)
1142 + xen_pt_switch(__pa_symbol(init_level4_pgt));
1143 + xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
1147 +void pda_init(int cpu)
1149 + struct x8664_pda *pda = cpu_pda(cpu);
1151 + /* Setup up data that may be needed in __get_free_pages early */
1152 + loadsegment(fs, 0);
1153 + loadsegment(gs, 0);
1155 + /* Memory clobbers used to order PDA accessed */
1157 + wrmsrl(MSR_GS_BASE, pda);
1160 + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
1161 + (unsigned long)pda))
1165 + pda->cpunumber = cpu;
1166 + pda->irqcount = -1;
1167 + pda->kernelstack = (unsigned long)stack_thread_info() -
1168 + PDA_STACKOFFSET + THREAD_SIZE;
1169 + pda->active_mm = &init_mm;
1170 + pda->mmu_state = 0;
1173 + /* others are initialized in smpboot.c */
1174 + pda->pcurrent = &init_task;
1175 + pda->irqstackptr = boot_cpu_stack;
1176 + pda->irqstackptr += IRQSTACKSIZE - 64;
1178 + if (!pda->irqstackptr) {
1179 + pda->irqstackptr = (char *)
1180 + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
1181 + if (!pda->irqstackptr)
1182 + panic("cannot allocate irqstack for cpu %d",
1184 + pda->irqstackptr += IRQSTACKSIZE - 64;
1187 + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
1188 + pda->nodenumber = cpu_to_node(cpu);
1194 +#ifndef CONFIG_X86_NO_TSS
1195 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
1196 + DEBUG_STKSZ] __page_aligned_bss;
1199 +extern asmlinkage void ignore_sysret(void);
1201 +void __cpuinit syscall_init(void)
1205 + * LSTAR and STAR live in a bit strange symbiosis.
1206 + * They both write to the same internal register. STAR allows to
1207 + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
1209 + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1210 + wrmsrl(MSR_LSTAR, system_call);
1211 + wrmsrl(MSR_CSTAR, ignore_sysret);
1213 + /* Flags to clear on syscall */
1214 + wrmsrl(MSR_SYSCALL_MASK,
1215 + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1217 +#ifdef CONFIG_IA32_EMULATION
1218 + syscall32_cpu_init();
1220 + static const struct callback_register __cpuinitconst cstar = {
1221 + .type = CALLBACKTYPE_syscall32,
1222 + .address = (unsigned long)ignore_sysret
1225 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
1226 + printk(KERN_WARN "Unable to register CSTAR callback\n");
1230 +void __cpuinit check_efer(void)
1232 + unsigned long efer;
1234 + rdmsrl(MSR_EFER, efer);
1235 + if (!(efer & EFER_NX) || do_not_nx)
1236 + __supported_pte_mask &= ~_PAGE_NX;
1239 +unsigned long kernel_eflags;
1241 +#ifndef CONFIG_X86_NO_TSS
1243 + * Copies of the original ist values from the tss are only accessed during
1244 + * debugging, no special alignment required.
1246 +DEFINE_PER_CPU(struct orig_ist, orig_ist);
1250 + * cpu_init() initializes state that is per-CPU. Some data is already
1251 + * initialized (naturally) in the bootstrap process, such as the GDT
1252 + * and IDT. We reload them nevertheless, this function acts as a
1253 + * 'CPU state barrier', nothing should get across.
1254 + * A lot of state is already set up in PDA init.
1256 +void __cpuinit cpu_init(void)
1258 + int cpu = stack_smp_processor_id();
1259 +#ifndef CONFIG_X86_NO_TSS
1260 + struct tss_struct *t = &per_cpu(init_tss, cpu);
1261 + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
1263 + char *estacks = NULL;
1266 + struct task_struct *me;
1268 + /* CPU 0 is initialised in head64.c */
1271 +#ifndef CONFIG_X86_NO_TSS
1273 + estacks = boot_exception_stacks;
1278 + if (cpu_test_and_set(cpu, cpu_initialized))
1279 + panic("CPU#%d already initialized!\n", cpu);
1281 + printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1283 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1286 + * Initialize the per-CPU GDT with the boot GDT,
1287 + * and set up the GDT descriptor:
1290 + switch_to_new_gdt();
1291 +#ifndef CONFIG_X86_NO_IDT
1292 + load_idt((const struct desc_ptr *)&idt_descr);
1295 + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
1298 + wrmsrl(MSR_FS_BASE, 0);
1299 + wrmsrl(MSR_KERNEL_GS_BASE, 0);
1304 +#ifndef CONFIG_X86_NO_TSS
1306 + * set up and load the per-CPU TSS
1308 + if (!orig_ist->ist[0]) {
1309 + static const unsigned int order[N_EXCEPTION_STACKS] = {
1310 + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1311 + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1313 + for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1315 + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1317 + panic("Cannot allocate exception "
1318 + "stack %ld %d\n", v, cpu);
1320 + estacks += PAGE_SIZE << order[v];
1321 + orig_ist->ist[v] = t->x86_tss.ist[v] =
1322 + (unsigned long)estacks;
1326 + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1328 + * <= is required because the CPU will access up to
1329 + * 8 bits beyond the end of the IO permission bitmap.
1331 + for (i = 0; i <= IO_BITMAP_LONGS; i++)
1332 + t->io_bitmap[i] = ~0UL;
1335 + atomic_inc(&init_mm.mm_count);
1336 + me->active_mm = &init_mm;
1339 + enter_lazy_tlb(&init_mm, me);
1341 + load_sp0(t, ¤t->thread);
1342 +#ifndef CONFIG_X86_NO_TSS
1343 + set_tss_desc(cpu, t);
1346 + load_LDT(&init_mm.context);
1350 + * If the kgdb is connected no debug regs should be altered. This
1351 + * is only applicable when KGDB and a KGDB I/O module are built
1352 + * into the kernel and you are using early debugging with
1353 + * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1355 + if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1356 + arch_kgdb_ops.correct_hw_break();
1360 + * Clear all 6 debug registers:
1363 + set_debugreg(0UL, 0);
1364 + set_debugreg(0UL, 1);
1365 + set_debugreg(0UL, 2);
1366 + set_debugreg(0UL, 3);
1367 + set_debugreg(0UL, 6);
1368 + set_debugreg(0UL, 7);
1370 + /* If the kgdb is connected no debug regs should be altered. */
1376 + asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
1377 + if (raw_irqs_disabled())
1378 + kernel_eflags &= ~X86_EFLAGS_IF;
1380 + if (is_uv_system())
1383 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
1384 +++ sle11-2009-06-04/arch/x86/kernel/e820-xen.c 2009-06-04 10:21:39.000000000 +0200
1387 + * Handle the memory map.
1388 + * The functions here do the job until bootmem takes over.
1390 + * Getting sanitize_e820_map() in sync with i386 version by applying change:
1391 + * - Provisions for empty E820 memory regions (reported by certain BIOSes).
1392 + * Alex Achenbach <xela@slit.de>, December 2002.
1393 + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
1396 +#include <linux/kernel.h>
1397 +#include <linux/types.h>
1398 +#include <linux/init.h>
1399 +#include <linux/bootmem.h>
1400 +#include <linux/ioport.h>
1401 +#include <linux/string.h>
1402 +#include <linux/kexec.h>
1403 +#include <linux/module.h>
1404 +#include <linux/mm.h>
1405 +#include <linux/pfn.h>
1406 +#include <linux/suspend.h>
1407 +#include <linux/firmware-map.h>
1409 +#include <asm/pgtable.h>
1410 +#include <asm/page.h>
1411 +#include <asm/e820.h>
1412 +#include <asm/proto.h>
1413 +#include <asm/setup.h>
1414 +#include <xen/interface/memory.h>
1417 + * The e820 map is the map that gets modified e.g. with command line parameters
1418 + * and that is also registered with modifications in the kernel resource tree
1419 + * with the iomem_resource as parent.
1421 + * The e820_saved is directly saved after the BIOS-provided memory map is
1422 + * copied. It doesn't get modified afterwards. It's registered for the
1423 + * /sys/firmware/memmap interface.
1425 + * That memory map is not modified and is used as base for kexec. The kexec'd
1426 + * kernel should get the same memory map as the firmware provides. Then the
1427 + * user can e.g. boot the original kernel with mem=1G while still booting the
1428 + * next kernel with full memory.
1430 +struct e820map e820;
1432 +struct e820map e820_saved;
1434 +static struct e820map machine_e820;
1435 +#define e820_saved machine_e820
1438 +/* For PCI or other memory-mapped resources */
1439 +unsigned long pci_mem_start = 0xaeedbabe;
1441 +EXPORT_SYMBOL(pci_mem_start);
1445 + * This function checks if any part of the range <start,end> is mapped
1449 +e820_any_mapped(u64 start, u64 end, unsigned type)
1454 + for (i = 0; i < e820.nr_map; i++) {
1455 + struct e820entry *ei = &e820.map[i];
1457 + if (!is_initial_xendomain())
1459 + for (i = 0; i < machine_e820.nr_map; ++i) {
1460 + const struct e820entry *ei = &machine_e820.map[i];
1463 + if (type && ei->type != type)
1465 + if (ei->addr >= end || ei->addr + ei->size <= start)
1471 +EXPORT_SYMBOL_GPL(e820_any_mapped);
1474 + * This function checks if the entire range <start,end> is mapped with type.
1476 + * Note: this function only works correct if the e820 table is sorted and
1477 + * not-overlapping, which is the case
1479 +int __init e820_all_mapped(u64 start, u64 end, unsigned type)
1484 + for (i = 0; i < e820.nr_map; i++) {
1485 + struct e820entry *ei = &e820.map[i];
1487 + if (!is_initial_xendomain())
1489 + for (i = 0; i < machine_e820.nr_map; ++i) {
1490 + const struct e820entry *ei = &machine_e820.map[i];
1493 + if (type && ei->type != type)
1495 + /* is the region (part) in overlap with the current region ?*/
1496 + if (ei->addr >= end || ei->addr + ei->size <= start)
1499 + /* if the region is at the beginning of <start,end> we move
1500 + * start to the end of the region since it's ok until there
1502 + if (ei->addr <= start)
1503 + start = ei->addr + ei->size;
1505 + * if start is now at or beyond end, we're done, full
1515 + * Add a memory region to the kernel e820 map.
1517 +void __init e820_add_region(u64 start, u64 size, int type)
1519 + int x = e820.nr_map;
1521 + if (x == ARRAY_SIZE(e820.map)) {
1522 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1526 + e820.map[x].addr = start;
1527 + e820.map[x].size = size;
1528 + e820.map[x].type = type;
1532 +void __init e820_print_map(char *who)
1536 + for (i = 0; i < e820.nr_map; i++) {
1537 + printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1538 + (unsigned long long) e820.map[i].addr,
1539 + (unsigned long long)
1540 + (e820.map[i].addr + e820.map[i].size));
1541 + switch (e820.map[i].type) {
1543 + case E820_RESERVED_KERN:
1544 + printk(KERN_CONT "(usable)\n");
1546 + case E820_RESERVED:
1547 + printk(KERN_CONT "(reserved)\n");
1550 + printk(KERN_CONT "(ACPI data)\n");
1553 + printk(KERN_CONT "(ACPI NVS)\n");
1556 + printk(KERN_CONT "type %u\n", e820.map[i].type);
1563 + * Sanitize the BIOS e820 map.
1565 + * Some e820 responses include overlapping entries. The following
1566 + * replaces the original e820 map with a new one, removing overlaps,
1567 + * and resolving conflicting memory types in favor of highest
1570 + * The input parameter biosmap points to an array of 'struct
1571 + * e820entry' which on entry has elements in the range [0, *pnr_map)
1572 + * valid, and which has space for up to max_nr_map entries.
1573 + * On return, the resulting sanitized e820 map entries will be in
1574 + * overwritten in the same location, starting at biosmap.
1576 + * The integer pointed to by pnr_map must be valid on entry (the
1577 + * current number of valid entries located at biosmap) and will
1578 + * be updated on return, with the new number of valid entries
1579 + * (something no more than max_nr_map.)
1581 + * The return value from sanitize_e820_map() is zero if it
1582 + * successfully 'sanitized' the map entries passed in, and is -1
1583 + * if it did nothing, which can happen if either of (1) it was
1584 + * only passed one map entry, or (2) any of the input map entries
1585 + * were invalid (start + size < start, meaning that the size was
1586 + * so big the described memory range wrapped around through zero.)
1588 + * Visually we're performing the following
1589 + * (1,2,3,4 = memory types)...
1591 + * Sample memory map (w/overlaps):
1592 + * ____22__________________
1593 + * ______________________4_
1594 + * ____1111________________
1595 + * _44_____________________
1596 + * 11111111________________
1597 + * ____________________33__
1598 + * ___________44___________
1599 + * __________33333_________
1600 + * ______________22________
1601 + * ___________________2222_
1602 + * _________111111111______
1603 + * _____________________11_
1604 + * _________________4______
1606 + * Sanitized equivalent (no overlap):
1607 + * 1_______________________
1608 + * _44_____________________
1609 + * ___1____________________
1610 + * ____22__________________
1611 + * ______11________________
1612 + * _________1______________
1613 + * __________3_____________
1614 + * ___________44___________
1615 + * _____________33_________
1616 + * _______________2________
1617 + * ________________1_______
1618 + * _________________4______
1619 + * ___________________2____
1620 + * ____________________33__
1621 + * ______________________4_
1624 +int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
1627 + struct change_member {
1628 + struct e820entry *pbios; /* pointer to original bios entry */
1629 + unsigned long long addr; /* address for this change point */
1631 + static struct change_member change_point_list[2*E820_X_MAX] __initdata;
1632 + static struct change_member *change_point[2*E820_X_MAX] __initdata;
1633 + static struct e820entry *overlap_list[E820_X_MAX] __initdata;
1634 + static struct e820entry new_bios[E820_X_MAX] __initdata;
1635 + struct change_member *change_tmp;
1636 + unsigned long current_type, last_type;
1637 + unsigned long long last_addr;
1638 + int chgidx, still_changing;
1639 + int overlap_entries;
1640 + int new_bios_entry;
1641 + int old_nr, new_nr, chg_nr;
1644 + /* if there's only one memory region, don't bother */
1646 + if (*pnr_map == 1)
1652 + old_nr = *pnr_map;
1653 + BUG_ON(old_nr > max_nr_map);
1655 + /* bail out if we find any unreasonable addresses in bios map */
1656 + for (i = 0; i < old_nr; i++)
1657 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
1660 + /* create pointers for initial change-point information (for sorting) */
1661 + for (i = 0; i < 2 * old_nr; i++)
1662 + change_point[i] = &change_point_list[i];
1664 + /* record all known change-points (starting and ending addresses),
1665 + omitting those that are for empty memory regions */
1667 + for (i = 0; i < old_nr; i++) {
1668 + if (biosmap[i].size != 0) {
1669 + change_point[chgidx]->addr = biosmap[i].addr;
1670 + change_point[chgidx++]->pbios = &biosmap[i];
1671 + change_point[chgidx]->addr = biosmap[i].addr +
1673 + change_point[chgidx++]->pbios = &biosmap[i];
1678 + /* sort change-point list by memory addresses (low -> high) */
1679 + still_changing = 1;
1680 + while (still_changing) {
1681 + still_changing = 0;
1682 + for (i = 1; i < chg_nr; i++) {
1683 + unsigned long long curaddr, lastaddr;
1684 + unsigned long long curpbaddr, lastpbaddr;
1686 + curaddr = change_point[i]->addr;
1687 + lastaddr = change_point[i - 1]->addr;
1688 + curpbaddr = change_point[i]->pbios->addr;
1689 + lastpbaddr = change_point[i - 1]->pbios->addr;
1692 + * swap entries, when:
1694 + * curaddr > lastaddr or
1695 + * curaddr == lastaddr and curaddr == curpbaddr and
1696 + * lastaddr != lastpbaddr
1698 + if (curaddr < lastaddr ||
1699 + (curaddr == lastaddr && curaddr == curpbaddr &&
1700 + lastaddr != lastpbaddr)) {
1701 + change_tmp = change_point[i];
1702 + change_point[i] = change_point[i-1];
1703 + change_point[i-1] = change_tmp;
1704 + still_changing = 1;
1709 + /* create a new bios memory map, removing overlaps */
1710 + overlap_entries = 0; /* number of entries in the overlap table */
1711 + new_bios_entry = 0; /* index for creating new bios map entries */
1712 + last_type = 0; /* start with undefined memory type */
1713 + last_addr = 0; /* start with 0 as last starting address */
1715 + /* loop through change-points, determining affect on the new bios map */
1716 + for (chgidx = 0; chgidx < chg_nr; chgidx++) {
1717 + /* keep track of all overlapping bios entries */
1718 + if (change_point[chgidx]->addr ==
1719 + change_point[chgidx]->pbios->addr) {
1721 + * add map entry to overlap list (> 1 entry
1722 + * implies an overlap)
1724 + overlap_list[overlap_entries++] =
1725 + change_point[chgidx]->pbios;
1728 + * remove entry from list (order independent,
1729 + * so swap with last)
1731 + for (i = 0; i < overlap_entries; i++) {
1732 + if (overlap_list[i] ==
1733 + change_point[chgidx]->pbios)
1735 + overlap_list[overlap_entries-1];
1737 + overlap_entries--;
1740 + * if there are overlapping entries, decide which
1741 + * "type" to use (larger value takes precedence --
1742 + * 1=usable, 2,3,4,4+=unusable)
1745 + for (i = 0; i < overlap_entries; i++)
1746 + if (overlap_list[i]->type > current_type)
1747 + current_type = overlap_list[i]->type;
1749 + * continue building up new bios map based on this
1752 + if (current_type != last_type) {
1753 + if (last_type != 0) {
1754 + new_bios[new_bios_entry].size =
1755 + change_point[chgidx]->addr - last_addr;
1757 + * move forward only if the new size
1760 + if (new_bios[new_bios_entry].size != 0)
1762 + * no more space left for new
1765 + if (++new_bios_entry >= max_nr_map)
1768 + if (current_type != 0) {
1769 + new_bios[new_bios_entry].addr =
1770 + change_point[chgidx]->addr;
1771 + new_bios[new_bios_entry].type = current_type;
1772 + last_addr = change_point[chgidx]->addr;
1774 + last_type = current_type;
1777 + /* retain count for new bios entries */
1778 + new_nr = new_bios_entry;
1780 + /* copy new bios mapping into original location */
1781 + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
1782 + *pnr_map = new_nr;
1787 +static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
1790 + u64 start = biosmap->addr;
1791 + u64 size = biosmap->size;
1792 + u64 end = start + size;
1793 + u32 type = biosmap->type;
1795 + /* Overflow in 64 bits? Ignore the memory map. */
1799 + e820_add_region(start, size, type);
1808 + * Copy the BIOS e820 map into a safe place.
1810 + * Sanity-check it while we're at it..
1812 + * If we're lucky and live on a modern system, the setup code
1813 + * will have given us a memory map that we can use to properly
1814 + * set up memory. If we aren't, we'll fake a memory map.
1816 +static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
1819 + /* Only one memory region (or negative)? Ignore it */
1823 + BUG_ON(nr_map < 1);
1826 + return __append_e820_map(biosmap, nr_map);
1829 +static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
1830 + u64 size, unsigned old_type,
1831 + unsigned new_type)
1833 + unsigned int i, x;
1834 + u64 real_updated_size = 0;
1836 + BUG_ON(old_type == new_type);
1838 + if (size > (ULLONG_MAX - start))
1839 + size = ULLONG_MAX - start;
1841 + for (i = 0; i < e820x->nr_map; i++) {
1842 + struct e820entry *ei = &e820x->map[i];
1843 + u64 final_start, final_end;
1844 + if (ei->type != old_type)
1846 + /* totally covered? */
1847 + if (ei->addr >= start &&
1848 + (ei->addr + ei->size) <= (start + size)) {
1849 + ei->type = new_type;
1850 + real_updated_size += ei->size;
1853 + /* partially covered */
1854 + final_start = max(start, ei->addr);
1855 + final_end = min(start + size, ei->addr + ei->size);
1856 + if (final_start >= final_end)
1859 + x = e820x->nr_map;
1860 + if (x == ARRAY_SIZE(e820x->map)) {
1861 + printk(KERN_ERR "Too many memory map entries!\n");
1864 + e820x->map[x].addr = final_start;
1865 + e820x->map[x].size = final_end - final_start;
1866 + e820x->map[x].type = new_type;
1869 + real_updated_size += final_end - final_start;
1871 + if (ei->addr < final_start)
1873 + ei->addr = final_end;
1874 + ei->size -= final_end - final_start;
1876 + return real_updated_size;
1879 +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
1880 + unsigned new_type)
1882 + return e820_update_range_map(&e820, start, size, old_type, new_type);
1885 +static u64 __init e820_update_range_saved(u64 start, u64 size,
1886 + unsigned old_type, unsigned new_type)
1889 + if (is_initial_xendomain())
1890 + return e820_update_range_map(&machine_e820,
1891 + phys_to_machine(start), size,
1892 + old_type, new_type);
1894 + return e820_update_range_map(&e820_saved, start, size, old_type,
1898 +/* make e820 not cover the range */
1899 +u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
1903 + u64 real_removed_size = 0;
1905 + if (size > (ULLONG_MAX - start))
1906 + size = ULLONG_MAX - start;
1908 + for (i = 0; i < e820.nr_map; i++) {
1909 + struct e820entry *ei = &e820.map[i];
1910 + u64 final_start, final_end;
1912 + if (checktype && ei->type != old_type)
1914 + /* totally covered? */
1915 + if (ei->addr >= start &&
1916 + (ei->addr + ei->size) <= (start + size)) {
1917 + real_removed_size += ei->size;
1918 + memset(ei, 0, sizeof(struct e820entry));
1921 + /* partially covered */
1922 + final_start = max(start, ei->addr);
1923 + final_end = min(start + size, ei->addr + ei->size);
1924 + if (final_start >= final_end)
1926 + real_removed_size += final_end - final_start;
1928 + ei->size -= final_end - final_start;
1929 + if (ei->addr < final_start)
1931 + ei->addr = final_end;
1933 + return real_removed_size;
1936 +void __init update_e820(void)
1940 + nr_map = e820.nr_map;
1941 + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
1943 + e820.nr_map = nr_map;
1944 + printk(KERN_INFO "modified physical RAM map:\n");
1945 + e820_print_map("modified");
1947 +static void __init update_e820_saved(void)
1951 + nr_map = e820_saved.nr_map;
1952 + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
1954 + e820_saved.nr_map = nr_map;
1958 +#define e820 machine_e820
1961 +#define MAX_GAP_END 0x100000000ull
1963 + * Search for a gap in the e820 memory space from start_addr to end_addr.
1965 +__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
1966 + unsigned long start_addr, unsigned long long end_addr)
1968 + unsigned long long last;
1969 + int i = e820.nr_map;
1972 + last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
1973 +#ifdef CONFIG_X86_64
1974 + if (start_addr >= MAX_GAP_END)
1975 + last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
1978 + while (--i >= 0) {
1979 + unsigned long long start = e820.map[i].addr;
1980 + unsigned long long end = start + e820.map[i].size;
1982 + if (end < start_addr)
1986 + * Since "last" is at most 4GB, we know we'll
1987 + * fit in 32 bits if this condition is true
1990 + unsigned long gap = last - end;
1992 + if (gap >= *gapsize) {
2005 + * Search for the biggest gap in the low 32 bits of the e820
2006 + * memory space. We pass this space to PCI to assign MMIO resources
2007 + * for hotplug or unconfigured devices in.
2008 + * Hopefully the BIOS let enough space left.
2010 +__init void e820_setup_gap(void)
2012 + unsigned long gapstart, gapsize, round;
2015 + gapstart = 0x10000000;
2016 + gapsize = 0x400000;
2017 + found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
2019 +#ifdef CONFIG_X86_64
2021 + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2023 + KERN_ERR "PCI: Unassigned devices with 32bit resource "
2024 + "registers may break!\n");
2025 + found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
2031 + * See how much we want to round up: start off with
2032 + * rounding to the next 1MB area.
2035 + while ((gapsize >> 4) > round)
2037 + /* Fun with two's complement */
2038 + pci_mem_start = (gapstart + round) & -round;
2041 + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2042 + pci_mem_start, gapstart, gapsize);
2049 + * Because of the size limitation of struct boot_params, only first
2050 + * 128 E820 memory entries are passed to kernel via
2051 + * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
2052 + * linked list of struct setup_data, which is parsed here.
2054 +void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
2058 + struct e820entry *extmap;
2060 + entries = sdata->len / sizeof(struct e820entry);
2061 + map_len = sdata->len + sizeof(struct setup_data);
2062 + if (map_len > PAGE_SIZE)
2063 + sdata = early_ioremap(pa_data, map_len);
2064 + extmap = (struct e820entry *)(sdata->data);
2065 + __append_e820_map(extmap, entries);
2066 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
2067 + if (map_len > PAGE_SIZE)
2068 + early_iounmap(sdata, map_len);
2069 + printk(KERN_INFO "extended physical RAM map:\n");
2070 + e820_print_map("extended");
2073 +#if defined(CONFIG_X86_64) || \
2074 + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
2076 + * Find the ranges of physical addresses that do not correspond to
2077 + * e820 RAM areas and mark the corresponding pages as nosave for
2078 + * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
2080 + * This function requires the e820 map to be sorted and without any
2081 + * overlapping entries and assumes the first e820 area to be RAM.
2083 +void __init e820_mark_nosave_regions(unsigned long limit_pfn)
2086 + unsigned long pfn;
2088 + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
2089 + for (i = 1; i < e820.nr_map; i++) {
2090 + struct e820entry *ei = &e820.map[i];
2092 + if (pfn < PFN_UP(ei->addr))
2093 + register_nosave_region(pfn, PFN_UP(ei->addr));
2095 + pfn = PFN_DOWN(ei->addr + ei->size);
2096 + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
2097 + register_nosave_region(PFN_UP(ei->addr), pfn);
2099 + if (pfn >= limit_pfn)
2107 + * Early reserved memory areas.
2109 +#define MAX_EARLY_RES 20
2116 +static struct early_res early_res[MAX_EARLY_RES] __initdata = {
2118 + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
2119 +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
2120 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
2122 +#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
2124 + * But first pinch a few for the stack/trampoline stuff
2125 + * FIXME: Don't need the extra page at 4K, but need to fix
2126 + * trampoline before removing it. (see the GDT stuff)
2128 + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
2130 + * Has to be in very low memory so we can execute
2131 + * real-mode AP code.
2133 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
2139 +static int __init find_overlapped_early(u64 start, u64 end)
2142 + struct early_res *r;
2144 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2145 + r = &early_res[i];
2146 + if (end > r->start && start < r->end)
2154 + * Drop the i-th range from the early reservation map,
2155 + * by copying any higher ranges down one over it, and
2156 + * clearing what had been the last slot.
2158 +static void __init drop_range(int i)
2162 + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
2165 + memmove(&early_res[i], &early_res[i + 1],
2166 + (j - 1 - i) * sizeof(struct early_res));
2168 + early_res[j - 1].end = 0;
2172 + * Split any existing ranges that:
2173 + * 1) are marked 'overlap_ok', and
2174 + * 2) overlap with the stated range [start, end)
2175 + * into whatever portion (if any) of the existing range is entirely
2176 + * below or entirely above the stated range. Drop the portion
2177 + * of the existing range that overlaps with the stated range,
2178 + * which will allow the caller of this routine to then add that
2179 + * stated range without conflicting with any existing range.
2181 +static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
2184 + struct early_res *r;
2185 + u64 lower_start, lower_end;
2186 + u64 upper_start, upper_end;
2189 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2190 + r = &early_res[i];
2192 + /* Continue past non-overlapping ranges */
2193 + if (end <= r->start || start >= r->end)
2197 + * Leave non-ok overlaps as is; let caller
2198 + * panic "Overlapping early reservations"
2199 + * when it hits this overlap.
2201 + if (!r->overlap_ok)
2205 + * We have an ok overlap. We will drop it from the early
2206 + * reservation map, and add back in any non-overlapping
2207 + * portions (lower or upper) as separate, overlap_ok,
2208 + * non-overlapping ranges.
2211 + /* 1. Note any non-overlapping (lower or upper) ranges. */
2212 + strncpy(name, r->name, sizeof(name) - 1);
2214 + lower_start = lower_end = 0;
2215 + upper_start = upper_end = 0;
2216 + if (r->start < start) {
2217 + lower_start = r->start;
2218 + lower_end = start;
2220 + if (r->end > end) {
2221 + upper_start = end;
2222 + upper_end = r->end;
2225 + /* 2. Drop the original ok overlapping range */
2228 + i--; /* resume for-loop on copied down entry */
2230 + /* 3. Add back in any non-overlapping ranges. */
2232 + reserve_early_overlap_ok(lower_start, lower_end, name);
2234 + reserve_early_overlap_ok(upper_start, upper_end, name);
2238 +static void __init __reserve_early(u64 start, u64 end, char *name,
2242 + struct early_res *r;
2244 + i = find_overlapped_early(start, end);
2245 + if (i >= MAX_EARLY_RES)
2246 + panic("Too many early reservations");
2247 + r = &early_res[i];
2249 + panic("Overlapping early reservations "
2250 + "%llx-%llx %s to %llx-%llx %s\n",
2251 + start, end - 1, name?name:"", r->start,
2252 + r->end - 1, r->name);
2255 + r->overlap_ok = overlap_ok;
2257 + strncpy(r->name, name, sizeof(r->name) - 1);
2261 + * A few early reservtations come here.
2263 + * The 'overlap_ok' in the name of this routine does -not- mean it
2264 + * is ok for these reservations to overlap an earlier reservation.
2265 + * Rather it means that it is ok for subsequent reservations to
2266 + * overlap this one.
2268 + * Use this entry point to reserve early ranges when you are doing
2269 + * so out of "Paranoia", reserving perhaps more memory than you need,
2270 + * just in case, and don't mind a subsequent overlapping reservation
2271 + * that is known to be needed.
2273 + * The drop_overlaps_that_are_ok() call here isn't really needed.
2274 + * It would be needed if we had two colliding 'overlap_ok'
2275 + * reservations, so that the second such would not panic on the
2276 + * overlap with the first. We don't have any such as of this
2277 + * writing, but might as well tolerate such if it happens in
2280 +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
2282 + drop_overlaps_that_are_ok(start, end);
2283 + __reserve_early(start, end, name, 1);
2287 + * Most early reservations come here.
2289 + * We first have drop_overlaps_that_are_ok() drop any pre-existing
2290 + * 'overlap_ok' ranges, so that we can then reserve this memory
2291 + * range without risk of panic'ing on an overlapping overlap_ok
2292 + * early reservation.
2294 +void __init reserve_early(u64 start, u64 end, char *name)
2296 + drop_overlaps_that_are_ok(start, end);
2297 + __reserve_early(start, end, name, 0);
2300 +void __init free_early(u64 start, u64 end)
2302 + struct early_res *r;
2305 + i = find_overlapped_early(start, end);
2306 + r = &early_res[i];
2307 + if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
2308 + panic("free_early on not reserved area: %llx-%llx!",
2314 +void __init early_res_to_bootmem(u64 start, u64 end)
2317 + u64 final_start, final_end;
2320 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
2323 + printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
2324 + count, start, end);
2325 + for (i = 0; i < count; i++) {
2326 + struct early_res *r = &early_res[i];
2327 + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
2328 + r->start, r->end, r->name);
2329 + final_start = max(start, r->start);
2330 + final_end = min(end, r->end);
2331 + if (final_start >= final_end) {
2332 + printk(KERN_CONT "\n");
2335 + printk(KERN_CONT " ==> [%010llx - %010llx]\n",
2336 + final_start, final_end);
2337 + reserve_bootmem_generic(final_start, final_end - final_start,
2342 +/* Check for already reserved areas */
2343 +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
2346 + u64 addr = *addrp;
2348 + struct early_res *r;
2350 + i = find_overlapped_early(addr, addr + size);
2351 + r = &early_res[i];
2352 + if (i < MAX_EARLY_RES && r->end) {
2353 + *addrp = addr = round_up(r->end, align);
2360 +/* Check for already reserved areas */
2361 +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
2364 + u64 addr = *addrp, last;
2365 + u64 size = *sizep;
2368 + last = addr + size;
2369 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2370 + struct early_res *r = &early_res[i];
2371 + if (last > r->start && addr < r->start) {
2372 + size = r->start - addr;
2376 + if (last > r->end && addr < r->end) {
2377 + addr = round_up(r->end, align);
2378 + size = last - addr;
2382 + if (last <= r->end && addr >= r->start) {
2395 + * Find a free area with specified alignment in a specific range.
2397 +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
2401 + for (i = 0; i < e820.nr_map; i++) {
2402 + struct e820entry *ei = &e820.map[i];
2406 + if (ei->type != E820_RAM)
2408 + addr = round_up(ei->addr, align);
2409 + ei_last = ei->addr + ei->size;
2411 + addr = round_up(start, align);
2412 + if (addr >= ei_last)
2414 + while (bad_addr(&addr, size, align) && addr+size <= ei_last)
2416 + last = addr + size;
2417 + if (last > ei_last)
2427 + * Find next free range after *start
2429 +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
2433 + for (i = 0; i < e820.nr_map; i++) {
2434 + struct e820entry *ei = &e820.map[i];
2438 + if (ei->type != E820_RAM)
2440 + addr = round_up(ei->addr, align);
2441 + ei_last = ei->addr + ei->size;
2443 + addr = round_up(start, align);
2444 + if (addr >= ei_last)
2446 + *sizep = ei_last - addr;
2447 + while (bad_addr_size(&addr, sizep, align) &&
2448 + addr + *sizep <= ei_last)
2450 + last = addr + *sizep;
2451 + if (last > ei_last)
2460 + * pre allocated 4k and reserved it in e820
2462 +u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
2468 + unsigned int order = get_order(sizet);
2470 + if (is_initial_xendomain()) {
2471 + sizet = PAGE_SIZE << order;
2472 + if (align < PAGE_SIZE)
2473 + align = PAGE_SIZE;
2476 + for (start = startt; ; start += size) {
2477 + start = find_e820_area_size(start, &size, align);
2480 + if (size >= sizet)
2484 +#ifdef CONFIG_X86_32
2485 + if (start >= MAXMEM)
2487 + if (start + size > MAXMEM)
2488 + size = MAXMEM - start;
2491 + if ((start >> PAGE_SHIFT) >= xen_start_info->nr_pages)
2493 + if (PFN_UP(start + size) > xen_start_info->nr_pages)
2494 + size = ((u64)xen_start_info->nr_pages << PAGE_SHIFT) - start;
2497 + addr = round_down(start + size - sizet, align);
2501 + if (is_initial_xendomain()) {
2503 + unsigned long max_initmap_pfn;
2505 + max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base))
2506 + + xen_start_info->nr_pt_frames
2507 + + 1 + (1 << (19 - PAGE_SHIFT)),
2508 + 1UL << (22 - PAGE_SHIFT));
2509 +#ifdef CONFIG_X86_32
2510 + if ((addr >> PAGE_SHIFT)
2511 + < max(max_initmap_pfn, max_pfn_mapped))
2512 + rc = xen_create_contiguous_region((unsigned long)
2516 + if ((addr >> PAGE_SHIFT) < max_pfn_mapped)
2517 + rc = xen_create_contiguous_region((unsigned long)
2520 + else if ((addr >> PAGE_SHIFT) < max_initmap_pfn)
2521 + rc = xen_create_contiguous_region(__START_KERNEL_map
2526 + rc = early_create_contiguous_region(addr >> PAGE_SHIFT,
2532 + e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
2533 + e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
2534 + printk(KERN_INFO "update e820 for early_reserve_e820\n");
2536 + update_e820_saved();
2541 +#ifdef CONFIG_X86_32
2542 +# ifdef CONFIG_X86_PAE
2543 +# define MAX_ARCH_PFN (1ULL<<(40-PAGE_SHIFT))
2545 +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
2547 +#else /* CONFIG_X86_32 */
2548 +# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
2552 + * Find the highest page frame number we have available
2554 +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
2557 + unsigned long last_pfn = 0;
2558 + unsigned long max_arch_pfn = MAX_ARCH_PFN;
2560 + for (i = 0; i < e820.nr_map; i++) {
2561 + struct e820entry *ei = &e820.map[i];
2562 + unsigned long start_pfn;
2563 + unsigned long end_pfn;
2565 + if (ei->type != type)
2568 + start_pfn = ei->addr >> PAGE_SHIFT;
2569 + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
2571 + if (start_pfn >= limit_pfn)
2573 + if (end_pfn > limit_pfn) {
2574 + last_pfn = limit_pfn;
2577 + if (end_pfn > last_pfn)
2578 + last_pfn = end_pfn;
2581 + if (last_pfn > max_arch_pfn)
2582 + last_pfn = max_arch_pfn;
2584 + printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
2585 + last_pfn, max_arch_pfn);
2588 +unsigned long __init e820_end_of_ram_pfn(void)
2590 + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
2593 +unsigned long __init e820_end_of_low_ram_pfn(void)
2595 + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
2598 + * Finds an active region in the address range from start_pfn to last_pfn and
2599 + * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
2601 +int __init e820_find_active_region(const struct e820entry *ei,
2602 + unsigned long start_pfn,
2603 + unsigned long last_pfn,
2604 + unsigned long *ei_startpfn,
2605 + unsigned long *ei_endpfn)
2607 + u64 align = PAGE_SIZE;
2609 + *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
2610 + *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
2612 + /* Skip map entries smaller than a page */
2613 + if (*ei_startpfn >= *ei_endpfn)
2616 + /* Skip if map is outside the node */
2617 + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
2618 + *ei_startpfn >= last_pfn)
2621 + /* Check for overlaps */
2622 + if (*ei_startpfn < start_pfn)
2623 + *ei_startpfn = start_pfn;
2624 + if (*ei_endpfn > last_pfn)
2625 + *ei_endpfn = last_pfn;
2630 +/* Walk the e820 map and register active regions within a node */
2631 +void __init e820_register_active_regions(int nid, unsigned long start_pfn,
2632 + unsigned long last_pfn)
2634 + unsigned long ei_startpfn;
2635 + unsigned long ei_endpfn;
2638 + for (i = 0; i < e820.nr_map; i++)
2639 + if (e820_find_active_region(&e820.map[i],
2640 + start_pfn, last_pfn,
2641 + &ei_startpfn, &ei_endpfn))
2642 + add_active_range(nid, ei_startpfn, ei_endpfn);
2646 + * Find the hole size (in bytes) in the memory range.
2647 + * @start: starting address of the memory range to scan
2648 + * @end: ending address of the memory range to scan
2650 +u64 __init e820_hole_size(u64 start, u64 end)
2652 + unsigned long start_pfn = start >> PAGE_SHIFT;
2653 + unsigned long last_pfn = end >> PAGE_SHIFT;
2654 + unsigned long ei_startpfn, ei_endpfn, ram = 0;
2657 + for (i = 0; i < e820.nr_map; i++) {
2658 + if (e820_find_active_region(&e820.map[i],
2659 + start_pfn, last_pfn,
2660 + &ei_startpfn, &ei_endpfn))
2661 + ram += ei_endpfn - ei_startpfn;
2663 + return end - start - ((u64)ram << PAGE_SHIFT);
2666 +static void early_panic(char *msg)
2668 + early_printk(msg);
2672 +static int userdef __initdata;
2674 +/* "mem=nopentium" disables the 4MB page tables. */
2675 +static int __init parse_memopt(char *p)
2677 + u64 mem_size, current_end;
2683 +#ifdef CONFIG_X86_32
2684 + if (!strcmp(p, "nopentium")) {
2685 + setup_clear_cpu_cap(X86_FEATURE_PSE);
2691 + mem_size = memparse(p, &p);
2692 + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2694 + i = e820.nr_map - 1;
2695 + current_end = e820.map[i].addr + e820.map[i].size;
2696 + if (current_end < mem_size) {
2698 + * The e820 map ends before our requested size so
2699 + * extend the final entry to the requested address.
2701 + if (e820.map[i].type == E820_RAM)
2702 + e820.map[i].size = mem_size - e820.map[i].addr;
2704 + e820_add_region(current_end, mem_size - current_end, E820_RAM);
2709 +early_param("mem", parse_memopt);
2712 +static int __init parse_memmap_opt(char *p)
2715 + u64 start_at, mem_size;
2720 + if (!strncmp(p, "exactmap", 8)) {
2721 +#ifdef CONFIG_CRASH_DUMP
2723 + * If we are doing a crash dump, we still need to know
2724 + * the real mem size before original memory map is
2727 + saved_max_pfn = e820_end_of_ram_pfn();
2735 + mem_size = memparse(p, &p);
2741 + start_at = memparse(p+1, &p);
2742 + e820_add_region(start_at, mem_size, E820_RAM);
2743 + } else if (*p == '#') {
2744 + start_at = memparse(p+1, &p);
2745 + e820_add_region(start_at, mem_size, E820_ACPI);
2746 + } else if (*p == '$') {
2747 + start_at = memparse(p+1, &p);
2748 + e820_add_region(start_at, mem_size, E820_RESERVED);
2750 + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2752 + return *p == '\0' ? 0 : -EINVAL;
2754 +early_param("memmap", parse_memmap_opt);
2756 +void __init finish_e820_parsing(void)
2759 + int nr = e820.nr_map;
2761 + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
2762 + early_panic("Invalid user supplied memory map");
2765 + printk(KERN_INFO "user-defined physical RAM map:\n");
2766 + e820_print_map("user");
2771 +static inline const char *e820_type_to_string(int e820_type)
2773 + switch (e820_type) {
2774 + case E820_RESERVED_KERN:
2775 + case E820_RAM: return "System RAM";
2776 + case E820_ACPI: return "ACPI Tables";
2777 + case E820_NVS: return "ACPI Non-volatile Storage";
2778 + default: return "reserved";
2783 +#define e820 machine_e820
2787 + * Mark e820 reserved areas as busy for the resource manager.
2789 +void __init e820_reserve_resources(void)
2792 + struct resource *res;
2795 + res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
2796 + for (i = 0; i < e820.nr_map; i++) {
2797 + end = e820.map[i].addr + e820.map[i].size - 1;
2798 +#ifndef CONFIG_RESOURCES_64BIT
2799 + if (end > 0x100000000ULL) {
2804 + res->name = e820_type_to_string(e820.map[i].type);
2805 + res->start = e820.map[i].addr;
2808 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
2809 + insert_resource(&iomem_resource, res);
2813 + for (i = 0; i < e820_saved.nr_map; i++) {
2814 + struct e820entry *entry = &e820_saved.map[i];
2815 + firmware_map_add_early(entry->addr,
2816 + entry->addr + entry->size - 1,
2817 + e820_type_to_string(entry->type));
2824 +char *__init default_machine_specific_memory_setup(void)
2826 + char *who = "BIOS-e820";
2829 + * Try to copy the BIOS-supplied E820-map.
2831 + * Otherwise fake a memory map; one section from 0k->640k,
2832 + * the next section from 1mb->appropriate_mem_k
2834 + new_nr = boot_params.e820_entries;
2835 + sanitize_e820_map(boot_params.e820_map,
2836 + ARRAY_SIZE(boot_params.e820_map),
2838 + boot_params.e820_entries = new_nr;
2839 + if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
2843 + /* compare results from other methods and take the greater */
2844 + if (boot_params.alt_mem_k
2845 + < boot_params.screen_info.ext_mem_k) {
2846 + mem_size = boot_params.screen_info.ext_mem_k;
2849 + mem_size = boot_params.alt_mem_k;
2850 + who = "BIOS-e801";
2854 + e820_add_region(0, LOWMEMSIZE(), E820_RAM);
2855 + e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
2858 + /* In case someone cares... */
2862 +char *__init __attribute__((weak)) machine_specific_memory_setup(void)
2864 + if (x86_quirks->arch_memory_setup) {
2865 + char *who = x86_quirks->arch_memory_setup();
2870 + return default_machine_specific_memory_setup();
2874 +char * __init memory_setup(void)
2877 + struct xen_memory_map memmap;
2879 + * This is rather large for a stack variable but this early in
2880 + * the boot process we know we have plenty slack space.
2882 + struct e820entry map[E820MAX];
2884 + memmap.nr_entries = E820MAX;
2885 + set_xen_guest_handle(memmap.buffer, map);
2887 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
2888 + if (rc == -ENOSYS) {
2889 + memmap.nr_entries = 1;
2890 + map[0].addr = 0ULL;
2891 + map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
2892 + /* 8MB slack (to balance backend allocations). */
2893 + map[0].size += 8ULL << 20;
2894 + map[0].type = E820_RAM;
2899 + nr_map = memmap.nr_entries;
2900 + sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
2902 + if (append_e820_map(map, nr_map) < 0)
2906 + if (is_initial_xendomain()) {
2907 + memmap.nr_entries = E820MAX;
2908 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
2910 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
2912 + machine_e820.nr_map = memmap.nr_entries;
2919 +void __init setup_memory_map(void)
2923 + who = memory_setup();
2925 + if (!is_initial_xendomain())
2927 + memcpy(&e820_saved, &e820, sizeof(struct e820map));
2928 + printk(KERN_INFO "Xen-provided physical RAM map:\n");
2929 + e820_print_map(who);
2931 --- sle11-2009-06-04.orig/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:38:05.000000000 +0100
2932 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
2934 -#include <linux/kernel.h>
2935 -#include <linux/types.h>
2936 -#include <linux/init.h>
2937 -#include <linux/bootmem.h>
2938 -#include <linux/ioport.h>
2939 -#include <linux/string.h>
2940 -#include <linux/kexec.h>
2941 -#include <linux/module.h>
2942 -#include <linux/mm.h>
2943 -#include <linux/pfn.h>
2944 -#include <linux/uaccess.h>
2945 -#include <linux/suspend.h>
2947 -#include <asm/pgtable.h>
2948 -#include <asm/page.h>
2949 -#include <asm/e820.h>
2950 -#include <asm/setup.h>
2951 -#include <xen/interface/memory.h>
2953 -struct e820map e820;
2954 -struct change_member {
2955 - struct e820entry *pbios; /* pointer to original bios entry */
2956 - unsigned long long addr; /* address for this change point */
2958 -static struct change_member change_point_list[2*E820MAX] __initdata;
2959 -static struct change_member *change_point[2*E820MAX] __initdata;
2960 -static struct e820entry *overlap_list[E820MAX] __initdata;
2961 -static struct e820entry new_bios[E820MAX] __initdata;
2962 -/* For PCI or other memory-mapped resources */
2963 -unsigned long pci_mem_start = 0x10000000;
2965 -EXPORT_SYMBOL(pci_mem_start);
2967 -extern int user_defined_memmap;
2969 -static struct resource system_rom_resource = {
2970 - .name = "System ROM",
2973 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2976 -static struct resource extension_rom_resource = {
2977 - .name = "Extension ROM",
2980 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2983 -static struct resource adapter_rom_resources[] = { {
2984 - .name = "Adapter ROM",
2987 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2989 - .name = "Adapter ROM",
2992 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2994 - .name = "Adapter ROM",
2997 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2999 - .name = "Adapter ROM",
3002 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3004 - .name = "Adapter ROM",
3007 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3009 - .name = "Adapter ROM",
3012 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3015 -static struct resource video_rom_resource = {
3016 - .name = "Video ROM",
3019 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3022 -#define ROMSIGNATURE 0xaa55
3024 -static int __init romsignature(const unsigned char *rom)
3026 - const unsigned short * const ptr = (const unsigned short *)rom;
3027 - unsigned short sig;
3029 - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
3032 -static int __init romchecksum(const unsigned char *rom, unsigned long length)
3034 - unsigned char sum, c;
3036 - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
3038 - return !length && !sum;
3041 -static void __init probe_roms(void)
3043 - const unsigned char *rom;
3044 - unsigned long start, length, upper;
3049 - /* Nothing to do if not running in dom0. */
3050 - if (!is_initial_xendomain())
3055 - upper = adapter_rom_resources[0].start;
3056 - for (start = video_rom_resource.start; start < upper; start += 2048) {
3057 - rom = isa_bus_to_virt(start);
3058 - if (!romsignature(rom))
3061 - video_rom_resource.start = start;
3063 - if (probe_kernel_address(rom + 2, c) != 0)
3066 - /* 0 < length <= 0x7f * 512, historically */
3069 - /* if checksum okay, trust length byte */
3070 - if (length && romchecksum(rom, length))
3071 - video_rom_resource.end = start + length - 1;
3073 - request_resource(&iomem_resource, &video_rom_resource);
3077 - start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
3078 - if (start < upper)
3082 - request_resource(&iomem_resource, &system_rom_resource);
3083 - upper = system_rom_resource.start;
3085 - /* check for extension rom (ignore length byte!) */
3086 - rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
3087 - if (romsignature(rom)) {
3088 - length = extension_rom_resource.end - extension_rom_resource.start + 1;
3089 - if (romchecksum(rom, length)) {
3090 - request_resource(&iomem_resource, &extension_rom_resource);
3091 - upper = extension_rom_resource.start;
3095 - /* check for adapter roms on 2k boundaries */
3096 - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
3097 - rom = isa_bus_to_virt(start);
3098 - if (!romsignature(rom))
3101 - if (probe_kernel_address(rom + 2, c) != 0)
3104 - /* 0 < length <= 0x7f * 512, historically */
3107 - /* but accept any length that fits if checksum okay */
3108 - if (!length || start + length > upper || !romchecksum(rom, length))
3111 - adapter_rom_resources[i].start = start;
3112 - adapter_rom_resources[i].end = start + length - 1;
3113 - request_resource(&iomem_resource, &adapter_rom_resources[i]);
3115 - start = adapter_rom_resources[i++].end & ~2047UL;
3120 -static struct e820map machine_e820;
3121 -#define e820 machine_e820
3125 - * Request address space for all standard RAM and ROM resources
3126 - * and also for regions reported as reserved by the e820.
3128 -void __init init_iomem_resources(struct resource *code_resource,
3129 - struct resource *data_resource,
3130 - struct resource *bss_resource)
3135 - for (i = 0; i < e820.nr_map; i++) {
3136 - struct resource *res;
3137 -#ifndef CONFIG_RESOURCES_64BIT
3138 - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
3141 - res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
3142 - switch (e820.map[i].type) {
3143 - case E820_RAM: res->name = "System RAM"; break;
3144 - case E820_ACPI: res->name = "ACPI Tables"; break;
3145 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
3146 - default: res->name = "reserved";
3148 - res->start = e820.map[i].addr;
3149 - res->end = res->start + e820.map[i].size - 1;
3150 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3151 - if (request_resource(&iomem_resource, res)) {
3155 - if (e820.map[i].type == E820_RAM) {
3157 - * We don't know which RAM region contains kernel data,
3158 - * so we try it repeatedly and let the resource manager
3162 - request_resource(res, code_resource);
3163 - request_resource(res, data_resource);
3164 - request_resource(res, bss_resource);
3166 -#ifdef CONFIG_KEXEC
3167 - if (crashk_res.start != crashk_res.end)
3168 - request_resource(res, &crashk_res);
3170 - xen_machine_kexec_register_resources(res);
3179 -#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
3181 - * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
3182 - * correspond to e820 RAM areas and mark the corresponding pages as nosave for
3185 - * This function requires the e820 map to be sorted and without any
3186 - * overlapping entries and assumes the first e820 area to be RAM.
3188 -void __init e820_mark_nosave_regions(void)
3191 - unsigned long pfn;
3193 - pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
3194 - for (i = 1; i < e820.nr_map; i++) {
3195 - struct e820entry *ei = &e820.map[i];
3197 - if (pfn < PFN_UP(ei->addr))
3198 - register_nosave_region(pfn, PFN_UP(ei->addr));
3200 - pfn = PFN_DOWN(ei->addr + ei->size);
3201 - if (ei->type != E820_RAM)
3202 - register_nosave_region(PFN_UP(ei->addr), pfn);
3204 - if (pfn >= max_low_pfn)
3210 -void __init add_memory_region(unsigned long long start,
3211 - unsigned long long size, int type)
3217 - if (x == E820MAX) {
3218 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
3222 - e820.map[x].addr = start;
3223 - e820.map[x].size = size;
3224 - e820.map[x].type = type;
3226 -} /* add_memory_region */
3229 - * Sanitize the BIOS e820 map.
3231 - * Some e820 responses include overlapping entries. The following
3232 - * replaces the original e820 map with a new one, removing overlaps.
3235 -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
3237 - struct change_member *change_tmp;
3238 - unsigned long current_type, last_type;
3239 - unsigned long long last_addr;
3240 - int chgidx, still_changing;
3241 - int overlap_entries;
3242 - int new_bios_entry;
3243 - int old_nr, new_nr, chg_nr;
3247 - Visually we're performing the following (1,2,3,4 = memory types)...
3249 - Sample memory map (w/overlaps):
3250 - ____22__________________
3251 - ______________________4_
3252 - ____1111________________
3253 - _44_____________________
3254 - 11111111________________
3255 - ____________________33__
3256 - ___________44___________
3257 - __________33333_________
3258 - ______________22________
3259 - ___________________2222_
3260 - _________111111111______
3261 - _____________________11_
3262 - _________________4______
3264 - Sanitized equivalent (no overlap):
3265 - 1_______________________
3266 - _44_____________________
3267 - ___1____________________
3268 - ____22__________________
3269 - ______11________________
3270 - _________1______________
3271 - __________3_____________
3272 - ___________44___________
3273 - _____________33_________
3274 - _______________2________
3275 - ________________1_______
3276 - _________________4______
3277 - ___________________2____
3278 - ____________________33__
3279 - ______________________4_
3281 - /* if there's only one memory region, don't bother */
3282 - if (*pnr_map < 2) {
3286 - old_nr = *pnr_map;
3288 - /* bail out if we find any unreasonable addresses in bios map */
3289 - for (i=0; i<old_nr; i++)
3290 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
3294 - /* create pointers for initial change-point information (for sorting) */
3295 - for (i=0; i < 2*old_nr; i++)
3296 - change_point[i] = &change_point_list[i];
3298 - /* record all known change-points (starting and ending addresses),
3299 - omitting those that are for empty memory regions */
3301 - for (i=0; i < old_nr; i++) {
3302 - if (biosmap[i].size != 0) {
3303 - change_point[chgidx]->addr = biosmap[i].addr;
3304 - change_point[chgidx++]->pbios = &biosmap[i];
3305 - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
3306 - change_point[chgidx++]->pbios = &biosmap[i];
3309 - chg_nr = chgidx; /* true number of change-points */
3311 - /* sort change-point list by memory addresses (low -> high) */
3312 - still_changing = 1;
3313 - while (still_changing) {
3314 - still_changing = 0;
3315 - for (i=1; i < chg_nr; i++) {
3316 - /* if <current_addr> > <last_addr>, swap */
3317 - /* or, if current=<start_addr> & last=<end_addr>, swap */
3318 - if ((change_point[i]->addr < change_point[i-1]->addr) ||
3319 - ((change_point[i]->addr == change_point[i-1]->addr) &&
3320 - (change_point[i]->addr == change_point[i]->pbios->addr) &&
3321 - (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
3324 - change_tmp = change_point[i];
3325 - change_point[i] = change_point[i-1];
3326 - change_point[i-1] = change_tmp;
3332 - /* create a new bios memory map, removing overlaps */
3333 - overlap_entries=0; /* number of entries in the overlap table */
3334 - new_bios_entry=0; /* index for creating new bios map entries */
3335 - last_type = 0; /* start with undefined memory type */
3336 - last_addr = 0; /* start with 0 as last starting address */
3337 - /* loop through change-points, determining affect on the new bios map */
3338 - for (chgidx=0; chgidx < chg_nr; chgidx++)
3340 - /* keep track of all overlapping bios entries */
3341 - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
3343 - /* add map entry to overlap list (> 1 entry implies an overlap) */
3344 - overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
3348 - /* remove entry from list (order independent, so swap with last) */
3349 - for (i=0; i<overlap_entries; i++)
3351 - if (overlap_list[i] == change_point[chgidx]->pbios)
3352 - overlap_list[i] = overlap_list[overlap_entries-1];
3354 - overlap_entries--;
3356 - /* if there are overlapping entries, decide which "type" to use */
3357 - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
3359 - for (i=0; i<overlap_entries; i++)
3360 - if (overlap_list[i]->type > current_type)
3361 - current_type = overlap_list[i]->type;
3362 - /* continue building up new bios map based on this information */
3363 - if (current_type != last_type) {
3364 - if (last_type != 0) {
3365 - new_bios[new_bios_entry].size =
3366 - change_point[chgidx]->addr - last_addr;
3367 - /* move forward only if the new size was non-zero */
3368 - if (new_bios[new_bios_entry].size != 0)
3369 - if (++new_bios_entry >= E820MAX)
3370 - break; /* no more space left for new bios entries */
3372 - if (current_type != 0) {
3373 - new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
3374 - new_bios[new_bios_entry].type = current_type;
3375 - last_addr=change_point[chgidx]->addr;
3377 - last_type = current_type;
3380 - new_nr = new_bios_entry; /* retain count for new bios entries */
3382 - /* copy new bios mapping into original location */
3383 - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
3384 - *pnr_map = new_nr;
3390 - * Copy the BIOS e820 map into a safe place.
3392 - * Sanity-check it while we're at it..
3394 - * If we're lucky and live on a modern system, the setup code
3395 - * will have given us a memory map that we can use to properly
3396 - * set up memory. If we aren't, we'll fake a memory map.
3398 - * We check to see that the memory map contains at least 2 elements
3399 - * before we'll use it, because the detection code in setup.S may
3400 - * not be perfect and most every PC known to man has two memory
3401 - * regions: one from 0 to 640k, and one from 1mb up. (The IBM
3402 - * thinkpad 560x, for example, does not cooperate with the memory
3403 - * detection code.)
3405 -int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
3408 - /* Only one memory region (or negative)? Ignore it */
3412 - BUG_ON(nr_map < 1);
3416 - u64 start = biosmap->addr;
3417 - u64 size = biosmap->size;
3418 - u64 end = start + size;
3419 - u32 type = biosmap->type;
3421 - /* Overflow in 64 bits? Ignore the memory map. */
3425 - add_memory_region(start, size, type);
3426 - } while (biosmap++, --nr_map);
3429 - if (is_initial_xendomain()) {
3430 - struct xen_memory_map memmap;
3432 - memmap.nr_entries = E820MAX;
3433 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
3435 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3437 - machine_e820.nr_map = memmap.nr_entries;
3439 - machine_e820 = e820;
3446 - * Find the highest page frame number we have available
3448 -void __init propagate_e820_map(void)
3454 - for (i = 0; i < e820.nr_map; i++) {
3455 - unsigned long start, end;
3457 - if (e820.map[i].type != E820_RAM)
3459 - start = PFN_UP(e820.map[i].addr);
3460 - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3463 - if (end > max_pfn)
3465 - memory_present(0, start, end);
3470 - * Register fully available low RAM pages with the bootmem allocator.
3472 -void __init register_bootmem_low_pages(unsigned long max_low_pfn)
3476 - for (i = 0; i < e820.nr_map; i++) {
3477 - unsigned long curr_pfn, last_pfn, size;
3479 - * Reserve usable low memory
3481 - if (e820.map[i].type != E820_RAM)
3484 - * We are rounding up the start address of usable memory:
3486 - curr_pfn = PFN_UP(e820.map[i].addr);
3487 - if (curr_pfn >= max_low_pfn)
3490 - * ... and at the end of the usable range downwards:
3492 - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3496 - * Truncate to the number of actual pages currently
3499 - if (last_pfn > xen_start_info->nr_pages)
3500 - last_pfn = xen_start_info->nr_pages;
3503 - if (last_pfn > max_low_pfn)
3504 - last_pfn = max_low_pfn;
3507 - * .. finally, did all the rounding and playing
3508 - * around just make the area go away?
3510 - if (last_pfn <= curr_pfn)
3513 - size = last_pfn - curr_pfn;
3514 - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
3518 -void __init e820_register_memory(void)
3520 - unsigned long gapstart, gapsize, round;
3521 - unsigned long long last;
3525 - if (is_initial_xendomain()) {
3526 - struct xen_memory_map memmap;
3528 - memmap.nr_entries = E820MAX;
3529 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
3531 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3533 - machine_e820.nr_map = memmap.nr_entries;
3536 - machine_e820 = e820;
3537 -#define e820 machine_e820
3541 - * Search for the biggest gap in the low 32 bits of the e820
3544 - last = 0x100000000ull;
3545 - gapstart = 0x10000000;
3546 - gapsize = 0x400000;
3548 - while (--i >= 0) {
3549 - unsigned long long start = e820.map[i].addr;
3550 - unsigned long long end = start + e820.map[i].size;
3553 - * Since "last" is at most 4GB, we know we'll
3554 - * fit in 32 bits if this condition is true
3557 - unsigned long gap = last - end;
3559 - if (gap > gapsize) {
3570 - * See how much we want to round up: start off with
3571 - * rounding to the next 1MB area.
3574 - while ((gapsize >> 4) > round)
3576 - /* Fun with two's complement */
3577 - pci_mem_start = (gapstart + round) & -round;
3579 - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
3580 - pci_mem_start, gapstart, gapsize);
3583 -void __init print_memory_map(char *who)
3587 - for (i = 0; i < e820.nr_map; i++) {
3588 - printk(" %s: %016Lx - %016Lx ", who,
3590 - e820.map[i].addr + e820.map[i].size);
3591 - switch (e820.map[i].type) {
3592 - case E820_RAM: printk("(usable)\n");
3594 - case E820_RESERVED:
3595 - printk("(reserved)\n");
3598 - printk("(ACPI data)\n");
3601 - printk("(ACPI NVS)\n");
3603 - default: printk("type %u\n", e820.map[i].type);
3609 -void __init limit_regions(unsigned long long size)
3611 - unsigned long long current_addr = 0;
3614 - print_memory_map("limit_regions start");
3615 - for (i = 0; i < e820.nr_map; i++) {
3616 - current_addr = e820.map[i].addr + e820.map[i].size;
3617 - if (current_addr < size)
3620 - if (e820.map[i].type != E820_RAM)
3623 - if (e820.map[i].addr >= size) {
3625 - * This region starts past the end of the
3626 - * requested size, skip it completely.
3630 - e820.nr_map = i + 1;
3631 - e820.map[i].size -= current_addr - size;
3633 - print_memory_map("limit_regions endfor");
3637 - if (current_addr < size) {
3639 - * The e820 map finished before our requested size so
3640 - * extend the final entry to the requested address.
3643 - if (e820.map[i].type == E820_RAM)
3644 - e820.map[i].size -= current_addr - size;
3646 - add_memory_region(current_addr, size - current_addr, E820_RAM);
3649 - print_memory_map("limit_regions endfunc");
3653 - * This function checks if any part of the range <start,end> is mapped
3657 -e820_any_mapped(u64 start, u64 end, unsigned type)
3662 - for (i = 0; i < e820.nr_map; i++) {
3663 - const struct e820entry *ei = &e820.map[i];
3665 - if (!is_initial_xendomain())
3667 - for (i = 0; i < machine_e820.nr_map; ++i) {
3668 - const struct e820entry *ei = &machine_e820.map[i];
3671 - if (type && ei->type != type)
3673 - if (ei->addr >= end || ei->addr + ei->size <= start)
3679 -EXPORT_SYMBOL_GPL(e820_any_mapped);
3682 - * This function checks if the entire range <start,end> is mapped with type.
3684 - * Note: this function only works correct if the e820 table is sorted and
3685 - * not-overlapping, which is the case
3688 -e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
3695 - for (i = 0; i < e820.nr_map; i++) {
3696 - struct e820entry *ei = &e820.map[i];
3698 - if (!is_initial_xendomain())
3700 - for (i = 0; i < machine_e820.nr_map; ++i) {
3701 - const struct e820entry *ei = &machine_e820.map[i];
3704 - if (type && ei->type != type)
3706 - /* is the region (part) in overlap with the current region ?*/
3707 - if (ei->addr >= end || ei->addr + ei->size <= start)
3709 - /* if the region is at the beginning of <start,end> we move
3710 - * start to the end of the region since it's ok until there
3712 - if (ei->addr <= start)
3713 - start = ei->addr + ei->size;
3714 - /* if start is now at or beyond end, we're done, full
3717 - return 1; /* we're done */
3722 -static int __init parse_memmap(char *arg)
3727 - if (strcmp(arg, "exactmap") == 0) {
3728 -#ifdef CONFIG_CRASH_DUMP
3729 - /* If we are doing a crash dump, we
3730 - * still need to know the real mem
3731 - * size before original memory map is
3734 - propagate_e820_map();
3735 - saved_max_pfn = max_pfn;
3738 - user_defined_memmap = 1;
3740 - /* If the user specifies memory size, we
3741 - * limit the BIOS-provided memory map to
3742 - * that size. exactmap can be used to specify
3743 - * the exact map. mem=number can be used to
3744 - * trim the existing memory map.
3746 - unsigned long long start_at, mem_size;
3748 - mem_size = memparse(arg, &arg);
3749 - if (*arg == '@') {
3750 - start_at = memparse(arg+1, &arg);
3751 - add_memory_region(start_at, mem_size, E820_RAM);
3752 - } else if (*arg == '#') {
3753 - start_at = memparse(arg+1, &arg);
3754 - add_memory_region(start_at, mem_size, E820_ACPI);
3755 - } else if (*arg == '$') {
3756 - start_at = memparse(arg+1, &arg);
3757 - add_memory_region(start_at, mem_size, E820_RESERVED);
3759 - limit_regions(mem_size);
3760 - user_defined_memmap = 1;
3765 -early_param("memmap", parse_memmap);
3768 -void __init update_memory_range(u64 start, u64 size, unsigned old_type,
3769 - unsigned new_type)
3773 - BUG_ON(old_type == new_type);
3775 - for (i = 0; i < e820.nr_map; i++) {
3776 - struct e820entry *ei = &e820.map[i];
3777 - u64 final_start, final_end;
3778 - if (ei->type != old_type)
3780 - /* totally covered? */
3781 - if (ei->addr >= start && ei->size <= size) {
3782 - ei->type = new_type;
3785 - /* partially covered */
3786 - final_start = max(start, ei->addr);
3787 - final_end = min(start + size, ei->addr + ei->size);
3788 - if (final_start >= final_end)
3790 - add_memory_region(final_start, final_end - final_start,
3795 -void __init update_e820(void)
3799 - nr_map = e820.nr_map;
3800 - if (sanitize_e820_map(e820.map, &nr_map))
3802 - e820.nr_map = nr_map;
3803 - printk(KERN_INFO "modified physical RAM map:\n");
3804 - print_memory_map("modified");
3807 --- sle11-2009-06-04.orig/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:38:05.000000000 +0100
3808 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3811 - * Handle the memory map.
3812 - * The functions here do the job until bootmem takes over.
3814 - * Getting sanitize_e820_map() in sync with i386 version by applying change:
3815 - * - Provisions for empty E820 memory regions (reported by certain BIOSes).
3816 - * Alex Achenbach <xela@slit.de>, December 2002.
3817 - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
3820 -#include <linux/kernel.h>
3821 -#include <linux/types.h>
3822 -#include <linux/init.h>
3823 -#include <linux/bootmem.h>
3824 -#include <linux/ioport.h>
3825 -#include <linux/string.h>
3826 -#include <linux/kexec.h>
3827 -#include <linux/module.h>
3828 -#include <linux/mm.h>
3829 -#include <linux/suspend.h>
3830 -#include <linux/pfn.h>
3832 -#include <asm/pgtable.h>
3833 -#include <asm/page.h>
3834 -#include <asm/e820.h>
3835 -#include <asm/proto.h>
3836 -#include <asm/setup.h>
3837 -#include <asm/sections.h>
3838 -#include <asm/kdebug.h>
3839 -#include <xen/interface/memory.h>
3841 -struct e820map e820 __initdata;
3843 -struct e820map machine_e820;
3847 - * PFN of last memory page.
3849 -unsigned long end_pfn;
3852 - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
3853 - * The direct mapping extends to max_pfn_mapped, so that we can directly access
3854 - * apertures, ACPI and other tables without having to play with fixmaps.
3856 -unsigned long max_pfn_mapped;
3859 - * Last pfn which the user wants to use.
3861 -static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
3864 - * Early reserved memory areas.
3866 -#define MAX_EARLY_RES 20
3869 - unsigned long start, end;
3872 -static struct early_res early_res[MAX_EARLY_RES] __initdata = {
3874 - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
3875 -#ifdef CONFIG_X86_TRAMPOLINE
3876 - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
3882 -void __init reserve_early(unsigned long start, unsigned long end, char *name)
3885 - struct early_res *r;
3886 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3887 - r = &early_res[i];
3888 - if (end > r->start && start < r->end)
3889 - panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
3890 - start, end - 1, name?name:"", r->start, r->end - 1, r->name);
3892 - if (i >= MAX_EARLY_RES)
3893 - panic("Too many early reservations");
3894 - r = &early_res[i];
3898 - strncpy(r->name, name, sizeof(r->name) - 1);
3901 -void __init free_early(unsigned long start, unsigned long end)
3903 - struct early_res *r;
3906 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3907 - r = &early_res[i];
3908 - if (start == r->start && end == r->end)
3911 - if (i >= MAX_EARLY_RES || !early_res[i].end)
3912 - panic("free_early on not reserved area: %lx-%lx!", start, end);
3914 - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
3917 - memmove(&early_res[i], &early_res[i + 1],
3918 - (j - 1 - i) * sizeof(struct early_res));
3920 - early_res[j - 1].end = 0;
3923 -void __init early_res_to_bootmem(unsigned long start, unsigned long end)
3926 - unsigned long final_start, final_end;
3927 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3928 - struct early_res *r = &early_res[i];
3929 - final_start = max(start, r->start);
3930 - final_end = min(end, r->end);
3931 - if (final_start >= final_end)
3933 - printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
3934 - final_start, final_end - 1, r->name);
3935 - reserve_bootmem_generic(final_start, final_end - final_start);
3939 -/* Check for already reserved areas */
3940 -static inline int __init
3941 -bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
3944 - unsigned long addr = *addrp, last;
3947 - last = addr + size;
3948 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3949 - struct early_res *r = &early_res[i];
3950 - if (last >= r->start && addr < r->end) {
3951 - *addrp = addr = round_up(r->end, align);
3959 -/* Check for already reserved areas */
3960 -static inline int __init
3961 -bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
3964 - unsigned long addr = *addrp, last;
3965 - unsigned long size = *sizep;
3968 - last = addr + size;
3969 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3970 - struct early_res *r = &early_res[i];
3971 - if (last > r->start && addr < r->start) {
3972 - size = r->start - addr;
3976 - if (last > r->end && addr < r->end) {
3977 - addr = round_up(r->end, align);
3978 - size = last - addr;
3982 - if (last <= r->end && addr >= r->start) {
3994 - * This function checks if any part of the range <start,end> is mapped
3998 -e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
4003 - for (i = 0; i < e820.nr_map; i++) {
4004 - struct e820entry *ei = &e820.map[i];
4006 - if (!is_initial_xendomain())
4008 - for (i = 0; i < machine_e820.nr_map; i++) {
4009 - const struct e820entry *ei = &machine_e820.map[i];
4012 - if (type && ei->type != type)
4014 - if (ei->addr >= end || ei->addr + ei->size <= start)
4020 -EXPORT_SYMBOL_GPL(e820_any_mapped);
4023 - * This function checks if the entire range <start,end> is mapped with type.
4025 - * Note: this function only works correct if the e820 table is sorted and
4026 - * not-overlapping, which is the case
4028 -int __init e820_all_mapped(unsigned long start, unsigned long end,
4034 - for (i = 0; i < e820.nr_map; i++) {
4035 - struct e820entry *ei = &e820.map[i];
4037 - if (!is_initial_xendomain())
4039 - for (i = 0; i < machine_e820.nr_map; i++) {
4040 - const struct e820entry *ei = &machine_e820.map[i];
4043 - if (type && ei->type != type)
4045 - /* is the region (part) in overlap with the current region ?*/
4046 - if (ei->addr >= end || ei->addr + ei->size <= start)
4049 - /* if the region is at the beginning of <start,end> we move
4050 - * start to the end of the region since it's ok until there
4052 - if (ei->addr <= start)
4053 - start = ei->addr + ei->size;
4055 - * if start is now at or beyond end, we're done, full
4065 - * Find a free area with specified alignment in a specific range.
4067 -unsigned long __init find_e820_area(unsigned long start, unsigned long end,
4068 - unsigned long size, unsigned long align)
4072 - for (i = 0; i < e820.nr_map; i++) {
4073 - struct e820entry *ei = &e820.map[i];
4074 - unsigned long addr, last;
4075 - unsigned long ei_last;
4077 - if (ei->type != E820_RAM)
4079 - addr = round_up(ei->addr, align);
4080 - ei_last = ei->addr + ei->size;
4082 - addr = round_up(start, align);
4083 - if (addr >= ei_last)
4085 - while (bad_addr(&addr, size, align) && addr+size <= ei_last)
4087 - last = addr + size;
4088 - if (last > ei_last)
4098 - * Find next free range after *start
4100 -unsigned long __init find_e820_area_size(unsigned long start,
4101 - unsigned long *sizep,
4102 - unsigned long align)
4106 - for (i = 0; i < e820.nr_map; i++) {
4107 - struct e820entry *ei = &e820.map[i];
4108 - unsigned long addr, last;
4109 - unsigned long ei_last;
4111 - if (ei->type != E820_RAM)
4113 - addr = round_up(ei->addr, align);
4114 - ei_last = ei->addr + ei->size;
4116 - addr = round_up(start, align);
4117 - if (addr >= ei_last)
4119 - *sizep = ei_last - addr;
4120 - while (bad_addr_size(&addr, sizep, align) &&
4121 - addr + *sizep <= ei_last)
4123 - last = addr + *sizep;
4124 - if (last > ei_last)
4132 - * Find the highest page frame number we have available
4134 -unsigned long __init e820_end_of_ram(void)
4136 - unsigned long end_pfn;
4138 - end_pfn = find_max_pfn_with_active_regions();
4140 - if (end_pfn > max_pfn_mapped)
4141 - max_pfn_mapped = end_pfn;
4142 - if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
4143 - max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
4144 - if (end_pfn > end_user_pfn)
4145 - end_pfn = end_user_pfn;
4146 - if (end_pfn > max_pfn_mapped)
4147 - end_pfn = max_pfn_mapped;
4149 - printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
4154 - * Mark e820 reserved areas as busy for the resource manager.
4156 -void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
4159 - struct resource *res;
4161 - res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
4162 - for (i = 0; i < nr_map; i++) {
4163 - switch (e820[i].type) {
4164 - case E820_RAM: res->name = "System RAM"; break;
4165 - case E820_ACPI: res->name = "ACPI Tables"; break;
4166 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
4167 - default: res->name = "reserved";
4169 - res->start = e820[i].addr;
4170 - res->end = res->start + e820[i].size - 1;
4171 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4172 - insert_resource(&iomem_resource, res);
4179 - * Find the ranges of physical addresses that do not correspond to
4180 - * e820 RAM areas and mark the corresponding pages as nosave for software
4181 - * suspend and suspend to RAM.
4183 - * This function requires the e820 map to be sorted and without any
4184 - * overlapping entries and assumes the first e820 area to be RAM.
4186 -void __init e820_mark_nosave_regions(void)
4189 - unsigned long paddr;
4191 - paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
4192 - for (i = 1; i < e820.nr_map; i++) {
4193 - struct e820entry *ei = &e820.map[i];
4195 - if (paddr < ei->addr)
4196 - register_nosave_region(PFN_DOWN(paddr),
4197 - PFN_UP(ei->addr));
4199 - paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
4200 - if (ei->type != E820_RAM)
4201 - register_nosave_region(PFN_UP(ei->addr),
4204 - if (paddr >= (end_pfn << PAGE_SHIFT))
4211 - * Finds an active region in the address range from start_pfn to end_pfn and
4212 - * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
4214 -static int __init e820_find_active_region(const struct e820entry *ei,
4215 - unsigned long start_pfn,
4216 - unsigned long end_pfn,
4217 - unsigned long *ei_startpfn,
4218 - unsigned long *ei_endpfn)
4220 - *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
4221 - *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
4223 - /* Skip map entries smaller than a page */
4224 - if (*ei_startpfn >= *ei_endpfn)
4227 - /* Check if max_pfn_mapped should be updated */
4228 - if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
4229 - max_pfn_mapped = *ei_endpfn;
4231 - /* Skip if map is outside the node */
4232 - if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
4233 - *ei_startpfn >= end_pfn)
4236 - /* Check for overlaps */
4237 - if (*ei_startpfn < start_pfn)
4238 - *ei_startpfn = start_pfn;
4239 - if (*ei_endpfn > end_pfn)
4240 - *ei_endpfn = end_pfn;
4242 - /* Obey end_user_pfn to save on memmap */
4243 - if (*ei_startpfn >= end_user_pfn)
4245 - if (*ei_endpfn > end_user_pfn)
4246 - *ei_endpfn = end_user_pfn;
4251 -/* Walk the e820 map and register active regions within a node */
4253 -e820_register_active_regions(int nid, unsigned long start_pfn,
4254 - unsigned long end_pfn)
4256 - unsigned long ei_startpfn;
4257 - unsigned long ei_endpfn;
4260 - for (i = 0; i < e820.nr_map; i++)
4261 - if (e820_find_active_region(&e820.map[i],
4262 - start_pfn, end_pfn,
4263 - &ei_startpfn, &ei_endpfn))
4264 - add_active_range(nid, ei_startpfn, ei_endpfn);
4268 - * Add a memory region to the kernel e820 map.
4270 -void __init add_memory_region(unsigned long start, unsigned long size, int type)
4272 - int x = e820.nr_map;
4274 - if (x == E820MAX) {
4275 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
4279 - e820.map[x].addr = start;
4280 - e820.map[x].size = size;
4281 - e820.map[x].type = type;
4286 - * Find the hole size (in bytes) in the memory range.
4287 - * @start: starting address of the memory range to scan
4288 - * @end: ending address of the memory range to scan
4290 -unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
4292 - unsigned long start_pfn = start >> PAGE_SHIFT;
4293 - unsigned long end_pfn = end >> PAGE_SHIFT;
4294 - unsigned long ei_startpfn, ei_endpfn, ram = 0;
4297 - for (i = 0; i < e820.nr_map; i++) {
4298 - if (e820_find_active_region(&e820.map[i],
4299 - start_pfn, end_pfn,
4300 - &ei_startpfn, &ei_endpfn))
4301 - ram += ei_endpfn - ei_startpfn;
4303 - return end - start - (ram << PAGE_SHIFT);
4306 -static void __init e820_print_map(char *who)
4310 - for (i = 0; i < e820.nr_map; i++) {
4311 - printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
4312 - (unsigned long long) e820.map[i].addr,
4313 - (unsigned long long)
4314 - (e820.map[i].addr + e820.map[i].size));
4315 - switch (e820.map[i].type) {
4317 - printk(KERN_CONT "(usable)\n");
4319 - case E820_RESERVED:
4320 - printk(KERN_CONT "(reserved)\n");
4323 - printk(KERN_CONT "(ACPI data)\n");
4326 - printk(KERN_CONT "(ACPI NVS)\n");
4329 - printk(KERN_CONT "type %u\n", e820.map[i].type);
4336 - * Sanitize the BIOS e820 map.
4338 - * Some e820 responses include overlapping entries. The following
4339 - * replaces the original e820 map with a new one, removing overlaps.
4342 -static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
4344 - struct change_member {
4345 - struct e820entry *pbios; /* pointer to original bios entry */
4346 - unsigned long long addr; /* address for this change point */
4348 - static struct change_member change_point_list[2*E820MAX] __initdata;
4349 - static struct change_member *change_point[2*E820MAX] __initdata;
4350 - static struct e820entry *overlap_list[E820MAX] __initdata;
4351 - static struct e820entry new_bios[E820MAX] __initdata;
4352 - struct change_member *change_tmp;
4353 - unsigned long current_type, last_type;
4354 - unsigned long long last_addr;
4355 - int chgidx, still_changing;
4356 - int overlap_entries;
4357 - int new_bios_entry;
4358 - int old_nr, new_nr, chg_nr;
4362 - Visually we're performing the following
4363 - (1,2,3,4 = memory types)...
4365 - Sample memory map (w/overlaps):
4366 - ____22__________________
4367 - ______________________4_
4368 - ____1111________________
4369 - _44_____________________
4370 - 11111111________________
4371 - ____________________33__
4372 - ___________44___________
4373 - __________33333_________
4374 - ______________22________
4375 - ___________________2222_
4376 - _________111111111______
4377 - _____________________11_
4378 - _________________4______
4380 - Sanitized equivalent (no overlap):
4381 - 1_______________________
4382 - _44_____________________
4383 - ___1____________________
4384 - ____22__________________
4385 - ______11________________
4386 - _________1______________
4387 - __________3_____________
4388 - ___________44___________
4389 - _____________33_________
4390 - _______________2________
4391 - ________________1_______
4392 - _________________4______
4393 - ___________________2____
4394 - ____________________33__
4395 - ______________________4_
4398 - /* if there's only one memory region, don't bother */
4402 - old_nr = *pnr_map;
4404 - /* bail out if we find any unreasonable addresses in bios map */
4405 - for (i = 0; i < old_nr; i++)
4406 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
4409 - /* create pointers for initial change-point information (for sorting) */
4410 - for (i = 0; i < 2 * old_nr; i++)
4411 - change_point[i] = &change_point_list[i];
4413 - /* record all known change-points (starting and ending addresses),
4414 - omitting those that are for empty memory regions */
4416 - for (i = 0; i < old_nr; i++) {
4417 - if (biosmap[i].size != 0) {
4418 - change_point[chgidx]->addr = biosmap[i].addr;
4419 - change_point[chgidx++]->pbios = &biosmap[i];
4420 - change_point[chgidx]->addr = biosmap[i].addr +
4422 - change_point[chgidx++]->pbios = &biosmap[i];
4427 - /* sort change-point list by memory addresses (low -> high) */
4428 - still_changing = 1;
4429 - while (still_changing) {
4430 - still_changing = 0;
4431 - for (i = 1; i < chg_nr; i++) {
4432 - unsigned long long curaddr, lastaddr;
4433 - unsigned long long curpbaddr, lastpbaddr;
4435 - curaddr = change_point[i]->addr;
4436 - lastaddr = change_point[i - 1]->addr;
4437 - curpbaddr = change_point[i]->pbios->addr;
4438 - lastpbaddr = change_point[i - 1]->pbios->addr;
4441 - * swap entries, when:
4443 - * curaddr > lastaddr or
4444 - * curaddr == lastaddr and curaddr == curpbaddr and
4445 - * lastaddr != lastpbaddr
4447 - if (curaddr < lastaddr ||
4448 - (curaddr == lastaddr && curaddr == curpbaddr &&
4449 - lastaddr != lastpbaddr)) {
4450 - change_tmp = change_point[i];
4451 - change_point[i] = change_point[i-1];
4452 - change_point[i-1] = change_tmp;
4453 - still_changing = 1;
4458 - /* create a new bios memory map, removing overlaps */
4459 - overlap_entries = 0; /* number of entries in the overlap table */
4460 - new_bios_entry = 0; /* index for creating new bios map entries */
4461 - last_type = 0; /* start with undefined memory type */
4462 - last_addr = 0; /* start with 0 as last starting address */
4464 - /* loop through change-points, determining affect on the new bios map */
4465 - for (chgidx = 0; chgidx < chg_nr; chgidx++) {
4466 - /* keep track of all overlapping bios entries */
4467 - if (change_point[chgidx]->addr ==
4468 - change_point[chgidx]->pbios->addr) {
4470 - * add map entry to overlap list (> 1 entry
4471 - * implies an overlap)
4473 - overlap_list[overlap_entries++] =
4474 - change_point[chgidx]->pbios;
4477 - * remove entry from list (order independent,
4478 - * so swap with last)
4480 - for (i = 0; i < overlap_entries; i++) {
4481 - if (overlap_list[i] ==
4482 - change_point[chgidx]->pbios)
4484 - overlap_list[overlap_entries-1];
4486 - overlap_entries--;
4489 - * if there are overlapping entries, decide which
4490 - * "type" to use (larger value takes precedence --
4491 - * 1=usable, 2,3,4,4+=unusable)
4494 - for (i = 0; i < overlap_entries; i++)
4495 - if (overlap_list[i]->type > current_type)
4496 - current_type = overlap_list[i]->type;
4498 - * continue building up new bios map based on this
4501 - if (current_type != last_type) {
4502 - if (last_type != 0) {
4503 - new_bios[new_bios_entry].size =
4504 - change_point[chgidx]->addr - last_addr;
4506 - * move forward only if the new size
4509 - if (new_bios[new_bios_entry].size != 0)
4511 - * no more space left for new
4514 - if (++new_bios_entry >= E820MAX)
4517 - if (current_type != 0) {
4518 - new_bios[new_bios_entry].addr =
4519 - change_point[chgidx]->addr;
4520 - new_bios[new_bios_entry].type = current_type;
4521 - last_addr = change_point[chgidx]->addr;
4523 - last_type = current_type;
4526 - /* retain count for new bios entries */
4527 - new_nr = new_bios_entry;
4529 - /* copy new bios mapping into original location */
4530 - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
4531 - *pnr_map = new_nr;
4537 - * Copy the BIOS e820 map into a safe place.
4539 - * Sanity-check it while we're at it..
4541 - * If we're lucky and live on a modern system, the setup code
4542 - * will have given us a memory map that we can use to properly
4543 - * set up memory. If we aren't, we'll fake a memory map.
4545 -static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
4548 - /* Only one memory region (or negative)? Ignore it */
4552 - BUG_ON(nr_map < 1);
4556 - u64 start = biosmap->addr;
4557 - u64 size = biosmap->size;
4558 - u64 end = start + size;
4559 - u32 type = biosmap->type;
4561 - /* Overflow in 64 bits? Ignore the memory map. */
4565 - add_memory_region(start, size, type);
4566 - } while (biosmap++, --nr_map);
4569 - if (is_initial_xendomain()) {
4570 - struct xen_memory_map memmap;
4572 - memmap.nr_entries = E820MAX;
4573 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
4575 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
4577 - machine_e820.nr_map = memmap.nr_entries;
4579 - machine_e820 = e820;
4585 -static void early_panic(char *msg)
4587 - early_printk(msg);
4591 -/* We're not void only for x86 32-bit compat */
4592 -char * __init machine_specific_memory_setup(void)
4595 - char *who = "BIOS-e820";
4597 - * Try to copy the BIOS-supplied E820-map.
4599 - * Otherwise fake a memory map; one section from 0k->640k,
4600 - * the next section from 1mb->appropriate_mem_k
4602 - sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
4603 - if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
4604 - early_panic("Cannot find a valid memory map");
4605 -#else /* CONFIG_XEN */
4606 - char *who = "Xen";
4608 - struct xen_memory_map memmap;
4610 - * This is rather large for a stack variable but this early in
4611 - * the boot process we know we have plenty slack space.
4613 - struct e820entry map[E820MAX];
4615 - memmap.nr_entries = E820MAX;
4616 - set_xen_guest_handle(memmap.buffer, map);
4618 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
4619 - if ( rc == -ENOSYS ) {
4620 - memmap.nr_entries = 1;
4621 - map[0].addr = 0ULL;
4622 - map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
4623 - /* 8MB slack (to balance backend allocations). */
4624 - map[0].size += 8 << 20;
4625 - map[0].type = E820_RAM;
4630 - sanitize_e820_map(map, (char *)&memmap.nr_entries);
4632 - if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
4633 - early_panic("Cannot find a valid memory map");
4635 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
4636 - e820_print_map(who);
4638 - /* In case someone cares... */
4642 -static int __init parse_memopt(char *p)
4645 - unsigned long current_end;
4646 - unsigned long end;
4650 - end_user_pfn = memparse(p, &p);
4651 - end_user_pfn >>= PAGE_SHIFT;
4653 - end = end_user_pfn<<PAGE_SHIFT;
4654 - i = e820.nr_map-1;
4655 - current_end = e820.map[i].addr + e820.map[i].size;
4657 - if (current_end < end) {
4659 - * The e820 map ends before our requested size so
4660 - * extend the final entry to the requested address.
4662 - if (e820.map[i].type == E820_RAM)
4663 - e820.map[i].size = end - e820.map[i].addr;
4665 - add_memory_region(current_end, end - current_end, E820_RAM);
4670 -early_param("mem", parse_memopt);
4672 -static int userdef __initdata;
4674 -static int __init parse_memmap_opt(char *p)
4677 - unsigned long long start_at, mem_size;
4679 - if (!strcmp(p, "exactmap")) {
4680 -#ifdef CONFIG_CRASH_DUMP
4682 - * If we are doing a crash dump, we still need to know
4683 - * the real mem size before original memory map is
4686 - e820_register_active_regions(0, 0, -1UL);
4687 - saved_max_pfn = e820_end_of_ram();
4688 - remove_all_active_ranges();
4690 - max_pfn_mapped = 0;
4697 - mem_size = memparse(p, &p);
4703 - start_at = memparse(p+1, &p);
4704 - add_memory_region(start_at, mem_size, E820_RAM);
4705 - } else if (*p == '#') {
4706 - start_at = memparse(p+1, &p);
4707 - add_memory_region(start_at, mem_size, E820_ACPI);
4708 - } else if (*p == '$') {
4709 - start_at = memparse(p+1, &p);
4710 - add_memory_region(start_at, mem_size, E820_RESERVED);
4712 - end_user_pfn = (mem_size >> PAGE_SHIFT);
4714 - return *p == '\0' ? 0 : -EINVAL;
4716 -early_param("memmap", parse_memmap_opt);
4718 -void __init finish_e820_parsing(void)
4721 - char nr = e820.nr_map;
4723 - if (sanitize_e820_map(e820.map, &nr) < 0)
4724 - early_panic("Invalid user supplied memory map");
4727 - printk(KERN_INFO "user-defined physical RAM map:\n");
4728 - e820_print_map("user");
4733 -void __init update_memory_range(u64 start, u64 size, unsigned old_type,
4734 - unsigned new_type)
4738 - BUG_ON(old_type == new_type);
4740 - for (i = 0; i < e820.nr_map; i++) {
4741 - struct e820entry *ei = &e820.map[i];
4742 - u64 final_start, final_end;
4743 - if (ei->type != old_type)
4745 - /* totally covered? */
4746 - if (ei->addr >= start && ei->size <= size) {
4747 - ei->type = new_type;
4750 - /* partially covered */
4751 - final_start = max(start, ei->addr);
4752 - final_end = min(start + size, ei->addr + ei->size);
4753 - if (final_start >= final_end)
4755 - add_memory_region(final_start, final_end - final_start,
4760 -void __init update_e820(void)
4764 - nr_map = e820.nr_map;
4765 - if (sanitize_e820_map(e820.map, &nr_map))
4767 - e820.nr_map = nr_map;
4768 - printk(KERN_INFO "modified physical RAM map:\n");
4769 - e820_print_map("modified");
4773 -unsigned long pci_mem_start = 0xaeedbabe;
4774 -EXPORT_SYMBOL(pci_mem_start);
4777 - * Search for the biggest gap in the low 32 bits of the e820
4778 - * memory space. We pass this space to PCI to assign MMIO resources
4779 - * for hotplug or unconfigured devices in.
4780 - * Hopefully the BIOS let enough space left.
4782 -__init void e820_setup_gap(struct e820entry *e820, int nr_map)
4784 - unsigned long gapstart, gapsize, round;
4785 - unsigned long last;
4789 - last = 0x100000000ull;
4790 - gapstart = 0x10000000;
4791 - gapsize = 0x400000;
4793 - while (--i >= 0) {
4794 - unsigned long long start = e820[i].addr;
4795 - unsigned long long end = start + e820[i].size;
4798 - * Since "last" is at most 4GB, we know we'll
4799 - * fit in 32 bits if this condition is true
4802 - unsigned long gap = last - end;
4804 - if (gap > gapsize) {
4815 - gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
4816 - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
4818 - KERN_ERR "PCI: Unassigned devices with 32bit resource "
4819 - "registers may break!\n");
4823 - * See how much we want to round up: start off with
4824 - * rounding to the next 1MB area.
4827 - while ((gapsize >> 4) > round)
4829 - /* Fun with two's complement */
4830 - pci_mem_start = (gapstart + round) & -round;
4833 - "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
4834 - pci_mem_start, gapstart, gapsize);
4837 -int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
4841 - if (slot < 0 || slot >= e820.nr_map)
4843 - for (i = slot; i < e820.nr_map; i++) {
4844 - if (e820.map[i].type != E820_RAM)
4848 - if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
4850 - *addr = e820.map[i].addr;
4851 - *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
4852 - max_pfn << PAGE_SHIFT) - *addr;
4855 --- sle11-2009-06-04.orig/arch/x86/kernel/early_printk-xen.c 2009-03-16 16:38:05.000000000 +0100
4856 +++ sle11-2009-06-04/arch/x86/kernel/early_printk-xen.c 2009-06-04 10:21:39.000000000 +0200
4857 @@ -225,7 +225,7 @@ static struct console simnow_console = {
4858 static struct console *early_console = &early_vga_console;
4859 static int early_console_initialized;
4861 -void early_printk(const char *fmt, ...)
4862 +asmlinkage void early_printk(const char *fmt, ...)
4866 --- sle11-2009-06-04.orig/arch/x86/kernel/entry_32-xen.S 2009-03-16 16:38:05.000000000 +0100
4867 +++ sle11-2009-06-04/arch/x86/kernel/entry_32-xen.S 2009-06-04 10:21:39.000000000 +0200
4869 #include <asm/percpu.h>
4870 #include <asm/dwarf2.h>
4871 #include <asm/processor-flags.h>
4872 -#include "irq_vectors.h"
4873 +#include <asm/ftrace.h>
4874 +#include <asm/irq_vectors.h>
4875 #include <xen/interface/xen.h>
4877 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
4878 +#include <linux/elf-em.h>
4879 +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
4880 +#define __AUDIT_ARCH_LE 0x40000000
4882 +#ifndef CONFIG_AUDITSYSCALL
4883 +#define sysenter_audit syscall_trace_entry
4884 +#define sysexit_audit syscall_exit_work
4888 * We use macros for low-level operations which need to be overridden
4889 * for paravirtualization. The following will never clobber any registers:
4890 * INTERRUPT_RETURN (aka. "iret")
4891 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
4892 - * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
4893 + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
4895 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
4896 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
4897 @@ -277,11 +288,6 @@ END(resume_kernel)
4901 - .macro test_tif ti_reg # system call tracing in operation / emulation
4902 - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
4903 - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
4906 /* SYSENTER_RETURN points to after the "sysenter" instruction in
4907 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
4909 @@ -338,8 +344,9 @@ sysenter_past_esp:
4912 GET_THREAD_INFO(%ebp)
4914 - jnz syscall_trace_entry
4915 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4916 + jnz sysenter_audit
4918 cmpl $(nr_syscalls), %eax
4920 call *sys_call_table(,%eax,4)
4921 @@ -349,14 +356,54 @@ sysenter_past_esp:
4923 movl TI_flags(%ebp), %ecx
4924 testw $_TIF_ALLWORK_MASK, %cx
4925 - jne syscall_exit_work
4928 /* if something modifies registers it must also disable sysexit */
4929 movl PT_EIP(%esp), %edx
4930 movl PT_OLDESP(%esp), %ecx
4933 1: mov PT_FS(%esp), %fs
4934 - ENABLE_INTERRUPTS_SYSCALL_RET
4935 + ENABLE_INTERRUPTS_SYSEXIT
4937 +#ifdef CONFIG_AUDITSYSCALL
4939 + testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
4940 + jnz syscall_trace_entry
4942 + CFI_ADJUST_CFA_OFFSET -4
4943 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
4944 + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
4945 + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
4946 + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
4947 + movl %eax,%edx /* 2nd arg: syscall number */
4948 + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
4949 + call audit_syscall_entry
4951 + CFI_ADJUST_CFA_OFFSET 4
4952 + movl PT_EAX(%esp),%eax /* reload syscall number */
4953 + jmp sysenter_do_call
4956 + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4957 + jne syscall_exit_work
4959 + ENABLE_INTERRUPTS(CLBR_ANY)
4960 + movl %eax,%edx /* second arg, syscall return value */
4961 + cmpl $0,%eax /* is it < 0? */
4962 + setl %al /* 1 if so, 0 if not */
4963 + movzbl %al,%eax /* zero-extend that */
4964 + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
4965 + call audit_syscall_exit
4966 + DISABLE_INTERRUPTS(CLBR_ANY)
4968 + movl TI_flags(%ebp), %ecx
4969 + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4970 + jne syscall_exit_work
4971 + movl PT_EAX(%esp),%eax /* reload syscall return value */
4976 .pushsection .fixup,"ax"
4977 2: movl $0,PT_FS(%esp)
4978 @@ -400,7 +447,7 @@ ENTRY(system_call)
4979 CFI_ADJUST_CFA_OFFSET 4
4981 GET_THREAD_INFO(%ebp)
4983 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4984 jnz syscall_trace_entry
4985 cmpl $(nr_syscalls), %eax
4987 @@ -413,10 +460,6 @@ syscall_exit:
4988 # setting need_resched or sigpending
4989 # between sampling and the iret
4991 - testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
4993 - orl $_TIF_SINGLESTEP,TI_flags(%ebp)
4995 movl TI_flags(%ebp), %ecx
4996 testw $_TIF_ALLWORK_MASK, %cx # current->work
4997 jne syscall_exit_work
4998 @@ -588,12 +631,8 @@ END(work_pending)
4999 syscall_trace_entry:
5000 movl $-ENOSYS,PT_EAX(%esp)
5003 - call do_syscall_trace
5005 - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5006 - # so must skip actual syscall
5007 - movl PT_ORIG_EAX(%esp), %eax
5008 + call syscall_trace_enter
5009 + /* What it returned is what we'll actually use. */
5010 cmpl $(nr_syscalls), %eax
5013 @@ -602,14 +641,13 @@ END(syscall_trace_entry)
5014 # perform syscall exit tracing
5017 - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
5018 + testb $_TIF_WORK_SYSCALL_EXIT, %cl
5021 - ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
5022 + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
5023 # schedule() instead
5026 - call do_syscall_trace
5027 + call syscall_trace_leave
5028 jmp resume_userspace
5029 END(syscall_exit_work)
5031 @@ -1113,10 +1151,10 @@ ENTRY(native_iret)
5035 -ENTRY(native_irq_enable_syscall_ret)
5036 +ENTRY(native_irq_enable_sysexit)
5039 -END(native_irq_enable_syscall_ret)
5040 +END(native_irq_enable_sysexit)
5044 @@ -1265,6 +1303,77 @@ ENTRY(kernel_thread_helper)
5046 ENDPROC(kernel_thread_helper)
5048 +#ifdef CONFIG_FTRACE
5049 +#ifdef CONFIG_DYNAMIC_FTRACE
5055 + movl 0xc(%esp), %eax
5056 + subl $MCOUNT_INSN_SIZE, %eax
5069 +ENTRY(ftrace_caller)
5073 + movl 0xc(%esp), %eax
5074 + movl 0x4(%ebp), %edx
5075 + subl $MCOUNT_INSN_SIZE, %eax
5090 +#else /* ! CONFIG_DYNAMIC_FTRACE */
5093 + cmpl $ftrace_stub, ftrace_trace_function
5099 + /* taken from glibc */
5104 + movl 0xc(%esp), %eax
5105 + movl 0x4(%ebp), %edx
5106 + subl $MCOUNT_INSN_SIZE, %eax
5108 + call *ftrace_trace_function
5116 +#endif /* CONFIG_DYNAMIC_FTRACE */
5117 +#endif /* CONFIG_FTRACE */
5119 #include <asm/alternative-asm.h>
5121 # pv syscall call handler stub
5122 @@ -1290,7 +1399,7 @@ ENTRY(ia32pv_cstar_target)
5125 GET_THREAD_INFO(%ebp)
5127 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
5128 jnz cstar_trace_entry
5129 cmpl $nr_syscalls,%eax
5131 @@ -1324,29 +1433,21 @@ cstar_trace_entry:
5132 btl %eax,cstar_special
5133 jc .Lcstar_trace_special
5137 orl $_TIF_CSTAR,TI_flags(%ebp)
5138 - call do_syscall_trace
5139 + call syscall_trace_enter
5141 andl $~_TIF_CSTAR,TI_flags(%ebp)
5143 - jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
5144 - # so must skip actual syscall
5145 - movl PT_ORIG_EAX(%esp),%eax
5146 + /* What it returned is what we'll actually use. */
5147 cmpl $nr_syscalls,%eax
5150 .Lcstar_trace_special:
5151 movl PT_ECX(%esp),%ecx
5154 movl %ecx,PT_EBP(%esp) # put user EBP back in place
5155 - call do_syscall_trace
5157 - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5158 - # so must skip actual syscall
5159 - movl PT_ORIG_EAX(%esp),%eax
5160 + call syscall_trace_enter
5161 + /* What it returned is what we'll actually use. */
5162 cmpl $nr_syscalls,%eax
5165 --- sle11-2009-06-04.orig/arch/x86/kernel/entry_64.S 2009-06-04 00:00:00.000000000 +0200
5166 +++ sle11-2009-06-04/arch/x86/kernel/entry_64.S 2009-06-04 10:21:39.000000000 +0200
5167 @@ -1409,7 +1409,7 @@ ENTRY(arch_unwind_init_running)
5168 ENDPROC(arch_unwind_init_running)
5172 +#ifdef CONFIG_PARAVIRT_XEN
5173 ENTRY(xen_hypervisor_callback)
5174 zeroentry xen_do_hypervisor_callback
5175 END(xen_hypervisor_callback)
5176 @@ -1507,7 +1507,7 @@ ENTRY(xen_failsafe_callback)
5178 END(xen_failsafe_callback)
5180 -#endif /* CONFIG_XEN */
5181 +#endif /* CONFIG_PARAVIRT_XEN */
5185 --- sle11-2009-06-04.orig/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:38:05.000000000 +0100
5186 +++ sle11-2009-06-04/arch/x86/kernel/entry_64-xen.S 2009-06-04 10:21:39.000000000 +0200
5187 @@ -53,19 +53,130 @@
5188 #include <asm/hw_irq.h>
5189 #include <asm/page.h>
5190 #include <asm/irqflags.h>
5191 +#include <asm/ftrace.h>
5192 #include <asm/errno.h>
5193 #include <xen/interface/xen.h>
5194 #include <xen/interface/features.h>
5196 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
5197 +#include <linux/elf-em.h>
5198 +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
5199 +#define __AUDIT_ARCH_64BIT 0x80000000
5200 +#define __AUDIT_ARCH_LE 0x40000000
5204 +#ifdef CONFIG_FTRACE
5205 +#ifdef CONFIG_DYNAMIC_FTRACE
5210 + movq %rcx, 8(%rsp)
5211 + movq %rdx, 16(%rsp)
5212 + movq %rsi, 24(%rsp)
5213 + movq %rdi, 32(%rsp)
5214 + movq %r8, 40(%rsp)
5215 + movq %r9, 48(%rsp)
5217 + movq 0x38(%rsp), %rdi
5218 + subq $MCOUNT_INSN_SIZE, %rdi
5224 + movq 48(%rsp), %r9
5225 + movq 40(%rsp), %r8
5226 + movq 32(%rsp), %rdi
5227 + movq 24(%rsp), %rsi
5228 + movq 16(%rsp), %rdx
5229 + movq 8(%rsp), %rcx
5236 +ENTRY(ftrace_caller)
5238 + /* taken from glibc */
5241 + movq %rcx, 8(%rsp)
5242 + movq %rdx, 16(%rsp)
5243 + movq %rsi, 24(%rsp)
5244 + movq %rdi, 32(%rsp)
5245 + movq %r8, 40(%rsp)
5246 + movq %r9, 48(%rsp)
5248 + movq 0x38(%rsp), %rdi
5249 + movq 8(%rbp), %rsi
5250 + subq $MCOUNT_INSN_SIZE, %rdi
5256 + movq 48(%rsp), %r9
5257 + movq 40(%rsp), %r8
5258 + movq 32(%rsp), %rdi
5259 + movq 24(%rsp), %rsi
5260 + movq 16(%rsp), %rdx
5261 + movq 8(%rsp), %rcx
5270 +#else /* ! CONFIG_DYNAMIC_FTRACE */
5272 + cmpq $ftrace_stub, ftrace_trace_function
5279 + /* taken from glibc */
5282 + movq %rcx, 8(%rsp)
5283 + movq %rdx, 16(%rsp)
5284 + movq %rsi, 24(%rsp)
5285 + movq %rdi, 32(%rsp)
5286 + movq %r8, 40(%rsp)
5287 + movq %r9, 48(%rsp)
5289 + movq 0x38(%rsp), %rdi
5290 + movq 8(%rbp), %rsi
5291 + subq $MCOUNT_INSN_SIZE, %rdi
5293 + call *ftrace_trace_function
5295 + movq 48(%rsp), %r9
5296 + movq 40(%rsp), %r8
5297 + movq 32(%rsp), %rdi
5298 + movq 24(%rsp), %rsi
5299 + movq 16(%rsp), %rdx
5300 + movq 8(%rsp), %rcx
5306 +#endif /* CONFIG_DYNAMIC_FTRACE */
5307 +#endif /* CONFIG_FTRACE */
5309 #ifndef CONFIG_PREEMPT
5310 #define retint_kernel retint_restore_args
5313 #ifdef CONFIG_PARAVIRT
5314 -ENTRY(native_irq_enable_syscall_ret)
5315 - movq %gs:pda_oldrsp,%rsp
5316 +ENTRY(native_usergs_sysret64)
5319 #endif /* CONFIG_PARAVIRT */
5320 @@ -102,7 +213,7 @@ NMI_MASK = 0x80000000
5321 .macro FAKE_STACK_FRAME child_rip
5322 /* push in order ss, rsp, eflags, cs, rip */
5324 - pushq %rax /* ss */
5325 + pushq $__KERNEL_DS /* ss */
5326 CFI_ADJUST_CFA_OFFSET 8
5327 /*CFI_REL_OFFSET ss,0*/
5328 pushq %rax /* rsp */
5329 @@ -197,13 +308,13 @@ ENTRY(ret_from_fork)
5330 CFI_ADJUST_CFA_OFFSET -4
5332 GET_THREAD_INFO(%rcx)
5333 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
5334 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5338 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
5339 je int_ret_from_sys_call
5340 - testl $_TIF_IA32,threadinfo_flags(%rcx)
5341 + testl $_TIF_IA32,TI_flags(%rcx)
5342 jnz int_ret_from_sys_call
5343 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
5344 jmp ret_from_sys_call
5345 @@ -265,8 +376,9 @@ ENTRY(system_call)
5347 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
5348 GET_THREAD_INFO(%rcx)
5349 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
5350 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
5352 +system_call_fastpath:
5353 cmpq $__NR_syscall_max,%rax
5356 @@ -284,7 +396,7 @@ sysret_check:
5357 GET_THREAD_INFO(%rcx)
5358 DISABLE_INTERRUPTS(CLBR_NONE)
5360 - movl threadinfo_flags(%rcx),%edx
5361 + movl TI_flags(%rcx),%edx
5365 @@ -315,16 +427,16 @@ sysret_careful:
5368 ENABLE_INTERRUPTS(CLBR_NONE)
5369 - testl $_TIF_DO_NOTIFY_MASK,%edx
5372 - /* Really a signal */
5373 +#ifdef CONFIG_AUDITSYSCALL
5374 + bt $TIF_SYSCALL_AUDIT,%edx
5377 /* edx: work flags (arg3) */
5378 leaq do_notify_resume(%rip),%rax
5379 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
5380 xorl %esi,%esi # oldset -> arg2
5381 call ptregscall_common
5382 -1: movl $_TIF_NEED_RESCHED,%edi
5383 + movl $_TIF_WORK_MASK,%edi
5384 /* Use IRET because user could have changed frame. This
5385 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
5386 DISABLE_INTERRUPTS(CLBR_NONE)
5387 @@ -335,14 +447,56 @@ badsys:
5388 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
5389 jmp ret_from_sys_call
5391 +#ifdef CONFIG_AUDITSYSCALL
5393 + * Fast path for syscall audit without full syscall trace.
5394 + * We just call audit_syscall_entry() directly, and then
5395 + * jump back to the normal fast path.
5398 + movq %r10,%r9 /* 6th arg: 4th syscall arg */
5399 + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
5400 + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
5401 + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
5402 + movq %rax,%rsi /* 2nd arg: syscall number */
5403 + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
5404 + call audit_syscall_entry
5405 + LOAD_ARGS 0 /* reload call-clobbered registers */
5406 + jmp system_call_fastpath
5409 + * Return fast path for syscall audit. Call audit_syscall_exit()
5410 + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
5414 + movq %rax,%rsi /* second arg, syscall return value */
5415 + cmpq $0,%rax /* is it < 0? */
5416 + setl %al /* 1 if so, 0 if not */
5417 + movzbl %al,%edi /* zero-extend that into %edi */
5418 + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
5419 + call audit_syscall_exit
5420 + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
5422 +#endif /* CONFIG_AUDITSYSCALL */
5424 /* Do syscall tracing */
5426 +#ifdef CONFIG_AUDITSYSCALL
5427 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5431 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
5432 FIXUP_TOP_OF_STACK %rdi
5434 call syscall_trace_enter
5435 - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
5437 + * Reload arg registers from stack in case ptrace changed them.
5438 + * We don't reload %rax because syscall_trace_enter() returned
5439 + * the value it wants us to use in the table lookup.
5441 + LOAD_ARGS ARGOFFSET, 1
5443 cmpq $__NR_syscall_max,%rax
5444 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
5445 @@ -356,6 +510,7 @@ tracesys:
5446 * Has correct top of stack, but partial stack frame.
5448 .globl int_ret_from_sys_call
5449 + .globl int_with_check
5450 int_ret_from_sys_call:
5451 DISABLE_INTERRUPTS(CLBR_NONE)
5453 @@ -370,10 +525,10 @@ int_ret_from_sys_call:
5455 LOCKDEP_SYS_EXIT_IRQ
5456 GET_THREAD_INFO(%rcx)
5457 - movl threadinfo_flags(%rcx),%edx
5458 + movl TI_flags(%rcx),%edx
5461 - andl $~TS_COMPAT,threadinfo_status(%rcx)
5462 + andl $~TS_COMPAT,TI_status(%rcx)
5463 jmp retint_restore_args
5465 /* Either reschedule or signal or syscall exit tracking needed. */
5466 @@ -399,7 +554,7 @@ int_very_careful:
5467 ENABLE_INTERRUPTS(CLBR_NONE)
5469 /* Check for syscall exit trace */
5470 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
5471 + testl $_TIF_WORK_SYSCALL_EXIT,%edx
5474 CFI_ADJUST_CFA_OFFSET 8
5475 @@ -407,7 +562,7 @@ int_very_careful:
5476 call syscall_trace_leave
5478 CFI_ADJUST_CFA_OFFSET -8
5479 - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
5480 + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
5481 jmp int_restore_rest
5484 @@ -416,7 +571,7 @@ int_signal:
5485 movq %rsp,%rdi # &ptregs -> arg1
5486 xorl %esi,%esi # oldset -> arg2
5487 call do_notify_resume
5488 -1: movl $_TIF_NEED_RESCHED,%edi
5489 +1: movl $_TIF_WORK_MASK,%edi
5492 DISABLE_INTERRUPTS(CLBR_NONE)
5493 @@ -443,7 +598,6 @@ END(\label)
5494 PTREGSCALL stub_clone, sys_clone, %r8
5495 PTREGSCALL stub_fork, sys_fork, %rdi
5496 PTREGSCALL stub_vfork, sys_vfork, %rdi
5497 - PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
5498 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
5499 PTREGSCALL stub_iopl, sys_iopl, %rsi
5501 @@ -517,10 +671,12 @@ END(stub_rt_sigreturn)
5506 +retint_with_reschedule:
5507 CFI_DEFAULT_STACK adj=1
5508 + movl $_TIF_WORK_MASK,%edi
5510 LOCKDEP_SYS_EXIT_IRQ
5511 - movl threadinfo_flags(%rcx),%edx
5512 + movl TI_flags(%rcx),%edx
5516 @@ -565,17 +721,16 @@ retint_signal:
5518 DISABLE_INTERRUPTS(CLBR_NONE)
5520 - movl $_TIF_NEED_RESCHED,%edi
5521 GET_THREAD_INFO(%rcx)
5523 + jmp retint_with_reschedule
5525 #ifdef CONFIG_PREEMPT
5526 /* Returning to kernel space. Check if we need preemption */
5527 /* rcx: threadinfo. interrupts off. */
5528 ENTRY(retint_kernel)
5529 - cmpl $0,threadinfo_preempt_count(%rcx)
5530 + cmpl $0,TI_preempt_count(%rcx)
5531 jnz retint_restore_args
5532 - bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
5533 + bt $TIF_NEED_RESCHED,TI_flags(%rcx)
5534 jnc retint_restore_args
5535 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
5536 jnc retint_restore_args
5537 @@ -630,6 +785,9 @@ END(invalidate_interrupt\num)
5538 ENTRY(call_function_interrupt)
5539 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
5540 END(call_function_interrupt)
5541 +ENTRY(call_function_single_interrupt)
5542 + apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
5543 +END(call_function_single_interrupt)
5544 ENTRY(irq_move_cleanup_interrupt)
5545 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
5546 END(irq_move_cleanup_interrupt)
5547 @@ -639,6 +797,10 @@ ENTRY(apic_timer_interrupt)
5548 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
5549 END(apic_timer_interrupt)
5551 +ENTRY(uv_bau_message_intr1)
5552 + apicinterrupt 220,uv_bau_message_interrupt
5553 +END(uv_bau_message_intr1)
5555 ENTRY(error_interrupt)
5556 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
5557 END(error_interrupt)
5558 @@ -752,7 +914,7 @@ paranoid_restore\trace:
5560 paranoid_userspace\trace:
5561 GET_THREAD_INFO(%rcx)
5562 - movl threadinfo_flags(%rcx),%ebx
5563 + movl TI_flags(%rcx),%ebx
5564 andl $_TIF_WORK_MASK,%ebx
5565 jz paranoid_swapgs\trace
5566 movq %rsp,%rdi /* &pt_regs */
5567 @@ -849,7 +1011,7 @@ error_exit:
5568 testb $3,CS-ARGOFFSET(%rsp)
5570 LOCKDEP_SYS_EXIT_IRQ
5571 - movl threadinfo_flags(%rcx),%edx
5572 + movl TI_flags(%rcx),%edx
5573 movl $_TIF_WORK_MASK,%edi
5576 @@ -871,11 +1033,11 @@ error_kernelspace:
5577 iret run with kernel gs again, so don't set the user space flag.
5578 B stepping K8s sometimes report an truncated RIP for IRET
5579 exceptions returning to compat mode. Check for these here too. */
5580 - leaq irq_return(%rip),%rbp
5581 - cmpq %rbp,RIP(%rsp)
5582 + leaq irq_return(%rip),%rcx
5583 + cmpq %rcx,RIP(%rsp)
5585 - movl %ebp,%ebp /* zero extend */
5586 - cmpq %rbp,RIP(%rsp)
5587 + movl %ecx,%ecx /* zero extend */
5588 + cmpq %rcx,RIP(%rsp)
5590 cmpq $gs_change,RIP(%rsp)
5592 @@ -1121,6 +1283,7 @@ END(device_not_available)
5593 /* runs on exception stack */
5596 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5598 CFI_ADJUST_CFA_OFFSET 8 */
5600 @@ -1148,6 +1311,7 @@ END(do_nmi_callback)
5604 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5606 CFI_ADJUST_CFA_OFFSET 8 */
5608 @@ -1171,14 +1335,11 @@ ENTRY(coprocessor_segment_overrun)
5609 zeroentry do_coprocessor_segment_overrun
5610 END(coprocessor_segment_overrun)
5613 - zeroentry do_reserved
5617 /* runs on exception stack */
5620 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5621 paranoidentry do_double_fault
5624 @@ -1196,6 +1357,7 @@ END(segment_not_present)
5625 /* runs on exception stack */
5626 ENTRY(stack_segment)
5628 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5629 paranoidentry do_stack_segment */
5630 errorentry do_stack_segment
5631 /* jmp paranoid_exit1
5632 @@ -1222,6 +1384,7 @@ END(spurious_interrupt_bug)
5633 /* runs on exception stack */
5634 ENTRY(machine_check)
5636 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5638 CFI_ADJUST_CFA_OFFSET 8
5639 paranoidentry do_machine_check
5640 --- sle11-2009-06-04.orig/arch/x86/kernel/fixup.c 2009-03-16 16:33:40.000000000 +0100
5641 +++ sle11-2009-06-04/arch/x86/kernel/fixup.c 2009-06-04 10:21:39.000000000 +0200
5643 #include <linux/kernel.h>
5644 #include <linux/delay.h>
5645 #include <linux/version.h>
5646 +#include <asm/traps.h>
5648 #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
5650 --- sle11-2009-06-04.orig/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
5651 +++ sle11-2009-06-04/arch/x86/kernel/genapic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
5652 @@ -58,7 +58,7 @@ void __init setup_apic_routing(void)
5656 - if (num_possible_cpus() <= 8)
5657 + if (max_physical_apicid < 8)
5658 genapic = &apic_flat;
5660 genapic = &apic_physflat;
5661 @@ -121,4 +121,5 @@ int is_uv_system(void)
5663 return uv_system_type != UV_NONE;
5665 +EXPORT_SYMBOL_GPL(is_uv_system);
5667 --- sle11-2009-06-04.orig/arch/x86/kernel/genapic_xen_64.c 2009-03-16 16:38:05.000000000 +0100
5668 +++ sle11-2009-06-04/arch/x86/kernel/genapic_xen_64.c 2009-06-04 10:21:39.000000000 +0200
5669 @@ -43,7 +43,7 @@ void xen_send_IPI_shortcut(unsigned int
5670 __send_IPI_one(smp_processor_id(), vector);
5672 case APIC_DEST_ALLBUT:
5673 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5674 + for_each_possible_cpu(cpu) {
5675 if (cpu == smp_processor_id())
5677 if (cpu_isset(cpu, cpu_online_map)) {
5678 @@ -52,7 +52,7 @@ void xen_send_IPI_shortcut(unsigned int
5681 case APIC_DEST_ALLINC:
5682 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5683 + for_each_possible_cpu(cpu) {
5684 if (cpu_isset(cpu, cpu_online_map)) {
5685 __send_IPI_one(cpu, vector);
5687 @@ -81,8 +81,6 @@ static cpumask_t xen_vector_allocation_d
5689 static void xen_init_apic_ldr(void)
5691 - Dprintk("%s\n", __FUNCTION__);
5695 static void xen_send_IPI_allbutself(int vector)
5696 @@ -92,14 +90,12 @@ static void xen_send_IPI_allbutself(int
5697 * we get an APIC send error if we try to broadcast.
5698 * thus we have to avoid sending IPIs in this case.
5700 - Dprintk("%s\n", __FUNCTION__);
5701 if (num_online_cpus() > 1)
5702 xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
5705 static void xen_send_IPI_all(int vector)
5707 - Dprintk("%s\n", __FUNCTION__);
5708 xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
5711 @@ -109,11 +105,10 @@ static void xen_send_IPI_mask(cpumask_t
5713 unsigned long flags;
5715 - Dprintk("%s\n", __FUNCTION__);
5716 local_irq_save(flags);
5717 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
5719 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5720 + for_each_possible_cpu(cpu) {
5721 if (cpu_isset(cpu, cpumask)) {
5722 __send_IPI_one(cpu, vector);
5724 @@ -125,14 +120,12 @@ static void xen_send_IPI_mask(cpumask_t
5725 static int xen_apic_id_registered(void)
5728 - Dprintk("%s\n", __FUNCTION__);
5729 return physid_isset(smp_processor_id(), phys_cpu_present_map);
5733 static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
5735 - Dprintk("%s\n", __FUNCTION__);
5736 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
5739 @@ -140,7 +133,6 @@ static unsigned int phys_pkg_id(int inde
5743 - Dprintk("%s\n", __FUNCTION__);
5745 return ((ebx >> 24) & 0xFF) >> index_msb;
5747 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5748 +++ sle11-2009-06-04/arch/x86/kernel/head-xen.c 2009-06-04 10:21:39.000000000 +0200
5750 +#include <linux/kernel.h>
5751 +#include <linux/init.h>
5753 +#include <asm/setup.h>
5754 +#include <asm/bios_ebda.h>
5756 +#define BIOS_LOWMEM_KILOBYTES 0x413
5759 + * The BIOS places the EBDA/XBDA at the top of conventional
5760 + * memory, and usually decreases the reported amount of
5761 + * conventional memory (int 0x12) too. This also contains a
5762 + * workaround for Dell systems that neglect to reserve EBDA.
5763 + * The same workaround also avoids a problem with the AMD768MPX
5764 + * chipset: reserve a page before VGA to prevent PCI prefetch
5765 + * into it (errata #56). Usually the page is reserved anyways,
5766 + * unless you have no PS/2 mouse plugged in.
5768 +void __init reserve_ebda_region(void)
5771 + unsigned int lowmem, ebda_addr;
5773 + /* To determine the position of the EBDA and the */
5774 + /* end of conventional memory, we need to look at */
5775 + /* the BIOS data area. In a paravirtual environment */
5776 + /* that area is absent. We'll just have to assume */
5777 + /* that the paravirt case can handle memory setup */
5778 + /* correctly, without our help. */
5779 + if (paravirt_enabled())
5782 + /* end of low (conventional) memory */
5783 + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5786 + /* start of EBDA area */
5787 + ebda_addr = get_bios_ebda();
5789 + /* Fixup: bios puts an EBDA in the top 64K segment */
5790 + /* of conventional memory, but does not adjust lowmem. */
5791 + if ((lowmem - ebda_addr) <= 0x10000)
5792 + lowmem = ebda_addr;
5794 + /* Fixup: bios does not report an EBDA at all. */
5795 + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5796 + if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5799 + /* Paranoia: should never happen, but... */
5800 + if ((lowmem == 0) || (lowmem >= 0x100000))
5803 + /* reserve all memory between lowmem and the 1MB mark */
5804 + reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
5807 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5808 +++ sle11-2009-06-04/arch/x86/kernel/head32-xen.c 2009-06-04 10:21:39.000000000 +0200
5811 + * linux/arch/i386/kernel/head32.c -- prepare to run common code
5813 + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5814 + * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
5817 +#include <linux/init.h>
5818 +#include <linux/start_kernel.h>
5820 +#include <asm/setup.h>
5821 +#include <asm/sections.h>
5822 +#include <asm/e820.h>
5823 +#include <asm/bios_ebda.h>
5825 +void __init i386_start_kernel(void)
5827 + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
5830 +#ifdef CONFIG_BLK_DEV_INITRD
5831 + /* Reserve INITRD */
5832 + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
5833 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
5834 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
5835 + u64 ramdisk_end = ramdisk_image + ramdisk_size;
5836 + reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
5839 + reserve_early(init_pg_tables_start, init_pg_tables_end,
5842 + reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
5843 + __pa(xen_start_info->pt_base)
5844 + + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
5850 + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
5851 + max_cmdline = COMMAND_LINE_SIZE;
5852 + memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
5853 + boot_command_line[max_cmdline-1] = '\0';
5857 + reserve_ebda_region();
5860 + * At this point everything still needed from the boot loader
5861 + * or BIOS or kernel text should be early reserved or marked not
5862 + * RAM in e820. All other memory is free game.
5867 --- sle11-2009-06-04.orig/arch/x86/kernel/head64-xen.c 2009-03-16 16:38:05.000000000 +0100
5868 +++ sle11-2009-06-04/arch/x86/kernel/head64-xen.c 2009-06-04 10:21:39.000000000 +0200
5870 #include <asm/e820.h>
5871 #include <asm/bios_ebda.h>
5873 -unsigned long start_pfn;
5875 +static struct x8664_pda _boot_cpu_pda __read_mostly;
5879 + * We install an empty cpu_pda pointer table to indicate to early users
5880 + * (numa_set_node) that the cpu_pda pointer table for cpus other than
5881 + * the boot cpu is not yet setup.
5883 +static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
5885 +static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
5888 +void __init x86_64_init_pda(void)
5890 + _cpu_pda = __cpu_pda;
5891 + cpu_pda(0) = &_boot_cpu_pda;
5896 static void __init zap_identity_mappings(void)
5897 @@ -77,83 +96,10 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
5898 unsigned int machine_to_phys_order;
5899 EXPORT_SYMBOL(machine_to_phys_order);
5901 -#define BIOS_LOWMEM_KILOBYTES 0x413
5904 - * The BIOS places the EBDA/XBDA at the top of conventional
5905 - * memory, and usually decreases the reported amount of
5906 - * conventional memory (int 0x12) too. This also contains a
5907 - * workaround for Dell systems that neglect to reserve EBDA.
5908 - * The same workaround also avoids a problem with the AMD768MPX
5909 - * chipset: reserve a page before VGA to prevent PCI prefetch
5910 - * into it (errata #56). Usually the page is reserved anyways,
5911 - * unless you have no PS/2 mouse plugged in.
5913 -static void __init reserve_ebda_region(void)
5916 - unsigned int lowmem, ebda_addr;
5918 - /* To determine the position of the EBDA and the */
5919 - /* end of conventional memory, we need to look at */
5920 - /* the BIOS data area. In a paravirtual environment */
5921 - /* that area is absent. We'll just have to assume */
5922 - /* that the paravirt case can handle memory setup */
5923 - /* correctly, without our help. */
5924 - if (paravirt_enabled())
5927 - /* end of low (conventional) memory */
5928 - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5931 - /* start of EBDA area */
5932 - ebda_addr = get_bios_ebda();
5934 - /* Fixup: bios puts an EBDA in the top 64K segment */
5935 - /* of conventional memory, but does not adjust lowmem. */
5936 - if ((lowmem - ebda_addr) <= 0x10000)
5937 - lowmem = ebda_addr;
5939 - /* Fixup: bios does not report an EBDA at all. */
5940 - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5941 - if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5944 - /* Paranoia: should never happen, but... */
5945 - if ((lowmem == 0) || (lowmem >= 0x100000))
5948 - /* reserve all memory between lowmem and the 1MB mark */
5949 - reserve_early(lowmem, 0x100000, "BIOS reserved");
5953 -static void __init reserve_setup_data(void)
5956 - struct setup_data *data;
5957 - unsigned long pa_data;
5960 - if (boot_params.hdr.version < 0x0209)
5962 - pa_data = boot_params.hdr.setup_data;
5964 - data = early_ioremap(pa_data, sizeof(*data));
5965 - sprintf(buf, "setup data %x", data->type);
5966 - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
5967 - pa_data = data->next;
5968 - early_iounmap(data, sizeof(*data));
5973 void __init x86_64_start_kernel(char * real_mode_data)
5975 struct xen_machphys_mapping mapping;
5976 unsigned long machine_to_phys_nr_ents;
5980 * Build-time sanity checks on the kernel image and module
5981 @@ -167,6 +113,7 @@ void __init x86_64_start_kernel(char * r
5982 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
5983 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
5984 (__START_KERNEL & PGDIR_MASK)));
5985 + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
5987 xen_setup_features();
5989 @@ -174,8 +121,6 @@ void __init x86_64_start_kernel(char * r
5990 if (!xen_feature(XENFEAT_auto_translated_physmap))
5991 phys_to_machine_mapping =
5992 (unsigned long *)xen_start_info->mfn_list;
5993 - start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
5994 - xen_start_info->nr_pt_frames;
5996 machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
5997 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
5998 @@ -208,19 +153,23 @@ void __init x86_64_start_kernel(char * r
6000 early_printk("Kernel alive\n");
6002 - for (i = 0; i < NR_CPUS; i++)
6003 - cpu_pda(i) = &boot_cpu_pda[i];
6004 + x86_64_init_pda();
6007 + early_printk("Kernel really alive\n");
6009 + x86_64_start_reservations(real_mode_data);
6012 +void __init x86_64_start_reservations(char *real_mode_data)
6014 copy_bootdata(__va(real_mode_data));
6016 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
6018 reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
6019 - start_pfn << PAGE_SHIFT, "Xen provided");
6021 - reserve_ebda_region();
6022 - reserve_setup_data();
6023 + __pa(xen_start_info->pt_base)
6024 + + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
6028 * At this point everything still needed from the boot loader
6029 --- sle11-2009-06-04.orig/arch/x86/kernel/head_64-xen.S 2009-02-16 16:17:21.000000000 +0100
6030 +++ sle11-2009-06-04/arch/x86/kernel/head_64-xen.S 2009-06-04 10:21:39.000000000 +0200
6031 @@ -95,53 +95,6 @@ NEXT_PAGE(hypercall_page)
6038 - .globl cpu_gdt_descr
6040 - .word gdt_end-cpu_gdt_table-1
6042 - .quad cpu_gdt_table
6050 -/* We need valid kernel segments for data and code in long mode too
6051 - * IRET will check the segment types kkeil 2000/10/28
6052 - * Also sysret mandates a special GDT layout
6055 - .section .data.page_aligned, "aw"
6058 -/* The TLS descriptors are currently at a different place compared to i386.
6059 - Hopefully nobody expects them at a fixed place (Wine?) */
6061 -ENTRY(cpu_gdt_table)
6062 - .quad 0x0000000000000000 /* NULL descriptor */
6063 - .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
6064 - .quad 0x00af9b000000ffff /* __KERNEL_CS */
6065 - .quad 0x00cf93000000ffff /* __KERNEL_DS */
6066 - .quad 0x00cffb000000ffff /* __USER32_CS */
6067 - .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
6068 - .quad 0x00affb000000ffff /* __USER_CS */
6069 - .quad 0x0 /* unused */
6070 - .quad 0,0 /* TSS */
6071 - .quad 0,0 /* LDT */
6072 - .quad 0,0,0 /* three TLS descriptors */
6073 - .quad 0x0000f40000000000 /* node/CPU stored in limit */
6075 - /* asm/segment.h:GDT_ENTRIES must match this */
6076 - /* This should be a multiple of the cache line size */
6077 - /* GDTs of other CPUs are now dynamically allocated */
6079 - /* zero the remaining page */
6080 - .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
6082 .section .bss.page_aligned, "aw", @nobits
6084 ENTRY(empty_zero_page)
6085 --- sle11-2009-06-04.orig/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:38:05.000000000 +0100
6086 +++ sle11-2009-06-04/arch/x86/kernel/io_apic_32-xen.c 2009-06-04 10:21:39.000000000 +0200
6088 #include <linux/init.h>
6089 #include <linux/delay.h>
6090 #include <linux/sched.h>
6091 +#include <linux/bootmem.h>
6092 #include <linux/mc146818rtc.h>
6093 #include <linux/compiler.h>
6094 #include <linux/acpi.h>
6095 @@ -75,7 +76,7 @@ static struct { int pin, apic; } ioapic_
6096 static DEFINE_SPINLOCK(ioapic_lock);
6097 static DEFINE_SPINLOCK(vector_lock);
6099 -int timer_over_8254 __initdata = 1;
6100 +int timer_through_8259 __initdata;
6103 * Is the SiS APIC rmw bug present ?
6104 @@ -89,15 +90,21 @@ int sis_apic_bug = -1;
6105 int nr_ioapic_registers[MAX_IO_APICS];
6107 /* I/O APIC entries */
6108 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
6109 +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
6112 /* MP IRQ source entries */
6113 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6114 +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6116 /* # of MP IRQ source entries */
6119 +#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
6120 +int mp_bus_id_to_type[MAX_MP_BUSSES];
6123 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
6125 static int disable_timer_pin_1 __initdata;
6128 @@ -128,7 +135,7 @@ struct io_apic {
6129 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
6131 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
6132 - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
6133 + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
6137 @@ -142,7 +149,7 @@ static inline unsigned int io_apic_read(
6138 struct physdev_apic apic_op;
6141 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6142 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6144 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
6146 @@ -160,7 +167,7 @@ static inline void io_apic_write(unsigne
6148 struct physdev_apic apic_op;
6150 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6151 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6153 apic_op.value = value;
6154 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
6155 @@ -288,7 +295,7 @@ static void __init replace_pin_at_irq(un
6159 -static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
6160 +static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
6162 struct irq_pin_list *entry = irq_2_pin + irq;
6163 unsigned int pin, reg;
6164 @@ -308,30 +315,32 @@ static void __modify_IO_APIC_irq (unsign
6168 -static void __mask_IO_APIC_irq (unsigned int irq)
6169 +static void __mask_IO_APIC_irq(unsigned int irq)
6171 - __modify_IO_APIC_irq(irq, 0x00010000, 0);
6172 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
6176 -static void __unmask_IO_APIC_irq (unsigned int irq)
6177 +static void __unmask_IO_APIC_irq(unsigned int irq)
6179 - __modify_IO_APIC_irq(irq, 0, 0x00010000);
6180 + __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
6183 /* mask = 1, trigger = 0 */
6184 -static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
6185 +static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
6187 - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
6188 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
6189 + IO_APIC_REDIR_LEVEL_TRIGGER);
6192 /* mask = 0, trigger = 1 */
6193 -static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
6194 +static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
6196 - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
6197 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
6198 + IO_APIC_REDIR_MASKED);
6201 -static void mask_IO_APIC_irq (unsigned int irq)
6202 +static void mask_IO_APIC_irq(unsigned int irq)
6204 unsigned long flags;
6206 @@ -340,7 +349,7 @@ static void mask_IO_APIC_irq (unsigned i
6207 spin_unlock_irqrestore(&ioapic_lock, flags);
6210 -static void unmask_IO_APIC_irq (unsigned int irq)
6211 +static void unmask_IO_APIC_irq(unsigned int irq)
6213 unsigned long flags;
6215 @@ -352,7 +361,7 @@ static void unmask_IO_APIC_irq (unsigned
6216 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
6218 struct IO_APIC_route_entry entry;
6221 /* Check delivery_mode to be sure we're not clearing an SMI pin */
6222 entry = ioapic_read_entry(apic, pin);
6223 if (entry.delivery_mode == dest_SMI)
6224 @@ -364,7 +373,7 @@ static void clear_IO_APIC_pin(unsigned i
6225 ioapic_mask_entry(apic, pin);
6228 -static void clear_IO_APIC (void)
6229 +static void clear_IO_APIC(void)
6233 @@ -381,7 +390,7 @@ static void set_ioapic_affinity_irq(unsi
6234 struct irq_pin_list *entry = irq_2_pin + irq;
6235 unsigned int apicid_value;
6239 cpus_and(tmp, cpumask, cpu_online_map);
6240 if (cpus_empty(tmp))
6242 @@ -410,7 +419,7 @@ static void set_ioapic_affinity_irq(unsi
6243 # include <linux/kernel_stat.h> /* kstat */
6244 # include <linux/slab.h> /* kmalloc() */
6245 # include <linux/timer.h>
6248 #define IRQBALANCE_CHECK_ARCH -999
6249 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
6250 #define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
6251 @@ -422,14 +431,14 @@ static int physical_balance __read_mostl
6252 static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
6254 static struct irq_cpu_info {
6255 - unsigned long * last_irq;
6256 - unsigned long * irq_delta;
6257 + unsigned long *last_irq;
6258 + unsigned long *irq_delta;
6260 } irq_cpu_data[NR_CPUS];
6262 #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
6263 -#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
6264 -#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
6265 +#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
6266 +#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
6268 #define IDLE_ENOUGH(cpu,now) \
6269 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
6270 @@ -468,8 +477,8 @@ inside:
6274 - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
6275 - (search_idle && !IDLE_ENOUGH(cpu,now)));
6276 + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
6277 + (search_idle && !IDLE_ENOUGH(cpu, now)));
6281 @@ -479,15 +488,14 @@ static inline void balance_irq(int cpu,
6282 unsigned long now = jiffies;
6283 cpumask_t allowed_mask;
6284 unsigned int new_cpu;
6287 if (irqbalance_disabled)
6291 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
6292 new_cpu = move(cpu, allowed_mask, now, 1);
6293 - if (cpu != new_cpu) {
6294 + if (cpu != new_cpu)
6295 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
6299 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
6300 @@ -499,14 +507,14 @@ static inline void rotate_irqs_among_cpu
6301 if (!irq_desc[j].action)
6303 /* Is it a significant load ? */
6304 - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
6305 + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
6306 useful_load_threshold)
6311 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6312 - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6313 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6317 @@ -535,22 +543,22 @@ static void do_irq_balance(void)
6318 /* Is this an active IRQ or balancing disabled ? */
6319 if (!irq_desc[j].action || irq_balancing_disabled(j))
6321 - if ( package_index == i )
6322 - IRQ_DELTA(package_index,j) = 0;
6323 + if (package_index == i)
6324 + IRQ_DELTA(package_index, j) = 0;
6325 /* Determine the total count per processor per IRQ */
6326 value_now = (unsigned long) kstat_cpu(i).irqs[j];
6328 /* Determine the activity per processor per IRQ */
6329 - delta = value_now - LAST_CPU_IRQ(i,j);
6330 + delta = value_now - LAST_CPU_IRQ(i, j);
6332 /* Update last_cpu_irq[][] for the next time */
6333 - LAST_CPU_IRQ(i,j) = value_now;
6334 + LAST_CPU_IRQ(i, j) = value_now;
6336 /* Ignore IRQs whose rate is less than the clock */
6337 if (delta < useful_load_threshold)
6339 /* update the load for the processor or package total */
6340 - IRQ_DELTA(package_index,j) += delta;
6341 + IRQ_DELTA(package_index, j) += delta;
6343 /* Keep track of the higher numbered sibling as well */
6344 if (i != package_index)
6345 @@ -576,7 +584,8 @@ static void do_irq_balance(void)
6346 max_cpu_irq = ULONG_MAX;
6349 - /* Look for heaviest loaded processor.
6351 + * Look for heaviest loaded processor.
6352 * We may come back to get the next heaviest loaded processor.
6353 * Skip processors with trivial loads.
6355 @@ -585,7 +594,7 @@ tryanothercpu:
6356 for_each_online_cpu(i) {
6357 if (i != CPU_TO_PACKAGEINDEX(i))
6359 - if (max_cpu_irq <= CPU_IRQ(i))
6360 + if (max_cpu_irq <= CPU_IRQ(i))
6362 if (tmp_cpu_irq < CPU_IRQ(i)) {
6363 tmp_cpu_irq = CPU_IRQ(i);
6364 @@ -594,8 +603,9 @@ tryanothercpu:
6367 if (tmp_loaded == -1) {
6368 - /* In the case of small number of heavy interrupt sources,
6369 - * loading some of the cpus too much. We use Ingo's original
6371 + * In the case of small number of heavy interrupt sources,
6372 + * loading some of the cpus too much. We use Ingo's original
6373 * approach to rotate them around.
6375 if (!first_attempt && imbalance >= useful_load_threshold) {
6376 @@ -604,13 +614,14 @@ tryanothercpu:
6378 goto not_worth_the_effort;
6382 first_attempt = 0; /* heaviest search */
6383 max_cpu_irq = tmp_cpu_irq; /* load */
6384 max_loaded = tmp_loaded; /* processor */
6385 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
6387 - /* if imbalance is less than approx 10% of max load, then
6390 + * if imbalance is less than approx 10% of max load, then
6391 * observe diminishing returns action. - quit
6393 if (imbalance < (max_cpu_irq >> 3))
6394 @@ -626,26 +637,25 @@ tryanotherirq:
6395 /* Is this an active IRQ? */
6396 if (!irq_desc[j].action)
6398 - if (imbalance <= IRQ_DELTA(max_loaded,j))
6399 + if (imbalance <= IRQ_DELTA(max_loaded, j))
6401 /* Try to find the IRQ that is closest to the imbalance
6402 * without going over.
6404 - if (move_this_load < IRQ_DELTA(max_loaded,j)) {
6405 - move_this_load = IRQ_DELTA(max_loaded,j);
6406 + if (move_this_load < IRQ_DELTA(max_loaded, j)) {
6407 + move_this_load = IRQ_DELTA(max_loaded, j);
6411 - if (selected_irq == -1) {
6412 + if (selected_irq == -1)
6416 imbalance = move_this_load;
6419 /* For physical_balance case, we accumulated both load
6420 * values in the one of the siblings cpu_irq[],
6421 * to use the same code for physical and logical processors
6422 - * as much as possible.
6423 + * as much as possible.
6425 * NOTE: the cpu_irq[] array holds the sum of the load for
6426 * sibling A and sibling B in the slot for the lowest numbered
6427 @@ -674,11 +684,11 @@ tryanotherirq:
6428 /* mark for change destination */
6429 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
6431 - /* Since we made a change, come back sooner to
6432 + /* Since we made a change, come back sooner to
6433 * check for more variation.
6435 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6436 - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6437 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6441 @@ -689,7 +699,7 @@ not_worth_the_effort:
6444 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
6445 - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6446 + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6450 @@ -728,13 +738,13 @@ static int __init balanced_irq_init(void
6453 cpus_shift_right(tmp, cpu_online_map, 2);
6454 - c = &boot_cpu_data;
6455 + c = &boot_cpu_data;
6456 /* When not overwritten by the command line ask subarchitecture. */
6457 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
6458 irqbalance_disabled = NO_BALANCE_IRQ;
6459 if (irqbalance_disabled)
6463 /* disable irqbalance completely if there is only one processor online */
6464 if (num_online_cpus() < 2) {
6465 irqbalance_disabled = 1;
6466 @@ -748,16 +758,14 @@ static int __init balanced_irq_init(void
6467 physical_balance = 1;
6469 for_each_online_cpu(i) {
6470 - irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6471 - irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6472 + irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6473 + irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6474 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
6475 printk(KERN_ERR "balanced_irq_init: out of memory");
6478 - memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
6479 - memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
6483 printk(KERN_INFO "Starting balanced_irq\n");
6484 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
6486 @@ -799,7 +807,7 @@ void send_IPI_self(int vector)
6488 * Send the IPI. The write to APIC_ICR fires this off.
6490 - apic_write_around(APIC_ICR, cfg);
6491 + apic_write(APIC_ICR, cfg);
6494 #endif /* !CONFIG_SMP */
6495 @@ -853,10 +861,10 @@ static int find_irq_entry(int apic, int
6498 for (i = 0; i < mp_irq_entries; i++)
6499 - if (mp_irqs[i].mpc_irqtype == type &&
6500 - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
6501 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
6502 - mp_irqs[i].mpc_dstirq == pin)
6503 + if (mp_irqs[i].mp_irqtype == type &&
6504 + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
6505 + mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
6506 + mp_irqs[i].mp_dstirq == pin)
6510 @@ -871,13 +879,13 @@ static int __init find_isa_irq_pin(int i
6513 for (i = 0; i < mp_irq_entries; i++) {
6514 - int lbus = mp_irqs[i].mpc_srcbus;
6515 + int lbus = mp_irqs[i].mp_srcbus;
6517 if (test_bit(lbus, mp_bus_not_pci) &&
6518 - (mp_irqs[i].mpc_irqtype == type) &&
6519 - (mp_irqs[i].mpc_srcbusirq == irq))
6520 + (mp_irqs[i].mp_irqtype == type) &&
6521 + (mp_irqs[i].mp_srcbusirq == irq))
6523 - return mp_irqs[i].mpc_dstirq;
6524 + return mp_irqs[i].mp_dstirq;
6528 @@ -887,17 +895,17 @@ static int __init find_isa_irq_apic(int
6531 for (i = 0; i < mp_irq_entries; i++) {
6532 - int lbus = mp_irqs[i].mpc_srcbus;
6533 + int lbus = mp_irqs[i].mp_srcbus;
6535 if (test_bit(lbus, mp_bus_not_pci) &&
6536 - (mp_irqs[i].mpc_irqtype == type) &&
6537 - (mp_irqs[i].mpc_srcbusirq == irq))
6538 + (mp_irqs[i].mp_irqtype == type) &&
6539 + (mp_irqs[i].mp_srcbusirq == irq))
6542 if (i < mp_irq_entries) {
6544 - for(apic = 0; apic < nr_ioapics; apic++) {
6545 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
6546 + for (apic = 0; apic < nr_ioapics; apic++) {
6547 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
6551 @@ -918,28 +926,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6553 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
6554 "slot:%d, pin:%d.\n", bus, slot, pin);
6555 - if (mp_bus_id_to_pci_bus[bus] == -1) {
6556 + if (test_bit(bus, mp_bus_not_pci)) {
6557 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
6560 for (i = 0; i < mp_irq_entries; i++) {
6561 - int lbus = mp_irqs[i].mpc_srcbus;
6562 + int lbus = mp_irqs[i].mp_srcbus;
6564 for (apic = 0; apic < nr_ioapics; apic++)
6565 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
6566 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
6567 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
6568 + mp_irqs[i].mp_dstapic == MP_APIC_ALL)
6571 if (!test_bit(lbus, mp_bus_not_pci) &&
6572 - !mp_irqs[i].mpc_irqtype &&
6573 + !mp_irqs[i].mp_irqtype &&
6575 - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
6576 - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
6577 + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
6578 + int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
6580 if (!(apic || IO_APIC_IRQ(irq)))
6583 - if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
6584 + if (pin == (mp_irqs[i].mp_srcbusirq & 3))
6587 * Use the first all-but-pin matching entry as a
6588 @@ -954,7 +962,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6589 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
6592 - * This function currently is only a helper for the i386 smp boot process where
6593 + * This function currently is only a helper for the i386 smp boot process where
6594 * we need to reprogram the ioredtbls to cater for the cpus which have come online
6595 * so mask in all cases should simply be TARGET_CPUS
6597 @@ -1008,7 +1016,7 @@ static int EISA_ELCR(unsigned int irq)
6598 * EISA conforming in the MP table, that means its trigger type must
6599 * be read in from the ELCR */
6601 -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
6602 +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
6603 #define default_EISA_polarity(idx) default_ISA_polarity(idx)
6605 /* PCI interrupts are always polarity one level triggered,
6606 @@ -1025,118 +1033,115 @@ static int EISA_ELCR(unsigned int irq)
6608 static int MPBIOS_polarity(int idx)
6610 - int bus = mp_irqs[idx].mpc_srcbus;
6611 + int bus = mp_irqs[idx].mp_srcbus;
6615 * Determine IRQ line polarity (high active or low active):
6617 - switch (mp_irqs[idx].mpc_irqflag & 3)
6618 + switch (mp_irqs[idx].mp_irqflag & 3) {
6619 + case 0: /* conforms, ie. bus-type dependent polarity */
6621 - case 0: /* conforms, ie. bus-type dependent polarity */
6623 - polarity = test_bit(bus, mp_bus_not_pci)?
6624 - default_ISA_polarity(idx):
6625 - default_PCI_polarity(idx);
6628 - case 1: /* high active */
6633 - case 2: /* reserved */
6635 - printk(KERN_WARNING "broken BIOS!!\n");
6639 - case 3: /* low active */
6644 - default: /* invalid */
6646 - printk(KERN_WARNING "broken BIOS!!\n");
6650 + polarity = test_bit(bus, mp_bus_not_pci)?
6651 + default_ISA_polarity(idx):
6652 + default_PCI_polarity(idx);
6655 + case 1: /* high active */
6660 + case 2: /* reserved */
6662 + printk(KERN_WARNING "broken BIOS!!\n");
6666 + case 3: /* low active */
6671 + default: /* invalid */
6673 + printk(KERN_WARNING "broken BIOS!!\n");
6681 static int MPBIOS_trigger(int idx)
6683 - int bus = mp_irqs[idx].mpc_srcbus;
6684 + int bus = mp_irqs[idx].mp_srcbus;
6688 * Determine IRQ trigger mode (edge or level sensitive):
6690 - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
6691 + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
6692 + case 0: /* conforms, ie. bus-type dependent */
6694 - case 0: /* conforms, ie. bus-type dependent */
6696 - trigger = test_bit(bus, mp_bus_not_pci)?
6697 - default_ISA_trigger(idx):
6698 - default_PCI_trigger(idx);
6699 + trigger = test_bit(bus, mp_bus_not_pci)?
6700 + default_ISA_trigger(idx):
6701 + default_PCI_trigger(idx);
6702 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
6703 - switch (mp_bus_id_to_type[bus])
6705 - case MP_BUS_ISA: /* ISA pin */
6707 - /* set before the switch */
6710 - case MP_BUS_EISA: /* EISA pin */
6712 - trigger = default_EISA_trigger(idx);
6715 - case MP_BUS_PCI: /* PCI pin */
6717 - /* set before the switch */
6720 - case MP_BUS_MCA: /* MCA pin */
6722 - trigger = default_MCA_trigger(idx);
6727 - printk(KERN_WARNING "broken BIOS!!\n");
6733 + switch (mp_bus_id_to_type[bus]) {
6734 + case MP_BUS_ISA: /* ISA pin */
6736 + /* set before the switch */
6739 - case 1: /* edge */
6740 + case MP_BUS_EISA: /* EISA pin */
6743 + trigger = default_EISA_trigger(idx);
6746 - case 2: /* reserved */
6747 + case MP_BUS_PCI: /* PCI pin */
6749 - printk(KERN_WARNING "broken BIOS!!\n");
6751 + /* set before the switch */
6754 - case 3: /* level */
6755 + case MP_BUS_MCA: /* MCA pin */
6758 + trigger = default_MCA_trigger(idx);
6761 - default: /* invalid */
6764 printk(KERN_WARNING "broken BIOS!!\n");
6773 + case 1: /* edge */
6778 + case 2: /* reserved */
6780 + printk(KERN_WARNING "broken BIOS!!\n");
6784 + case 3: /* level */
6789 + default: /* invalid */
6791 + printk(KERN_WARNING "broken BIOS!!\n");
6799 @@ -1153,16 +1158,16 @@ static inline int irq_trigger(int idx)
6800 static int pin_2_irq(int idx, int apic, int pin)
6803 - int bus = mp_irqs[idx].mpc_srcbus;
6804 + int bus = mp_irqs[idx].mp_srcbus;
6807 * Debugging check, we are in big trouble if this message pops up!
6809 - if (mp_irqs[idx].mpc_dstirq != pin)
6810 + if (mp_irqs[idx].mp_dstirq != pin)
6811 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
6813 if (test_bit(bus, mp_bus_not_pci))
6814 - irq = mp_irqs[idx].mpc_srcbusirq;
6815 + irq = mp_irqs[idx].mp_srcbusirq;
6818 * PCI IRQs are mapped in order
6819 @@ -1204,8 +1209,8 @@ static inline int IO_APIC_irq_trigger(in
6821 for (apic = 0; apic < nr_ioapics; apic++) {
6822 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6823 - idx = find_irq_entry(apic,pin,mp_INT);
6824 - if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
6825 + idx = find_irq_entry(apic, pin, mp_INT);
6826 + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
6827 return irq_trigger(idx);
6830 @@ -1291,25 +1296,25 @@ static void __init setup_IO_APIC_irqs(vo
6832 * add it to the IO-APIC irq-routing table:
6834 - memset(&entry,0,sizeof(entry));
6835 + memset(&entry, 0, sizeof(entry));
6837 entry.delivery_mode = INT_DELIVERY_MODE;
6838 entry.dest_mode = INT_DEST_MODE;
6839 entry.mask = 0; /* enable IRQ */
6840 - entry.dest.logical.logical_dest =
6841 + entry.dest.logical.logical_dest =
6842 cpu_mask_to_apicid(TARGET_CPUS);
6844 - idx = find_irq_entry(apic,pin,mp_INT);
6845 + idx = find_irq_entry(apic, pin, mp_INT);
6848 apic_printk(APIC_VERBOSE, KERN_DEBUG
6849 " IO-APIC (apicid-pin) %d-%d",
6850 - mp_ioapics[apic].mpc_apicid,
6851 + mp_ioapics[apic].mp_apicid,
6855 apic_printk(APIC_VERBOSE, ", %d-%d",
6856 - mp_ioapics[apic].mpc_apicid, pin);
6857 + mp_ioapics[apic].mp_apicid, pin);
6861 @@ -1343,7 +1348,7 @@ static void __init setup_IO_APIC_irqs(vo
6862 vector = assign_irq_vector(irq);
6863 entry.vector = vector;
6864 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
6867 if (!apic && (irq < 16))
6868 disable_8259A_irq(irq);
6870 @@ -1355,27 +1360,23 @@ static void __init setup_IO_APIC_irqs(vo
6871 apic_printk(APIC_VERBOSE, " not connected.\n");
6876 - * Set up the 8259A-master output pin:
6877 + * Set up the timer pin, possibly with the 8259A-master behind.
6880 -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
6881 +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
6884 struct IO_APIC_route_entry entry;
6886 - memset(&entry,0,sizeof(entry));
6888 - disable_8259A_irq(0);
6891 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6892 + memset(&entry, 0, sizeof(entry));
6895 * We use logical delivery to get the timer IRQ
6898 entry.dest_mode = INT_DEST_MODE;
6899 - entry.mask = 0; /* unmask IRQ now */
6900 + entry.mask = 1; /* mask IRQ now */
6901 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6902 entry.delivery_mode = INT_DELIVERY_MODE;
6904 @@ -1384,17 +1385,14 @@ static void __init setup_ExtINT_IRQ0_pin
6907 * The timer IRQ doesn't have to know that behind the
6908 - * scene we have a 8259A-master in AEOI mode ...
6909 + * scene we may have a 8259A-master in AEOI mode ...
6911 - irq_desc[0].chip = &ioapic_chip;
6912 - set_irq_handler(0, handle_edge_irq);
6913 + ioapic_register_intr(0, vector, IOAPIC_EDGE);
6916 * Add it to the IO-APIC irq-routing table:
6918 ioapic_write_entry(apic, pin, entry);
6920 - enable_8259A_irq(0);
6923 void __init print_IO_APIC(void)
6924 @@ -1409,10 +1407,10 @@ void __init print_IO_APIC(void)
6925 if (apic_verbosity == APIC_QUIET)
6928 - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6929 + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6930 for (i = 0; i < nr_ioapics; i++)
6931 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
6932 - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
6933 + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
6936 * We are a bit conservative about what we expect. We have to
6937 @@ -1431,7 +1429,7 @@ void __init print_IO_APIC(void)
6938 reg_03.raw = io_apic_read(apic, 3);
6939 spin_unlock_irqrestore(&ioapic_lock, flags);
6941 - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
6942 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
6943 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
6944 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
6945 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
6946 @@ -1512,7 +1510,7 @@ void __init print_IO_APIC(void)
6950 -static void print_APIC_bitfield (int base)
6951 +static void print_APIC_bitfield(int base)
6955 @@ -1533,7 +1531,7 @@ static void print_APIC_bitfield (int bas
6959 -void /*__init*/ print_local_APIC(void * dummy)
6960 +void /*__init*/ print_local_APIC(void *dummy)
6962 unsigned int v, ver, maxlvt;
6964 @@ -1542,6 +1540,7 @@ void /*__init*/ print_local_APIC(void *
6966 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
6967 smp_processor_id(), hard_smp_processor_id());
6968 + v = apic_read(APIC_ID);
6969 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
6970 GET_APIC_ID(read_apic_id()));
6971 v = apic_read(APIC_LVR);
6972 @@ -1616,9 +1615,9 @@ void /*__init*/ print_local_APIC(void *
6976 -void print_all_local_APICs (void)
6977 +void print_all_local_APICs(void)
6979 - on_each_cpu(print_local_APIC, NULL, 1, 1);
6980 + on_each_cpu(print_local_APIC, NULL, 1);
6983 void /*__init*/ print_PIC(void)
6984 @@ -1639,11 +1638,11 @@ void /*__init*/ print_PIC(void)
6985 v = inb(0xa0) << 8 | inb(0x20);
6986 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
6992 v = inb(0xa0) << 8 | inb(0x20);
6998 spin_unlock_irqrestore(&i8259A_lock, flags);
7000 @@ -1652,6 +1651,8 @@ void /*__init*/ print_PIC(void)
7001 v = inb(0x4d1) << 8 | inb(0x4d0);
7002 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
7005 +void __init print_IO_APIC(void) {}
7006 #endif /* !CONFIG_XEN */
7008 static void __init enable_IO_APIC(void)
7009 @@ -1681,7 +1682,7 @@ static void __init enable_IO_APIC(void)
7010 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
7013 - for(apic = 0; apic < nr_ioapics; apic++) {
7014 + for (apic = 0; apic < nr_ioapics; apic++) {
7016 /* See if any of the pins is in ExtINT mode */
7017 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
7018 @@ -1774,7 +1775,7 @@ void disable_IO_APIC(void)
7019 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
7022 -#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
7024 static void __init setup_ioapic_ids_from_mpc(void)
7026 union IO_APIC_reg_00 reg_00;
7027 @@ -1784,6 +1785,11 @@ static void __init setup_ioapic_ids_from
7028 unsigned char old_id;
7029 unsigned long flags;
7031 +#ifdef CONFIG_X86_NUMAQ
7037 * Don't check I/O APIC IDs for xAPIC systems. They have
7038 * no meaning without the serial APIC bus.
7039 @@ -1806,15 +1812,15 @@ static void __init setup_ioapic_ids_from
7040 spin_lock_irqsave(&ioapic_lock, flags);
7041 reg_00.raw = io_apic_read(apic, 0);
7042 spin_unlock_irqrestore(&ioapic_lock, flags);
7044 - old_id = mp_ioapics[apic].mpc_apicid;
7046 - if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
7047 + old_id = mp_ioapics[apic].mp_apicid;
7049 + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
7050 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
7051 - apic, mp_ioapics[apic].mpc_apicid);
7052 + apic, mp_ioapics[apic].mp_apicid);
7053 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7055 - mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
7056 + mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
7060 @@ -1823,9 +1829,9 @@ static void __init setup_ioapic_ids_from
7061 * 'stuck on smp_invalidate_needed IPI wait' messages.
7063 if (check_apicid_used(phys_id_present_map,
7064 - mp_ioapics[apic].mpc_apicid)) {
7065 + mp_ioapics[apic].mp_apicid)) {
7066 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
7067 - apic, mp_ioapics[apic].mpc_apicid);
7068 + apic, mp_ioapics[apic].mp_apicid);
7069 for (i = 0; i < get_physical_broadcast(); i++)
7070 if (!physid_isset(i, phys_id_present_map))
7072 @@ -1834,13 +1840,13 @@ static void __init setup_ioapic_ids_from
7073 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7075 physid_set(i, phys_id_present_map);
7076 - mp_ioapics[apic].mpc_apicid = i;
7077 + mp_ioapics[apic].mp_apicid = i;
7080 - tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
7081 + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
7082 apic_printk(APIC_VERBOSE, "Setting %d in the "
7083 "phys_id_present_map\n",
7084 - mp_ioapics[apic].mpc_apicid);
7085 + mp_ioapics[apic].mp_apicid);
7086 physids_or(phys_id_present_map, phys_id_present_map, tmp);
7089 @@ -1849,21 +1855,21 @@ static void __init setup_ioapic_ids_from
7090 * We need to adjust the IRQ routing table
7091 * if the ID changed.
7093 - if (old_id != mp_ioapics[apic].mpc_apicid)
7094 + if (old_id != mp_ioapics[apic].mp_apicid)
7095 for (i = 0; i < mp_irq_entries; i++)
7096 - if (mp_irqs[i].mpc_dstapic == old_id)
7097 - mp_irqs[i].mpc_dstapic
7098 - = mp_ioapics[apic].mpc_apicid;
7099 + if (mp_irqs[i].mp_dstapic == old_id)
7100 + mp_irqs[i].mp_dstapic
7101 + = mp_ioapics[apic].mp_apicid;
7104 * Read the right value from the MPC table and
7105 * write it into the ID register.
7108 apic_printk(APIC_VERBOSE, KERN_INFO
7109 "...changing IO-APIC physical APIC ID to %d ...",
7110 - mp_ioapics[apic].mpc_apicid);
7111 + mp_ioapics[apic].mp_apicid);
7113 - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
7114 + reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
7115 spin_lock_irqsave(&ioapic_lock, flags);
7116 io_apic_write(apic, 0, reg_00.raw);
7117 spin_unlock_irqrestore(&ioapic_lock, flags);
7118 @@ -1874,17 +1880,13 @@ static void __init setup_ioapic_ids_from
7119 spin_lock_irqsave(&ioapic_lock, flags);
7120 reg_00.raw = io_apic_read(apic, 0);
7121 spin_unlock_irqrestore(&ioapic_lock, flags);
7122 - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
7123 + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
7124 printk("could not set ID!\n");
7126 apic_printk(APIC_VERBOSE, " ok.\n");
7130 -static void __init setup_ioapic_ids_from_mpc(void) { }
7134 int no_timer_check __initdata;
7136 static int __init notimercheck(char *s)
7137 @@ -2077,45 +2079,53 @@ static inline void init_IO_APIC_traps(vo
7138 * The local APIC irq-chip implementation:
7141 -static void ack_apic(unsigned int irq)
7142 +static void ack_lapic_irq(unsigned int irq)
7147 -static void mask_lapic_irq (unsigned int irq)
7148 +static void mask_lapic_irq(unsigned int irq)
7152 v = apic_read(APIC_LVT0);
7153 - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
7154 + apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
7157 -static void unmask_lapic_irq (unsigned int irq)
7158 +static void unmask_lapic_irq(unsigned int irq)
7162 v = apic_read(APIC_LVT0);
7163 - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
7164 + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
7167 static struct irq_chip lapic_chip __read_mostly = {
7168 - .name = "local-APIC-edge",
7169 + .name = "local-APIC",
7170 .mask = mask_lapic_irq,
7171 .unmask = unmask_lapic_irq,
7173 + .ack = ack_lapic_irq,
7176 +static void lapic_register_intr(int irq, int vector)
7178 + irq_desc[irq].status &= ~IRQ_LEVEL;
7179 + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
7181 + set_intr_gate(vector, interrupt[irq]);
7184 static void __init setup_nmi(void)
7187 - * Dirty trick to enable the NMI watchdog ...
7188 + * Dirty trick to enable the NMI watchdog ...
7189 * We put the 8259A master into AEOI mode and
7190 * unmask on all local APICs LVT0 as NMI.
7192 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
7193 * is from Maciej W. Rozycki - so we do not have to EOI from
7194 * the NMI handler or the timer interrupt.
7197 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
7199 enable_NMI_through_LVT0();
7200 @@ -2191,11 +2201,16 @@ static inline void __init unlock_ExtINT_
7201 static inline void __init check_timer(void)
7203 int apic1, pin1, apic2, pin2;
7207 unsigned long flags;
7209 local_irq_save(flags);
7211 + ver = apic_read(APIC_LVR);
7212 + ver = GET_APIC_VERSION(ver);
7215 * get/set the timer IRQ vector:
7217 @@ -2204,34 +2219,54 @@ static inline void __init check_timer(vo
7218 set_intr_gate(vector, interrupt[0]);
7221 - * Subtle, code in do_timer_interrupt() expects an AEOI
7222 - * mode for the 8259A whenever interrupts are routed
7223 - * through I/O APICs. Also IRQ0 has to be enabled in
7224 - * the 8259A which implies the virtual wire has to be
7225 - * disabled in the local APIC.
7226 + * As IRQ0 is to be enabled in the 8259A, the virtual
7227 + * wire has to be disabled in the local APIC. Also
7228 + * timer interrupts need to be acknowledged manually in
7229 + * the 8259A for the i82489DX when using the NMI
7230 + * watchdog as that APIC treats NMIs as level-triggered.
7231 + * The AEOI mode will finish them in the 8259A
7234 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7235 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7238 - if (timer_over_8254 > 0)
7239 - enable_8259A_irq(0);
7240 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
7242 pin1 = find_isa_irq_pin(0, mp_INT);
7243 apic1 = find_isa_irq_apic(0, mp_INT);
7244 pin2 = ioapic_i8259.pin;
7245 apic2 = ioapic_i8259.apic;
7247 - printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
7248 - vector, apic1, pin1, apic2, pin2);
7249 + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
7250 + "apic1=%d pin1=%d apic2=%d pin2=%d\n",
7251 + vector, apic1, pin1, apic2, pin2);
7254 + * Some BIOS writers are clueless and report the ExtINTA
7255 + * I/O APIC input from the cascaded 8259A as the timer
7256 + * interrupt input. So just in case, if only one pin
7257 + * was found above, try it both directly and through the
7264 + } else if (pin2 == -1) {
7271 * Ok, does IRQ0 through the IOAPIC work?
7274 + add_pin_to_irq(0, apic1, pin1);
7275 + setup_timer_IRQ0_pin(apic1, pin1, vector);
7277 unmask_IO_APIC_irq(0);
7278 if (timer_irq_works()) {
7279 if (nmi_watchdog == NMI_IO_APIC) {
7280 - disable_8259A_irq(0);
7282 enable_8259A_irq(0);
7284 @@ -2240,71 +2275,77 @@ static inline void __init check_timer(vo
7287 clear_IO_APIC_pin(apic1, pin1);
7288 - printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
7292 - printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
7294 - printk("\n..... (found pin %d) ...", pin2);
7296 + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
7297 + "8254 timer not connected to IO-APIC\n");
7299 + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
7300 + "(IRQ0) through the 8259A ...\n");
7301 + apic_printk(APIC_QUIET, KERN_INFO
7302 + "..... (found apic %d pin %d) ...\n", apic2, pin2);
7304 * legacy devices should be connected to IO APIC #0
7306 - setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
7307 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7308 + setup_timer_IRQ0_pin(apic2, pin2, vector);
7309 + unmask_IO_APIC_irq(0);
7310 + enable_8259A_irq(0);
7311 if (timer_irq_works()) {
7312 - printk("works.\n");
7314 - replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7316 - add_pin_to_irq(0, apic2, pin2);
7317 + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
7318 + timer_through_8259 = 1;
7319 if (nmi_watchdog == NMI_IO_APIC) {
7320 + disable_8259A_irq(0);
7322 + enable_8259A_irq(0);
7327 * Cleanup, just in case ...
7329 + disable_8259A_irq(0);
7330 clear_IO_APIC_pin(apic2, pin2);
7331 + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
7333 - printk(" failed.\n");
7335 if (nmi_watchdog == NMI_IO_APIC) {
7336 - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
7338 + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
7339 + "through the IO-APIC - disabling NMI Watchdog!\n");
7340 + nmi_watchdog = NMI_NONE;
7344 - printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
7345 + apic_printk(APIC_QUIET, KERN_INFO
7346 + "...trying to set up timer as Virtual Wire IRQ...\n");
7348 - disable_8259A_irq(0);
7349 - set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
7351 - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7352 + lapic_register_intr(0, vector);
7353 + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7354 enable_8259A_irq(0);
7356 if (timer_irq_works()) {
7357 - printk(" works.\n");
7358 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7361 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7362 - printk(" failed.\n");
7363 + disable_8259A_irq(0);
7364 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7365 + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
7367 - printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
7368 + apic_printk(APIC_QUIET, KERN_INFO
7369 + "...trying to set up timer as ExtINT IRQ...\n");
7374 - apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
7375 + apic_write(APIC_LVT0, APIC_DM_EXTINT);
7377 unlock_ExtINT_logic();
7379 if (timer_irq_works()) {
7380 - printk(" works.\n");
7381 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7384 - printk(" failed :(.\n");
7385 + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
7386 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
7387 - "report. Then try booting with the 'noapic' option");
7388 + "report. Then try booting with the 'noapic' option.\n");
7390 local_irq_restore(flags);
7392 @@ -2314,11 +2355,21 @@ int timer_uses_ioapic_pin_0 = 0;
7397 - * IRQ's that are handled by the PIC in the MPS IOAPIC case.
7398 - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
7399 - * Linux doesn't really care, as it's not actually used
7400 - * for any interrupt handling anyway.
7401 + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
7402 + * to devices. However there may be an I/O APIC pin available for
7403 + * this interrupt regardless. The pin may be left unconnected, but
7404 + * typically it will be reused as an ExtINT cascade interrupt for
7405 + * the master 8259A. In the MPS case such a pin will normally be
7406 + * reported as an ExtINT interrupt in the MP table. With ACPI
7407 + * there is no provision for ExtINT interrupts, and in the absence
7408 + * of an override it would be treated as an ordinary ISA I/O APIC
7409 + * interrupt, that is edge-triggered and unmasked by default. We
7410 + * used to do this, but it caused problems on some systems because
7411 + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
7412 + * the same ExtINT cascade interrupt to drive the local APIC of the
7413 + * bootstrap processor. Therefore we refrain from routing IRQ2 to
7414 + * the I/O APIC in all cases now. No actual device should request
7415 + * it anyway. --macro
7417 #define PIC_IRQS (1 << PIC_CASCADE_IR)
7419 @@ -2328,25 +2379,22 @@ void __init setup_IO_APIC(void)
7422 /* Reserve all the system vectors. */
7423 - for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
7424 + for (i = first_system_vector; i < NR_VECTORS; i++)
7425 set_bit(i, used_vectors);
7431 - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
7433 - io_apic_irqs = ~PIC_IRQS;
7434 + io_apic_irqs = ~PIC_IRQS;
7436 printk("ENABLING IO-APIC IRQs\n");
7440 * Set up IO-APIC IRQ routing.
7443 setup_ioapic_ids_from_mpc();
7447 setup_IO_APIC_irqs();
7448 @@ -2356,28 +2404,14 @@ void __init setup_IO_APIC(void)
7452 -static int __init setup_disable_8254_timer(char *s)
7454 - timer_over_8254 = -1;
7457 -static int __init setup_enable_8254_timer(char *s)
7459 - timer_over_8254 = 2;
7463 -__setup("disable_8254_timer", setup_disable_8254_timer);
7464 -__setup("enable_8254_timer", setup_enable_8254_timer);
7467 * Called after all the initialization is done. If we didnt find any
7468 * APIC bugs then we can allow the modify fast path
7472 static int __init io_apic_bug_finalize(void)
7474 - if(sis_apic_bug == -1)
7475 + if (sis_apic_bug == -1)
7477 if (is_initial_xendomain()) {
7478 struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
7479 @@ -2396,17 +2430,17 @@ struct sysfs_ioapic_data {
7480 struct sys_device dev;
7481 struct IO_APIC_route_entry entry[0];
7483 -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
7484 +static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
7486 static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
7488 struct IO_APIC_route_entry *entry;
7489 struct sysfs_ioapic_data *data;
7493 data = container_of(dev, struct sysfs_ioapic_data, dev);
7494 entry = data->entry;
7495 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7496 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7497 entry[i] = ioapic_read_entry(dev->id, i);
7500 @@ -2419,18 +2453,18 @@ static int ioapic_resume(struct sys_devi
7501 unsigned long flags;
7502 union IO_APIC_reg_00 reg_00;
7506 data = container_of(dev, struct sysfs_ioapic_data, dev);
7507 entry = data->entry;
7509 spin_lock_irqsave(&ioapic_lock, flags);
7510 reg_00.raw = io_apic_read(dev->id, 0);
7511 - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
7512 - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
7513 + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
7514 + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
7515 io_apic_write(dev->id, 0, reg_00.raw);
7517 spin_unlock_irqrestore(&ioapic_lock, flags);
7518 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7519 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7520 ioapic_write_entry(dev->id, i, entry[i]);
7523 @@ -2444,24 +2478,23 @@ static struct sysdev_class ioapic_sysdev
7525 static int __init ioapic_init_sysfs(void)
7527 - struct sys_device * dev;
7528 + struct sys_device *dev;
7529 int i, size, error = 0;
7531 error = sysdev_class_register(&ioapic_sysdev_class);
7535 - for (i = 0; i < nr_ioapics; i++ ) {
7536 - size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7537 + for (i = 0; i < nr_ioapics; i++) {
7538 + size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7539 * sizeof(struct IO_APIC_route_entry);
7540 - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
7541 + mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
7542 if (!mp_ioapic_data[i]) {
7543 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
7546 - memset(mp_ioapic_data[i], 0, size);
7547 dev = &mp_ioapic_data[i]->dev;
7550 dev->cls = &ioapic_sysdev_class;
7551 error = sysdev_register(dev);
7553 @@ -2538,7 +2571,7 @@ static int msi_compose_msg(struct pci_de
7556 ((INT_DEST_MODE == 0) ?
7557 - MSI_ADDR_DEST_MODE_PHYSICAL:
7558 +MSI_ADDR_DEST_MODE_PHYSICAL:
7559 MSI_ADDR_DEST_MODE_LOGICAL) |
7560 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7561 MSI_ADDR_REDIRECTION_CPU:
7562 @@ -2549,7 +2582,7 @@ static int msi_compose_msg(struct pci_de
7563 MSI_DATA_TRIGGER_EDGE |
7564 MSI_DATA_LEVEL_ASSERT |
7565 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7566 - MSI_DATA_DELIVERY_FIXED:
7567 +MSI_DATA_DELIVERY_FIXED:
7568 MSI_DATA_DELIVERY_LOWPRI) |
7569 MSI_DATA_VECTOR(vector);
7571 @@ -2720,12 +2753,12 @@ int arch_setup_ht_irq(unsigned int irq,
7572 #endif /* CONFIG_HT_IRQ */
7574 /* --------------------------------------------------------------------------
7575 - ACPI-based IOAPIC Configuration
7576 + ACPI-based IOAPIC Configuration
7577 -------------------------------------------------------------------------- */
7581 -int __init io_apic_get_unique_id (int ioapic, int apic_id)
7582 +int __init io_apic_get_unique_id(int ioapic, int apic_id)
7585 union IO_APIC_reg_00 reg_00;
7586 @@ -2735,10 +2768,10 @@ int __init io_apic_get_unique_id (int io
7590 - * The P4 platform supports up to 256 APIC IDs on two separate APIC
7591 - * buses (one for LAPICs, one for IOAPICs), where predecessors only
7592 + * The P4 platform supports up to 256 APIC IDs on two separate APIC
7593 + * buses (one for LAPICs, one for IOAPICs), where predecessors only
7594 * supports up to 16 on one shared APIC bus.
7597 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
7598 * advantage of new APIC bus architecture.
7600 @@ -2757,7 +2790,7 @@ int __init io_apic_get_unique_id (int io
7604 - * Every APIC in a system must have a unique ID or we get lots of nice
7605 + * Every APIC in a system must have a unique ID or we get lots of nice
7606 * 'stuck on smp_invalidate_needed IPI wait' messages.
7608 if (check_apicid_used(apic_id_map, apic_id)) {
7609 @@ -2774,7 +2807,7 @@ int __init io_apic_get_unique_id (int io
7610 "trying %d\n", ioapic, apic_id, i);
7616 tmp = apicid_to_cpu_present(apic_id);
7617 physids_or(apic_id_map, apic_id_map, tmp);
7618 @@ -2802,7 +2835,7 @@ int __init io_apic_get_unique_id (int io
7622 -int __init io_apic_get_version (int ioapic)
7623 +int __init io_apic_get_version(int ioapic)
7625 union IO_APIC_reg_01 reg_01;
7626 unsigned long flags;
7627 @@ -2815,7 +2848,7 @@ int __init io_apic_get_version (int ioap
7631 -int __init io_apic_get_redir_entries (int ioapic)
7632 +int __init io_apic_get_redir_entries(int ioapic)
7634 union IO_APIC_reg_01 reg_01;
7635 unsigned long flags;
7636 @@ -2828,7 +2861,7 @@ int __init io_apic_get_redir_entries (in
7640 -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
7641 +int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
7643 struct IO_APIC_route_entry entry;
7645 @@ -2844,7 +2877,7 @@ int io_apic_set_pci_routing (int ioapic,
7646 * corresponding device driver registers for this IRQ.
7649 - memset(&entry,0,sizeof(entry));
7650 + memset(&entry, 0, sizeof(entry));
7652 entry.delivery_mode = INT_DELIVERY_MODE;
7653 entry.dest_mode = INT_DEST_MODE;
7654 @@ -2863,7 +2896,7 @@ int io_apic_set_pci_routing (int ioapic,
7656 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
7657 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
7658 - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
7659 + mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
7660 edge_level, active_high_low);
7662 ioapic_register_intr(irq, entry.vector, edge_level);
7663 @@ -2884,8 +2917,8 @@ int acpi_get_override_irq(int bus_irq, i
7666 for (i = 0; i < mp_irq_entries; i++)
7667 - if (mp_irqs[i].mpc_irqtype == mp_INT &&
7668 - mp_irqs[i].mpc_srcbusirq == bus_irq)
7669 + if (mp_irqs[i].mp_irqtype == mp_INT &&
7670 + mp_irqs[i].mp_srcbusirq == bus_irq)
7672 if (i >= mp_irq_entries)
7674 @@ -2918,3 +2951,35 @@ static int __init parse_noapic(char *arg
7677 early_param("noapic", parse_noapic);
7680 +void __init ioapic_init_mappings(void)
7682 + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
7685 + for (i = 0; i < nr_ioapics; i++) {
7686 + if (smp_found_config) {
7687 + ioapic_phys = mp_ioapics[i].mp_apicaddr;
7688 + if (!ioapic_phys) {
7690 + "WARNING: bogus zero IO-APIC "
7691 + "address found in MPTABLE, "
7692 + "disabling IO/APIC support!\n");
7693 + smp_found_config = 0;
7694 + skip_ioapic_setup = 1;
7695 + goto fake_ioapic_page;
7699 + ioapic_phys = (unsigned long)
7700 + alloc_bootmem_pages(PAGE_SIZE);
7701 + ioapic_phys = __pa(ioapic_phys);
7703 + set_fixmap_nocache(idx, ioapic_phys);
7704 + printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
7705 + __fix_to_virt(idx), ioapic_phys);
7710 --- sle11-2009-06-04.orig/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
7711 +++ sle11-2009-06-04/arch/x86/kernel/io_apic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
7713 #include <asm/proto.h>
7714 #include <asm/acpi.h>
7715 #include <asm/dma.h>
7716 +#include <asm/i8259.h>
7717 #include <asm/nmi.h>
7718 #include <asm/msidef.h>
7719 #include <asm/hypertransport.h>
7720 @@ -63,10 +64,16 @@ struct irq_cfg {
7723 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
7724 -struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7725 +static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7727 static int assign_irq_vector(int irq, cpumask_t mask);
7730 +int first_system_vector = 0xfe;
7732 +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
7735 #define __apicdebuginit __init
7737 int sis_apic_bug; /* not actually supported, dummy for compile */
7738 @@ -89,14 +96,14 @@ unsigned long io_apic_irqs;
7740 #define clear_IO_APIC() ((void)0)
7742 -int timer_over_8254 __initdata = 1;
7743 +int timer_through_8259 __initdata;
7745 /* Where if anywhere is the i8259 connect in external int mode */
7746 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
7749 static DEFINE_SPINLOCK(ioapic_lock);
7750 -DEFINE_SPINLOCK(vector_lock);
7751 +static DEFINE_SPINLOCK(vector_lock);
7754 * # of IRQ routing registers
7755 @@ -104,15 +111,17 @@ DEFINE_SPINLOCK(vector_lock);
7756 int nr_ioapic_registers[MAX_IO_APICS];
7758 /* I/O APIC entries */
7759 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
7760 +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
7763 /* MP IRQ source entries */
7764 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7765 +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7767 /* # of MP IRQ source entries */
7770 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
7773 * Rough estimation of how many shared IRQs there are, can
7774 * be changed anytime.
7775 @@ -141,7 +150,7 @@ struct io_apic {
7776 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
7778 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
7779 - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
7780 + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
7784 @@ -155,7 +164,7 @@ static inline unsigned int io_apic_read(
7785 struct physdev_apic apic_op;
7788 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7789 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7791 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
7793 @@ -173,7 +182,7 @@ static inline void io_apic_write(unsigne
7795 struct physdev_apic apic_op;
7797 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7798 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7800 apic_op.value = value;
7801 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
7802 @@ -209,7 +218,7 @@ static bool io_apic_level_ack_pending(un
7804 reg = io_apic_read(entry->apic, 0x10 + pin*2);
7805 /* Is the remote IRR bit set? */
7806 - if ((reg >> 14) & 1) {
7807 + if (reg & IO_APIC_REDIR_REMOTE_IRR) {
7808 spin_unlock_irqrestore(&ioapic_lock, flags);
7811 @@ -308,7 +317,7 @@ static void __target_IO_APIC_irq(unsigne
7813 io_apic_write(apic, 0x11 + pin*2, dest);
7814 reg = io_apic_read(apic, 0x10 + pin*2);
7815 - reg &= ~0x000000ff;
7816 + reg &= ~IO_APIC_REDIR_VECTOR_MASK;
7818 io_apic_modify(apic, reg);
7820 @@ -372,6 +381,26 @@ static void add_pin_to_irq(unsigned int
7825 + * Reroute an IRQ to a different pin.
7827 +static void __init replace_pin_at_irq(unsigned int irq,
7828 + int oldapic, int oldpin,
7829 + int newapic, int newpin)
7831 + struct irq_pin_list *entry = irq_2_pin + irq;
7834 + if (entry->apic == oldapic && entry->pin == oldpin) {
7835 + entry->apic = newapic;
7836 + entry->pin = newpin;
7840 + entry = irq_2_pin + entry->next;
7844 #define __DO_ACTION(R, ACTION, FINAL) \
7847 @@ -399,10 +428,11 @@ static void add_pin_to_irq(unsigned int
7848 static void name##_IO_APIC_irq (unsigned int irq) \
7849 __DO_ACTION(R, ACTION, FINAL)
7851 -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
7853 -DO_ACTION( __unmask, 0, &= 0xfffeffff, )
7856 +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
7859 +DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
7861 static void mask_IO_APIC_irq (unsigned int irq)
7863 @@ -465,22 +495,6 @@ static int __init disable_timer_pin_setu
7865 __setup("disable_timer_pin_1", disable_timer_pin_setup);
7868 -static int __init setup_disable_8254_timer(char *s)
7870 - timer_over_8254 = -1;
7873 -static int __init setup_enable_8254_timer(char *s)
7875 - timer_over_8254 = 2;
7879 -__setup("disable_8254_timer", setup_disable_8254_timer);
7880 -__setup("enable_8254_timer", setup_enable_8254_timer);
7881 -#endif /* !CONFIG_XEN */
7885 * Find the IRQ entry number of a certain pin.
7886 @@ -490,10 +504,10 @@ static int find_irq_entry(int apic, int
7889 for (i = 0; i < mp_irq_entries; i++)
7890 - if (mp_irqs[i].mpc_irqtype == type &&
7891 - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
7892 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
7893 - mp_irqs[i].mpc_dstirq == pin)
7894 + if (mp_irqs[i].mp_irqtype == type &&
7895 + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
7896 + mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
7897 + mp_irqs[i].mp_dstirq == pin)
7901 @@ -508,13 +522,13 @@ static int __init find_isa_irq_pin(int i
7904 for (i = 0; i < mp_irq_entries; i++) {
7905 - int lbus = mp_irqs[i].mpc_srcbus;
7906 + int lbus = mp_irqs[i].mp_srcbus;
7908 if (test_bit(lbus, mp_bus_not_pci) &&
7909 - (mp_irqs[i].mpc_irqtype == type) &&
7910 - (mp_irqs[i].mpc_srcbusirq == irq))
7911 + (mp_irqs[i].mp_irqtype == type) &&
7912 + (mp_irqs[i].mp_srcbusirq == irq))
7914 - return mp_irqs[i].mpc_dstirq;
7915 + return mp_irqs[i].mp_dstirq;
7919 @@ -524,17 +538,17 @@ static int __init find_isa_irq_apic(int
7922 for (i = 0; i < mp_irq_entries; i++) {
7923 - int lbus = mp_irqs[i].mpc_srcbus;
7924 + int lbus = mp_irqs[i].mp_srcbus;
7926 if (test_bit(lbus, mp_bus_not_pci) &&
7927 - (mp_irqs[i].mpc_irqtype == type) &&
7928 - (mp_irqs[i].mpc_srcbusirq == irq))
7929 + (mp_irqs[i].mp_irqtype == type) &&
7930 + (mp_irqs[i].mp_srcbusirq == irq))
7933 if (i < mp_irq_entries) {
7935 for(apic = 0; apic < nr_ioapics; apic++) {
7936 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
7937 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
7941 @@ -555,28 +569,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7943 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
7945 - if (mp_bus_id_to_pci_bus[bus] == -1) {
7946 + if (test_bit(bus, mp_bus_not_pci)) {
7947 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
7950 for (i = 0; i < mp_irq_entries; i++) {
7951 - int lbus = mp_irqs[i].mpc_srcbus;
7952 + int lbus = mp_irqs[i].mp_srcbus;
7954 for (apic = 0; apic < nr_ioapics; apic++)
7955 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
7956 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
7957 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
7958 + mp_irqs[i].mp_dstapic == MP_APIC_ALL)
7961 if (!test_bit(lbus, mp_bus_not_pci) &&
7962 - !mp_irqs[i].mpc_irqtype &&
7963 + !mp_irqs[i].mp_irqtype &&
7965 - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
7966 - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
7967 + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
7968 + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
7970 if (!(apic || IO_APIC_IRQ(irq)))
7973 - if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
7974 + if (pin == (mp_irqs[i].mp_srcbusirq & 3))
7977 * Use the first all-but-pin matching entry as a
7978 @@ -604,13 +618,13 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7980 static int MPBIOS_polarity(int idx)
7982 - int bus = mp_irqs[idx].mpc_srcbus;
7983 + int bus = mp_irqs[idx].mp_srcbus;
7987 * Determine IRQ line polarity (high active or low active):
7989 - switch (mp_irqs[idx].mpc_irqflag & 3)
7990 + switch (mp_irqs[idx].mp_irqflag & 3)
7992 case 0: /* conforms, ie. bus-type dependent polarity */
7993 if (test_bit(bus, mp_bus_not_pci))
7994 @@ -646,13 +660,13 @@ static int MPBIOS_polarity(int idx)
7996 static int MPBIOS_trigger(int idx)
7998 - int bus = mp_irqs[idx].mpc_srcbus;
7999 + int bus = mp_irqs[idx].mp_srcbus;
8003 * Determine IRQ trigger mode (edge or level sensitive):
8005 - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
8006 + switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
8008 case 0: /* conforms, ie. bus-type dependent */
8009 if (test_bit(bus, mp_bus_not_pci))
8010 @@ -699,16 +713,16 @@ static inline int irq_trigger(int idx)
8011 static int pin_2_irq(int idx, int apic, int pin)
8014 - int bus = mp_irqs[idx].mpc_srcbus;
8015 + int bus = mp_irqs[idx].mp_srcbus;
8018 * Debugging check, we are in big trouble if this message pops up!
8020 - if (mp_irqs[idx].mpc_dstirq != pin)
8021 + if (mp_irqs[idx].mp_dstirq != pin)
8022 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
8024 if (test_bit(bus, mp_bus_not_pci)) {
8025 - irq = mp_irqs[idx].mpc_srcbusirq;
8026 + irq = mp_irqs[idx].mp_srcbusirq;
8029 * PCI IRQs are mapped in order
8030 @@ -722,6 +736,19 @@ static int pin_2_irq(int idx, int apic,
8034 +void lock_vector_lock(void)
8036 + /* Used to the online set of cpus does not change
8037 + * during assign_irq_vector.
8039 + spin_lock(&vector_lock);
8042 +void unlock_vector_lock(void)
8044 + spin_unlock(&vector_lock);
8047 static int __assign_irq_vector(int irq, cpumask_t mask)
8049 struct physdev_irq irq_op;
8050 @@ -773,7 +800,7 @@ static void __clear_irq_vector(int irq)
8052 vector = cfg->vector;
8053 cpus_and(mask, cfg->domain, cpu_online_map);
8054 - for_each_cpu_mask(cpu, mask)
8055 + for_each_cpu_mask_nr(cpu, mask)
8056 per_cpu(vector_irq, cpu)[vector] = -1;
8059 @@ -842,7 +869,7 @@ static void setup_IO_APIC_irq(int apic,
8060 apic_printk(APIC_VERBOSE,KERN_DEBUG
8061 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
8062 "IRQ %d Mode:%i Active:%i)\n",
8063 - apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
8064 + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
8065 irq, trigger, polarity);
8068 @@ -883,10 +910,10 @@ static void __init setup_IO_APIC_irqs(vo
8069 idx = find_irq_entry(apic,pin,mp_INT);
8072 - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8073 + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
8076 - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8077 + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
8080 if (!first_notcon) {
8081 @@ -908,26 +935,21 @@ static void __init setup_IO_APIC_irqs(vo
8085 - * Set up the 8259A-master output pin as broadcast to all
8087 + * Set up the timer pin, possibly with the 8259A-master behind.
8089 -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
8090 +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
8093 struct IO_APIC_route_entry entry;
8095 memset(&entry, 0, sizeof(entry));
8097 - disable_8259A_irq(0);
8100 - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8103 * We use logical delivery to get the timer IRQ
8106 entry.dest_mode = INT_DEST_MODE;
8107 - entry.mask = 0; /* unmask IRQ now */
8108 + entry.mask = 1; /* mask IRQ now */
8109 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
8110 entry.delivery_mode = INT_DELIVERY_MODE;
8112 @@ -936,7 +958,7 @@ static void __init setup_ExtINT_IRQ0_pin
8115 * The timer IRQ doesn't have to know that behind the
8116 - * scene we have a 8259A-master in AEOI mode ...
8117 + * scene we may have a 8259A-master in AEOI mode ...
8119 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
8121 @@ -944,8 +966,6 @@ static void __init setup_ExtINT_IRQ0_pin
8122 * Add it to the IO-APIC irq-routing table:
8124 ioapic_write_entry(apic, pin, entry);
8126 - enable_8259A_irq(0);
8129 void __apicdebuginit print_IO_APIC(void)
8130 @@ -962,7 +982,7 @@ void __apicdebuginit print_IO_APIC(void)
8131 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
8132 for (i = 0; i < nr_ioapics; i++)
8133 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
8134 - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
8135 + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
8138 * We are a bit conservative about what we expect. We have to
8139 @@ -980,7 +1000,7 @@ void __apicdebuginit print_IO_APIC(void)
8140 spin_unlock_irqrestore(&ioapic_lock, flags);
8143 - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
8144 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
8145 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
8146 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
8148 @@ -1072,6 +1092,7 @@ void __apicdebuginit print_local_APIC(vo
8150 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
8151 smp_processor_id(), hard_smp_processor_id());
8152 + v = apic_read(APIC_ID);
8153 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
8154 v = apic_read(APIC_LVR);
8155 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
8156 @@ -1141,7 +1162,7 @@ void __apicdebuginit print_local_APIC(vo
8158 void print_all_local_APICs (void)
8160 - on_each_cpu(print_local_APIC, NULL, 1, 1);
8161 + on_each_cpu(print_local_APIC, NULL, 1);
8164 void __apicdebuginit print_PIC(void)
8165 @@ -1175,6 +1196,8 @@ void __apicdebuginit print_PIC(void)
8166 v = inb(0x4d1) << 8 | inb(0x4d0);
8167 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
8170 +void __apicdebuginit print_IO_APIC(void) {}
8171 #endif /* !CONFIG_XEN */
8173 void __init enable_IO_APIC(void)
8174 @@ -1359,12 +1382,10 @@ static unsigned int startup_ioapic_irq(u
8175 static int ioapic_retrigger_irq(unsigned int irq)
8177 struct irq_cfg *cfg = &irq_cfg[irq];
8179 unsigned long flags;
8181 spin_lock_irqsave(&vector_lock, flags);
8182 - mask = cpumask_of_cpu(first_cpu(cfg->domain));
8183 - send_IPI_mask(mask, cfg->vector);
8184 + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
8185 spin_unlock_irqrestore(&vector_lock, flags);
8188 @@ -1545,7 +1566,7 @@ static inline void init_IO_APIC_traps(vo
8192 -static void enable_lapic_irq (unsigned int irq)
8193 +static void unmask_lapic_irq(unsigned int irq)
8197 @@ -1553,7 +1574,7 @@ static void enable_lapic_irq (unsigned i
8198 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
8201 -static void disable_lapic_irq (unsigned int irq)
8202 +static void mask_lapic_irq(unsigned int irq)
8206 @@ -1566,19 +1587,20 @@ static void ack_lapic_irq (unsigned int
8210 -static void end_lapic_irq (unsigned int i) { /* nothing */ }
8212 -static struct hw_interrupt_type lapic_irq_type __read_mostly = {
8213 - .name = "local-APIC",
8214 - .typename = "local-APIC-edge",
8215 - .startup = NULL, /* startup_irq() not used for IRQ0 */
8216 - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
8217 - .enable = enable_lapic_irq,
8218 - .disable = disable_lapic_irq,
8219 - .ack = ack_lapic_irq,
8220 - .end = end_lapic_irq,
8221 +static struct irq_chip lapic_chip __read_mostly = {
8222 + .name = "local-APIC",
8223 + .mask = mask_lapic_irq,
8224 + .unmask = unmask_lapic_irq,
8225 + .ack = ack_lapic_irq,
8228 +static void lapic_register_intr(int irq)
8230 + irq_desc[irq].status &= ~IRQ_LEVEL;
8231 + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
8235 static void __init setup_nmi(void)
8238 @@ -1664,6 +1686,7 @@ static inline void __init check_timer(vo
8239 struct irq_cfg *cfg = irq_cfg + 0;
8240 int apic1, pin1, apic2, pin2;
8241 unsigned long flags;
8244 local_irq_save(flags);
8246 @@ -1674,34 +1697,48 @@ static inline void __init check_timer(vo
8247 assign_irq_vector(0, TARGET_CPUS);
8250 - * Subtle, code in do_timer_interrupt() expects an AEOI
8251 - * mode for the 8259A whenever interrupts are routed
8252 - * through I/O APICs. Also IRQ0 has to be enabled in
8253 - * the 8259A which implies the virtual wire has to be
8254 - * disabled in the local APIC.
8255 + * As IRQ0 is to be enabled in the 8259A, the virtual
8256 + * wire has to be disabled in the local APIC.
8258 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8260 - if (timer_over_8254 > 0)
8261 - enable_8259A_irq(0);
8263 pin1 = find_isa_irq_pin(0, mp_INT);
8264 apic1 = find_isa_irq_apic(0, mp_INT);
8265 pin2 = ioapic_i8259.pin;
8266 apic2 = ioapic_i8259.apic;
8268 - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
8269 - cfg->vector, apic1, pin1, apic2, pin2);
8270 + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
8271 + "apic1=%d pin1=%d apic2=%d pin2=%d\n",
8272 + cfg->vector, apic1, pin1, apic2, pin2);
8275 + * Some BIOS writers are clueless and report the ExtINTA
8276 + * I/O APIC input from the cascaded 8259A as the timer
8277 + * interrupt input. So just in case, if only one pin
8278 + * was found above, try it both directly and through the
8285 + } else if (pin2 == -1) {
8292 * Ok, does IRQ0 through the IOAPIC work?
8295 + add_pin_to_irq(0, apic1, pin1);
8296 + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
8298 unmask_IO_APIC_irq(0);
8299 if (!no_timer_check && timer_irq_works()) {
8300 - nmi_watchdog_default();
8301 if (nmi_watchdog == NMI_IO_APIC) {
8302 - disable_8259A_irq(0);
8304 enable_8259A_irq(0);
8306 @@ -1710,54 +1747,62 @@ static inline void __init check_timer(vo
8309 clear_IO_APIC_pin(apic1, pin1);
8310 - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
8311 - "connected to IO-APIC\n");
8314 - apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
8315 - "through the 8259A ... ");
8317 - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
8320 + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
8321 + "8254 timer not connected to IO-APIC\n");
8323 + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
8324 + "(IRQ0) through the 8259A ...\n");
8325 + apic_printk(APIC_QUIET, KERN_INFO
8326 + "..... (found apic %d pin %d) ...\n", apic2, pin2);
8328 * legacy devices should be connected to IO APIC #0
8330 - setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
8331 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
8332 + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
8333 + unmask_IO_APIC_irq(0);
8334 + enable_8259A_irq(0);
8335 if (timer_irq_works()) {
8336 - apic_printk(APIC_VERBOSE," works.\n");
8337 - nmi_watchdog_default();
8338 + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
8339 + timer_through_8259 = 1;
8340 if (nmi_watchdog == NMI_IO_APIC) {
8341 + disable_8259A_irq(0);
8343 + enable_8259A_irq(0);
8348 * Cleanup, just in case ...
8350 + disable_8259A_irq(0);
8351 clear_IO_APIC_pin(apic2, pin2);
8352 + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
8354 - apic_printk(APIC_VERBOSE," failed.\n");
8356 if (nmi_watchdog == NMI_IO_APIC) {
8357 - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
8359 + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
8360 + "through the IO-APIC - disabling NMI Watchdog!\n");
8361 + nmi_watchdog = NMI_NONE;
8364 - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
8365 + apic_printk(APIC_QUIET, KERN_INFO
8366 + "...trying to set up timer as Virtual Wire IRQ...\n");
8368 - disable_8259A_irq(0);
8369 - irq_desc[0].chip = &lapic_irq_type;
8370 + lapic_register_intr(0);
8371 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
8372 enable_8259A_irq(0);
8374 if (timer_irq_works()) {
8375 - apic_printk(APIC_VERBOSE," works.\n");
8376 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8379 + disable_8259A_irq(0);
8380 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
8381 - apic_printk(APIC_VERBOSE," failed.\n");
8382 + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
8384 - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
8385 + apic_printk(APIC_QUIET, KERN_INFO
8386 + "...trying to set up timer as ExtINT IRQ...\n");
8390 @@ -1766,11 +1811,12 @@ static inline void __init check_timer(vo
8391 unlock_ExtINT_logic();
8393 if (timer_irq_works()) {
8394 - apic_printk(APIC_VERBOSE," works.\n");
8395 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8398 - apic_printk(APIC_VERBOSE," failed :(.\n");
8399 - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
8400 + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
8401 + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
8402 + "report. Then try booting with the 'noapic' option.\n");
8404 local_irq_restore(flags);
8406 @@ -1788,10 +1834,21 @@ __setup("no_timer_check", notimercheck);
8410 - * IRQs that are handled by the PIC in the MPS IOAPIC case.
8411 - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
8412 - * Linux doesn't really care, as it's not actually used
8413 - * for any interrupt handling anyway.
8414 + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
8415 + * to devices. However there may be an I/O APIC pin available for
8416 + * this interrupt regardless. The pin may be left unconnected, but
8417 + * typically it will be reused as an ExtINT cascade interrupt for
8418 + * the master 8259A. In the MPS case such a pin will normally be
8419 + * reported as an ExtINT interrupt in the MP table. With ACPI
8420 + * there is no provision for ExtINT interrupts, and in the absence
8421 + * of an override it would be treated as an ordinary ISA I/O APIC
8422 + * interrupt, that is edge-triggered and unmasked by default. We
8423 + * used to do this, but it caused problems on some systems because
8424 + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
8425 + * the same ExtINT cascade interrupt to drive the local APIC of the
8426 + * bootstrap processor. Therefore we refrain from routing IRQ2 to
8427 + * the I/O APIC in all cases now. No actual device should request
8428 + * it anyway. --macro
8430 #define PIC_IRQS (1<<2)
8432 @@ -1799,10 +1856,7 @@ void __init setup_IO_APIC(void)
8437 - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
8439 - io_apic_irqs = ~PIC_IRQS;
8440 + io_apic_irqs = ~PIC_IRQS;
8442 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
8444 @@ -1851,8 +1905,8 @@ static int ioapic_resume(struct sys_devi
8446 spin_lock_irqsave(&ioapic_lock, flags);
8447 reg_00.raw = io_apic_read(dev->id, 0);
8448 - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
8449 - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
8450 + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
8451 + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
8452 io_apic_write(dev->id, 0, reg_00.raw);
8454 spin_unlock_irqrestore(&ioapic_lock, flags);
8455 @@ -2254,8 +2308,8 @@ int acpi_get_override_irq(int bus_irq, i
8458 for (i = 0; i < mp_irq_entries; i++)
8459 - if (mp_irqs[i].mpc_irqtype == mp_INT &&
8460 - mp_irqs[i].mpc_srcbusirq == bus_irq)
8461 + if (mp_irqs[i].mp_irqtype == mp_INT &&
8462 + mp_irqs[i].mp_srcbusirq == bus_irq)
8464 if (i >= mp_irq_entries)
8466 @@ -2349,7 +2403,7 @@ void __init ioapic_init_mappings(void)
8467 ioapic_res = ioapic_setup_resources();
8468 for (i = 0; i < nr_ioapics; i++) {
8469 if (smp_found_config) {
8470 - ioapic_phys = mp_ioapics[i].mpc_apicaddr;
8471 + ioapic_phys = mp_ioapics[i].mp_apicaddr;
8473 ioapic_phys = (unsigned long)
8474 alloc_bootmem_pages(PAGE_SIZE);
8475 --- sle11-2009-06-04.orig/arch/x86/kernel/ipi-xen.c 2009-03-16 16:38:05.000000000 +0100
8476 +++ sle11-2009-06-04/arch/x86/kernel/ipi-xen.c 2009-06-04 10:21:39.000000000 +0200
8478 #include <linux/kernel_stat.h>
8479 #include <linux/mc146818rtc.h>
8480 #include <linux/cache.h>
8481 -#include <linux/interrupt.h>
8482 #include <linux/cpu.h>
8483 #include <linux/module.h>
8485 @@ -85,7 +84,7 @@ void __send_IPI_shortcut(unsigned int sh
8487 * Send the IPI. The write to APIC_ICR fires this off.
8489 - apic_write_around(APIC_ICR, cfg);
8490 + apic_write(APIC_ICR, cfg);
8494 @@ -132,7 +131,7 @@ static inline void __send_IPI_dest_field
8495 * prepare target chip field
8497 cfg = __prepare_ICR2(mask);
8498 - apic_write_around(APIC_ICR2, cfg);
8499 + apic_write(APIC_ICR2, cfg);
8503 @@ -142,7 +141,7 @@ static inline void __send_IPI_dest_field
8505 * Send the IPI. The write to APIC_ICR fires this off.
8507 - apic_write_around(APIC_ICR, cfg);
8508 + apic_write(APIC_ICR, cfg);
8512 --- sle11-2009-06-04.orig/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:38:05.000000000 +0100
8513 +++ sle11-2009-06-04/arch/x86/kernel/irq_32-xen.c 2009-06-04 10:21:39.000000000 +0200
8514 @@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
8518 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
8519 +/* Debugging check for stack overflow: is there less than 1KB free? */
8520 +static int check_stack_overflow(void)
8524 + __asm__ __volatile__("andl %%esp,%0" :
8525 + "=r" (sp) : "0" (THREAD_SIZE - 1));
8527 + return sp < (sizeof(struct thread_info) + STACK_WARN);
8530 +static void print_stack_overflow(void)
8532 + printk(KERN_WARNING "low stack detected by irq handler\n");
8537 +static inline int check_stack_overflow(void) { return 0; }
8538 +static inline void print_stack_overflow(void) { }
8541 #ifdef CONFIG_4KSTACKS
8543 * per-CPU IRQ handling contexts (thread information and stack)
8544 @@ -59,48 +82,26 @@ union irq_ctx {
8546 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
8547 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
8551 - * do_IRQ handles all normal device IRQ's (the special
8552 - * SMP cross-CPU interrupts have their own specific
8555 -unsigned int do_IRQ(struct pt_regs *regs)
8557 - struct pt_regs *old_regs;
8558 - /* high bit used in ret_from_ code */
8559 - int irq = ~regs->orig_ax;
8560 - struct irq_desc *desc = irq_desc + irq;
8561 -#ifdef CONFIG_4KSTACKS
8562 - union irq_ctx *curctx, *irqctx;
8566 - if (unlikely((unsigned)irq >= NR_IRQS)) {
8567 - printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8571 +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8572 +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8574 - old_regs = set_irq_regs(regs);
8576 -#ifdef CONFIG_DEBUG_STACKOVERFLOW
8577 - /* Debugging check for stack overflow: is there less than 1KB free? */
8581 - __asm__ __volatile__("andl %%esp,%0" :
8582 - "=r" (sp) : "0" (THREAD_SIZE - 1));
8583 - if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
8584 - printk("do_IRQ: stack overflow: %ld\n",
8585 - sp - sizeof(struct thread_info));
8590 +static void call_on_stack(void *func, void *stack)
8592 + asm volatile("xchgl %%ebx,%%esp \n"
8594 + "movl %%ebx,%%esp \n"
8598 + : "memory", "cc", "edx", "ecx", "eax");
8601 -#ifdef CONFIG_4KSTACKS
8603 +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
8605 + union irq_ctx *curctx, *irqctx;
8606 + u32 *isp, arg1, arg2;
8608 curctx = (union irq_ctx *) current_thread_info();
8609 irqctx = hardirq_ctx[smp_processor_id()];
8610 @@ -111,52 +112,39 @@ unsigned int do_IRQ(struct pt_regs *regs
8611 * handler) we can't do that and just have to keep using the
8612 * current stack (which is the irq stack already after all)
8614 - if (curctx != irqctx) {
8615 - int arg1, arg2, bx;
8617 - /* build the stack frame on the IRQ stack */
8618 - isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8619 - irqctx->tinfo.task = curctx->tinfo.task;
8620 - irqctx->tinfo.previous_esp = current_stack_pointer;
8621 + if (unlikely(curctx == irqctx))
8625 - * Copy the softirq bits in preempt_count so that the
8626 - * softirq checks work in the hardirq context.
8628 - irqctx->tinfo.preempt_count =
8629 - (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8630 - (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8633 - " xchgl %%ebx,%%esp \n"
8635 - " movl %%ebx,%%esp \n"
8636 - : "=a" (arg1), "=d" (arg2), "=b" (bx)
8637 - : "0" (irq), "1" (desc), "2" (isp),
8638 - "D" (desc->handle_irq)
8639 - : "memory", "cc", "ecx"
8643 - desc->handle_irq(irq, desc);
8644 + /* build the stack frame on the IRQ stack */
8645 + isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
8646 + irqctx->tinfo.task = curctx->tinfo.task;
8647 + irqctx->tinfo.previous_esp = current_stack_pointer;
8650 - set_irq_regs(old_regs);
8652 + * Copy the softirq bits in preempt_count so that the
8653 + * softirq checks work in the hardirq context.
8655 + irqctx->tinfo.preempt_count =
8656 + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8657 + (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8659 + if (unlikely(overflow))
8660 + call_on_stack(print_stack_overflow, isp);
8662 + asm volatile("xchgl %%ebx,%%esp \n"
8664 + "movl %%ebx,%%esp \n"
8665 + : "=a" (arg1), "=d" (arg2), "=b" (isp)
8666 + : "0" (irq), "1" (desc), "2" (isp),
8667 + "D" (desc->handle_irq)
8668 + : "memory", "cc", "ecx");
8672 -#ifdef CONFIG_4KSTACKS
8674 -static char softirq_stack[NR_CPUS * THREAD_SIZE]
8675 - __attribute__((__section__(".bss.page_aligned")));
8677 -static char hardirq_stack[NR_CPUS * THREAD_SIZE]
8678 - __attribute__((__section__(".bss.page_aligned")));
8681 * allocate per-cpu stacks for hardirq and for softirq processing
8683 -void irq_ctx_init(int cpu)
8684 +void __cpuinit irq_ctx_init(int cpu)
8686 union irq_ctx *irqctx;
8688 @@ -164,25 +152,25 @@ void irq_ctx_init(int cpu)
8691 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
8692 - irqctx->tinfo.task = NULL;
8693 - irqctx->tinfo.exec_domain = NULL;
8694 - irqctx->tinfo.cpu = cpu;
8695 - irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8696 - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8697 + irqctx->tinfo.task = NULL;
8698 + irqctx->tinfo.exec_domain = NULL;
8699 + irqctx->tinfo.cpu = cpu;
8700 + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8701 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8703 hardirq_ctx[cpu] = irqctx;
8705 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
8706 - irqctx->tinfo.task = NULL;
8707 - irqctx->tinfo.exec_domain = NULL;
8708 - irqctx->tinfo.cpu = cpu;
8709 - irqctx->tinfo.preempt_count = 0;
8710 - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8711 + irqctx->tinfo.task = NULL;
8712 + irqctx->tinfo.exec_domain = NULL;
8713 + irqctx->tinfo.cpu = cpu;
8714 + irqctx->tinfo.preempt_count = 0;
8715 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8717 softirq_ctx[cpu] = irqctx;
8719 - printk("CPU %u irqstacks, hard=%p soft=%p\n",
8720 - cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8721 + printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
8722 + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8725 void irq_ctx_exit(int cpu)
8726 @@ -211,25 +199,56 @@ asmlinkage void do_softirq(void)
8727 /* build the stack frame on the softirq stack */
8728 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8731 - " xchgl %%ebx,%%esp \n"
8732 - " call __do_softirq \n"
8733 - " movl %%ebx,%%esp \n"
8736 - : "memory", "cc", "edx", "ecx", "eax"
8738 + call_on_stack(__do_softirq, isp);
8740 * Shouldnt happen, we returned above if in_interrupt():
8743 WARN_ON_ONCE(softirq_count());
8746 local_irq_restore(flags);
8751 +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
8755 + * do_IRQ handles all normal device IRQ's (the special
8756 + * SMP cross-CPU interrupts have their own specific
8759 +unsigned int do_IRQ(struct pt_regs *regs)
8761 + struct pt_regs *old_regs;
8762 + /* high bit used in ret_from_ code */
8763 + int overflow, irq = ~regs->orig_ax;
8764 + struct irq_desc *desc = irq_desc + irq;
8766 + if (unlikely((unsigned)irq >= NR_IRQS)) {
8767 + printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8772 + old_regs = set_irq_regs(regs);
8775 + overflow = check_stack_overflow();
8777 + if (!execute_on_irq_stack(overflow, desc, irq)) {
8778 + if (unlikely(overflow))
8779 + print_stack_overflow();
8780 + desc->handle_irq(irq, desc);
8784 + set_irq_regs(old_regs);
8789 * Interrupt statistics:
8792 @@ -337,6 +356,42 @@ skip:
8797 + * /proc/stat helpers
8799 +u64 arch_irq_stat_cpu(unsigned int cpu)
8801 + u64 sum = nmi_count(cpu);
8803 +#ifdef CONFIG_X86_LOCAL_APIC
8804 + sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
8807 + sum += per_cpu(irq_stat, cpu).irq_resched_count;
8808 + sum += per_cpu(irq_stat, cpu).irq_call_count;
8810 + sum += per_cpu(irq_stat, cpu).irq_tlb_count;
8813 +#ifdef CONFIG_X86_MCE
8814 + sum += per_cpu(irq_stat, cpu).irq_thermal_count;
8816 +#ifdef CONFIG_X86_LOCAL_APIC
8817 + sum += per_cpu(irq_stat, cpu).irq_spurious_count;
8822 +u64 arch_irq_stat(void)
8824 + u64 sum = atomic_read(&irq_err_count);
8826 +#ifdef CONFIG_X86_IO_APIC
8827 + sum += atomic_read(&irq_mis_count);
8832 #ifdef CONFIG_HOTPLUG_CPU
8834 void fixup_irqs(cpumask_t map)
8835 --- sle11-2009-06-04.orig/arch/x86/kernel/irq_64-xen.c 2009-03-16 16:33:40.000000000 +0100
8836 +++ sle11-2009-06-04/arch/x86/kernel/irq_64-xen.c 2009-06-04 10:21:39.000000000 +0200
8837 @@ -163,6 +163,34 @@ skip:
8841 + * /proc/stat helpers
8843 +u64 arch_irq_stat_cpu(unsigned int cpu)
8845 + u64 sum = cpu_pda(cpu)->__nmi_count;
8847 + sum += cpu_pda(cpu)->apic_timer_irqs;
8849 + sum += cpu_pda(cpu)->irq_resched_count;
8850 + sum += cpu_pda(cpu)->irq_call_count;
8852 + sum += cpu_pda(cpu)->irq_tlb_count;
8855 +#ifdef CONFIG_X86_MCE
8856 + sum += cpu_pda(cpu)->irq_thermal_count;
8857 + sum += cpu_pda(cpu)->irq_threshold_count;
8859 + sum += cpu_pda(cpu)->irq_spurious_count;
8863 +u64 arch_irq_stat(void)
8865 + return atomic_read(&irq_err_count);
8869 * do_IRQ handles all normal device IRQ's (the special
8870 * SMP cross-CPU interrupts have their own specific
8872 --- sle11-2009-06-04.orig/arch/x86/kernel/ldt-xen.c 2009-03-16 16:33:40.000000000 +0100
8873 +++ sle11-2009-06-04/arch/x86/kernel/ldt-xen.c 2009-06-04 10:21:39.000000000 +0200
8875 #include <asm/mmu_context.h>
8878 -static void flush_ldt(void *null)
8879 +static void flush_ldt(void *current_mm)
8881 - if (current->active_mm)
8882 + if (current->active_mm == current_mm)
8883 load_LDT(¤t->active_mm->context);
8886 @@ -62,8 +62,6 @@ static int alloc_ldt(mm_context_t *pc, i
8894 make_pages_readonly(newldt,
8895 @@ -71,9 +69,9 @@ static int alloc_ldt(mm_context_t *pc, i
8896 XENFEAT_writable_descriptor_tables);
8899 - mask = cpumask_of_cpu(smp_processor_id());
8900 - if (!cpus_equal(current->mm->cpu_vm_mask, mask))
8901 - smp_call_function(flush_ldt, NULL, 1, 1);
8902 + if (!cpus_equal(current->mm->cpu_vm_mask,
8903 + cpumask_of_cpu(smp_processor_id())))
8904 + smp_call_function(flush_ldt, current->mm, 1);
8908 --- sle11-2009-06-04.orig/arch/x86/kernel/machine_kexec_32.c 2008-11-25 12:35:53.000000000 +0100
8909 +++ sle11-2009-06-04/arch/x86/kernel/machine_kexec_32.c 2009-06-04 10:21:39.000000000 +0200
8910 @@ -68,6 +68,8 @@ void machine_kexec_setup_load_arg(xen_ke
8911 xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
8912 xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
8914 + if (image->type == KEXEC_TYPE_DEFAULT)
8915 + xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
8918 int __init machine_kexec_setup_resources(struct resource *hypervisor,
8919 --- sle11-2009-06-04.orig/arch/x86/kernel/microcode-xen.c 2009-03-16 16:38:05.000000000 +0100
8920 +++ sle11-2009-06-04/arch/x86/kernel/microcode-xen.c 2009-06-04 10:21:39.000000000 +0200
8922 * 2006 Shaohua Li <shaohua.li@intel.com>
8924 * This driver allows to upgrade microcode on Intel processors
8925 - * belonging to IA-32 family - PentiumPro, Pentium II,
8926 + * belonging to IA-32 family - PentiumPro, Pentium II,
8927 * Pentium III, Xeon, Pentium 4, etc.
8929 - * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
8930 - * Order Number 245472 or free download from:
8932 - * http://developer.intel.com/design/pentium4/manuals/245472.htm
8933 + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
8934 + * Software Developer's Manual
8935 + * Order Number 253668 or free download from:
8937 + * http://developer.intel.com/design/pentium4/manuals/253668.htm
8939 * For more information, go to http://www.urbanmyth.org/microcode
8942 #include <linux/kernel.h>
8943 #include <linux/init.h>
8944 #include <linux/sched.h>
8945 +#include <linux/smp_lock.h>
8946 #include <linux/cpumask.h>
8947 #include <linux/module.h>
8948 #include <linux/slab.h>
8949 @@ -86,6 +88,7 @@ static int do_microcode_update (const vo
8951 static int microcode_open (struct inode *unused1, struct file *unused2)
8953 + cycle_kernel_lock();
8954 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
8957 @@ -162,7 +165,7 @@ static int request_microcode(void)
8958 c->x86, c->x86_model, c->x86_mask);
8959 error = request_firmware(&firmware, name, µcode_pdev->dev);
8961 - pr_debug("microcode: ucode data file %s load failed\n", name);
8962 + pr_debug("microcode: data file %s load failed\n", name);
8966 @@ -183,6 +186,9 @@ static int __init microcode_init (void)
8971 + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8973 error = microcode_dev_init();
8976 @@ -195,8 +201,6 @@ static int __init microcode_init (void)
8978 request_microcode();
8981 - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8985 --- sle11-2009-06-04.orig/arch/x86/kernel/mpparse-xen.c 2009-03-16 16:38:05.000000000 +0100
8986 +++ sle11-2009-06-04/arch/x86/kernel/mpparse-xen.c 2009-06-04 10:21:39.000000000 +0200
8988 #include <asm/proto.h>
8989 #include <asm/acpi.h>
8990 #include <asm/bios_ebda.h>
8991 +#include <asm/e820.h>
8992 +#include <asm/trampoline.h>
8993 +#include <asm/setup.h>
8995 #include <mach_apic.h>
8996 #ifdef CONFIG_X86_32
8998 #include <mach_mpparse.h>
9001 -/* Have we found an MP table */
9002 -int smp_found_config;
9005 - * Various Linux-internal data structures created from the
9008 -#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9009 -int mp_bus_id_to_type[MAX_MP_BUSSES];
9012 -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
9013 -int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
9015 -static int mp_current_pci_id;
9020 - * Intel MP BIOS table parsing routines:
9022 +static void *_bus_to_virt(unsigned long ma)
9024 + return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma);
9028 * Checksum an MP configuration block.
9029 @@ -68,19 +54,7 @@ static int __init mpf_checksum(unsigned
9033 -#ifdef CONFIG_X86_NUMAQ
9035 - * Have to match translation table entries to main table entries by counter
9036 - * hence the mpc_record variable .... can't see a less disgusting way of
9040 -static int mpc_record;
9041 -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
9045 -static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
9046 +static void __init MP_processor_info(struct mpc_config_processor *m)
9050 @@ -90,11 +64,12 @@ static void __cpuinit MP_processor_info(
9054 -#ifdef CONFIG_X86_NUMAQ
9055 - apicid = mpc_apic_id(m, translation_table[mpc_record]);
9057 - apicid = m->mpc_apicid;
9060 + if (x86_quirks->mpc_apic_id)
9061 + apicid = x86_quirks->mpc_apic_id(m);
9063 + apicid = m->mpc_apicid;
9065 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
9066 bootup_cpu = " (Bootup-CPU)";
9067 boot_cpu_physical_apicid = m->mpc_apicid;
9068 @@ -107,18 +82,17 @@ static void __cpuinit MP_processor_info(
9072 +#ifdef CONFIG_X86_IO_APIC
9073 static void __init MP_bus_info(struct mpc_config_bus *m)
9077 memcpy(str, m->mpc_bustype, 6);
9080 -#ifdef CONFIG_X86_NUMAQ
9081 - mpc_oem_bus_info(m, str, translation_table[mpc_record]);
9083 - Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
9085 + if (x86_quirks->mpc_oem_bus_info)
9086 + x86_quirks->mpc_oem_bus_info(m, str);
9088 + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
9090 #if MAX_MP_BUSSES < 256
9091 if (m->mpc_busid >= MAX_MP_BUSSES) {
9092 @@ -135,12 +109,10 @@ static void __init MP_bus_info(struct mp
9093 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
9095 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
9096 -#ifdef CONFIG_X86_NUMAQ
9097 - mpc_oem_pci_bus(m, translation_table[mpc_record]);
9099 + if (x86_quirks->mpc_oem_pci_bus)
9100 + x86_quirks->mpc_oem_pci_bus(m);
9102 clear_bit(m->mpc_busid, mp_bus_not_pci);
9103 - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
9104 - mp_current_pci_id++;
9105 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
9106 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
9107 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
9108 @@ -151,6 +123,7 @@ static void __init MP_bus_info(struct mp
9110 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
9114 #ifdef CONFIG_X86_IO_APIC
9116 @@ -180,117 +153,111 @@ static void __init MP_ioapic_info(struct
9117 if (bad_ioapic(m->mpc_apicaddr))
9120 - mp_ioapics[nr_ioapics] = *m;
9121 + mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
9122 + mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
9123 + mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
9124 + mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
9125 + mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
9129 -static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9130 +static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
9132 - mp_irqs[mp_irq_entries] = *m;
9133 - Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
9134 + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9135 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9136 m->mpc_irqtype, m->mpc_irqflag & 3,
9137 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
9138 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
9139 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
9140 - panic("Max # of irq sources exceeded!!\n");
9145 -static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9146 +static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
9148 - Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
9149 - " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9150 - m->mpc_irqtype, m->mpc_irqflag & 3,
9151 - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9152 - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9153 + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9154 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9155 + mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
9156 + (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
9157 + mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
9160 -#ifdef CONFIG_X86_NUMAQ
9161 -static void __init MP_translation_info(struct mpc_config_translation *m)
9162 +static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
9163 + struct mp_config_intsrc *mp_irq)
9166 - "Translation: record %d, type %d, quad %d, global %d, local %d\n",
9167 - mpc_record, m->trans_type, m->trans_quad, m->trans_global,
9169 + mp_irq->mp_dstapic = m->mpc_dstapic;
9170 + mp_irq->mp_type = m->mpc_type;
9171 + mp_irq->mp_irqtype = m->mpc_irqtype;
9172 + mp_irq->mp_irqflag = m->mpc_irqflag;
9173 + mp_irq->mp_srcbus = m->mpc_srcbus;
9174 + mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
9175 + mp_irq->mp_dstirq = m->mpc_dstirq;
9178 - if (mpc_record >= MAX_MPC_ENTRY)
9179 - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
9181 - translation_table[mpc_record] = m; /* stash this for later */
9182 - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
9183 - node_set_online(m->trans_quad);
9184 +static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
9185 + struct mpc_config_intsrc *m)
9187 + m->mpc_dstapic = mp_irq->mp_dstapic;
9188 + m->mpc_type = mp_irq->mp_type;
9189 + m->mpc_irqtype = mp_irq->mp_irqtype;
9190 + m->mpc_irqflag = mp_irq->mp_irqflag;
9191 + m->mpc_srcbus = mp_irq->mp_srcbus;
9192 + m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
9193 + m->mpc_dstirq = mp_irq->mp_dstirq;
9197 - * Read/parse the MPC oem tables
9199 +static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
9200 + struct mpc_config_intsrc *m)
9202 + if (mp_irq->mp_dstapic != m->mpc_dstapic)
9204 + if (mp_irq->mp_type != m->mpc_type)
9206 + if (mp_irq->mp_irqtype != m->mpc_irqtype)
9208 + if (mp_irq->mp_irqflag != m->mpc_irqflag)
9210 + if (mp_irq->mp_srcbus != m->mpc_srcbus)
9212 + if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
9214 + if (mp_irq->mp_dstirq != m->mpc_dstirq)
9217 -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
9218 - unsigned short oemsize)
9222 +static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9224 - int count = sizeof(*oemtable); /* the header size */
9225 - unsigned char *oemptr = ((unsigned char *)oemtable) + count;
9229 - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
9231 - if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
9232 - printk(KERN_WARNING
9233 - "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
9234 - oemtable->oem_signature[0], oemtable->oem_signature[1],
9235 - oemtable->oem_signature[2], oemtable->oem_signature[3]);
9238 - if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
9239 - printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
9242 - while (count < oemtable->oem_length) {
9243 - switch (*oemptr) {
9244 - case MP_TRANSLATION:
9246 - struct mpc_config_translation *m =
9247 - (struct mpc_config_translation *)oemptr;
9248 - MP_translation_info(m);
9249 - oemptr += sizeof(*m);
9250 - count += sizeof(*m);
9256 - printk(KERN_WARNING
9257 - "Unrecognised OEM table entry type! - %d\n",
9262 + print_MP_intsrc_info(m);
9264 + for (i = 0; i < mp_irq_entries; i++) {
9265 + if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
9269 + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
9270 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
9271 + panic("Max # of irq sources exceeded!!\n");
9274 -static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
9278 +static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9280 - if (strncmp(oem, "IBM NUMA", 8))
9281 - printk("Warning! May not be a NUMA-Q system!\n");
9282 - if (mpc->mpc_oemptr)
9283 - smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
9284 - mpc->mpc_oemsize);
9285 + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
9286 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9287 + m->mpc_irqtype, m->mpc_irqflag & 3,
9288 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9289 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9291 -#endif /* CONFIG_X86_NUMAQ */
9294 * Read/parse the MPC
9297 -static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9298 +static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
9303 - int count = sizeof(*mpc);
9304 - unsigned char *mpt = ((unsigned char *)mpc) + count;
9306 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
9307 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
9308 @@ -313,19 +280,41 @@ static int __init smp_read_mpc(struct mp
9310 memcpy(oem, mpc->mpc_oem, 8);
9312 - printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
9313 + printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
9315 memcpy(str, mpc->mpc_productid, 12);
9317 - printk("Product ID: %s ", str);
9319 -#ifdef CONFIG_X86_32
9320 - mps_oem_check(mpc, oem, str);
9322 - printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
9323 + printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
9325 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
9330 +static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9335 + int count = sizeof(*mpc);
9336 + unsigned char *mpt = ((unsigned char *)mpc) + count;
9338 + if (!smp_check_mpc(mpc, oem, str))
9341 +#ifdef CONFIG_X86_32
9343 + * need to make sure summit and es7000's mps_oem_check is safe to be
9344 + * called early via genericarch 's mps_oem_check
9347 +#ifdef CONFIG_X86_NUMAQ
9348 + numaq_mps_oem_check(mpc, oem, str);
9351 + mps_oem_check(mpc, oem, str);
9353 /* save the local APIC address, it might be non-default */
9355 mp_lapic_addr = mpc->mpc_lapic;
9356 @@ -333,12 +322,17 @@ static int __init smp_read_mpc(struct mp
9360 + if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
9361 + struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
9362 + x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
9366 * Now process the configuration blocks.
9368 -#ifdef CONFIG_X86_NUMAQ
9371 + if (x86_quirks->mpc_record)
9372 + *x86_quirks->mpc_record = 0;
9374 while (count < mpc->mpc_length) {
9377 @@ -356,7 +350,9 @@ static int __init smp_read_mpc(struct mp
9379 struct mpc_config_bus *m =
9380 (struct mpc_config_bus *)mpt;
9381 +#ifdef CONFIG_X86_IO_APIC
9385 count += sizeof(*m);
9387 @@ -402,10 +398,14 @@ static int __init smp_read_mpc(struct mp
9388 count = mpc->mpc_length;
9391 -#ifdef CONFIG_X86_NUMAQ
9394 + if (x86_quirks->mpc_record)
9395 + (*x86_quirks->mpc_record)++;
9398 +#ifdef CONFIG_X86_GENERICARCH
9399 + generic_bigsmp_probe();
9402 setup_apic_routing();
9403 if (!num_processors)
9404 printk(KERN_ERR "MPTABLE: no processors registered!\n");
9405 @@ -431,7 +431,7 @@ static void __init construct_default_ioi
9406 intsrc.mpc_type = MP_INTSRC;
9407 intsrc.mpc_irqflag = 0; /* conforming */
9408 intsrc.mpc_srcbus = 0;
9409 - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
9410 + intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
9412 intsrc.mpc_irqtype = mp_INT;
9414 @@ -492,40 +492,11 @@ static void __init construct_default_ioi
9415 MP_intsrc_info(&intsrc);
9420 -static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9421 +static void __init construct_ioapic_table(int mpc_default_type)
9423 - struct mpc_config_processor processor;
9424 - struct mpc_config_bus bus;
9425 -#ifdef CONFIG_X86_IO_APIC
9426 struct mpc_config_ioapic ioapic;
9428 - struct mpc_config_lintsrc lintsrc;
9429 - int linttypes[2] = { mp_ExtINT, mp_NMI };
9433 - * local APIC has default address
9435 - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9438 - * 2 CPUs, numbered 0 & 1.
9440 - processor.mpc_type = MP_PROCESSOR;
9441 - /* Either an integrated APIC or a discrete 82489DX. */
9442 - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9443 - processor.mpc_cpuflag = CPU_ENABLED;
9444 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9445 - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9446 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9447 - processor.mpc_reserved[0] = 0;
9448 - processor.mpc_reserved[1] = 0;
9449 - for (i = 0; i < 2; i++) {
9450 - processor.mpc_apicid = i;
9451 - MP_processor_info(&processor);
9453 + struct mpc_config_bus bus;
9455 bus.mpc_type = MP_BUS;
9457 @@ -554,7 +525,6 @@ static inline void __init construct_defa
9461 -#ifdef CONFIG_X86_IO_APIC
9462 ioapic.mpc_type = MP_IOAPIC;
9463 ioapic.mpc_apicid = 2;
9464 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9465 @@ -566,7 +536,42 @@ static inline void __init construct_defa
9466 * We set up most of the low 16 IO-APIC pins according to MPS rules.
9468 construct_default_ioirq_mptable(mpc_default_type);
9471 +static inline void __init construct_ioapic_table(int mpc_default_type) { }
9474 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9476 + struct mpc_config_processor processor;
9477 + struct mpc_config_lintsrc lintsrc;
9478 + int linttypes[2] = { mp_ExtINT, mp_NMI };
9482 + * local APIC has default address
9484 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9487 + * 2 CPUs, numbered 0 & 1.
9489 + processor.mpc_type = MP_PROCESSOR;
9490 + /* Either an integrated APIC or a discrete 82489DX. */
9491 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9492 + processor.mpc_cpuflag = CPU_ENABLED;
9493 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9494 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9495 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9496 + processor.mpc_reserved[0] = 0;
9497 + processor.mpc_reserved[1] = 0;
9498 + for (i = 0; i < 2; i++) {
9499 + processor.mpc_apicid = i;
9500 + MP_processor_info(&processor);
9503 + construct_ioapic_table(mpc_default_type);
9505 lintsrc.mpc_type = MP_LINTSRC;
9506 lintsrc.mpc_irqflag = 0; /* conforming */
9507 lintsrc.mpc_srcbusid = 0;
9508 @@ -584,10 +589,14 @@ static struct intel_mp_floating *mpf_fou
9510 * Scan the memory blocks for an SMP configuration block.
9512 -static void __init __get_smp_config(unsigned early)
9513 +static void __init __get_smp_config(unsigned int early)
9515 struct intel_mp_floating *mpf = mpf_found;
9517 + if (x86_quirks->mach_get_smp_config) {
9518 + if (x86_quirks->mach_get_smp_config(early))
9521 if (acpi_lapic && early)
9524 @@ -604,7 +613,7 @@ static void __init __get_smp_config(unsi
9526 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
9527 mpf->mpf_specification);
9528 -#ifdef CONFIG_X86_32
9529 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
9530 if (mpf->mpf_feature2 & (1 << 7)) {
9531 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
9533 @@ -635,8 +644,10 @@ static void __init __get_smp_config(unsi
9534 * Read the physical hardware table. Anything here will
9535 * override the defaults.
9537 - if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
9538 + if (!smp_read_mpc(_bus_to_virt(mpf->mpf_physptr), early)) {
9539 +#ifdef CONFIG_X86_LOCAL_APIC
9540 smp_found_config = 0;
9543 "BIOS bug, MP table errors detected!...\n");
9544 printk(KERN_ERR "... disabling SMP support. "
9545 @@ -690,10 +701,11 @@ void __init get_smp_config(void)
9546 static int __init smp_scan_config(unsigned long base, unsigned long length,
9549 - unsigned int *bp = isa_bus_to_virt(base);
9550 + unsigned int *bp = _bus_to_virt(base);
9551 struct intel_mp_floating *mpf;
9553 - Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
9554 + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
9556 BUILD_BUG_ON(sizeof(*mpf) != 16);
9558 while (length > 0) {
9559 @@ -703,16 +715,22 @@ static int __init smp_scan_config(unsign
9560 !mpf_checksum((unsigned char *)bp, 16) &&
9561 ((mpf->mpf_specification == 1)
9562 || (mpf->mpf_specification == 4))) {
9564 +#ifdef CONFIG_X86_LOCAL_APIC
9565 smp_found_config = 1;
9568 -#ifdef CONFIG_X86_32
9571 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9572 mpf, virt_to_phys(mpf));
9573 - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
9577 + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
9579 if (mpf->mpf_physptr) {
9580 + unsigned long size = PAGE_SIZE;
9581 +#ifdef CONFIG_X86_32
9583 * We cannot access to MPC table to compute
9584 * table size yet, as only few megabytes from
9585 @@ -722,27 +740,18 @@ static int __init smp_scan_config(unsign
9586 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
9587 * in reserve_bootmem.
9589 - unsigned long size = PAGE_SIZE;
9590 unsigned long end = max_low_pfn * PAGE_SIZE;
9591 if (mpf->mpf_physptr + size > end)
9592 size = end - mpf->mpf_physptr;
9593 - reserve_bootmem(mpf->mpf_physptr, size,
9595 + reserve_bootmem_generic(mpf->mpf_physptr, size,
9599 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9600 - mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
9602 -#elif !defined(CONFIG_XEN)
9606 - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
9607 - if (mpf->mpf_physptr)
9608 - reserve_bootmem_generic(mpf->mpf_physptr,
9610 + mpf, ((void *)bp - _bus_to_virt(base)) + base);
9617 @@ -750,12 +759,16 @@ static int __init smp_scan_config(unsign
9621 -static void __init __find_smp_config(unsigned reserve)
9622 +static void __init __find_smp_config(unsigned int reserve)
9625 unsigned int address;
9628 + if (x86_quirks->mach_find_smp_config) {
9629 + if (x86_quirks->mach_find_smp_config(reserve))
9633 * FIXME: Linux assumes you have 640K of base ram..
9634 * this continues the error...
9635 @@ -802,300 +815,297 @@ void __init find_smp_config(void)
9636 __find_smp_config(1);
9639 -/* --------------------------------------------------------------------------
9640 - ACPI-based MP Configuration
9641 - -------------------------------------------------------------------------- */
9644 - * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
9649 +#ifdef CONFIG_X86_IO_APIC
9650 +static u8 __initdata irq_used[MAX_IRQ_SOURCES];
9652 -#ifdef CONFIG_X86_IO_APIC
9653 +static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
9657 -#define MP_ISA_BUS 0
9658 + if (m->mpc_irqtype != mp_INT)
9661 -extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
9662 + if (m->mpc_irqflag != 0x0f)
9665 -static int mp_find_ioapic(int gsi)
9670 - /* Find the IOAPIC that manages this GSI. */
9671 - for (i = 0; i < nr_ioapics; i++) {
9672 - if ((gsi >= mp_ioapic_routing[i].gsi_base)
9673 - && (gsi <= mp_ioapic_routing[i].gsi_end))
9675 + for (i = 0; i < mp_irq_entries; i++) {
9676 + if (mp_irqs[i].mp_irqtype != mp_INT)
9679 + if (mp_irqs[i].mp_irqflag != 0x0f)
9682 + if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
9684 + if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
9686 + if (irq_used[i]) {
9687 + /* already claimed */
9694 - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
9699 -static u8 __init uniq_ioapic_id(u8 id)
9701 -#ifdef CONFIG_X86_32
9702 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
9703 - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
9704 - return io_apic_get_unique_id(nr_ioapics, id);
9709 - DECLARE_BITMAP(used, 256);
9710 - bitmap_zero(used, 256);
9711 - for (i = 0; i < nr_ioapics; i++) {
9712 - struct mpc_config_ioapic *ia = &mp_ioapics[i];
9713 - __set_bit(ia->mpc_apicid, used);
9715 - if (!test_bit(id, used))
9717 - return find_first_zero_bit(used, 256);
9718 +#define SPARE_SLOT_NUM 20
9720 +static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
9724 -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
9725 +static int __init replace_intsrc_all(struct mp_config_table *mpc,
9726 + unsigned long mpc_new_phys,
9727 + unsigned long mpc_new_length)
9731 - if (bad_ioapic(address))
9733 +#ifdef CONFIG_X86_IO_APIC
9735 + int nr_m_spare = 0;
9739 + int count = sizeof(*mpc);
9740 + unsigned char *mpt = ((unsigned char *)mpc) + count;
9742 - mp_ioapics[idx].mpc_type = MP_IOAPIC;
9743 - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
9744 - mp_ioapics[idx].mpc_apicaddr = address;
9745 + printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
9746 + while (count < mpc->mpc_length) {
9748 + case MP_PROCESSOR:
9750 + struct mpc_config_processor *m =
9751 + (struct mpc_config_processor *)mpt;
9752 + mpt += sizeof(*m);
9753 + count += sizeof(*m);
9758 + struct mpc_config_bus *m =
9759 + (struct mpc_config_bus *)mpt;
9760 + mpt += sizeof(*m);
9761 + count += sizeof(*m);
9766 + mpt += sizeof(struct mpc_config_ioapic);
9767 + count += sizeof(struct mpc_config_ioapic);
9772 +#ifdef CONFIG_X86_IO_APIC
9773 + struct mpc_config_intsrc *m =
9774 + (struct mpc_config_intsrc *)mpt;
9777 - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
9778 + apic_printk(APIC_VERBOSE, "OLD ");
9779 + print_MP_intsrc_info(m);
9780 + i = get_MP_intsrc_index(m);
9782 + assign_to_mpc_intsrc(&mp_irqs[i], m);
9783 + apic_printk(APIC_VERBOSE, "NEW ");
9784 + print_mp_irq_info(&mp_irqs[i]);
9786 + /* legacy, do nothing */
9787 + } else if (nr_m_spare < SPARE_SLOT_NUM) {
9789 + * not found (-1), or duplicated (-2)
9790 + * are invalid entries,
9791 + * we need to use the slot later
9793 + m_spare[nr_m_spare] = m;
9797 - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
9798 -#ifdef CONFIG_X86_32
9799 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
9801 - mp_ioapics[idx].mpc_apicver = 0;
9802 + mpt += sizeof(struct mpc_config_intsrc);
9803 + count += sizeof(struct mpc_config_intsrc);
9808 + struct mpc_config_lintsrc *m =
9809 + (struct mpc_config_lintsrc *)mpt;
9810 + mpt += sizeof(*m);
9811 + count += sizeof(*m);
9815 + /* wrong mptable */
9816 + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
9817 + printk(KERN_ERR "type %x\n", *mpt);
9818 + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
9819 + 1, mpc, mpc->mpc_length, 1);
9824 +#ifdef CONFIG_X86_IO_APIC
9825 + for (i = 0; i < mp_irq_entries; i++) {
9829 + if (mp_irqs[i].mp_irqtype != mp_INT)
9832 + if (mp_irqs[i].mp_irqflag != 0x0f)
9835 + if (nr_m_spare > 0) {
9836 + apic_printk(APIC_VERBOSE, "*NEW* found\n");
9838 + assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
9839 + m_spare[nr_m_spare] = NULL;
9841 + struct mpc_config_intsrc *m =
9842 + (struct mpc_config_intsrc *)mpt;
9843 + count += sizeof(struct mpc_config_intsrc);
9844 + if (!mpc_new_phys) {
9845 + printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
9847 + if (count <= mpc_new_length)
9848 + printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
9850 + printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
9854 + assign_to_mpc_intsrc(&mp_irqs[i], m);
9855 + mpc->mpc_length = count;
9856 + mpt += sizeof(struct mpc_config_intsrc);
9858 + print_mp_irq_info(&mp_irqs[i]);
9862 - * Build basic GSI lookup table to facilitate gsi->io_apic lookups
9863 - * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
9865 - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
9866 - mp_ioapic_routing[idx].gsi_base = gsi_base;
9867 - mp_ioapic_routing[idx].gsi_end = gsi_base +
9868 - io_apic_get_redir_entries(idx);
9870 - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
9871 - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
9872 - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
9873 - mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
9875 + /* update checksum */
9876 + mpc->mpc_checksum = 0;
9877 + mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
9884 -void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
9886 - struct mpc_config_intsrc intsrc;
9891 - * Convert 'gsi' to 'ioapic.pin'.
9893 - ioapic = mp_find_ioapic(gsi);
9896 - pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9897 +static int __initdata enable_update_mptable;
9900 - * TBD: This check is for faulty timer entries, where the override
9901 - * erroneously sets the trigger to level, resulting in a HUGE
9902 - * increase of timer interrupts!
9904 - if ((bus_irq == 0) && (trigger == 3))
9906 +static int __init update_mptable_setup(char *str)
9908 + enable_update_mptable = 1;
9911 +early_param("update_mptable", update_mptable_setup);
9913 - intsrc.mpc_type = MP_INTSRC;
9914 - intsrc.mpc_irqtype = mp_INT;
9915 - intsrc.mpc_irqflag = (trigger << 2) | polarity;
9916 - intsrc.mpc_srcbus = MP_ISA_BUS;
9917 - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
9918 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
9919 - intsrc.mpc_dstirq = pin; /* INTIN# */
9920 +static unsigned long __initdata mpc_new_phys;
9921 +static unsigned long mpc_new_length __initdata = 4096;
9923 - MP_intsrc_info(&intsrc);
9924 +/* alloc_mptable or alloc_mptable=4k */
9925 +static int __initdata alloc_mptable;
9926 +static int __init parse_alloc_mptable_opt(char *p)
9928 + enable_update_mptable = 1;
9929 + alloc_mptable = 1;
9932 + mpc_new_length = memparse(p, &p);
9935 +early_param("alloc_mptable", parse_alloc_mptable_opt);
9937 -void __init mp_config_acpi_legacy_irqs(void)
9938 +void __init early_reserve_e820_mpc_new(void)
9940 - struct mpc_config_intsrc intsrc;
9944 -#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9946 - * Fabricate the legacy ISA bus (bus #31).
9948 - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
9949 + if (enable_update_mptable && alloc_mptable) {
9951 +#ifdef CONFIG_X86_TRAMPOLINE
9952 + startt = TRAMPOLINE_BASE;
9954 - set_bit(MP_ISA_BUS, mp_bus_not_pci);
9955 - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
9958 - * Older generations of ES7000 have no legacy identity mappings
9960 - if (es7000_plat == 1)
9964 - * Locate the IOAPIC that manages the ISA IRQs (0-15).
9966 - ioapic = mp_find_ioapic(0);
9970 - intsrc.mpc_type = MP_INTSRC;
9971 - intsrc.mpc_irqflag = 0; /* Conforming */
9972 - intsrc.mpc_srcbus = MP_ISA_BUS;
9973 -#ifdef CONFIG_X86_IO_APIC
9974 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
9977 - * Use the default configuration for the IRQs 0-15. Unless
9978 - * overridden by (MADT) interrupt source override entries.
9980 - for (i = 0; i < 16; i++) {
9983 - for (idx = 0; idx < mp_irq_entries; idx++) {
9984 - struct mpc_config_intsrc *irq = mp_irqs + idx;
9986 - /* Do we already have a mapping for this ISA IRQ? */
9987 - if (irq->mpc_srcbus == MP_ISA_BUS
9988 - && irq->mpc_srcbusirq == i)
9991 - /* Do we already have a mapping for this IOAPIC pin */
9992 - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
9993 - (irq->mpc_dstirq == i))
9997 - if (idx != mp_irq_entries) {
9998 - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
9999 - continue; /* IRQ already used */
10002 - intsrc.mpc_irqtype = mp_INT;
10003 - intsrc.mpc_srcbusirq = i; /* Identity mapped */
10004 - intsrc.mpc_dstirq = i;
10006 - MP_intsrc_info(&intsrc);
10007 + mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
10011 -int mp_register_gsi(u32 gsi, int triggering, int polarity)
10012 +static int __init update_mp_table(void)
10016 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10017 -#define MAX_GSI_NUM 4096
10018 -#define IRQ_COMPRESSION_START 64
10021 + struct intel_mp_floating *mpf;
10022 + struct mp_config_table *mpc;
10023 + struct mp_config_table *mpc_new;
10025 + if (!enable_update_mptable)
10032 - static int pci_irq = IRQ_COMPRESSION_START;
10034 - * Mapping between Global System Interrupts, which
10035 - * represent all possible interrupts, and IRQs
10036 - * assigned to actual devices.
10037 + * Now see if we need to go further.
10039 - static int gsi_to_irq[MAX_GSI_NUM];
10041 + if (mpf->mpf_feature1 != 0)
10044 - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
10047 + if (!mpf->mpf_physptr)
10050 - /* Don't set up the ACPI SCI because it's already set up */
10051 - if (acpi_gbl_FADT.sci_interrupt == gsi)
10053 + mpc = _bus_to_virt(mpf->mpf_physptr);
10055 - ioapic = mp_find_ioapic(gsi);
10056 - if (ioapic < 0) {
10057 - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
10060 + if (!smp_check_mpc(mpc, oem, str))
10063 - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
10064 + printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf));
10065 + printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
10067 -#ifndef CONFIG_X86_32
10068 - if (ioapic_renumber_irq)
10069 - gsi = ioapic_renumber_irq(ioapic, gsi);
10071 + if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
10072 + mpc_new_phys = 0;
10073 + printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
10077 + if (!mpc_new_phys) {
10078 + unsigned char old, new;
10079 + /* check if we can change the postion */
10080 + mpc->mpc_checksum = 0;
10081 + old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10082 + mpc->mpc_checksum = 0xff;
10083 + new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10084 + if (old == new) {
10085 + printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
10088 + printk(KERN_INFO "use in-positon replacing\n");
10090 + maddr_t mpc_new_bus;
10093 - * Avoid pin reprogramming. PRTs typically include entries
10094 - * with redundant pin->gsi mappings (but unique PCI devices);
10095 - * we only program the IOAPIC on the first.
10097 - if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
10098 - printk(KERN_ERR "Invalid reference to IOAPIC pin "
10099 - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
10103 - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
10104 - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
10105 - mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
10106 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10107 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
10111 + mpc_new_bus = phys_to_machine(mpc_new_phys);
10112 + mpf->mpf_physptr = mpc_new_bus;
10113 + mpc_new = phys_to_virt(mpc_new_phys);
10114 + memcpy(mpc_new, mpc, mpc->mpc_length);
10116 + /* check if we can modify that */
10117 + if (mpc_new_bus - mpf->mpf_physptr) {
10118 + struct intel_mp_floating *mpf_new;
10119 + /* steal 16 bytes from [0, 1k) */
10120 + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
10121 + mpf_new = isa_bus_to_virt(0x400 - 16);
10122 + memcpy(mpf_new, mpf, 16);
10124 + mpf->mpf_physptr = mpc_new_bus;
10126 + mpf->mpf_checksum = 0;
10127 + mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
10128 + printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
10131 - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
10132 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10134 - * For GSI >= 64, use IRQ compression
10135 + * only replace the one with mp_INT and
10136 + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
10137 + * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
10138 + * may need pci=routeirq for all coverage
10140 - if ((gsi >= IRQ_COMPRESSION_START)
10141 - && (triggering == ACPI_LEVEL_SENSITIVE)) {
10143 - * For PCI devices assign IRQs in order, avoiding gaps
10144 - * due to unused I/O APIC pins.
10147 - if (gsi < MAX_GSI_NUM) {
10149 - * Retain the VIA chipset work-around (gsi > 15), but
10150 - * avoid a problem where the 8254 timer (IRQ0) is setup
10151 - * via an override (so it's not on pin 0 of the ioapic),
10152 - * and at the same time, the pin 0 interrupt is a PCI
10153 - * type. The gsi > 15 test could cause these two pins
10154 - * to be shared as IRQ0, and they are not shareable.
10155 - * So test for this condition, and if necessary, avoid
10156 - * the pin collision.
10160 - * Don't assign IRQ used by ACPI SCI
10162 - if (gsi == acpi_gbl_FADT.sci_interrupt)
10164 - gsi_to_irq[irq] = gsi;
10166 - printk(KERN_ERR "GSI %u is too high\n", gsi);
10171 - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
10172 - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
10173 - polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
10175 + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
10180 -#endif /* CONFIG_X86_IO_APIC */
10181 -#endif /* CONFIG_ACPI */
10182 +late_initcall(update_mp_table);
10183 --- sle11-2009-06-04.orig/arch/x86/kernel/nmi.c 2009-06-04 11:08:07.000000000 +0200
10184 +++ sle11-2009-06-04/arch/x86/kernel/nmi.c 2009-06-04 10:21:39.000000000 +0200
10186 #include <linux/kdebug.h>
10187 #include <linux/smp.h>
10189 +#ifndef CONFIG_XEN
10190 #include <asm/i8259.h>
10192 #include <asm/io_apic.h>
10193 #include <asm/smp.h>
10194 #include <asm/nmi.h>
10195 @@ -179,8 +181,10 @@ int __init check_nmi_watchdog(void)
10196 kfree(prev_nmi_count);
10199 +#ifndef CONFIG_XEN
10200 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
10201 disable_8259A_irq(0);
10203 #ifdef CONFIG_X86_32
10206 --- sle11-2009-06-04.orig/arch/x86/kernel/pci-dma-xen.c 2009-03-16 16:38:05.000000000 +0100
10207 +++ sle11-2009-06-04/arch/x86/kernel/pci-dma-xen.c 2009-06-04 10:21:39.000000000 +0200
10210 #include <asm/proto.h>
10211 #include <asm/dma.h>
10212 -#include <asm/gart.h>
10213 +#include <asm/iommu.h>
10214 #include <asm/calgary.h>
10215 +#include <asm/amd_iommu.h>
10217 -int forbid_dac __read_mostly;
10218 -EXPORT_SYMBOL(forbid_dac);
10219 +static int forbid_dac __read_mostly;
10221 -const struct dma_mapping_ops *dma_ops;
10222 +struct dma_mapping_ops *dma_ops;
10223 EXPORT_SYMBOL(dma_ops);
10225 static int iommu_sac_force __read_mostly;
10226 @@ -74,13 +74,17 @@ early_param("dma32_size", parse_dma32_si
10227 void __init dma32_reserve_bootmem(void)
10229 unsigned long size, align;
10230 - if (end_pfn <= MAX_DMA32_PFN)
10231 + if (max_pfn <= MAX_DMA32_PFN)
10235 + * check aperture_64.c allocate_aperture() for reason about
10236 + * using 512M as goal
10239 size = round_up(dma32_bootmem_size, align);
10240 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
10241 - __pa(MAX_DMA_ADDRESS));
10243 if (dma32_bootmem_ptr)
10244 dma32_bootmem_size = size;
10246 @@ -88,17 +92,14 @@ void __init dma32_reserve_bootmem(void)
10248 static void __init dma32_free_bootmem(void)
10252 - if (end_pfn <= MAX_DMA32_PFN)
10253 + if (max_pfn <= MAX_DMA32_PFN)
10256 if (!dma32_bootmem_ptr)
10259 - for_each_online_node(node)
10260 - free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
10261 - dma32_bootmem_size);
10262 + free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
10264 dma32_bootmem_ptr = NULL;
10265 dma32_bootmem_size = 0;
10266 @@ -107,7 +108,7 @@ static void __init dma32_free_bootmem(vo
10267 #define dma32_free_bootmem() ((void)0)
10270 -static const struct dma_mapping_ops swiotlb_dma_ops = {
10271 +static struct dma_mapping_ops swiotlb_dma_ops = {
10272 .mapping_error = swiotlb_dma_mapping_error,
10273 .map_single = swiotlb_map_single_phys,
10274 .unmap_single = swiotlb_unmap_single,
10275 @@ -130,25 +131,31 @@ void __init pci_iommu_alloc(void)
10276 * The order of these functions is important for
10277 * fall-back/fail-over reasons
10279 -#ifdef CONFIG_GART_IOMMU
10280 gart_iommu_hole_init();
10283 -#ifdef CONFIG_CALGARY_IOMMU
10287 detect_intel_iommu();
10289 -#ifdef CONFIG_SWIOTLB
10290 + amd_iommu_detect();
10294 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
10295 dma_ops = &swiotlb_dma_ops;
10300 +#ifndef CONFIG_XEN
10301 +unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
10303 + unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
10305 + return size >> PAGE_SHIFT;
10307 +EXPORT_SYMBOL(iommu_num_pages);
10311 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
10313 @@ -201,9 +208,7 @@ static __init int iommu_setup(char *p)
10317 -#ifdef CONFIG_GART_IOMMU
10318 gart_parse_options(p);
10321 #ifdef CONFIG_CALGARY_IOMMU
10322 if (!strncmp(p, "calgary", 7))
10323 @@ -245,136 +250,19 @@ int range_straddles_page_boundary(paddr_
10324 !check_pages_physically_contiguous(pfn, offset, size));
10327 -#ifdef CONFIG_X86_32
10328 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
10329 - dma_addr_t device_addr, size_t size, int flags)
10331 - void __iomem *mem_base = NULL;
10332 - int pages = size >> PAGE_SHIFT;
10333 - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
10335 - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
10339 - if (dev->dma_mem)
10342 - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
10344 - mem_base = ioremap(bus_addr, size);
10348 - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
10349 - if (!dev->dma_mem)
10351 - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
10352 - if (!dev->dma_mem->bitmap)
10355 - dev->dma_mem->virt_base = mem_base;
10356 - dev->dma_mem->device_base = device_addr;
10357 - dev->dma_mem->size = pages;
10358 - dev->dma_mem->flags = flags;
10360 - if (flags & DMA_MEMORY_MAP)
10361 - return DMA_MEMORY_MAP;
10363 - return DMA_MEMORY_IO;
10366 - kfree(dev->dma_mem);
10369 - iounmap(mem_base);
10372 -EXPORT_SYMBOL(dma_declare_coherent_memory);
10374 -void dma_release_declared_memory(struct device *dev)
10376 - struct dma_coherent_mem *mem = dev->dma_mem;
10380 - dev->dma_mem = NULL;
10381 - iounmap(mem->virt_base);
10382 - kfree(mem->bitmap);
10385 -EXPORT_SYMBOL(dma_release_declared_memory);
10387 -void *dma_mark_declared_memory_occupied(struct device *dev,
10388 - dma_addr_t device_addr, size_t size)
10390 - struct dma_coherent_mem *mem = dev->dma_mem;
10392 - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
10394 - pages >>= PAGE_SHIFT;
10397 - return ERR_PTR(-EINVAL);
10399 - pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
10400 - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
10402 - return ERR_PTR(err);
10403 - return mem->virt_base + (pos << PAGE_SHIFT);
10405 -EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
10407 -static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
10408 - dma_addr_t *dma_handle, void **ret)
10410 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10411 - int order = get_order(size);
10414 - int page = bitmap_find_free_region(mem->bitmap, mem->size,
10417 - *dma_handle = mem->device_base + (page << PAGE_SHIFT);
10418 - *ret = mem->virt_base + (page << PAGE_SHIFT);
10419 - memset(*ret, 0, size);
10421 - if (mem->flags & DMA_MEMORY_EXCLUSIVE)
10424 - return (mem != NULL);
10427 -static int dma_release_coherent(struct device *dev, int order, void *vaddr)
10429 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10431 - if (mem && vaddr >= mem->virt_base && vaddr <
10432 - (mem->virt_base + (mem->size << PAGE_SHIFT))) {
10433 - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
10435 - bitmap_release_region(mem->bitmap, page, order);
10441 -#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
10442 -#define dma_release_coherent(dev, order, vaddr) (0)
10443 -#endif /* CONFIG_X86_32 */
10445 int dma_supported(struct device *dev, u64 mask)
10447 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10450 if (mask > 0xffffffff && forbid_dac > 0) {
10451 - printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
10453 + dev_info(dev, "PCI: Disallowing DAC for device\n");
10458 - if (dma_ops->dma_supported)
10459 - return dma_ops->dma_supported(dev, mask);
10460 + if (ops->dma_supported)
10461 + return ops->dma_supported(dev, mask);
10463 /* Copied from i386. Doesn't make much sense, because it will
10464 only work for pci_alloc_coherent.
10465 @@ -395,8 +283,7 @@ int dma_supported(struct device *dev, u6
10466 type. Normally this doesn't make any difference, but gives
10467 more gentle handling of IOMMU overflow. */
10468 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
10469 - printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
10470 - dev->bus_id, mask);
10471 + dev_info(dev, "Force SAC with mask %Lx\n", mask);
10475 @@ -422,6 +309,9 @@ void *
10476 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
10479 +#ifndef CONFIG_XEN
10480 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10482 void *memory = NULL;
10484 unsigned long dma_mask = 0;
10485 @@ -431,7 +321,7 @@ dma_alloc_coherent(struct device *dev, s
10486 /* ignore region specifiers */
10487 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
10489 - if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
10490 + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
10494 @@ -491,8 +381,8 @@ dma_alloc_coherent(struct device *dev, s
10495 /* Let low level make its own zone decisions */
10496 gfp &= ~(GFP_DMA32|GFP_DMA);
10498 - if (dma_ops->alloc_coherent)
10499 - return dma_ops->alloc_coherent(dev, size,
10500 + if (ops->alloc_coherent)
10501 + return ops->alloc_coherent(dev, size,
10505 @@ -504,14 +394,14 @@ dma_alloc_coherent(struct device *dev, s
10509 - if (dma_ops->alloc_coherent) {
10510 + if (ops->alloc_coherent) {
10511 free_pages((unsigned long)memory, order);
10512 gfp &= ~(GFP_DMA|GFP_DMA32);
10513 - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
10514 + return ops->alloc_coherent(dev, size, dma_handle, gfp);
10517 - if (dma_ops->map_simple) {
10518 - *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
10519 + if (ops->map_simple) {
10520 + *dma_handle = ops->map_simple(dev, virt_to_bus(memory),
10522 PCI_DMA_BIDIRECTIONAL);
10523 if (*dma_handle != bad_dma_address)
10524 @@ -542,13 +432,17 @@ EXPORT_SYMBOL(dma_alloc_coherent);
10525 void dma_free_coherent(struct device *dev, size_t size,
10526 void *vaddr, dma_addr_t bus)
10528 +#ifndef CONFIG_XEN
10529 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10532 int order = get_order(size);
10533 WARN_ON(irqs_disabled()); /* for portability */
10534 - if (dma_release_coherent(dev, order, vaddr))
10535 + if (dma_release_from_coherent(dev, order, vaddr))
10538 - if (dma_ops->unmap_single)
10539 - dma_ops->unmap_single(dev, bus, size, 0);
10540 + if (ops->unmap_single)
10541 + ops->unmap_single(dev, bus, size, 0);
10543 xen_destroy_contiguous_region((unsigned long)vaddr, order);
10544 free_pages((unsigned long)vaddr, order);
10545 @@ -557,15 +451,13 @@ EXPORT_SYMBOL(dma_free_coherent);
10547 static int __init pci_iommu_init(void)
10549 -#ifdef CONFIG_CALGARY_IOMMU
10550 calgary_iommu_init();
10553 intel_iommu_init();
10555 -#ifdef CONFIG_GART_IOMMU
10556 + amd_iommu_init();
10563 --- sle11-2009-06-04.orig/arch/x86/kernel/pci-nommu-xen.c 2009-03-16 16:38:05.000000000 +0100
10564 +++ sle11-2009-06-04/arch/x86/kernel/pci-nommu-xen.c 2009-06-04 10:21:39.000000000 +0200
10565 @@ -79,18 +79,12 @@ gnttab_unmap_single(struct device *dev,
10566 gnttab_dma_unmap_page(dma_addr);
10569 -static int nommu_mapping_error(dma_addr_t dma_addr)
10571 - return (dma_addr == bad_dma_address);
10574 -static const struct dma_mapping_ops nommu_dma_ops = {
10575 +static struct dma_mapping_ops nommu_dma_ops = {
10576 .map_single = gnttab_map_single,
10577 .unmap_single = gnttab_unmap_single,
10578 .map_sg = gnttab_map_sg,
10579 .unmap_sg = gnttab_unmap_sg,
10580 .dma_supported = swiotlb_dma_supported,
10581 - .mapping_error = nommu_mapping_error
10584 void __init no_iommu_init(void)
10585 --- sle11-2009-06-04.orig/arch/x86/kernel/probe_roms_32.c 2009-06-04 11:08:07.000000000 +0200
10586 +++ sle11-2009-06-04/arch/x86/kernel/probe_roms_32.c 2009-06-04 10:21:39.000000000 +0200
10587 @@ -99,6 +99,11 @@ void __init probe_roms(void)
10592 + if (!is_initial_xendomain())
10597 upper = adapter_rom_resources[0].start;
10598 for (start = video_rom_resource.start; start < upper; start += 2048) {
10599 @@ -131,7 +136,7 @@ void __init probe_roms(void)
10600 upper = system_rom_resource.start;
10602 /* check for extension rom (ignore length byte!) */
10603 - rom = isa_bus_to_virt(extension_rom_resource.start);
10604 + rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
10605 if (romsignature(rom)) {
10606 length = extension_rom_resource.end - extension_rom_resource.start + 1;
10607 if (romchecksum(rom, length)) {
10608 --- sle11-2009-06-04.orig/arch/x86/kernel/process-xen.c 2009-03-16 16:38:05.000000000 +0100
10609 +++ sle11-2009-06-04/arch/x86/kernel/process-xen.c 2009-06-04 10:21:39.000000000 +0200
10611 #include <linux/sched.h>
10612 #include <linux/module.h>
10613 #include <linux/pm.h>
10614 +#include <linux/clockchips.h>
10615 +#include <asm/system.h>
10617 +unsigned long idle_halt;
10618 +EXPORT_SYMBOL(idle_halt);
10619 +unsigned long idle_nomwait;
10620 +EXPORT_SYMBOL(idle_nomwait);
10622 struct kmem_cache *task_xstate_cachep;
10624 @@ -45,6 +52,70 @@ void arch_task_cache_init(void)
10629 + * Idle related variables and functions
10631 +unsigned long boot_option_idle_override = 0;
10632 +EXPORT_SYMBOL(boot_option_idle_override);
10635 + * Powermanagement idle function, if any..
10637 +void (*pm_idle)(void);
10638 +EXPORT_SYMBOL(pm_idle);
10640 +#ifdef CONFIG_X86_32
10642 + * This halt magic was a workaround for ancient floppy DMA
10643 + * wreckage. It should be safe to remove.
10645 +static int hlt_counter;
10646 +void disable_hlt(void)
10650 +EXPORT_SYMBOL(disable_hlt);
10652 +void enable_hlt(void)
10656 +EXPORT_SYMBOL(enable_hlt);
10658 +static inline int hlt_use_halt(void)
10660 + return (!hlt_counter && boot_cpu_data.hlt_works_ok);
10663 +static inline int hlt_use_halt(void)
10670 + * We use this if we don't have any better
10673 +void xen_idle(void)
10675 + current_thread_info()->status &= ~TS_POLLING;
10677 + * TS_POLLING-cleared state must be visible before we
10678 + * test NEED_RESCHED:
10682 + if (!need_resched())
10683 + safe_halt(); /* enables interrupts racelessly */
10685 + local_irq_enable();
10686 + current_thread_info()->status |= TS_POLLING;
10688 +#ifdef CONFIG_APM_MODULE
10689 +EXPORT_SYMBOL(default_idle);
10692 static void do_nothing(void *unused)
10695 @@ -61,7 +132,7 @@ void cpu_idle_wait(void)
10698 /* kick all the CPUs so that they exit out of pm_idle */
10699 - smp_call_function(do_nothing, NULL, 0, 1);
10700 + smp_call_function(do_nothing, NULL, 1);
10702 EXPORT_SYMBOL_GPL(cpu_idle_wait);
10704 @@ -125,60 +196,175 @@ static void poll_idle(void)
10706 * idle=mwait overrides this decision and forces the usage of mwait.
10708 +static int __cpuinitdata force_mwait;
10710 +#define MWAIT_INFO 0x05
10711 +#define MWAIT_ECX_EXTENDED_INFO 0x01
10712 +#define MWAIT_EDX_C1 0xf0
10714 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
10716 + u32 eax, ebx, ecx, edx;
10721 - if (c->x86_vendor == X86_VENDOR_AMD) {
10726 + if (c->cpuid_level < MWAIT_INFO)
10729 + cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
10730 + /* Check, whether EDX has extended info about MWAIT */
10731 + if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
10735 + * edx enumeratios MONITOR/MWAIT extensions. Check, whether
10736 + * C1 supports MWAIT
10738 + return (edx & MWAIT_EDX_C1);
10742 + * Check for AMD CPUs, which have potentially C1E support
10744 +static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
10746 + if (c->x86_vendor != X86_VENDOR_AMD)
10749 + if (c->x86 < 0x0F)
10752 + /* Family 0x0f models < rev F do not have C1E */
10753 + if (c->x86 == 0x0f && c->x86_model < 0x40)
10759 +static cpumask_t c1e_mask = CPU_MASK_NONE;
10760 +static int c1e_detected;
10762 +void c1e_remove_cpu(int cpu)
10764 + cpu_clear(cpu, c1e_mask);
10768 + * C1E aware idle routine. We check for C1E active in the interrupt
10769 + * pending message MSR. If we detect C1E, then we handle it the same
10770 + * way as C3 power states (local apic timer and TSC stop)
10772 +static void c1e_idle(void)
10774 + if (need_resched())
10777 + if (!c1e_detected) {
10780 + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
10781 + if (lo & K8_INTP_C1E_ACTIVE_MASK) {
10782 + c1e_detected = 1;
10783 + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
10784 + mark_tsc_unstable("TSC halt in AMD C1E");
10785 + printk(KERN_INFO "System has AMD C1E enabled\n");
10786 + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
10791 + if (c1e_detected) {
10792 + int cpu = smp_processor_id();
10794 + if (!cpu_isset(cpu, c1e_mask)) {
10795 + cpu_set(cpu, c1e_mask);
10797 + * Force broadcast so ACPI can not interfere. Needs
10798 + * to run with interrupts enabled as it uses
10799 + * smp_function_call.
10801 + local_irq_enable();
10802 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
10804 + printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
10806 + local_irq_disable();
10808 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
10813 + * The switch back from broadcast mode needs to be
10814 + * called with interrupts disabled.
10816 + local_irq_disable();
10817 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
10818 + local_irq_enable();
10824 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
10827 - static int selected;
10831 #ifdef CONFIG_X86_SMP
10832 if (pm_idle == poll_idle && smp_num_siblings > 1) {
10833 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
10834 " performance may degrade.\n");
10840 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
10842 - * Skip, if setup has overridden idle.
10843 * One CPU supports mwait => All CPUs supports mwait
10846 - printk(KERN_INFO "using mwait in idle threads.\n");
10847 - pm_idle = mwait_idle;
10851 + printk(KERN_INFO "using mwait in idle threads.\n");
10852 + pm_idle = mwait_idle;
10853 + } else if (check_c1e_idle(c)) {
10854 + printk(KERN_INFO "using C1E aware idle routine\n");
10855 + pm_idle = c1e_idle;
10857 + pm_idle = default_idle;
10861 static int __init idle_setup(char *str)
10866 if (!strcmp(str, "poll")) {
10867 printk("using polling idle threads.\n");
10868 pm_idle = poll_idle;
10871 - else if (!strcmp(str, "mwait"))
10872 + } else if (!strcmp(str, "mwait"))
10874 + else if (!strcmp(str, "halt")) {
10876 + * When the boot option of idle=halt is added, halt is
10877 + * forced to be used for CPU idle. In such case CPU C2/C3
10878 + * won't be used again.
10879 + * To continue to load the CPU idle driver, don't touch
10880 + * the boot_option_idle_override.
10882 + pm_idle = default_idle;
10885 + } else if (!strcmp(str, "nomwait")) {
10887 + * If the boot option of "idle=nomwait" is added,
10888 + * it means that mwait will be disabled for CPU C2/C3
10889 + * states. In such case it won't touch the variable
10890 + * of boot_option_idle_override.
10892 + idle_nomwait = 1;
10899 boot_option_idle_override = 1;
10900 --- sle11-2009-06-04.orig/arch/x86/kernel/process_32-xen.c 2009-03-16 16:38:05.000000000 +0100
10901 +++ sle11-2009-06-04/arch/x86/kernel/process_32-xen.c 2009-06-04 10:21:39.000000000 +0200
10902 @@ -59,15 +59,11 @@
10903 #include <asm/tlbflush.h>
10904 #include <asm/cpu.h>
10905 #include <asm/kdebug.h>
10906 +#include <asm/idle.h>
10908 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
10909 asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
10911 -static int hlt_counter;
10913 -unsigned long boot_option_idle_override = 0;
10914 -EXPORT_SYMBOL(boot_option_idle_override);
10916 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
10917 EXPORT_PER_CPU_SYMBOL(current_task);
10919 @@ -82,46 +78,27 @@ unsigned long thread_saved_pc(struct tas
10920 return ((unsigned long *)tsk->thread.sp)[3];
10924 - * Powermanagement idle function, if any..
10926 -void (*pm_idle)(void);
10927 -EXPORT_SYMBOL(pm_idle);
10928 +#ifdef CONFIG_HOTPLUG_CPU
10929 +#ifndef CONFIG_XEN
10930 +#include <asm/nmi.h>
10932 -void disable_hlt(void)
10933 +static void cpu_exit_clear(void)
10937 + int cpu = raw_smp_processor_id();
10939 -EXPORT_SYMBOL(disable_hlt);
10941 -void enable_hlt(void)
10945 + idle_task_exit();
10947 -EXPORT_SYMBOL(enable_hlt);
10949 + irq_ctx_exit(cpu);
10951 -static void xen_idle(void)
10953 - current_thread_info()->status &= ~TS_POLLING;
10955 - * TS_POLLING-cleared state must be visible before we
10956 - * test NEED_RESCHED:
10959 + cpu_clear(cpu, cpu_callout_map);
10960 + cpu_clear(cpu, cpu_callin_map);
10962 - if (!need_resched())
10963 - safe_halt(); /* enables interrupts racelessly */
10965 - local_irq_enable();
10966 - current_thread_info()->status |= TS_POLLING;
10967 + numa_remove_cpu(cpu);
10968 + c1e_remove_cpu(cpu);
10970 -#ifdef CONFIG_APM_MODULE
10971 -EXPORT_SYMBOL(default_idle);
10974 -#ifdef CONFIG_HOTPLUG_CPU
10975 static inline void play_dead(void)
10978 @@ -152,13 +129,11 @@ void cpu_idle(void)
10980 /* endless idle loop with no priority at all */
10982 - tick_nohz_stop_sched_tick();
10983 + tick_nohz_stop_sched_tick(1);
10984 while (!need_resched()) {
10985 - void (*idle)(void);
10989 - idle = xen_idle; /* no alternatives */
10991 if (rcu_pending(cpu))
10992 rcu_check_callbacks(cpu, 0);
10993 @@ -168,7 +143,10 @@ void cpu_idle(void)
10995 local_irq_disable();
10996 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
10998 + /* Don't trace irqs off for idle */
10999 + stop_critical_timings();
11001 + start_critical_timings();
11003 tick_nohz_restart_sched_tick();
11004 preempt_enable_no_resched();
11005 --- sle11-2009-06-04.orig/arch/x86/kernel/process_64-xen.c 2009-03-16 16:38:05.000000000 +0100
11006 +++ sle11-2009-06-04/arch/x86/kernel/process_64-xen.c 2009-06-04 10:21:39.000000000 +0200
11007 @@ -65,15 +65,6 @@ asmlinkage extern void ret_from_fork(voi
11009 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
11011 -unsigned long boot_option_idle_override = 0;
11012 -EXPORT_SYMBOL(boot_option_idle_override);
11015 - * Powermanagement idle function, if any..
11017 -void (*pm_idle)(void);
11018 -EXPORT_SYMBOL(pm_idle);
11020 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
11022 void idle_notifier_register(struct notifier_block *n)
11023 @@ -103,25 +94,13 @@ void exit_idle(void)
11027 -static void xen_idle(void)
11029 - current_thread_info()->status &= ~TS_POLLING;
11031 - * TS_POLLING-cleared state must be visible before we
11032 - * test NEED_RESCHED:
11035 - if (!need_resched())
11036 - safe_halt(); /* enables interrupts racelessly */
11038 - local_irq_enable();
11039 - current_thread_info()->status |= TS_POLLING;
11042 #ifdef CONFIG_HOTPLUG_CPU
11043 static inline void play_dead(void)
11046 +#ifndef CONFIG_XEN
11047 + c1e_remove_cpu(raw_smp_processor_id());
11049 local_irq_disable();
11050 cpu_clear(smp_processor_id(), cpu_initialized);
11051 preempt_enable_no_resched();
11052 @@ -146,12 +125,11 @@ void cpu_idle(void)
11053 current_thread_info()->status |= TS_POLLING;
11054 /* endless idle loop with no priority at all */
11056 - tick_nohz_stop_sched_tick();
11057 + tick_nohz_stop_sched_tick(1);
11058 while (!need_resched()) {
11059 - void (*idle)(void);
11062 - idle = xen_idle; /* no alternatives */
11064 if (cpu_is_offline(smp_processor_id()))
11067 @@ -161,7 +139,10 @@ void cpu_idle(void)
11069 local_irq_disable();
11072 + /* Don't trace irqs off for idle */
11073 + stop_critical_timings();
11075 + start_critical_timings();
11076 /* In many cases the interrupt that ended idle
11077 has already called exit_idle. But some idle
11078 loops can be woken up without interrupt. */
11079 @@ -271,7 +252,7 @@ void exit_thread(void)
11083 -void load_gs_index(unsigned gs)
11084 +void xen_load_gs_index(unsigned gs)
11086 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
11088 @@ -373,10 +354,10 @@ int copy_thread(int nr, unsigned long cl
11089 p->thread.fs = me->thread.fs;
11090 p->thread.gs = me->thread.gs;
11092 - asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
11093 - asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
11094 - asm("mov %%es,%0" : "=m" (p->thread.es));
11095 - asm("mov %%ds,%0" : "=m" (p->thread.ds));
11096 + savesegment(gs, p->thread.gsindex);
11097 + savesegment(fs, p->thread.fsindex);
11098 + savesegment(es, p->thread.es);
11099 + savesegment(ds, p->thread.ds);
11101 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
11102 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
11103 @@ -417,7 +398,9 @@ out:
11105 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
11107 - asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
11108 + loadsegment(fs, 0);
11109 + loadsegment(es, 0);
11110 + loadsegment(ds, 0);
11114 @@ -557,8 +540,8 @@ static inline void __switch_to_xtra(stru
11115 struct task_struct *
11116 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
11118 - struct thread_struct *prev = &prev_p->thread,
11119 - *next = &next_p->thread;
11120 + struct thread_struct *prev = &prev_p->thread;
11121 + struct thread_struct *next = &next_p->thread;
11122 int cpu = smp_processor_id();
11123 #ifndef CONFIG_X86_NO_TSS
11124 struct tss_struct *tss = &per_cpu(init_tss, cpu);
11125 @@ -661,12 +644,25 @@ __switch_to(struct task_struct *prev_p,
11127 if (unlikely(next->es))
11128 loadsegment(es, next->es);
11131 if (unlikely(next->ds))
11132 loadsegment(ds, next->ds);
11135 + * Leave lazy mode, flushing any hypercalls made here.
11136 + * This must be done before restoring TLS segments so
11137 + * the GDT and LDT are properly updated, and must be
11138 + * done before math_state_restore, so the TS bit is up
11141 + arch_leave_lazy_cpu_mode();
11144 * Switch FS and GS.
11146 + * Segment register != 0 always requires a reload. Also
11147 + * reload when it has changed. When prev process used 64bit
11148 + * base always reload to avoid an information leak.
11150 if (unlikely(next->fsindex))
11151 loadsegment(fs, next->fsindex);
11152 @@ -687,7 +683,8 @@ __switch_to(struct task_struct *prev_p,
11153 write_pda(oldrsp, next->usersp);
11154 write_pda(pcurrent, next_p);
11155 write_pda(kernelstack,
11156 - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
11157 + (unsigned long)task_stack_page(next_p) +
11158 + THREAD_SIZE - PDA_STACKOFFSET);
11159 #ifdef CONFIG_CC_STACKPROTECTOR
11160 write_pda(stack_canary, next_p->stack_canary);
11162 @@ -848,7 +845,7 @@ long do_arch_prctl(struct task_struct *t
11163 set_32bit_tls(task, FS_TLS, addr);
11165 load_TLS(&task->thread, cpu);
11166 - asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
11167 + loadsegment(fs, FS_TLS_SEL);
11169 task->thread.fsindex = FS_TLS_SEL;
11170 task->thread.fs = 0;
11171 @@ -858,7 +855,7 @@ long do_arch_prctl(struct task_struct *t
11173 /* set the selector to 0 to not confuse
11175 - asm volatile("movl %0,%%fs" :: "r" (0));
11176 + loadsegment(fs, 0);
11177 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
11180 @@ -882,7 +879,7 @@ long do_arch_prctl(struct task_struct *t
11181 if (task->thread.gsindex == GS_TLS_SEL)
11182 base = read_32bit_tls(task, GS_TLS);
11184 - asm("movl %%gs,%0" : "=r" (gsindex));
11185 + savesegment(gs, gsindex);
11187 rdmsrl(MSR_KERNEL_GS_BASE, base);
11189 --- sle11-2009-06-04.orig/arch/x86/kernel/quirks-xen.c 2009-03-16 16:33:40.000000000 +0100
11190 +++ sle11-2009-06-04/arch/x86/kernel/quirks-xen.c 2009-06-04 10:21:39.000000000 +0200
11191 @@ -63,6 +63,7 @@ static enum {
11192 ICH_FORCE_HPET_RESUME,
11193 VT8237_FORCE_HPET_RESUME,
11194 NVIDIA_FORCE_HPET_RESUME,
11195 + ATI_FORCE_HPET_RESUME,
11196 } force_hpet_resume_type;
11198 static void __iomem *rcba_base;
11199 @@ -156,6 +157,8 @@ static void ich_force_enable_hpet(struct
11201 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
11202 ich_force_enable_hpet);
11203 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
11204 + ich_force_enable_hpet);
11205 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
11206 ich_force_enable_hpet);
11207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
11208 @@ -172,6 +175,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
11210 static struct pci_dev *cached_dev;
11212 +static void hpet_print_force_info(void)
11214 + printk(KERN_INFO "HPET not enabled in BIOS. "
11215 + "You might try hpet=force boot option\n");
11218 static void old_ich_force_hpet_resume(void)
11221 @@ -251,8 +260,12 @@ static void old_ich_force_enable_hpet_us
11223 if (hpet_force_user)
11224 old_ich_force_enable_hpet(dev);
11226 + hpet_print_force_info();
11229 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
11230 + old_ich_force_enable_hpet_user);
11231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
11232 old_ich_force_enable_hpet_user);
11233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
11234 @@ -288,9 +301,14 @@ static void vt8237_force_enable_hpet(str
11236 u32 uninitialized_var(val);
11238 - if (!hpet_force_user || hpet_address || force_hpet_address)
11239 + if (hpet_address || force_hpet_address)
11242 + if (!hpet_force_user) {
11243 + hpet_print_force_info();
11247 pci_read_config_dword(dev, 0x68, &val);
11249 * Bit 7 is HPET enable bit.
11250 @@ -328,6 +346,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_V
11251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
11252 vt8237_force_enable_hpet);
11254 +static void ati_force_hpet_resume(void)
11256 + pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
11257 + printk(KERN_DEBUG "Force enabled HPET at resume\n");
11260 +static void ati_force_enable_hpet(struct pci_dev *dev)
11262 + u32 uninitialized_var(val);
11264 + if (hpet_address || force_hpet_address)
11267 + if (!hpet_force_user) {
11268 + hpet_print_force_info();
11272 + pci_write_config_dword(dev, 0x14, 0xfed00000);
11273 + pci_read_config_dword(dev, 0x14, &val);
11274 + force_hpet_address = val;
11275 + force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
11276 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
11277 + force_hpet_address);
11278 + cached_dev = dev;
11281 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
11282 + ati_force_enable_hpet);
11285 * Undocumented chipset feature taken from LinuxBIOS.
11287 @@ -341,8 +389,13 @@ static void nvidia_force_enable_hpet(str
11289 u32 uninitialized_var(val);
11291 - if (!hpet_force_user || hpet_address || force_hpet_address)
11292 + if (hpet_address || force_hpet_address)
11295 + if (!hpet_force_user) {
11296 + hpet_print_force_info();
11300 pci_write_config_dword(dev, 0x44, 0xfed00001);
11301 pci_read_config_dword(dev, 0x44, &val);
11302 @@ -395,6 +448,9 @@ void force_hpet_resume(void)
11303 case NVIDIA_FORCE_HPET_RESUME:
11304 nvidia_force_hpet_resume();
11306 + case ATI_FORCE_HPET_RESUME:
11307 + ati_force_hpet_resume();
11312 --- sle11-2009-06-04.orig/arch/x86/kernel/setup-xen.c 2009-03-16 16:38:05.000000000 +0100
11313 +++ sle11-2009-06-04/arch/x86/kernel/setup-xen.c 2009-06-04 10:21:39.000000000 +0200
11314 @@ -1,141 +1,1131 @@
11315 -#include <linux/kernel.h>
11317 + * Copyright (C) 1995 Linus Torvalds
11319 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11321 + * Memory region support
11322 + * David Parsons <orc@pell.chi.il.us>, July-August 1999
11324 + * Added E820 sanitization routine (removes overlapping memory regions);
11325 + * Brian Moyle <bmoyle@mvista.com>, February 2001
11327 + * Moved CPU detection code to cpu/${cpu}.c
11328 + * Patrick Mochel <mochel@osdl.org>, March 2002
11330 + * Provisions for empty E820 memory regions (reported by certain BIOSes).
11331 + * Alex Achenbach <xela@slit.de>, December 2002.
11336 + * This file handles the architecture-dependent parts of initialization
11339 +#include <linux/sched.h>
11340 +#include <linux/mm.h>
11341 +#include <linux/mmzone.h>
11342 +#include <linux/screen_info.h>
11343 +#include <linux/ioport.h>
11344 +#include <linux/acpi.h>
11345 +#include <linux/apm_bios.h>
11346 +#include <linux/initrd.h>
11347 +#include <linux/bootmem.h>
11348 +#include <linux/seq_file.h>
11349 +#include <linux/console.h>
11350 +#include <linux/mca.h>
11351 +#include <linux/root_dev.h>
11352 +#include <linux/highmem.h>
11353 #include <linux/module.h>
11354 +#include <linux/efi.h>
11355 #include <linux/init.h>
11356 -#include <linux/bootmem.h>
11357 +#include <linux/edd.h>
11358 +#include <linux/iscsi_ibft.h>
11359 +#include <linux/nodemask.h>
11360 +#include <linux/kexec.h>
11361 +#include <linux/dmi.h>
11362 +#include <linux/pfn.h>
11363 +#include <linux/pci.h>
11364 +#include <asm/pci-direct.h>
11365 +#include <linux/init_ohci1394_dma.h>
11366 +#include <linux/kvm_para.h>
11368 +#include <linux/errno.h>
11369 +#include <linux/kernel.h>
11370 +#include <linux/stddef.h>
11371 +#include <linux/unistd.h>
11372 +#include <linux/ptrace.h>
11373 +#include <linux/slab.h>
11374 +#include <linux/user.h>
11375 +#include <linux/delay.h>
11377 +#include <linux/kallsyms.h>
11378 +#include <linux/cpufreq.h>
11379 +#include <linux/dma-mapping.h>
11380 +#include <linux/ctype.h>
11381 +#include <linux/uaccess.h>
11383 #include <linux/percpu.h>
11384 -#include <asm/smp.h>
11385 -#include <asm/percpu.h>
11386 +#include <linux/crash_dump.h>
11388 +#include <video/edid.h>
11390 +#include <asm/mtrr.h>
11391 +#include <asm/apic.h>
11392 +#include <asm/e820.h>
11393 +#include <asm/mpspec.h>
11394 +#include <asm/setup.h>
11395 +#include <asm/arch_hooks.h>
11396 +#include <asm/efi.h>
11397 #include <asm/sections.h>
11398 +#include <asm/dmi.h>
11399 +#include <asm/io_apic.h>
11400 +#include <asm/ist.h>
11401 +#include <asm/vmi.h>
11402 +#include <setup_arch.h>
11403 +#include <asm/bios_ebda.h>
11404 +#include <asm/cacheflush.h>
11405 #include <asm/processor.h>
11406 -#include <asm/setup.h>
11407 +#include <asm/bugs.h>
11409 +#include <asm/system.h>
11410 +#include <asm/vsyscall.h>
11411 +#include <asm/smp.h>
11412 +#include <asm/desc.h>
11413 +#include <asm/dma.h>
11414 +#include <asm/iommu.h>
11415 +#include <asm/mmu_context.h>
11416 +#include <asm/proto.h>
11418 +#include <mach_apic.h>
11419 +#include <asm/paravirt.h>
11421 +#include <asm/percpu.h>
11422 #include <asm/topology.h>
11423 -#include <asm/mpspec.h>
11424 #include <asm/apicdef.h>
11425 +#ifdef CONFIG_X86_64
11426 +#include <asm/numa_64.h>
11430 +#include <asm/hypervisor.h>
11431 +#include <xen/interface/kexec.h>
11432 +#include <xen/interface/memory.h>
11433 +#include <xen/interface/nmi.h>
11434 +#include <xen/interface/physdev.h>
11435 +#include <xen/features.h>
11436 +#include <xen/firmware.h>
11437 +#include <xen/xencons.h>
11439 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
11440 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
11442 -#ifdef CONFIG_X86_LOCAL_APIC
11443 -unsigned int num_processors;
11444 -unsigned disabled_cpus __cpuinitdata;
11445 -/* Processor that is doing the boot up */
11446 -unsigned int boot_cpu_physical_apicid = -1U;
11447 -EXPORT_SYMBOL(boot_cpu_physical_apicid);
11448 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
11449 +static struct notifier_block xen_panic_block = {
11450 + xen_panic_event, NULL, 0 /* try to go last */
11453 -DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
11454 -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
11455 +unsigned long *phys_to_machine_mapping;
11456 +EXPORT_SYMBOL(phys_to_machine_mapping);
11458 -/* Bitmask of physically existing CPUs */
11459 -physid_mask_t phys_cpu_present_map;
11460 +unsigned long *pfn_to_mfn_frame_list_list,
11461 +#ifdef CONFIG_X86_64
11462 + *pfn_to_mfn_frame_list[512];
11464 + *pfn_to_mfn_frame_list[128];
11467 +/* Raw start-of-day parameters from the hypervisor. */
11468 +start_info_t *xen_start_info;
11469 +EXPORT_SYMBOL(xen_start_info);
11472 +#ifndef ARCH_SETUP
11473 +#define ARCH_SETUP
11476 +#ifndef CONFIG_XEN
11477 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
11478 +struct boot_params __initdata boot_params;
11480 +struct boot_params boot_params;
11484 -#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
11486 - * Copy data used in early init routines from the initial arrays to the
11487 - * per cpu data areas. These arrays then become expendable and the
11488 - * *_early_ptr's are zeroed indicating that the static arrays are gone.
11489 + * Machine setup..
11491 -static void __init setup_per_cpu_maps(void)
11492 +static struct resource data_resource = {
11493 + .name = "Kernel data",
11496 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11499 +static struct resource code_resource = {
11500 + .name = "Kernel code",
11503 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11506 +static struct resource bss_resource = {
11507 + .name = "Kernel bss",
11510 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11514 +#ifdef CONFIG_X86_32
11515 +#ifndef CONFIG_XEN
11516 +/* This value is set up by the early boot code to point to the value
11517 + immediately after the boot time page tables. It contains a *physical*
11518 + address, and must not be in the .bss segment! */
11519 +unsigned long init_pg_tables_start __initdata = ~0UL;
11520 +unsigned long init_pg_tables_end __initdata = ~0UL;
11523 +static struct resource video_ram_resource = {
11524 + .name = "Video RAM area",
11525 + .start = 0xa0000,
11527 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11530 +/* cpu data as detected by the assembly code in head.S */
11531 +struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11532 +/* common cpu data for all cpus */
11533 +struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11534 +EXPORT_SYMBOL(boot_cpu_data);
11535 +#ifndef CONFIG_XEN
11536 +static void set_mca_bus(int x)
11543 +unsigned int def_to_bigsmp;
11545 +/* for MCA, but anyone else can use it if they want */
11546 +unsigned int machine_id;
11547 +unsigned int machine_submodel_id;
11548 +unsigned int BIOS_revision;
11550 +struct apm_info apm_info;
11551 +EXPORT_SYMBOL(apm_info);
11554 +#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
11555 +struct ist_info ist_info;
11556 +EXPORT_SYMBOL(ist_info);
11557 +#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
11558 +struct ist_info ist_info;
11562 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
11563 +EXPORT_SYMBOL(boot_cpu_data);
11567 +#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
11568 +unsigned long mmu_cr4_features;
11570 +unsigned long mmu_cr4_features = X86_CR4_PAE;
11573 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
11574 +int bootloader_type;
11577 + * Early DMI memory
11579 +int dmi_alloc_index;
11580 +char dmi_alloc_data[DMI_MAX_DATA];
11585 +struct screen_info screen_info;
11586 +EXPORT_SYMBOL(screen_info);
11587 +struct edid_info edid_info;
11588 +EXPORT_SYMBOL_GPL(edid_info);
11590 +extern int root_mountflags;
11592 +unsigned long saved_video_mode;
11594 +#define RAMDISK_IMAGE_START_MASK 0x07FF
11595 +#define RAMDISK_PROMPT_FLAG 0x8000
11596 +#define RAMDISK_LOAD_FLAG 0x4000
11598 +static char __initdata command_line[COMMAND_LINE_SIZE];
11600 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
11602 +#ifdef CONFIG_EDD_MODULE
11603 +EXPORT_SYMBOL(edd);
11605 +#ifndef CONFIG_XEN
11607 + * copy_edd() - Copy the BIOS EDD information
11608 + * from boot_params into a safe place.
11611 +static inline void copy_edd(void)
11613 + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
11614 + sizeof(edd.mbr_signature));
11615 + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
11616 + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
11617 + edd.edd_info_nr = boot_params.eddbuf_entries;
11621 +static inline void copy_edd(void)
11626 +#ifdef CONFIG_BLK_DEV_INITRD
11628 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11630 +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
11631 +static void __init relocate_initrd(void)
11634 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11635 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11636 + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11637 + u64 ramdisk_here;
11638 + unsigned long slop, clen, mapaddr;
11641 + /* We need to move the initrd down into lowmem */
11642 + ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
11645 + if (ramdisk_here == -1ULL)
11646 + panic("Cannot find place for new RAMDISK of size %lld\n",
11649 + /* Note: this includes all the lowmem currently occupied by
11650 + the initrd, we rely on that fact to keep the data intact. */
11651 + reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
11653 + initrd_start = ramdisk_here + PAGE_OFFSET;
11654 + initrd_end = initrd_start + ramdisk_size;
11655 + printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
11656 + ramdisk_here, ramdisk_here + ramdisk_size);
11658 + q = (char *)initrd_start;
11660 + /* Copy any lowmem portion of the initrd */
11661 + if (ramdisk_image < end_of_lowmem) {
11662 + clen = end_of_lowmem - ramdisk_image;
11663 + p = (char *)__va(ramdisk_image);
11664 + memcpy(q, p, clen);
11666 + ramdisk_image += clen;
11667 + ramdisk_size -= clen;
11670 + /* Copy the highmem portion of the initrd */
11671 + while (ramdisk_size) {
11672 + slop = ramdisk_image & ~PAGE_MASK;
11673 + clen = ramdisk_size;
11674 + if (clen > MAX_MAP_CHUNK-slop)
11675 + clen = MAX_MAP_CHUNK-slop;
11676 + mapaddr = ramdisk_image & PAGE_MASK;
11677 + p = early_ioremap(mapaddr, clen+slop);
11678 + memcpy(q, p+slop, clen);
11679 + early_iounmap(p, clen+slop);
11681 + ramdisk_image += clen;
11682 + ramdisk_size -= clen;
11684 + /* high pages is not converted by early_res_to_bootmem */
11685 + ramdisk_image = boot_params.hdr.ramdisk_image;
11686 + ramdisk_size = boot_params.hdr.ramdisk_size;
11687 + printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
11688 + " %08llx - %08llx\n",
11689 + ramdisk_image, ramdisk_image + ramdisk_size - 1,
11690 + ramdisk_here, ramdisk_here + ramdisk_size - 1);
11694 +static void __init reserve_initrd(void)
11698 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11699 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11700 + u64 ramdisk_end = ramdisk_image + ramdisk_size;
11701 + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11703 + if (!boot_params.hdr.type_of_loader ||
11704 + !ramdisk_image || !ramdisk_size)
11705 + return; /* No initrd provided by bootloader */
11707 + unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
11708 + unsigned long ramdisk_size = xen_start_info->mod_len;
11709 + unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
11710 + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11712 - for_each_possible_cpu(cpu) {
11713 - per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
11714 - per_cpu(x86_bios_cpu_apicid, cpu) =
11715 - x86_bios_cpu_apicid_init[cpu];
11716 -#ifdef CONFIG_NUMA
11717 - per_cpu(x86_cpu_to_node_map, cpu) =
11718 - x86_cpu_to_node_map_init[cpu];
11719 + if (!xen_start_info->mod_start || !ramdisk_size)
11720 + return; /* No initrd provided by bootloader */
11723 + initrd_start = 0;
11725 + if (ramdisk_size >= (end_of_lowmem>>1)) {
11726 + free_early(ramdisk_image, ramdisk_end);
11727 + printk(KERN_ERR "initrd too large to handle, "
11728 + "disabling initrd\n");
11732 - /* indicate the early static arrays will soon be gone */
11733 - x86_cpu_to_apicid_early_ptr = NULL;
11734 - x86_bios_cpu_apicid_early_ptr = NULL;
11735 -#ifdef CONFIG_NUMA
11736 - x86_cpu_to_node_map_early_ptr = NULL;
11737 + printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
11741 + if (ramdisk_end <= end_of_lowmem) {
11742 + /* All in lowmem, easy case */
11744 + * don't need to reserve again, already reserved early
11745 + * in i386_start_kernel
11747 + initrd_start = ramdisk_image + PAGE_OFFSET;
11748 + initrd_end = initrd_start + ramdisk_size;
11749 +#ifdef CONFIG_X86_64_XEN
11750 + initrd_below_start_ok = 1;
11755 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11756 + relocate_initrd();
11758 + printk(KERN_ERR "initrd extends beyond end of memory "
11759 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
11760 + ramdisk_end, end_of_lowmem);
11761 + initrd_start = 0;
11763 + free_early(ramdisk_image, ramdisk_end);
11766 +static void __init reserve_initrd(void)
11769 +#endif /* CONFIG_BLK_DEV_INITRD */
11771 +static void __init parse_setup_data(void)
11773 +#ifndef CONFIG_XEN
11774 + struct setup_data *data;
11777 -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
11778 -cpumask_t *cpumask_of_cpu_map __read_mostly;
11779 -EXPORT_SYMBOL(cpumask_of_cpu_map);
11780 + if (boot_params.hdr.version < 0x0209)
11782 + pa_data = boot_params.hdr.setup_data;
11783 + while (pa_data) {
11784 + data = early_ioremap(pa_data, PAGE_SIZE);
11785 + switch (data->type) {
11786 + case SETUP_E820_EXT:
11787 + parse_e820_ext(data, pa_data);
11792 + pa_data = data->next;
11793 + early_iounmap(data, PAGE_SIZE);
11798 -/* requires nr_cpu_ids to be initialized */
11799 -static void __init setup_cpumask_of_cpu(void)
11800 +static void __init e820_reserve_setup_data(void)
11803 +#ifndef CONFIG_XEN
11804 + struct setup_data *data;
11808 + if (boot_params.hdr.version < 0x0209)
11810 + pa_data = boot_params.hdr.setup_data;
11811 + while (pa_data) {
11812 + data = early_ioremap(pa_data, sizeof(*data));
11813 + e820_update_range(pa_data, sizeof(*data)+data->len,
11814 + E820_RAM, E820_RESERVED_KERN);
11816 + pa_data = data->next;
11817 + early_iounmap(data, sizeof(*data));
11822 - /* alloc_bootmem zeroes memory */
11823 - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
11824 - for (i = 0; i < nr_cpu_ids; i++)
11825 - cpu_set(i, cpumask_of_cpu_map[i]);
11826 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
11827 + memcpy(&e820_saved, &e820, sizeof(struct e820map));
11828 + printk(KERN_INFO "extended physical RAM map:\n");
11829 + e820_print_map("reserve setup_data");
11833 -static inline void setup_cpumask_of_cpu(void) { }
11835 +static void __init reserve_early_setup_data(void)
11837 +#ifndef CONFIG_XEN
11838 + struct setup_data *data;
11842 + if (boot_params.hdr.version < 0x0209)
11844 + pa_data = boot_params.hdr.setup_data;
11845 + while (pa_data) {
11846 + data = early_ioremap(pa_data, sizeof(*data));
11847 + sprintf(buf, "setup data %x", data->type);
11848 + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
11849 + pa_data = data->next;
11850 + early_iounmap(data, sizeof(*data));
11855 -#ifdef CONFIG_X86_32
11857 - * Great future not-so-futuristic plan: make i386 and x86_64 do it
11859 + * --------- Crashkernel reservation ------------------------------
11862 +#ifdef CONFIG_KEXEC
11864 +#ifndef CONFIG_XEN
11866 + * Reserve @size bytes of crashkernel memory at any suitable offset.
11868 + * @size: Size of the crashkernel memory to reserve.
11869 + * Returns the base address on success, and -1ULL on failure.
11871 +unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
11873 + const unsigned long long alignment = 16<<20; /* 16M */
11874 + unsigned long long start = 0LL;
11879 + start = find_e820_area(start, ULONG_MAX, size, alignment);
11880 + if (start == -1ULL)
11883 + /* try to reserve it */
11884 + ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
11888 + start += alignment;
11892 +static inline unsigned long long get_total_mem(void)
11894 + unsigned long long total;
11896 + total = max_low_pfn - min_low_pfn;
11897 +#ifdef CONFIG_HIGHMEM
11898 + total += highend_pfn - highstart_pfn;
11901 + return total << PAGE_SHIFT;
11904 +static void __init reserve_crashkernel(void)
11906 + unsigned long long total_mem;
11907 + unsigned long long crash_size, crash_base;
11910 + total_mem = get_total_mem();
11912 + ret = parse_crashkernel(boot_command_line, total_mem,
11913 + &crash_size, &crash_base);
11914 + if (ret != 0 || crash_size <= 0)
11917 + /* 0 means: find the address automatically */
11918 + if (crash_base <= 0) {
11919 + crash_base = find_and_reserve_crashkernel(crash_size);
11920 + if (crash_base == -1ULL) {
11921 + pr_info("crashkernel reservation failed. "
11922 + "No suitable area found.\n");
11926 + ret = reserve_bootmem_generic(crash_base, crash_size,
11927 + BOOTMEM_EXCLUSIVE);
11929 + pr_info("crashkernel reservation failed - "
11930 + "memory is in use\n");
11935 + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
11936 + "for crashkernel (System RAM: %ldMB)\n",
11937 + (unsigned long)(crash_size >> 20),
11938 + (unsigned long)(crash_base >> 20),
11939 + (unsigned long)(total_mem >> 20));
11941 + crashk_res.start = crash_base;
11942 + crashk_res.end = crash_base + crash_size - 1;
11943 + insert_resource(&iomem_resource, &crashk_res);
11946 +#define reserve_crashkernel xen_machine_kexec_setup_resources
11949 +static void __init reserve_crashkernel(void)
11954 +static struct resource standard_io_resources[] = {
11955 + { .name = "dma1", .start = 0x00, .end = 0x1f,
11956 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11957 + { .name = "pic1", .start = 0x20, .end = 0x21,
11958 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11959 + { .name = "timer0", .start = 0x40, .end = 0x43,
11960 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11961 + { .name = "timer1", .start = 0x50, .end = 0x53,
11962 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11963 + { .name = "keyboard", .start = 0x60, .end = 0x60,
11964 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11965 + { .name = "keyboard", .start = 0x64, .end = 0x64,
11966 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11967 + { .name = "dma page reg", .start = 0x80, .end = 0x8f,
11968 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11969 + { .name = "pic2", .start = 0xa0, .end = 0xa1,
11970 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11971 + { .name = "dma2", .start = 0xc0, .end = 0xdf,
11972 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11973 + { .name = "fpu", .start = 0xf0, .end = 0xff,
11974 + .flags = IORESOURCE_BUSY | IORESOURCE_IO }
11977 +static void __init reserve_standard_io_resources(void)
11981 + /* Nothing to do if not running in dom0. */
11982 + if (!is_initial_xendomain())
11985 + /* request I/O space for devices used on all i[345]86 PCs */
11986 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
11987 + request_resource(&ioport_resource, &standard_io_resources[i]);
11991 +#ifdef CONFIG_PROC_VMCORE
11992 +/* elfcorehdr= specifies the location of elf core header
11993 + * stored by the crashed kernel. This option will be passed
11994 + * by kexec loader to the capture kernel.
11996 -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
11997 -EXPORT_SYMBOL(__per_cpu_offset);
11998 +static int __init setup_elfcorehdr(char *arg)
12003 + elfcorehdr_addr = memparse(arg, &end);
12004 + return end > arg ? 0 : -EINVAL;
12006 +early_param("elfcorehdr", setup_elfcorehdr);
12009 +static struct x86_quirks default_x86_quirks __initdata;
12011 +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
12014 + * Determine if we were loaded by an EFI loader. If so, then we have also been
12015 + * passed the efi memmap, systab, etc., so we should use these data structures
12016 + * for initialization. Note, the efi init code path is determined by the
12017 + * global efi_enabled. This allows the same kernel image to be used on existing
12018 + * systems (with a traditional BIOS) as well as on EFI systems.
12021 - * Great future plan:
12022 - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
12023 - * Always point %gs to its beginning
12024 + * setup_arch - architecture-specific boot-time initializations
12026 + * Note: On x86_64, fixmaps are ready for use even before this is called.
12028 -void __init setup_per_cpu_areas(void)
12030 +void __init setup_arch(char **cmdline_p)
12032 - int i, highest_cpu = 0;
12033 - unsigned long size;
12036 + unsigned long p2m_pages;
12037 + struct physdev_set_iopl set_iopl;
12039 -#ifdef CONFIG_HOTPLUG_CPU
12040 - prefill_possible_map();
12041 +#ifdef CONFIG_X86_32
12042 + /* Force a quick death if the kernel panics (not domain 0). */
12043 + extern int panic_timeout;
12044 + if (!panic_timeout && !is_initial_xendomain())
12045 + panic_timeout = 1;
12048 - /* Copy section for each CPU (we discard the original) */
12049 - size = PERCPU_ENOUGH_ROOM;
12050 - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
12053 - for_each_possible_cpu(i) {
12055 -#ifndef CONFIG_NEED_MULTIPLE_NODES
12056 - ptr = alloc_bootmem_pages(size);
12058 - int node = early_cpu_to_node(i);
12059 - if (!node_online(node) || !NODE_DATA(node)) {
12060 - ptr = alloc_bootmem_pages(size);
12062 - "cpu %d has no node or node-local memory\n", i);
12065 - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
12066 + /* Register a call for panic conditions. */
12067 + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
12069 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12070 + VMASST_TYPE_writable_pagetables));
12071 +#ifdef CONFIG_X86_32
12072 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12073 + VMASST_TYPE_4gb_segments));
12075 +#endif /* CONFIG_XEN */
12077 +#ifdef CONFIG_X86_32
12078 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
12079 + visws_early_detect();
12080 + pre_setup_arch_hook();
12082 + printk(KERN_INFO "Command line: %s\n", boot_command_line);
12085 + early_cpu_init();
12086 + early_ioremap_init();
12088 +#ifndef CONFIG_XEN
12089 + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
12090 + screen_info = boot_params.screen_info;
12091 + edid_info = boot_params.edid_info;
12092 +#ifdef CONFIG_X86_32
12093 + apm_info.bios = boot_params.apm_bios_info;
12094 + ist_info = boot_params.ist_info;
12095 + if (boot_params.sys_desc_table.length != 0) {
12096 + set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
12097 + machine_id = boot_params.sys_desc_table.table[0];
12098 + machine_submodel_id = boot_params.sys_desc_table.table[1];
12099 + BIOS_revision = boot_params.sys_desc_table.table[2];
12102 + saved_video_mode = boot_params.hdr.vid_mode;
12103 + bootloader_type = boot_params.hdr.type_of_loader;
12105 +#ifdef CONFIG_BLK_DEV_RAM
12106 + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
12107 + rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
12108 + rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
12111 + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
12112 +#ifdef CONFIG_X86_32
12118 - panic("Cannot allocate cpu data for CPU %d\n", i);
12121 + efi_reserve_early();
12124 +#else /* CONFIG_XEN */
12125 +#ifdef CONFIG_X86_32
12126 + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
12127 + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
12129 + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
12131 + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
12133 + if (is_initial_xendomain()) {
12134 + const struct dom0_vga_console_info *info =
12135 + (void *)((char *)xen_start_info +
12136 + xen_start_info->console.dom0.info_off);
12138 + dom0_init_screen_info(info,
12139 + xen_start_info->console.dom0.info_size);
12140 + xen_start_info->console.domU.mfn = 0;
12141 + xen_start_info->console.domU.evtchn = 0;
12143 + screen_info.orig_video_isVGA = 0;
12145 +#endif /* CONFIG_XEN */
12149 + setup_memory_map();
12150 + parse_setup_data();
12151 + /* update the e820_saved too */
12152 + e820_reserve_setup_data();
12156 +#ifndef CONFIG_XEN
12157 + if (!boot_params.hdr.root_flags)
12158 + root_mountflags &= ~MS_RDONLY;
12160 + init_mm.start_code = (unsigned long) _text;
12161 + init_mm.end_code = (unsigned long) _etext;
12162 + init_mm.end_data = (unsigned long) _edata;
12163 +#ifdef CONFIG_X86_32
12164 +#ifndef CONFIG_XEN
12165 + init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
12167 + init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
12168 + xen_start_info->nr_pt_frames) << PAGE_SHIFT;
12171 + init_mm.brk = (unsigned long) &_end;
12174 + code_resource.start = virt_to_phys(_text);
12175 + code_resource.end = virt_to_phys(_etext)-1;
12176 + data_resource.start = virt_to_phys(_etext);
12177 + data_resource.end = virt_to_phys(_edata)-1;
12178 + bss_resource.start = virt_to_phys(&__bss_start);
12179 + bss_resource.end = virt_to_phys(&__bss_stop)-1;
12181 + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
12182 + *cmdline_p = command_line;
12184 + parse_early_param();
12186 #ifdef CONFIG_X86_64
12187 - cpu_pda(i)->data_offset = ptr - __per_cpu_start;
12191 +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
12193 + * Must be before kernel pagetables are setup
12194 + * or fixmap area is touched.
12199 + /* after early param, so could get panic from serial */
12200 + reserve_early_setup_data();
12202 + if (acpi_mps_check()) {
12203 +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
12204 + disable_apic = 1;
12206 + setup_clear_cpu_cap(X86_FEATURE_APIC);
12210 + if (pci_early_dump_regs)
12211 + early_dump_pci_devices();
12214 + finish_e820_parsing();
12216 +#ifdef CONFIG_X86_32
12220 +#ifndef CONFIG_XEN
12221 + /* after parse_early_param, so could debug it */
12222 + insert_resource(&iomem_resource, &code_resource);
12223 + insert_resource(&iomem_resource, &data_resource);
12224 + insert_resource(&iomem_resource, &bss_resource);
12229 +#ifdef CONFIG_X86_32
12230 + if (ppro_with_ram_bug()) {
12231 + e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
12233 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
12234 + printk(KERN_INFO "fixed physical RAM map:\n");
12235 + e820_print_map("bad_ppro");
12238 - __per_cpu_offset[i] = ptr - __per_cpu_start;
12239 + early_gart_iommu_check();
12241 - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
12242 +#endif /* CONFIG_XEN */
12246 + * partially used pages are not usable - thus
12247 + * we are rounding upwards:
12249 + max_pfn = e820_end_of_ram_pfn();
12251 + /* preallocate 4k for mptable mpc */
12252 + early_reserve_e820_mpc_new();
12253 + /* update e820 for memory not covered by WB MTRRs */
12255 +#ifndef CONFIG_XEN
12256 + if (mtrr_trim_uncached_memory(max_pfn))
12257 + max_pfn = e820_end_of_ram_pfn();
12260 +#ifdef CONFIG_X86_32
12261 + /* max_low_pfn get updated here */
12262 + find_low_pfn_range();
12264 + num_physpages = max_pfn;
12265 + max_mapnr = max_pfn;
12268 + /* How many end-of-memory variables you have, grandma! */
12269 + /* need this before calling reserve_initrd */
12270 + if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
12271 + max_low_pfn = e820_end_of_low_ram_pfn();
12273 + max_low_pfn = max_pfn;
12275 + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
12278 + /* max_pfn_mapped is updated here */
12279 + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
12280 + max_pfn_mapped = max_low_pfn_mapped;
12282 +#ifdef CONFIG_X86_64
12283 + if (max_pfn > max_low_pfn) {
12284 + max_pfn_mapped = init_memory_mapping(1UL<<32,
12285 + max_pfn<<PAGE_SHIFT);
12286 + /* can we preseve max_low_pfn ?*/
12287 + max_low_pfn = max_pfn;
12291 - nr_cpu_ids = highest_cpu + 1;
12292 - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
12294 + * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
12297 - /* Setup percpu data maps */
12298 - setup_per_cpu_maps();
12299 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
12300 + if (init_ohci1394_dma_early)
12301 + init_ohci1394_dma_on_all_controllers();
12304 - /* Setup cpumask_of_cpu map */
12305 - setup_cpumask_of_cpu();
12307 + reserve_initrd();
12309 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12313 + if (is_initial_xendomain())
12314 + dmi_scan_machine();
12318 +#ifdef CONFIG_ACPI
12319 + if (!is_initial_xendomain()) {
12320 + printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
12326 + * Parse the ACPI tables for possible boot-time SMP configuration.
12328 + acpi_boot_table_init();
12330 +#ifdef CONFIG_ACPI_NUMA
12332 + * Parse SRAT to discover nodes.
12334 + acpi_numa_init();
12337 + initmem_init(0, max_pfn);
12339 +#ifdef CONFIG_ACPI_SLEEP
12341 + * Reserve low memory region for sleep support.
12343 + acpi_reserve_bootmem();
12345 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
12347 + * Find and reserve possible boot-time SMP configuration:
12349 + find_smp_config();
12351 + reserve_crashkernel();
12353 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12355 + * dma32_reserve_bootmem() allocates bootmem which may conflict
12356 + * with the crashkernel command line, so do that after
12357 + * reserve_crashkernel()
12359 + dma32_reserve_bootmem();
12362 + reserve_ibft_region();
12364 +#ifdef CONFIG_KVM_CLOCK
12368 + xen_pagetable_setup_start(swapper_pg_dir);
12370 + xen_pagetable_setup_done(swapper_pg_dir);
12371 + paravirt_post_allocator_init();
12373 +#ifdef CONFIG_X86_64
12378 + p2m_pages = max_pfn;
12379 + if (xen_start_info->nr_pages > max_pfn) {
12381 + * the max_pfn was shrunk (probably by mem= or highmem=
12382 + * kernel parameter); shrink reservation with the HV
12384 + struct xen_memory_reservation reservation = {
12385 + .address_bits = 0,
12386 + .extent_order = 0,
12387 + .domid = DOMID_SELF
12389 + unsigned int difference;
12392 + difference = xen_start_info->nr_pages - max_pfn;
12394 + set_xen_guest_handle(reservation.extent_start,
12395 + ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
12396 + reservation.nr_extents = difference;
12397 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
12399 + BUG_ON(ret != difference);
12401 + else if (max_pfn > xen_start_info->nr_pages)
12402 + p2m_pages = xen_start_info->nr_pages;
12404 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
12405 + unsigned long i, j;
12406 + unsigned int k, fpp;
12408 + /* Make sure we have a large enough P->M table. */
12409 + phys_to_machine_mapping = alloc_bootmem_pages(
12410 + max_pfn * sizeof(unsigned long));
12411 + memset(phys_to_machine_mapping, ~0,
12412 + max_pfn * sizeof(unsigned long));
12413 + memcpy(phys_to_machine_mapping,
12414 + (unsigned long *)xen_start_info->mfn_list,
12415 + p2m_pages * sizeof(unsigned long));
12417 + __pa(xen_start_info->mfn_list),
12418 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
12419 + sizeof(unsigned long))));
12422 + * Initialise the list of the frames that specify the list of
12423 + * frames that make up the p2m table. Used by save/restore.
12425 + pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
12427 + fpp = PAGE_SIZE/sizeof(unsigned long);
12428 + for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
12433 + BUG_ON(k>=ARRAY_SIZE(pfn_to_mfn_frame_list));
12434 + pfn_to_mfn_frame_list[k] =
12435 + alloc_bootmem_pages(PAGE_SIZE);
12436 + pfn_to_mfn_frame_list_list[k] =
12437 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
12439 + pfn_to_mfn_frame_list[k][j] =
12440 + virt_to_mfn(&phys_to_machine_mapping[i]);
12442 + HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
12443 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
12444 + virt_to_mfn(pfn_to_mfn_frame_list_list);
12447 + /* Mark all ISA DMA channels in-use - using them wouldn't work. */
12448 + for (i = 0; i < MAX_DMA_CHANNELS; ++i)
12449 + if (i != 4 && request_dma(i, "xen") != 0)
12451 +#endif /* CONFIG_XEN */
12453 +#ifdef CONFIG_X86_GENERICARCH
12454 + generic_apic_probe();
12457 +#ifndef CONFIG_XEN
12462 + * Read APIC and some other early information from ACPI tables.
12464 + acpi_boot_init();
12466 +#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
12468 + * get boot-time SMP configuration:
12470 + if (smp_found_config)
12471 + get_smp_config();
12474 + prefill_possible_map();
12475 +#ifdef CONFIG_X86_64
12476 + init_cpu_to_node();
12479 +#ifndef CONFIG_XEN
12480 + init_apic_mappings();
12481 + ioapic_init_mappings();
12483 + kvm_guest_init();
12485 + e820_reserve_resources();
12486 + e820_mark_nosave_regions(max_low_pfn);
12488 + if (is_initial_xendomain())
12489 + e820_reserve_resources();
12492 +#ifdef CONFIG_X86_32
12493 + request_resource(&iomem_resource, &video_ram_resource);
12495 + reserve_standard_io_resources();
12497 +#ifndef CONFIG_XEN
12498 + e820_setup_gap();
12501 +#if defined(CONFIG_VGA_CONSOLE)
12502 + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
12503 + conswitchp = &vga_con;
12504 +#elif defined(CONFIG_DUMMY_CONSOLE)
12505 + conswitchp = &dummy_con;
12508 +#else /* CONFIG_XEN */
12509 + if (is_initial_xendomain())
12510 + e820_setup_gap();
12512 + set_iopl.iopl = 1;
12513 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
12516 +#ifdef CONFIG_DUMMY_CONSOLE
12517 + conswitchp = &dummy_con;
12519 +#ifdef CONFIG_VGA_CONSOLE
12520 + if (is_initial_xendomain())
12521 + conswitchp = &vga_con;
12524 +#endif /* CONFIG_XEN */
12529 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
12531 + HYPERVISOR_shutdown(SHUTDOWN_crash);
12532 + /* we're never actually going to get here... */
12533 + return NOTIFY_DONE;
12535 +#endif /* !CONFIG_XEN */
12536 --- sle11-2009-06-04.orig/arch/x86/kernel/setup64-xen.c 2009-03-16 16:38:05.000000000 +0100
12537 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12540 - * X86-64 specific CPU setup.
12541 - * Copyright (C) 1995 Linus Torvalds
12542 - * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
12543 - * See setup.c for older changelog.
12545 - * Jun Nakajima <jun.nakajima@intel.com>
12546 - * Modified for Xen
12549 -#include <linux/init.h>
12550 -#include <linux/kernel.h>
12551 -#include <linux/sched.h>
12552 -#include <linux/string.h>
12553 -#include <linux/bootmem.h>
12554 -#include <linux/bitops.h>
12555 -#include <linux/module.h>
12556 -#include <linux/kgdb.h>
12557 -#include <asm/pda.h>
12558 -#include <asm/pgtable.h>
12559 -#include <asm/processor.h>
12560 -#include <asm/desc.h>
12561 -#include <asm/atomic.h>
12562 -#include <asm/mmu_context.h>
12563 -#include <asm/smp.h>
12564 -#include <asm/i387.h>
12565 -#include <asm/percpu.h>
12566 -#include <asm/proto.h>
12567 -#include <asm/sections.h>
12568 -#include <asm/setup.h>
12569 -#include <asm/genapic.h>
12571 -#include <asm/hypervisor.h>
12574 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
12575 -struct boot_params __initdata boot_params;
12577 -struct boot_params boot_params;
12580 -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
12582 -struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
12583 -EXPORT_SYMBOL(_cpu_pda);
12584 -struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
12586 -#ifndef CONFIG_X86_NO_IDT
12587 -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
12590 -char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
12592 -unsigned long __supported_pte_mask __read_mostly = ~0UL;
12593 -EXPORT_SYMBOL(__supported_pte_mask);
12595 -static int do_not_nx __cpuinitdata = 0;
12598 -Control non executable mappings for 64bit processes.
12600 -on Enable(default)
12603 -static int __init nonx_setup(char *str)
12607 - if (!strncmp(str, "on", 2)) {
12608 - __supported_pte_mask |= _PAGE_NX;
12610 - } else if (!strncmp(str, "off", 3)) {
12612 - __supported_pte_mask &= ~_PAGE_NX;
12616 -early_param("noexec", nonx_setup);
12618 -int force_personality32 = 0;
12620 -/* noexec32=on|off
12621 -Control non executable heap for 32bit processes.
12622 -To control the stack too use noexec=off
12624 -on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
12625 -off PROT_READ implies PROT_EXEC
12627 -static int __init nonx32_setup(char *str)
12629 - if (!strcmp(str, "on"))
12630 - force_personality32 &= ~READ_IMPLIES_EXEC;
12631 - else if (!strcmp(str, "off"))
12632 - force_personality32 |= READ_IMPLIES_EXEC;
12635 -__setup("noexec32=", nonx32_setup);
12638 -static void __init_refok switch_pt(int cpu)
12642 - xen_pt_switch(__pa_symbol(init_level4_pgt));
12643 - xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
12645 -#define switch_pt() switch_pt(cpu)
12647 -static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12649 - unsigned long frames[16];
12650 - unsigned long va;
12653 - for (va = gdt_descr->address, f = 0;
12654 - va < gdt_descr->address + gdt_descr->size;
12655 - va += PAGE_SIZE, f++) {
12656 - frames[f] = virt_to_mfn(va);
12657 - make_page_readonly(
12658 - (void *)va, XENFEAT_writable_descriptor_tables);
12660 - if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
12661 - sizeof (struct desc_struct)))
12665 -static void switch_pt(void)
12667 - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
12670 -static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12672 - load_gdt(gdt_descr);
12673 - load_idt(idt_descr);
12677 -void pda_init(int cpu)
12679 - struct x8664_pda *pda = cpu_pda(cpu);
12681 - /* Setup up data that may be needed in __get_free_pages early */
12682 - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
12683 -#ifndef CONFIG_XEN
12684 - /* Memory clobbers used to order PDA accessed */
12686 - wrmsrl(MSR_GS_BASE, pda);
12689 - if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
12690 - (unsigned long)pda))
12693 - pda->cpunumber = cpu;
12694 - pda->irqcount = -1;
12695 - pda->kernelstack =
12696 - (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
12697 - pda->active_mm = &init_mm;
12698 - pda->mmu_state = 0;
12701 - /* others are initialized in smpboot.c */
12702 - pda->pcurrent = &init_task;
12703 - pda->irqstackptr = boot_cpu_stack;
12705 - pda->irqstackptr = (char *)
12706 - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
12707 - if (!pda->irqstackptr)
12708 - panic("cannot allocate irqstack for cpu %d", cpu);
12713 - pda->irqstackptr += IRQSTACKSIZE-64;
12716 -#ifndef CONFIG_X86_NO_TSS
12717 -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
12718 -__attribute__((section(".bss.page_aligned")));
12721 -extern asmlinkage void ignore_sysret(void);
12723 -/* May not be marked __init: used by software suspend */
12724 -void syscall_init(void)
12726 -#ifndef CONFIG_XEN
12728 - * LSTAR and STAR live in a bit strange symbiosis.
12729 - * They both write to the same internal register. STAR allows to set CS/DS
12730 - * but only a 32bit target. LSTAR sets the 64bit rip.
12732 - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
12733 - wrmsrl(MSR_LSTAR, system_call);
12734 - wrmsrl(MSR_CSTAR, ignore_sysret);
12736 - /* Flags to clear on syscall */
12737 - wrmsrl(MSR_SYSCALL_MASK,
12738 - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
12740 -#ifdef CONFIG_IA32_EMULATION
12741 - syscall32_cpu_init ();
12744 - static const struct callback_register cstar = {
12745 - .type = CALLBACKTYPE_syscall32,
12746 - .address = (unsigned long)ignore_sysret
12748 - if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
12749 - printk(KERN_WARN "Unable to register CSTAR callback\n");
12754 -void __cpuinit check_efer(void)
12756 - unsigned long efer;
12758 - rdmsrl(MSR_EFER, efer);
12759 - if (!(efer & EFER_NX) || do_not_nx) {
12760 - __supported_pte_mask &= ~_PAGE_NX;
12764 -unsigned long kernel_eflags;
12766 -#ifndef CONFIG_X86_NO_TSS
12768 - * Copies of the original ist values from the tss are only accessed during
12769 - * debugging, no special alignment required.
12771 -DEFINE_PER_CPU(struct orig_ist, orig_ist);
12775 - * cpu_init() initializes state that is per-CPU. Some data is already
12776 - * initialized (naturally) in the bootstrap process, such as the GDT
12777 - * and IDT. We reload them nevertheless, this function acts as a
12778 - * 'CPU state barrier', nothing should get across.
12779 - * A lot of state is already set up in PDA init.
12781 -void __cpuinit cpu_init (void)
12783 - int cpu = stack_smp_processor_id();
12784 -#ifndef CONFIG_X86_NO_TSS
12785 - struct tss_struct *t = &per_cpu(init_tss, cpu);
12786 - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
12788 - char *estacks = NULL;
12791 - struct task_struct *me;
12793 - /* CPU 0 is initialised in head64.c */
12797 -#ifndef CONFIG_X86_NO_TSS
12799 - estacks = boot_exception_stacks;
12804 - if (cpu_test_and_set(cpu, cpu_initialized))
12805 - panic("CPU#%d already initialized!\n", cpu);
12807 - printk("Initializing CPU#%d\n", cpu);
12809 - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
12812 - * Initialize the per-CPU GDT with the boot GDT,
12813 - * and set up the GDT descriptor:
12815 -#ifndef CONFIG_XEN
12817 - memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
12820 - cpu_gdt_descr[cpu].size = GDT_SIZE;
12821 - cpu_gdt_init(&cpu_gdt_descr[cpu]);
12823 - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
12826 - wrmsrl(MSR_FS_BASE, 0);
12827 - wrmsrl(MSR_KERNEL_GS_BASE, 0);
12832 -#ifndef CONFIG_X86_NO_TSS
12834 - * set up and load the per-CPU TSS
12836 - for (v = 0; v < N_EXCEPTION_STACKS; v++) {
12837 - static const unsigned int order[N_EXCEPTION_STACKS] = {
12838 - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
12839 - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
12842 - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
12844 - panic("Cannot allocate exception stack %ld %d\n",
12847 - estacks += PAGE_SIZE << order[v];
12848 - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
12851 - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
12853 - * <= is required because the CPU will access up to
12854 - * 8 bits beyond the end of the IO permission bitmap.
12856 - for (i = 0; i <= IO_BITMAP_LONGS; i++)
12857 - t->io_bitmap[i] = ~0UL;
12860 - atomic_inc(&init_mm.mm_count);
12861 - me->active_mm = &init_mm;
12864 - enter_lazy_tlb(&init_mm, me);
12866 -#ifndef CONFIG_X86_NO_TSS
12867 - set_tss_desc(cpu, t);
12869 -#ifndef CONFIG_XEN
12872 - load_LDT(&init_mm.context);
12874 -#ifdef CONFIG_KGDB
12876 - * If the kgdb is connected no debug regs should be altered. This
12877 - * is only applicable when KGDB and a KGDB I/O module are built
12878 - * into the kernel and you are using early debugging with
12879 - * kgdbwait. KGDB will control the kernel HW breakpoint registers.
12881 - if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
12882 - arch_kgdb_ops.correct_hw_break();
12886 - * Clear all 6 debug registers:
12889 - set_debugreg(0UL, 0);
12890 - set_debugreg(0UL, 1);
12891 - set_debugreg(0UL, 2);
12892 - set_debugreg(0UL, 3);
12893 - set_debugreg(0UL, 6);
12894 - set_debugreg(0UL, 7);
12895 -#ifdef CONFIG_KGDB
12896 - /* If the kgdb is connected no debug regs should be altered. */
12902 - asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
12903 - if (raw_irqs_disabled())
12904 - kernel_eflags &= ~X86_EFLAGS_IF;
12906 - if (is_uv_system())
12909 --- sle11-2009-06-04.orig/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:38:05.000000000 +0100
12910 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12913 - * Copyright (C) 1995 Linus Torvalds
12915 - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
12917 - * Memory region support
12918 - * David Parsons <orc@pell.chi.il.us>, July-August 1999
12920 - * Added E820 sanitization routine (removes overlapping memory regions);
12921 - * Brian Moyle <bmoyle@mvista.com>, February 2001
12923 - * Moved CPU detection code to cpu/${cpu}.c
12924 - * Patrick Mochel <mochel@osdl.org>, March 2002
12926 - * Provisions for empty E820 memory regions (reported by certain BIOSes).
12927 - * Alex Achenbach <xela@slit.de>, December 2002.
12932 - * This file handles the architecture-dependent parts of initialization
12935 -#include <linux/sched.h>
12936 -#include <linux/mm.h>
12937 -#include <linux/mmzone.h>
12938 -#include <linux/screen_info.h>
12939 -#include <linux/ioport.h>
12940 -#include <linux/acpi.h>
12941 -#include <linux/apm_bios.h>
12942 -#include <linux/initrd.h>
12943 -#include <linux/bootmem.h>
12944 -#include <linux/seq_file.h>
12945 -#include <linux/console.h>
12946 -#include <linux/mca.h>
12947 -#include <linux/root_dev.h>
12948 -#include <linux/highmem.h>
12949 -#include <linux/module.h>
12950 -#include <linux/efi.h>
12951 -#include <linux/init.h>
12952 -#include <linux/edd.h>
12953 -#include <linux/iscsi_ibft.h>
12954 -#include <linux/nodemask.h>
12955 -#include <linux/kernel.h>
12956 -#include <linux/percpu.h>
12957 -#include <linux/notifier.h>
12958 -#include <linux/kexec.h>
12959 -#include <linux/crash_dump.h>
12960 -#include <linux/dmi.h>
12961 -#include <linux/pfn.h>
12962 -#include <linux/pci.h>
12963 -#include <linux/init_ohci1394_dma.h>
12964 -#include <linux/kvm_para.h>
12966 -#include <video/edid.h>
12968 -#include <asm/mtrr.h>
12969 -#include <asm/apic.h>
12970 -#include <asm/e820.h>
12971 -#include <asm/mpspec.h>
12972 -#include <asm/mmzone.h>
12973 -#include <asm/setup.h>
12974 -#include <asm/arch_hooks.h>
12975 -#include <asm/sections.h>
12976 -#include <asm/io_apic.h>
12977 -#include <asm/ist.h>
12978 -#include <asm/io.h>
12979 -#include <asm/hypervisor.h>
12980 -#include <xen/interface/physdev.h>
12981 -#include <xen/interface/memory.h>
12982 -#include <xen/features.h>
12983 -#include <xen/firmware.h>
12984 -#include <xen/xencons.h>
12985 -#include <setup_arch.h>
12986 -#include <asm/bios_ebda.h>
12987 -#include <asm/cacheflush.h>
12988 -#include <asm/processor.h>
12991 -#include <xen/interface/kexec.h>
12994 -static int xen_panic_event(struct notifier_block *, unsigned long, void *);
12995 -static struct notifier_block xen_panic_block = {
12996 - xen_panic_event, NULL, 0 /* try to go last */
13000 - * Machine setup..
13002 -static struct resource data_resource = {
13003 - .name = "Kernel data",
13006 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13009 -static struct resource code_resource = {
13010 - .name = "Kernel code",
13013 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13016 -static struct resource bss_resource = {
13017 - .name = "Kernel bss",
13020 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13023 -static struct resource video_ram_resource = {
13024 - .name = "Video RAM area",
13025 - .start = 0xa0000,
13027 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13030 -static struct resource standard_io_resources[] = { {
13034 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13039 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13041 - .name = "timer0",
13044 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13046 - .name = "timer1",
13049 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13051 - .name = "keyboard",
13054 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13056 - .name = "keyboard",
13059 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13061 - .name = "dma page reg",
13064 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13069 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13074 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13079 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13082 -/* cpu data as detected by the assembly code in head.S */
13083 -struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13084 -/* common cpu data for all cpus */
13085 -struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13086 -EXPORT_SYMBOL(boot_cpu_data);
13088 -unsigned int def_to_bigsmp;
13090 -#ifndef CONFIG_X86_PAE
13091 -unsigned long mmu_cr4_features;
13093 -unsigned long mmu_cr4_features = X86_CR4_PAE;
13096 -/* for MCA, but anyone else can use it if they want */
13097 -unsigned int machine_id;
13098 -unsigned int machine_submodel_id;
13099 -unsigned int BIOS_revision;
13101 -/* Boot loader ID as an integer, for the benefit of proc_dointvec */
13102 -int bootloader_type;
13104 -/* user-defined highmem size */
13105 -static unsigned int highmem_pages = -1;
13110 -struct screen_info screen_info;
13111 -EXPORT_SYMBOL(screen_info);
13112 -struct apm_info apm_info;
13113 -EXPORT_SYMBOL(apm_info);
13114 -struct edid_info edid_info;
13115 -EXPORT_SYMBOL_GPL(edid_info);
13116 -#ifndef CONFIG_XEN
13117 -#define copy_edid() (edid_info = boot_params.edid_info)
13119 -struct ist_info ist_info;
13120 -#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
13121 - defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
13122 -EXPORT_SYMBOL(ist_info);
13125 -extern void early_cpu_init(void);
13126 -extern int root_mountflags;
13128 -unsigned long saved_video_mode;
13130 -#define RAMDISK_IMAGE_START_MASK 0x07FF
13131 -#define RAMDISK_PROMPT_FLAG 0x8000
13132 -#define RAMDISK_LOAD_FLAG 0x4000
13134 -static char __initdata command_line[COMMAND_LINE_SIZE];
13136 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
13137 -struct boot_params __initdata boot_params;
13139 -struct boot_params boot_params;
13143 - * Point at the empty zero page to start with. We map the real shared_info
13144 - * page as soon as fixmap is up and running.
13146 -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
13147 -EXPORT_SYMBOL(HYPERVISOR_shared_info);
13149 -unsigned long *phys_to_machine_mapping;
13150 -unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
13151 -EXPORT_SYMBOL(phys_to_machine_mapping);
13153 -/* Raw start-of-day parameters from the hypervisor. */
13154 -start_info_t *xen_start_info;
13155 -EXPORT_SYMBOL(xen_start_info);
13157 -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
13159 -#ifdef CONFIG_EDD_MODULE
13160 -EXPORT_SYMBOL(edd);
13162 -#ifndef CONFIG_XEN
13164 - * copy_edd() - Copy the BIOS EDD information
13165 - * from boot_params into a safe place.
13168 -static inline void copy_edd(void)
13170 - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
13171 - sizeof(edd.mbr_signature));
13172 - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
13173 - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
13174 - edd.edd_info_nr = boot_params.eddbuf_entries;
13178 -static inline void copy_edd(void)
13183 -int __initdata user_defined_memmap;
13186 - * "mem=nopentium" disables the 4MB page tables.
13187 - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
13188 - * to <mem>, overriding the bios size.
13189 - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
13190 - * <start> to <start>+<mem>, overriding the bios size.
13192 - * HPA tells me bootloaders need to parse mem=, so no new
13193 - * option should be mem= [also see Documentation/i386/boot.txt]
13195 -static int __init parse_mem(char *arg)
13200 - if (strcmp(arg, "nopentium") == 0) {
13201 - setup_clear_cpu_cap(X86_FEATURE_PSE);
13203 - /* If the user specifies memory size, we
13204 - * limit the BIOS-provided memory map to
13205 - * that size. exactmap can be used to specify
13206 - * the exact map. mem=number can be used to
13207 - * trim the existing memory map.
13209 - unsigned long long mem_size;
13211 - mem_size = memparse(arg, &arg);
13212 - limit_regions(mem_size);
13213 - user_defined_memmap = 1;
13217 -early_param("mem", parse_mem);
13219 -#ifdef CONFIG_PROC_VMCORE
13220 -/* elfcorehdr= specifies the location of elf core header
13221 - * stored by the crashed kernel.
13223 -static int __init parse_elfcorehdr(char *arg)
13228 - elfcorehdr_addr = memparse(arg, &arg);
13231 -early_param("elfcorehdr", parse_elfcorehdr);
13232 -#endif /* CONFIG_PROC_VMCORE */
13235 - * highmem=size forces highmem to be exactly 'size' bytes.
13236 - * This works even on boxes that have no highmem otherwise.
13237 - * This also works to reduce highmem size on bigger boxes.
13239 -static int __init parse_highmem(char *arg)
13244 - highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
13247 -early_param("highmem", parse_highmem);
13250 - * vmalloc=size forces the vmalloc area to be exactly 'size'
13251 - * bytes. This can be used to increase (or decrease) the
13252 - * vmalloc area - the default is 128m.
13254 -static int __init parse_vmalloc(char *arg)
13259 - __VMALLOC_RESERVE = memparse(arg, &arg);
13262 -early_param("vmalloc", parse_vmalloc);
13264 -#ifndef CONFIG_XEN
13266 - * reservetop=size reserves a hole at the top of the kernel address space which
13267 - * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
13268 - * so relocating the fixmap can be done before paging initialization.
13270 -static int __init parse_reservetop(char *arg)
13272 - unsigned long address;
13277 - address = memparse(arg, &arg);
13278 - reserve_top_address(address);
13281 -early_param("reservetop", parse_reservetop);
13285 - * Determine low and high memory ranges:
13287 -unsigned long __init find_max_low_pfn(void)
13289 - unsigned long max_low_pfn;
13291 - max_low_pfn = max_pfn;
13292 - if (max_low_pfn > MAXMEM_PFN) {
13293 - if (highmem_pages == -1)
13294 - highmem_pages = max_pfn - MAXMEM_PFN;
13295 - if (highmem_pages + MAXMEM_PFN < max_pfn)
13296 - max_pfn = MAXMEM_PFN + highmem_pages;
13297 - if (highmem_pages + MAXMEM_PFN > max_pfn) {
13298 - printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
13299 - highmem_pages = 0;
13301 - max_low_pfn = MAXMEM_PFN;
13302 -#ifndef CONFIG_HIGHMEM
13303 - /* Maximum memory usable is what is directly addressable */
13304 - printk(KERN_WARNING "Warning only %ldMB will be used.\n",
13306 - if (max_pfn > MAX_NONPAE_PFN)
13307 - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13309 - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
13310 - max_pfn = MAXMEM_PFN;
13311 -#else /* !CONFIG_HIGHMEM */
13312 -#ifndef CONFIG_HIGHMEM64G
13313 - if (max_pfn > MAX_NONPAE_PFN) {
13314 - max_pfn = MAX_NONPAE_PFN;
13315 - printk(KERN_WARNING "Warning only 4GB will be used.\n");
13316 - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13318 -#endif /* !CONFIG_HIGHMEM64G */
13319 -#endif /* !CONFIG_HIGHMEM */
13321 - if (highmem_pages == -1)
13322 - highmem_pages = 0;
13323 -#ifdef CONFIG_HIGHMEM
13324 - if (highmem_pages >= max_pfn) {
13325 - printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
13326 - highmem_pages = 0;
13328 - if (highmem_pages) {
13329 - if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
13330 - printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
13331 - highmem_pages = 0;
13333 - max_low_pfn -= highmem_pages;
13336 - if (highmem_pages)
13337 - printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
13340 - return max_low_pfn;
13343 -#ifndef CONFIG_XEN
13344 -#define BIOS_LOWMEM_KILOBYTES 0x413
13347 - * The BIOS places the EBDA/XBDA at the top of conventional
13348 - * memory, and usually decreases the reported amount of
13349 - * conventional memory (int 0x12) too. This also contains a
13350 - * workaround for Dell systems that neglect to reserve EBDA.
13351 - * The same workaround also avoids a problem with the AMD768MPX
13352 - * chipset: reserve a page before VGA to prevent PCI prefetch
13353 - * into it (errata #56). Usually the page is reserved anyways,
13354 - * unless you have no PS/2 mouse plugged in.
13356 -static void __init reserve_ebda_region(void)
13358 - unsigned int lowmem, ebda_addr;
13360 - /* To determine the position of the EBDA and the */
13361 - /* end of conventional memory, we need to look at */
13362 - /* the BIOS data area. In a paravirtual environment */
13363 - /* that area is absent. We'll just have to assume */
13364 - /* that the paravirt case can handle memory setup */
13365 - /* correctly, without our help. */
13366 - if (paravirt_enabled())
13369 - /* end of low (conventional) memory */
13370 - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
13373 - /* start of EBDA area */
13374 - ebda_addr = get_bios_ebda();
13376 - /* Fixup: bios puts an EBDA in the top 64K segment */
13377 - /* of conventional memory, but does not adjust lowmem. */
13378 - if ((lowmem - ebda_addr) <= 0x10000)
13379 - lowmem = ebda_addr;
13381 - /* Fixup: bios does not report an EBDA at all. */
13382 - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
13383 - if ((ebda_addr == 0) && (lowmem >= 0x9f000))
13384 - lowmem = 0x9f000;
13386 - /* Paranoia: should never happen, but... */
13387 - if ((lowmem == 0) || (lowmem >= 0x100000))
13388 - lowmem = 0x9f000;
13390 - /* reserve all memory between lowmem and the 1MB mark */
13391 - reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
13395 -#ifndef CONFIG_NEED_MULTIPLE_NODES
13396 -static void __init setup_bootmem_allocator(void);
13397 -static unsigned long __init setup_memory(void)
13400 - * partially used pages are not usable - thus
13401 - * we are rounding upwards:
13403 - min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
13404 - xen_start_info->nr_pt_frames;
13406 - max_low_pfn = find_max_low_pfn();
13408 -#ifdef CONFIG_HIGHMEM
13409 - highstart_pfn = highend_pfn = max_pfn;
13410 - if (max_pfn > max_low_pfn) {
13411 - highstart_pfn = max_low_pfn;
13413 - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
13414 - pages_to_mb(highend_pfn - highstart_pfn));
13415 - num_physpages = highend_pfn;
13416 - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
13418 - num_physpages = max_low_pfn;
13419 - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
13421 -#ifdef CONFIG_FLATMEM
13422 - max_mapnr = num_physpages;
13424 - printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
13425 - pages_to_mb(max_low_pfn));
13427 - setup_bootmem_allocator();
13429 - return max_low_pfn;
13432 -static void __init zone_sizes_init(void)
13434 - unsigned long max_zone_pfns[MAX_NR_ZONES];
13435 - memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
13436 - max_zone_pfns[ZONE_DMA] =
13437 - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
13438 - max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
13439 -#ifdef CONFIG_HIGHMEM
13440 - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
13441 - add_active_range(0, 0, highend_pfn);
13443 - add_active_range(0, 0, max_low_pfn);
13446 - free_area_init_nodes(max_zone_pfns);
13449 -extern unsigned long __init setup_memory(void);
13450 -extern void zone_sizes_init(void);
13451 -#endif /* !CONFIG_NEED_MULTIPLE_NODES */
13453 -static inline unsigned long long get_total_mem(void)
13455 - unsigned long long total;
13457 - total = max_low_pfn - min_low_pfn;
13458 -#ifdef CONFIG_HIGHMEM
13459 - total += highend_pfn - highstart_pfn;
13462 - return total << PAGE_SHIFT;
13465 -#ifdef CONFIG_KEXEC
13466 -#ifndef CONFIG_XEN
13467 -static void __init reserve_crashkernel(void)
13469 - unsigned long long total_mem;
13470 - unsigned long long crash_size, crash_base;
13473 - total_mem = get_total_mem();
13475 - ret = parse_crashkernel(boot_command_line, total_mem,
13476 - &crash_size, &crash_base);
13477 - if (ret == 0 && crash_size > 0) {
13478 - if (crash_base > 0) {
13479 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
13480 - "for crashkernel (System RAM: %ldMB)\n",
13481 - (unsigned long)(crash_size >> 20),
13482 - (unsigned long)(crash_base >> 20),
13483 - (unsigned long)(total_mem >> 20));
13485 - if (reserve_bootmem(crash_base, crash_size,
13486 - BOOTMEM_EXCLUSIVE) < 0) {
13487 - printk(KERN_INFO "crashkernel reservation "
13488 - "failed - memory is in use\n");
13492 - crashk_res.start = crash_base;
13493 - crashk_res.end = crash_base + crash_size - 1;
13495 - printk(KERN_INFO "crashkernel reservation failed - "
13496 - "you have to specify a base address\n");
13500 -#define reserve_crashkernel xen_machine_kexec_setup_resources
13503 -static inline void __init reserve_crashkernel(void)
13507 -#ifdef CONFIG_BLK_DEV_INITRD
13509 -static bool do_relocate_initrd = false;
13511 -static void __init reserve_initrd(void)
13513 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
13514 - unsigned long ramdisk_size = xen_start_info->mod_len;
13515 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
13516 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13517 - unsigned long ramdisk_here;
13519 - initrd_start = 0;
13521 - if (!xen_start_info->mod_start || !ramdisk_size)
13522 - return; /* No initrd provided by bootloader */
13524 - if (ramdisk_end < ramdisk_image) {
13525 - printk(KERN_ERR "initrd wraps around end of memory, "
13526 - "disabling initrd\n");
13529 - if (ramdisk_size >= end_of_lowmem/2) {
13530 - printk(KERN_ERR "initrd too large to handle, "
13531 - "disabling initrd\n");
13534 - if (ramdisk_end <= end_of_lowmem) {
13535 - /* All in lowmem, easy case */
13536 - reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
13537 - initrd_start = ramdisk_image + PAGE_OFFSET;
13538 - initrd_end = initrd_start+ramdisk_size;
13542 - /* We need to move the initrd down into lowmem */
13543 - ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
13545 - /* Note: this includes all the lowmem currently occupied by
13546 - the initrd, we rely on that fact to keep the data intact. */
13547 - reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
13548 - initrd_start = ramdisk_here + PAGE_OFFSET;
13549 - initrd_end = initrd_start + ramdisk_size;
13551 - do_relocate_initrd = true;
13554 -#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
13556 -static void __init relocate_initrd(void)
13558 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
13559 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
13560 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13561 - unsigned long ramdisk_here;
13562 - unsigned long slop, clen, mapaddr;
13565 - if (!do_relocate_initrd)
13568 - ramdisk_here = initrd_start - PAGE_OFFSET;
13570 - q = (char *)initrd_start;
13572 - /* Copy any lowmem portion of the initrd */
13573 - if (ramdisk_image < end_of_lowmem) {
13574 - clen = end_of_lowmem - ramdisk_image;
13575 - p = (char *)__va(ramdisk_image);
13576 - memcpy(q, p, clen);
13578 - ramdisk_image += clen;
13579 - ramdisk_size -= clen;
13582 - /* Copy the highmem portion of the initrd */
13583 - while (ramdisk_size) {
13584 - slop = ramdisk_image & ~PAGE_MASK;
13585 - clen = ramdisk_size;
13586 - if (clen > MAX_MAP_CHUNK-slop)
13587 - clen = MAX_MAP_CHUNK-slop;
13588 - mapaddr = ramdisk_image & PAGE_MASK;
13589 - p = early_ioremap(mapaddr, clen+slop);
13590 - memcpy(q, p+slop, clen);
13591 - early_iounmap(p, clen+slop);
13593 - ramdisk_image += clen;
13594 - ramdisk_size -= clen;
13598 -#endif /* CONFIG_BLK_DEV_INITRD */
13600 -void __init setup_bootmem_allocator(void)
13602 - unsigned long bootmap_size;
13604 - * Initialize the boot-time allocator (with low memory only):
13606 - bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
13608 - register_bootmem_low_pages(max_low_pfn);
13611 - * Reserve the bootmem bitmap itself as well. We do this in two
13612 - * steps (first step was init_bootmem()) because this catches
13613 - * the (very unlikely) case of us accidentally initializing the
13614 - * bootmem allocator with an invalid RAM area.
13616 - reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
13617 - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
13618 - BOOTMEM_DEFAULT);
13620 -#ifndef CONFIG_XEN
13622 - * reserve physical page 0 - it's a special BIOS page on many boxes,
13623 - * enabling clean reboots, SMP operation, laptop functions.
13625 - reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
13627 - /* reserve EBDA region */
13628 - reserve_ebda_region();
13632 - * But first pinch a few for the stack/trampoline stuff
13633 - * FIXME: Don't need the extra page at 4K, but need to fix
13634 - * trampoline before removing it. (see the GDT stuff)
13636 - reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
13638 -#ifdef CONFIG_ACPI_SLEEP
13640 - * Reserve low memory region for sleep support.
13642 - acpi_reserve_bootmem();
13644 -#endif /* !CONFIG_XEN */
13646 -#ifdef CONFIG_BLK_DEV_INITRD
13647 - reserve_initrd();
13649 - numa_kva_reserve();
13650 - reserve_crashkernel();
13652 - reserve_ibft_region();
13656 - * The node 0 pgdat is initialized before all of these because
13657 - * it's needed for bootmem. node>0 pgdats have their virtual
13658 - * space allocated before the pagetables are in place to access
13659 - * them, so they can't be cleared then.
13661 - * This should all compile down to nothing when NUMA is off.
13663 -static void __init remapped_pgdat_init(void)
13667 - for_each_online_node(nid) {
13669 - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
13674 -static void set_mca_bus(int x)
13679 -static void set_mca_bus(int x) { }
13682 -/* Overridden in paravirt.c if CONFIG_PARAVIRT */
13683 -char * __init __attribute__((weak)) memory_setup(void)
13685 - return machine_specific_memory_setup();
13688 -#ifdef CONFIG_NUMA
13690 - * In the golden day, when everything among i386 and x86_64 will be
13691 - * integrated, this will not live here
13693 -void *x86_cpu_to_node_map_early_ptr;
13694 -int x86_cpu_to_node_map_init[NR_CPUS] = {
13695 - [0 ... NR_CPUS-1] = NUMA_NO_NODE
13697 -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
13701 - * Determine if we were loaded by an EFI loader. If so, then we have also been
13702 - * passed the efi memmap, systab, etc., so we should use these data structures
13703 - * for initialization. Note, the efi init code path is determined by the
13704 - * global efi_enabled. This allows the same kernel image to be used on existing
13705 - * systems (with a traditional BIOS) as well as on EFI systems.
13707 -void __init setup_arch(char **cmdline_p)
13709 - int i, j, k, fpp;
13710 - struct physdev_set_iopl set_iopl;
13711 - unsigned long max_low_pfn;
13712 - unsigned long p2m_pages;
13714 - /* Force a quick death if the kernel panics (not domain 0). */
13715 - extern int panic_timeout;
13716 - if (!panic_timeout && !is_initial_xendomain())
13717 - panic_timeout = 1;
13719 - /* Register a call for panic conditions. */
13720 - atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
13722 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13723 - VMASST_TYPE_4gb_segments));
13724 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13725 - VMASST_TYPE_writable_pagetables));
13727 - memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
13728 - pre_setup_arch_hook();
13729 - early_cpu_init();
13730 - early_ioremap_init();
13732 - prefill_possible_map();
13736 - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
13741 - /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
13742 - properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
13744 - ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
13745 - screen_info = boot_params.screen_info;
13747 - apm_info.bios = boot_params.apm_bios_info;
13748 - ist_info = boot_params.ist_info;
13749 - saved_video_mode = boot_params.hdr.vid_mode;
13750 - if( boot_params.sys_desc_table.length != 0 ) {
13751 - set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
13752 - machine_id = boot_params.sys_desc_table.table[0];
13753 - machine_submodel_id = boot_params.sys_desc_table.table[1];
13754 - BIOS_revision = boot_params.sys_desc_table.table[2];
13756 - bootloader_type = boot_params.hdr.type_of_loader;
13758 - if (is_initial_xendomain()) {
13759 - const struct dom0_vga_console_info *info =
13760 - (void *)((char *)xen_start_info +
13761 - xen_start_info->console.dom0.info_off);
13763 - dom0_init_screen_info(info,
13764 - xen_start_info->console.dom0.info_size);
13765 - xen_start_info->console.domU.mfn = 0;
13766 - xen_start_info->console.domU.evtchn = 0;
13768 - screen_info.orig_video_isVGA = 0;
13770 -#ifdef CONFIG_BLK_DEV_RAM
13771 - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
13772 - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
13773 - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
13778 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
13779 - print_memory_map(memory_setup());
13783 - if (!boot_params.hdr.root_flags)
13784 - root_mountflags &= ~MS_RDONLY;
13785 - init_mm.start_code = (unsigned long) _text;
13786 - init_mm.end_code = (unsigned long) _etext;
13787 - init_mm.end_data = (unsigned long) _edata;
13788 - init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
13789 - xen_start_info->nr_pt_frames) << PAGE_SHIFT;
13791 - code_resource.start = virt_to_phys(_text);
13792 - code_resource.end = virt_to_phys(_etext)-1;
13793 - data_resource.start = virt_to_phys(_etext);
13794 - data_resource.end = virt_to_phys(_edata)-1;
13795 - bss_resource.start = virt_to_phys(&__bss_start);
13796 - bss_resource.end = virt_to_phys(&__bss_stop)-1;
13798 - if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
13799 - i = COMMAND_LINE_SIZE;
13800 - memcpy(boot_command_line, xen_start_info->cmd_line, i);
13801 - boot_command_line[i - 1] = '\0';
13802 - parse_early_param();
13804 - if (user_defined_memmap) {
13805 - printk(KERN_INFO "user-defined physical RAM map:\n");
13806 - print_memory_map("user");
13809 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
13810 - *cmdline_p = command_line;
13815 - /* update e820 for memory not covered by WB MTRRs */
13816 - propagate_e820_map();
13818 -#ifndef CONFIG_XEN
13819 - if (mtrr_trim_uncached_memory(max_pfn))
13820 - propagate_e820_map();
13823 - max_low_pfn = setup_memory();
13825 -#ifdef CONFIG_KVM_CLOCK
13831 - * Must be after max_low_pfn is determined, and before kernel
13832 - * pagetables are setup.
13836 - kvm_guest_init();
13839 - * NOTE: before this point _nobody_ is allowed to allocate
13840 - * any memory using the bootmem allocator. Although the
13841 - * allocator is now initialised only the first 8Mb of the kernel
13842 - * virtual address space has been mapped. All allocations before
13843 - * paging_init() has completed must use the alloc_bootmem_low_pages()
13844 - * variant (which allocates DMA'able memory) and care must be taken
13845 - * not to exceed the 8Mb limit.
13849 - smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
13854 - * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
13857 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
13858 - if (init_ohci1394_dma_early)
13859 - init_ohci1394_dma_on_all_controllers();
13862 - remapped_pgdat_init();
13864 - zone_sizes_init();
13866 -#ifdef CONFIG_X86_FIND_SMP_CONFIG
13868 - * Find and reserve possible boot-time SMP configuration:
13870 - find_smp_config();
13873 - p2m_pages = max_pfn;
13874 - if (xen_start_info->nr_pages > max_pfn) {
13876 - * the max_pfn was shrunk (probably by mem= or highmem=
13877 - * kernel parameter); shrink reservation with the HV
13879 - struct xen_memory_reservation reservation = {
13880 - .address_bits = 0,
13881 - .extent_order = 0,
13882 - .domid = DOMID_SELF
13884 - unsigned int difference;
13887 - difference = xen_start_info->nr_pages - max_pfn;
13889 - set_xen_guest_handle(reservation.extent_start,
13890 - ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
13891 - reservation.nr_extents = difference;
13892 - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
13894 - BUG_ON (ret != difference);
13896 - else if (max_pfn > xen_start_info->nr_pages)
13897 - p2m_pages = xen_start_info->nr_pages;
13899 - /* Make sure we have a correctly sized P->M table. */
13900 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
13901 - phys_to_machine_mapping = alloc_bootmem_low_pages(
13902 - max_pfn * sizeof(unsigned long));
13903 - memset(phys_to_machine_mapping, ~0,
13904 - max_pfn * sizeof(unsigned long));
13905 - memcpy(phys_to_machine_mapping,
13906 - (unsigned long *)xen_start_info->mfn_list,
13907 - p2m_pages * sizeof(unsigned long));
13909 - __pa(xen_start_info->mfn_list),
13910 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
13911 - sizeof(unsigned long))));
13914 - * Initialise the list of the frames that specify the list of
13915 - * frames that make up the p2m table. Used by save/restore
13917 - pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
13919 - fpp = PAGE_SIZE/sizeof(unsigned long);
13920 - for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
13921 - if ((j % fpp) == 0) {
13924 - pfn_to_mfn_frame_list[k] =
13925 - alloc_bootmem_low_pages(PAGE_SIZE);
13926 - pfn_to_mfn_frame_list_list[k] =
13927 - virt_to_mfn(pfn_to_mfn_frame_list[k]);
13930 - pfn_to_mfn_frame_list[k][j] =
13931 - virt_to_mfn(&phys_to_machine_mapping[i]);
13933 - HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
13934 - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
13935 - virt_to_mfn(pfn_to_mfn_frame_list_list);
13938 - /* Mark all ISA DMA channels in-use - using them wouldn't work. */
13939 - for (i = 0; i < MAX_DMA_CHANNELS; ++i)
13940 - if (i != 4 && request_dma(i, "xen") != 0)
13944 - * NOTE: at this point the bootmem allocator is fully available.
13947 -#ifdef CONFIG_BLK_DEV_INITRD
13948 - relocate_initrd();
13951 - paravirt_post_allocator_init();
13953 - if (is_initial_xendomain())
13954 - dmi_scan_machine();
13958 -#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
13960 - * setup to use the early static init tables during kernel startup
13961 - * X86_SMP will exclude sub-arches that don't deal well with it.
13963 - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
13964 - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
13965 -#ifdef CONFIG_NUMA
13966 - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
13970 -#ifdef CONFIG_X86_GENERICARCH
13971 - generic_apic_probe();
13974 - set_iopl.iopl = 1;
13975 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
13977 -#ifdef CONFIG_ACPI
13978 - if (!is_initial_xendomain()) {
13979 - printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
13980 - acpi_disabled = 1;
13985 - * Parse the ACPI tables for possible boot-time SMP configuration.
13987 - acpi_boot_table_init();
13990 -#ifndef CONFIG_XEN
13994 -#ifdef CONFIG_ACPI
13995 - acpi_boot_init();
13997 -#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
13998 - if (def_to_bigsmp)
13999 - printk(KERN_WARNING "More than 8 CPUs detected and "
14000 - "CONFIG_X86_PC cannot handle it.\nUse "
14001 - "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
14004 -#ifdef CONFIG_X86_LOCAL_APIC
14005 - if (smp_found_config)
14006 - get_smp_config();
14009 - e820_register_memory();
14010 - e820_mark_nosave_regions();
14012 - if (is_initial_xendomain()) {
14014 -#if defined(CONFIG_VGA_CONSOLE)
14015 - if (!efi_enabled ||
14016 - (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14017 - conswitchp = &vga_con;
14018 -#elif defined(CONFIG_DUMMY_CONSOLE)
14019 - conswitchp = &dummy_con;
14023 -#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14024 - conswitchp = &dummy_con;
14030 -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14032 - HYPERVISOR_shutdown(SHUTDOWN_crash);
14033 - /* we're never actually going to get here... */
14034 - return NOTIFY_DONE;
14038 - * Request address space for all standard resources
14040 - * This is called just before pcibios_init(), which is also a
14041 - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
14043 -static int __init request_standard_resources(void)
14047 - /* Nothing to do if not running in dom0. */
14048 - if (!is_initial_xendomain())
14051 - printk(KERN_INFO "Setting up standard PCI resources\n");
14052 - init_iomem_resources(&code_resource, &data_resource, &bss_resource);
14054 - request_resource(&iomem_resource, &video_ram_resource);
14056 - /* request I/O space for devices used on all i[345]86 PCs */
14057 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14058 - request_resource(&ioport_resource, &standard_io_resources[i]);
14062 -subsys_initcall(request_standard_resources);
14063 --- sle11-2009-06-04.orig/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:38:05.000000000 +0100
14064 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
14067 - * Copyright (C) 1995 Linus Torvalds
14071 - * This file handles the architecture-dependent parts of initialization
14074 -#include <linux/errno.h>
14075 -#include <linux/sched.h>
14076 -#include <linux/kernel.h>
14077 -#include <linux/mm.h>
14078 -#include <linux/stddef.h>
14079 -#include <linux/unistd.h>
14080 -#include <linux/ptrace.h>
14081 -#include <linux/slab.h>
14082 -#include <linux/user.h>
14083 -#include <linux/screen_info.h>
14084 -#include <linux/ioport.h>
14085 -#include <linux/delay.h>
14086 -#include <linux/init.h>
14087 -#include <linux/initrd.h>
14088 -#include <linux/highmem.h>
14089 -#include <linux/bootmem.h>
14090 -#include <linux/module.h>
14091 -#include <asm/processor.h>
14092 -#include <linux/console.h>
14093 -#include <linux/seq_file.h>
14094 -#include <linux/crash_dump.h>
14095 -#include <linux/root_dev.h>
14096 -#include <linux/pci.h>
14097 -#include <asm/pci-direct.h>
14098 -#include <linux/efi.h>
14099 -#include <linux/acpi.h>
14100 -#include <linux/kallsyms.h>
14101 -#include <linux/edd.h>
14102 -#include <linux/iscsi_ibft.h>
14103 -#include <linux/mmzone.h>
14104 -#include <linux/kexec.h>
14105 -#include <linux/cpufreq.h>
14106 -#include <linux/dmi.h>
14107 -#include <linux/dma-mapping.h>
14108 -#include <linux/ctype.h>
14109 -#include <linux/sort.h>
14110 -#include <linux/uaccess.h>
14111 -#include <linux/init_ohci1394_dma.h>
14112 -#include <linux/kvm_para.h>
14114 -#include <asm/mtrr.h>
14115 -#include <asm/uaccess.h>
14116 -#include <asm/system.h>
14117 -#include <asm/vsyscall.h>
14118 -#include <asm/io.h>
14119 -#include <asm/smp.h>
14120 -#include <asm/msr.h>
14121 -#include <asm/desc.h>
14122 -#include <video/edid.h>
14123 -#include <asm/e820.h>
14124 -#include <asm/dma.h>
14125 -#include <asm/gart.h>
14126 -#include <asm/mpspec.h>
14127 -#include <asm/mmu_context.h>
14128 -#include <asm/proto.h>
14129 -#include <asm/setup.h>
14130 -#include <asm/numa.h>
14131 -#include <asm/sections.h>
14132 -#include <asm/dmi.h>
14133 -#include <asm/cacheflush.h>
14134 -#include <asm/mce.h>
14135 -#include <asm/ds.h>
14136 -#include <asm/topology.h>
14137 -#include <asm/pat.h>
14139 -#include <mach_apic.h>
14141 -#include <linux/percpu.h>
14142 -#include <xen/interface/physdev.h>
14143 -#include "setup_arch_pre.h"
14144 -#include <asm/hypervisor.h>
14145 -#include <xen/interface/nmi.h>
14146 -#include <xen/features.h>
14147 -#include <xen/firmware.h>
14148 -#include <xen/xencons.h>
14149 -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
14150 -#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
14151 -#include <asm/mach-xen/setup_arch_post.h>
14152 -#include <xen/interface/memory.h>
14155 -#include <xen/interface/kexec.h>
14158 -extern unsigned long start_pfn;
14159 -extern struct edid_info edid_info;
14161 -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
14162 -EXPORT_SYMBOL(HYPERVISOR_shared_info);
14164 -static int xen_panic_event(struct notifier_block *, unsigned long, void *);
14165 -static struct notifier_block xen_panic_block = {
14166 - xen_panic_event, NULL, 0 /* try to go last */
14169 -unsigned long *phys_to_machine_mapping;
14170 -unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
14172 -EXPORT_SYMBOL(phys_to_machine_mapping);
14174 -DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
14175 -DEFINE_PER_CPU(int, nr_multicall_ents);
14177 -/* Raw start-of-day parameters from the hypervisor. */
14178 -start_info_t *xen_start_info;
14179 -EXPORT_SYMBOL(xen_start_info);
14183 - * Machine setup..
14186 -struct cpuinfo_x86 boot_cpu_data __read_mostly;
14187 -EXPORT_SYMBOL(boot_cpu_data);
14189 -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
14191 -unsigned long mmu_cr4_features;
14193 -/* Boot loader ID as an integer, for the benefit of proc_dointvec */
14194 -int bootloader_type;
14196 -unsigned long saved_video_mode;
14198 -int force_mwait __cpuinitdata;
14201 - * Early DMI memory
14203 -int dmi_alloc_index;
14204 -char dmi_alloc_data[DMI_MAX_DATA];
14209 -struct screen_info screen_info;
14210 -EXPORT_SYMBOL(screen_info);
14211 -struct sys_desc_table_struct {
14212 - unsigned short length;
14213 - unsigned char table[0];
14216 -struct edid_info edid_info;
14217 -EXPORT_SYMBOL_GPL(edid_info);
14219 -extern int root_mountflags;
14221 -char __initdata command_line[COMMAND_LINE_SIZE];
14223 -static struct resource standard_io_resources[] = {
14224 - { .name = "dma1", .start = 0x00, .end = 0x1f,
14225 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14226 - { .name = "pic1", .start = 0x20, .end = 0x21,
14227 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14228 - { .name = "timer0", .start = 0x40, .end = 0x43,
14229 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14230 - { .name = "timer1", .start = 0x50, .end = 0x53,
14231 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14232 - { .name = "keyboard", .start = 0x60, .end = 0x60,
14233 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14234 - { .name = "keyboard", .start = 0x64, .end = 0x64,
14235 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14236 - { .name = "dma page reg", .start = 0x80, .end = 0x8f,
14237 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14238 - { .name = "pic2", .start = 0xa0, .end = 0xa1,
14239 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14240 - { .name = "dma2", .start = 0xc0, .end = 0xdf,
14241 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14242 - { .name = "fpu", .start = 0xf0, .end = 0xff,
14243 - .flags = IORESOURCE_BUSY | IORESOURCE_IO }
14246 -#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
14248 -static struct resource data_resource = {
14249 - .name = "Kernel data",
14252 - .flags = IORESOURCE_RAM,
14254 -static struct resource code_resource = {
14255 - .name = "Kernel code",
14258 - .flags = IORESOURCE_RAM,
14260 -static struct resource bss_resource = {
14261 - .name = "Kernel bss",
14264 - .flags = IORESOURCE_RAM,
14267 -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
14269 -#ifdef CONFIG_PROC_VMCORE
14270 -/* elfcorehdr= specifies the location of elf core header
14271 - * stored by the crashed kernel. This option will be passed
14272 - * by kexec loader to the capture kernel.
14274 -static int __init setup_elfcorehdr(char *arg)
14279 - elfcorehdr_addr = memparse(arg, &end);
14280 - return end > arg ? 0 : -EINVAL;
14282 -early_param("elfcorehdr", setup_elfcorehdr);
14285 -#ifndef CONFIG_NUMA
14286 -static void __init
14287 -contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
14289 - unsigned long bootmap_size, bootmap;
14291 - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
14292 - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
14294 - if (bootmap == -1L)
14295 - panic("Cannot find bootmem map of size %ld\n", bootmap_size);
14296 - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
14297 - e820_register_active_regions(0, start_pfn, end_pfn);
14299 - free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
14300 - early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
14302 - free_bootmem_with_active_regions(0, end_pfn);
14303 - early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
14305 - reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
14309 -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
14311 -#ifdef CONFIG_EDD_MODULE
14312 -EXPORT_SYMBOL(edd);
14314 -#ifndef CONFIG_XEN
14316 - * copy_edd() - Copy the BIOS EDD information
14317 - * from boot_params into a safe place.
14320 -static inline void copy_edd(void)
14322 - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
14323 - sizeof(edd.mbr_signature));
14324 - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
14325 - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
14326 - edd.edd_info_nr = boot_params.eddbuf_entries;
14330 -static inline void copy_edd(void)
14335 -#ifdef CONFIG_KEXEC
14336 -#ifndef CONFIG_XEN
14337 -static void __init reserve_crashkernel(void)
14339 - unsigned long long total_mem;
14340 - unsigned long long crash_size, crash_base;
14343 - total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
14345 - ret = parse_crashkernel(boot_command_line, total_mem,
14346 - &crash_size, &crash_base);
14347 - if (ret == 0 && crash_size) {
14348 - if (crash_base <= 0) {
14349 - printk(KERN_INFO "crashkernel reservation failed - "
14350 - "you have to specify a base address\n");
14354 - if (reserve_bootmem(crash_base, crash_size,
14355 - BOOTMEM_EXCLUSIVE) < 0) {
14356 - printk(KERN_INFO "crashkernel reservation failed - "
14357 - "memory is in use\n");
14361 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
14362 - "for crashkernel (System RAM: %ldMB)\n",
14363 - (unsigned long)(crash_size >> 20),
14364 - (unsigned long)(crash_base >> 20),
14365 - (unsigned long)(total_mem >> 20));
14366 - crashk_res.start = crash_base;
14367 - crashk_res.end = crash_base + crash_size - 1;
14368 - insert_resource(&iomem_resource, &crashk_res);
14372 -#define reserve_crashkernel xen_machine_kexec_setup_resources
14375 -static inline void __init reserve_crashkernel(void)
14379 -/* Overridden in paravirt.c if CONFIG_PARAVIRT */
14380 -void __attribute__((weak)) __init memory_setup(void)
14382 - machine_specific_memory_setup();
14385 -static void __init parse_setup_data(void)
14387 - struct setup_data *data;
14388 - unsigned long pa_data;
14390 - if (boot_params.hdr.version < 0x0209)
14392 - pa_data = boot_params.hdr.setup_data;
14393 - while (pa_data) {
14394 - data = early_ioremap(pa_data, PAGE_SIZE);
14395 - switch (data->type) {
14399 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
14400 - free_early(pa_data, pa_data+sizeof(*data)+data->len);
14402 - pa_data = data->next;
14403 - early_iounmap(data, PAGE_SIZE);
14407 -#ifdef CONFIG_PCI_MMCONFIG
14408 -extern void __cpuinit fam10h_check_enable_mmcfg(void);
14409 -extern void __init check_enable_amd_mmconf_dmi(void);
14411 -void __cpuinit fam10h_check_enable_mmcfg(void)
14414 -void __init check_enable_amd_mmconf_dmi(void)
14420 - * setup_arch - architecture-specific boot-time initializations
14422 - * Note: On x86_64, fixmaps are ready for use even before this is called.
14424 -void __init setup_arch(char **cmdline_p)
14429 - extern struct e820map machine_e820;
14431 - printk(KERN_INFO "Command line: %s\n", boot_command_line);
14433 - /* Register a call for panic conditions. */
14434 - atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
14436 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
14437 - VMASST_TYPE_writable_pagetables));
14439 - early_ioremap_init();
14441 - ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
14442 - screen_info = boot_params.screen_info;
14444 - if (is_initial_xendomain()) {
14445 - const struct dom0_vga_console_info *info =
14446 - (void *)((char *)xen_start_info +
14447 - xen_start_info->console.dom0.info_off);
14449 - dom0_init_screen_info(info,
14450 - xen_start_info->console.dom0.info_size);
14451 - xen_start_info->console.domU.mfn = 0;
14452 - xen_start_info->console.domU.evtchn = 0;
14454 - screen_info.orig_video_isVGA = 0;
14458 - printk(KERN_INFO "Command line: %s\n", boot_command_line);
14460 - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
14461 - screen_info = boot_params.screen_info;
14462 - edid_info = boot_params.edid_info;
14463 -#endif /* !CONFIG_XEN */
14464 - saved_video_mode = boot_params.hdr.vid_mode;
14465 - bootloader_type = boot_params.hdr.type_of_loader;
14467 -#ifdef CONFIG_BLK_DEV_RAM
14468 - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
14469 - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
14470 - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
14473 - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
14483 - if (!boot_params.hdr.root_flags)
14484 - root_mountflags &= ~MS_RDONLY;
14485 - init_mm.start_code = (unsigned long) &_text;
14486 - init_mm.end_code = (unsigned long) &_etext;
14487 - init_mm.end_data = (unsigned long) &_edata;
14488 - init_mm.brk = (unsigned long) &_end;
14490 - code_resource.start = virt_to_phys(&_text);
14491 - code_resource.end = virt_to_phys(&_etext)-1;
14492 - data_resource.start = virt_to_phys(&_etext);
14493 - data_resource.end = virt_to_phys(&_edata)-1;
14494 - bss_resource.start = virt_to_phys(&__bss_start);
14495 - bss_resource.end = virt_to_phys(&__bss_stop)-1;
14497 - early_identify_cpu(&boot_cpu_data);
14499 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
14500 - *cmdline_p = command_line;
14502 - parse_setup_data();
14504 - parse_early_param();
14506 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
14507 - if (init_ohci1394_dma_early)
14508 - init_ohci1394_dma_on_all_controllers();
14511 - finish_e820_parsing();
14513 -#ifndef CONFIG_XEN
14514 - /* after parse_early_param, so could debug it */
14515 - insert_resource(&iomem_resource, &code_resource);
14516 - insert_resource(&iomem_resource, &data_resource);
14517 - insert_resource(&iomem_resource, &bss_resource);
14520 - early_gart_iommu_check();
14522 - e820_register_active_regions(0, 0, -1UL);
14524 - * partially used pages are not usable - thus
14525 - * we are rounding upwards:
14527 - end_pfn = e820_end_of_ram();
14528 - /* update e820 for memory not covered by WB MTRRs */
14530 -#ifndef CONFIG_XEN
14531 - if (mtrr_trim_uncached_memory(end_pfn)) {
14532 - e820_register_active_regions(0, 0, -1UL);
14533 - end_pfn = e820_end_of_ram();
14537 - num_physpages = end_pfn;
14538 - max_mapnr = end_pfn;
14542 - max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
14546 -#ifndef CONFIG_XEN
14550 - if (is_initial_xendomain())
14551 - dmi_scan_machine();
14555 -#ifdef CONFIG_KVM_CLOCK
14559 -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
14560 - /* setup to use the early static init tables during kernel startup */
14561 - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
14562 - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
14563 -#ifdef CONFIG_NUMA
14564 - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
14568 - /* How many end-of-memory variables you have, grandma! */
14569 - max_low_pfn = end_pfn;
14570 - max_pfn = end_pfn;
14571 - high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
14573 - /* Remove active ranges so rediscovery with NUMA-awareness happens */
14574 - remove_all_active_ranges();
14576 -#ifdef CONFIG_ACPI_NUMA
14578 - * Parse SRAT to discover nodes.
14580 - acpi_numa_init();
14583 -#ifdef CONFIG_NUMA
14584 - numa_initmem_init(0, end_pfn);
14586 - contig_initmem_init(0, end_pfn);
14589 -#ifndef CONFIG_XEN
14590 - dma32_reserve_bootmem();
14592 -#ifdef CONFIG_ACPI_SLEEP
14594 - * Reserve low memory region for sleep support.
14596 - acpi_reserve_bootmem();
14600 - efi_reserve_bootmem();
14603 -#ifdef CONFIG_BLK_DEV_INITRD
14605 - if (xen_start_info->mod_start) {
14606 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
14607 - unsigned long ramdisk_size = xen_start_info->mod_len;
14609 - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
14610 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
14611 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
14613 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
14614 - unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
14616 - if (ramdisk_end <= end_of_mem) {
14618 - * don't need to reserve again, already reserved early
14619 - * in x86_64_start_kernel, and early_res_to_bootmem
14620 - * convert that to reserved in bootmem
14622 - initrd_start = ramdisk_image + PAGE_OFFSET;
14623 - initrd_end = initrd_start+ramdisk_size;
14625 - initrd_below_start_ok = 1;
14628 - free_bootmem(ramdisk_image, ramdisk_size);
14629 - printk(KERN_ERR "initrd extends beyond end of memory "
14630 - "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
14631 - ramdisk_end, end_of_mem);
14632 - initrd_start = 0;
14636 - reserve_crashkernel();
14638 - reserve_ibft_region();
14642 -#ifdef CONFIG_X86_LOCAL_APIC
14644 - * Find and reserve possible boot-time SMP configuration:
14646 - find_smp_config();
14650 - int i, j, k, fpp;
14651 - unsigned long p2m_pages;
14653 - p2m_pages = end_pfn;
14654 - if (xen_start_info->nr_pages > end_pfn) {
14656 - * the end_pfn was shrunk (probably by mem= or highmem=
14657 - * kernel parameter); shrink reservation with the HV
14659 - struct xen_memory_reservation reservation = {
14660 - .address_bits = 0,
14661 - .extent_order = 0,
14662 - .domid = DOMID_SELF
14664 - unsigned int difference;
14667 - difference = xen_start_info->nr_pages - end_pfn;
14669 - set_xen_guest_handle(reservation.extent_start,
14670 - ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
14671 - reservation.nr_extents = difference;
14672 - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
14674 - BUG_ON (ret != difference);
14676 - else if (end_pfn > xen_start_info->nr_pages)
14677 - p2m_pages = xen_start_info->nr_pages;
14679 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
14680 - /* Make sure we have a large enough P->M table. */
14681 - phys_to_machine_mapping = alloc_bootmem_pages(
14682 - end_pfn * sizeof(unsigned long));
14683 - memset(phys_to_machine_mapping, ~0,
14684 - end_pfn * sizeof(unsigned long));
14685 - memcpy(phys_to_machine_mapping,
14686 - (unsigned long *)xen_start_info->mfn_list,
14687 - p2m_pages * sizeof(unsigned long));
14689 - __pa(xen_start_info->mfn_list),
14690 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
14691 - sizeof(unsigned long))));
14694 - * Initialise the list of the frames that specify the
14695 - * list of frames that make up the p2m table. Used by
14698 - pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
14700 - fpp = PAGE_SIZE/sizeof(unsigned long);
14701 - for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
14702 - if ((j % fpp) == 0) {
14705 - pfn_to_mfn_frame_list[k] =
14706 - alloc_bootmem_pages(PAGE_SIZE);
14707 - pfn_to_mfn_frame_list_list[k] =
14708 - virt_to_mfn(pfn_to_mfn_frame_list[k]);
14711 - pfn_to_mfn_frame_list[k][j] =
14712 - virt_to_mfn(&phys_to_machine_mapping[i]);
14714 - HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
14715 - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
14716 - virt_to_mfn(pfn_to_mfn_frame_list_list);
14719 - /* Mark all ISA DMA channels in-use - using them wouldn't work. */
14720 - for (i = 0; i < MAX_DMA_CHANNELS; ++i)
14721 - if (i != 4 && request_dma(i, "xen") != 0)
14725 -#ifdef CONFIG_ACPI
14726 - if (!is_initial_xendomain()) {
14727 - acpi_disabled = 1;
14733 -#ifndef CONFIG_XEN
14737 -#ifdef CONFIG_ACPI
14739 - * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
14740 - * Call this early for SRAT node setup.
14742 - acpi_boot_table_init();
14745 - * Read APIC and some other early information from ACPI tables.
14747 - acpi_boot_init();
14750 - init_cpu_to_node();
14752 -#ifdef CONFIG_X86_LOCAL_APIC
14754 - * get boot-time SMP configuration:
14756 - if (smp_found_config)
14757 - get_smp_config();
14758 -#ifndef CONFIG_XEN
14759 - init_apic_mappings();
14760 - ioapic_init_mappings();
14763 -#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
14764 - prefill_possible_map();
14767 - kvm_guest_init();
14770 - * We trust e820 completely. No explicit ROM probing in memory.
14773 - if (is_initial_xendomain())
14774 - e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
14776 - e820_reserve_resources(e820.map, e820.nr_map);
14777 - e820_mark_nosave_regions();
14780 - /* request I/O space for devices used on all i[345]86 PCs */
14781 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14782 - request_resource(&ioport_resource, &standard_io_resources[i]);
14785 - if (is_initial_xendomain())
14786 - e820_setup_gap(machine_e820.map, machine_e820.nr_map);
14788 - e820_setup_gap(e820.map, e820.nr_map);
14793 - struct physdev_set_iopl set_iopl;
14795 - set_iopl.iopl = 1;
14796 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
14798 - if (is_initial_xendomain()) {
14800 -#if defined(CONFIG_VGA_CONSOLE)
14801 - conswitchp = &vga_con;
14802 -#elif defined(CONFIG_DUMMY_CONSOLE)
14803 - conswitchp = &dummy_con;
14807 -#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14808 - conswitchp = &dummy_con;
14812 -#else /* CONFIG_XEN */
14815 -#if defined(CONFIG_VGA_CONSOLE)
14816 - if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14817 - conswitchp = &vga_con;
14818 -#elif defined(CONFIG_DUMMY_CONSOLE)
14819 - conswitchp = &dummy_con;
14823 -#endif /* !CONFIG_XEN */
14825 - /* do this before identify_cpu for boot cpu */
14826 - check_enable_amd_mmconf_dmi();
14831 -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14833 - HYPERVISOR_shutdown(SHUTDOWN_crash);
14834 - /* we're never actually going to get here... */
14835 - return NOTIFY_DONE;
14837 -#endif /* !CONFIG_XEN */
14840 -static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
14844 - if (c->extended_cpuid_level < 0x80000004)
14847 - v = (unsigned int *) c->x86_model_id;
14848 - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
14849 - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
14850 - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
14851 - c->x86_model_id[48] = 0;
14856 -static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
14858 - unsigned int n, dummy, eax, ebx, ecx, edx;
14860 - n = c->extended_cpuid_level;
14862 - if (n >= 0x80000005) {
14863 - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
14864 - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
14865 - "D cache %dK (%d bytes/line)\n",
14866 - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
14867 - c->x86_cache_size = (ecx>>24) + (edx>>24);
14868 - /* On K8 L1 TLB is inclusive, so don't count it */
14869 - c->x86_tlbsize = 0;
14872 - if (n >= 0x80000006) {
14873 - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
14874 - ecx = cpuid_ecx(0x80000006);
14875 - c->x86_cache_size = ecx >> 16;
14876 - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
14878 - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
14879 - c->x86_cache_size, ecx & 0xFF);
14881 - if (n >= 0x80000008) {
14882 - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
14883 - c->x86_virt_bits = (eax >> 8) & 0xff;
14884 - c->x86_phys_bits = eax & 0xff;
14888 -#ifdef CONFIG_NUMA
14889 -static int __cpuinit nearby_node(int apicid)
14893 - for (i = apicid - 1; i >= 0; i--) {
14894 - node = apicid_to_node[i];
14895 - if (node != NUMA_NO_NODE && node_online(node))
14898 - for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
14899 - node = apicid_to_node[i];
14900 - if (node != NUMA_NO_NODE && node_online(node))
14903 - return first_node(node_online_map); /* Shouldn't happen */
14908 - * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
14909 - * Assumes number of cores is a power of two.
14911 -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
14915 -#ifdef CONFIG_NUMA
14916 - int cpu = smp_processor_id();
14918 - unsigned apicid = hard_smp_processor_id();
14920 - bits = c->x86_coreid_bits;
14922 - /* Low order bits define the core id (index of core in socket) */
14923 - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
14924 - /* Convert the initial APIC ID into the socket ID */
14925 - c->phys_proc_id = c->initial_apicid >> bits;
14927 -#ifdef CONFIG_NUMA
14928 - node = c->phys_proc_id;
14929 - if (apicid_to_node[apicid] != NUMA_NO_NODE)
14930 - node = apicid_to_node[apicid];
14931 - if (!node_online(node)) {
14932 - /* Two possibilities here:
14933 - - The CPU is missing memory and no node was created.
14934 - In that case try picking one from a nearby CPU
14935 - - The APIC IDs differ from the HyperTransport node IDs
14936 - which the K8 northbridge parsing fills in.
14937 - Assume they are all increased by a constant offset,
14938 - but in the same order as the HT nodeids.
14939 - If that doesn't result in a usable node fall back to the
14940 - path for the previous case. */
14942 - int ht_nodeid = c->initial_apicid;
14944 - if (ht_nodeid >= 0 &&
14945 - apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
14946 - node = apicid_to_node[ht_nodeid];
14947 - /* Pick a nearby node */
14948 - if (!node_online(node))
14949 - node = nearby_node(apicid);
14951 - numa_set_node(cpu, node);
14953 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
14958 -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
14961 - unsigned bits, ecx;
14963 - /* Multi core CPU? */
14964 - if (c->extended_cpuid_level < 0x80000008)
14967 - ecx = cpuid_ecx(0x80000008);
14969 - c->x86_max_cores = (ecx & 0xff) + 1;
14971 - /* CPU telling us the core id bits shift? */
14972 - bits = (ecx >> 12) & 0xF;
14974 - /* Otherwise recompute */
14976 - while ((1 << bits) < c->x86_max_cores)
14980 - c->x86_coreid_bits = bits;
14985 -#define ENABLE_C1E_MASK 0x18000000
14986 -#define CPUID_PROCESSOR_SIGNATURE 1
14987 -#define CPUID_XFAM 0x0ff00000
14988 -#define CPUID_XFAM_K8 0x00000000
14989 -#define CPUID_XFAM_10H 0x00100000
14990 -#define CPUID_XFAM_11H 0x00200000
14991 -#define CPUID_XMOD 0x000f0000
14992 -#define CPUID_XMOD_REV_F 0x00040000
14994 -#ifndef CONFIG_XEN
14995 -/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
14996 -static __cpuinit int amd_apic_timer_broken(void)
14998 - u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
15000 - switch (eax & CPUID_XFAM) {
15001 - case CPUID_XFAM_K8:
15002 - if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
15004 - case CPUID_XFAM_10H:
15005 - case CPUID_XFAM_11H:
15006 - rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
15007 - if (lo & ENABLE_C1E_MASK)
15011 - /* err on the side of caution */
15018 -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
15020 - early_init_amd_mc(c);
15022 - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
15023 - if (c->x86_power & (1<<8))
15024 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15027 -static void __cpuinit init_amd(struct cpuinfo_x86 *c)
15032 - unsigned long value;
15035 - * Disable TLB flush filter by setting HWCR.FFDIS on K8
15036 - * bit 6 of msr C001_0015
15038 - * Errata 63 for SH-B3 steppings
15039 - * Errata 122 for all steppings (F+ have it disabled by default)
15041 - if (c->x86 == 15) {
15042 - rdmsrl(MSR_K8_HWCR, value);
15044 - wrmsrl(MSR_K8_HWCR, value);
15048 - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
15049 - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
15050 - clear_cpu_cap(c, 0*32+31);
15052 - /* On C+ stepping K8 rep microcode works well for copy/memset */
15053 - level = cpuid_eax(1);
15054 - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
15055 - level >= 0x0f58))
15056 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15057 - if (c->x86 == 0x10 || c->x86 == 0x11)
15058 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15060 - /* Enable workaround for FXSAVE leak */
15062 - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
15064 - level = get_model_name(c);
15066 - switch (c->x86) {
15068 - /* Should distinguish Models here, but this is only
15069 - a fallback anyways. */
15070 - strcpy(c->x86_model_id, "Hammer");
15074 - display_cacheinfo(c);
15076 - /* Multi core CPU? */
15077 - if (c->extended_cpuid_level >= 0x80000008)
15078 - amd_detect_cmp(c);
15080 - if (c->extended_cpuid_level >= 0x80000006 &&
15081 - (cpuid_edx(0x80000006) & 0xf000))
15082 - num_cache_leaves = 4;
15084 - num_cache_leaves = 3;
15086 - if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
15087 - set_cpu_cap(c, X86_FEATURE_K8);
15089 - /* MFENCE stops RDTSC speculation */
15090 - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
15092 - if (c->x86 == 0x10)
15093 - fam10h_check_enable_mmcfg();
15095 -#ifndef CONFIG_XEN
15096 - if (amd_apic_timer_broken())
15097 - disable_apic_timer = 1;
15099 - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
15100 - unsigned long long tseg;
15103 - * Split up direct mapping around the TSEG SMM area.
15104 - * Don't do it for gbpages because there seems very little
15105 - * benefit in doing so.
15107 - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
15108 - (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
15109 - set_memory_4k((unsigned long)__va(tseg), 1);
15114 -void __cpuinit detect_ht(struct cpuinfo_x86 *c)
15117 - u32 eax, ebx, ecx, edx;
15118 - int index_msb, core_bits;
15120 - cpuid(1, &eax, &ebx, &ecx, &edx);
15123 - if (!cpu_has(c, X86_FEATURE_HT))
15125 - if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
15128 - smp_num_siblings = (ebx & 0xff0000) >> 16;
15130 - if (smp_num_siblings == 1) {
15131 - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
15132 - } else if (smp_num_siblings > 1) {
15134 - if (smp_num_siblings > NR_CPUS) {
15135 - printk(KERN_WARNING "CPU: Unsupported number of "
15136 - "siblings %d", smp_num_siblings);
15137 - smp_num_siblings = 1;
15141 - index_msb = get_count_order(smp_num_siblings);
15142 - c->phys_proc_id = phys_pkg_id(index_msb);
15144 - smp_num_siblings = smp_num_siblings / c->x86_max_cores;
15146 - index_msb = get_count_order(smp_num_siblings);
15148 - core_bits = get_count_order(c->x86_max_cores);
15150 - c->cpu_core_id = phys_pkg_id(index_msb) &
15151 - ((1 << core_bits) - 1);
15154 - if ((c->x86_max_cores * smp_num_siblings) > 1) {
15155 - printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
15156 - c->phys_proc_id);
15157 - printk(KERN_INFO "CPU: Processor Core ID: %d\n",
15165 - * find out the number of processor cores on the die
15167 -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
15169 - unsigned int eax, t;
15171 - if (c->cpuid_level < 4)
15174 - cpuid_count(4, 0, &eax, &t, &t, &t);
15177 - return ((eax >> 26) + 1);
15182 -static void __cpuinit srat_detect_node(void)
15184 -#ifdef CONFIG_NUMA
15186 - int cpu = smp_processor_id();
15187 - int apicid = hard_smp_processor_id();
15189 - /* Don't do the funky fallback heuristics the AMD version employs
15191 - node = apicid_to_node[apicid];
15192 - if (node == NUMA_NO_NODE || !node_online(node))
15193 - node = first_node(node_online_map);
15194 - numa_set_node(cpu, node);
15196 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
15200 -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
15202 - if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
15203 - (c->x86 == 0x6 && c->x86_model >= 0x0e))
15204 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15207 -static void __cpuinit init_intel(struct cpuinfo_x86 *c)
15209 - /* Cache sizes */
15212 - init_intel_cacheinfo(c);
15213 - if (c->cpuid_level > 9) {
15214 - unsigned eax = cpuid_eax(10);
15215 - /* Check for version and the number of counters */
15216 - if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
15217 - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
15220 - if (cpu_has_ds) {
15221 - unsigned int l1, l2;
15222 - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
15223 - if (!(l1 & (1<<11)))
15224 - set_cpu_cap(c, X86_FEATURE_BTS);
15225 - if (!(l1 & (1<<12)))
15226 - set_cpu_cap(c, X86_FEATURE_PEBS);
15231 - ds_init_intel(c);
15233 - n = c->extended_cpuid_level;
15234 - if (n >= 0x80000008) {
15235 - unsigned eax = cpuid_eax(0x80000008);
15236 - c->x86_virt_bits = (eax >> 8) & 0xff;
15237 - c->x86_phys_bits = eax & 0xff;
15238 - /* CPUID workaround for Intel 0F34 CPU */
15239 - if (c->x86_vendor == X86_VENDOR_INTEL &&
15240 - c->x86 == 0xF && c->x86_model == 0x3 &&
15241 - c->x86_mask == 0x4)
15242 - c->x86_phys_bits = 36;
15245 - if (c->x86 == 15)
15246 - c->x86_cache_alignment = c->x86_clflush_size * 2;
15248 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15249 - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15250 - c->x86_max_cores = intel_num_cpu_cores(c);
15252 - srat_detect_node();
15255 -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
15257 - if (c->x86 == 0x6 && c->x86_model >= 0xf)
15258 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15261 -static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
15263 - /* Cache sizes */
15266 - n = c->extended_cpuid_level;
15267 - if (n >= 0x80000008) {
15268 - unsigned eax = cpuid_eax(0x80000008);
15269 - c->x86_virt_bits = (eax >> 8) & 0xff;
15270 - c->x86_phys_bits = eax & 0xff;
15273 - if (c->x86 == 0x6 && c->x86_model >= 0xf) {
15274 - c->x86_cache_alignment = c->x86_clflush_size * 2;
15275 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15276 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15278 - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15281 -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
15283 - char *v = c->x86_vendor_id;
15285 - if (!strcmp(v, "AuthenticAMD"))
15286 - c->x86_vendor = X86_VENDOR_AMD;
15287 - else if (!strcmp(v, "GenuineIntel"))
15288 - c->x86_vendor = X86_VENDOR_INTEL;
15289 - else if (!strcmp(v, "CentaurHauls"))
15290 - c->x86_vendor = X86_VENDOR_CENTAUR;
15292 - c->x86_vendor = X86_VENDOR_UNKNOWN;
15295 -/* Do some early cpuid on the boot CPU to get some parameter that are
15296 - needed before check_bugs. Everything advanced is in identify_cpu
15298 -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
15302 - c->loops_per_jiffy = loops_per_jiffy;
15303 - c->x86_cache_size = -1;
15304 - c->x86_vendor = X86_VENDOR_UNKNOWN;
15305 - c->x86_model = c->x86_mask = 0; /* So far unknown... */
15306 - c->x86_vendor_id[0] = '\0'; /* Unset */
15307 - c->x86_model_id[0] = '\0'; /* Unset */
15308 - c->x86_clflush_size = 64;
15309 - c->x86_cache_alignment = c->x86_clflush_size;
15310 - c->x86_max_cores = 1;
15311 - c->x86_coreid_bits = 0;
15312 - c->extended_cpuid_level = 0;
15313 - memset(&c->x86_capability, 0, sizeof c->x86_capability);
15315 - /* Get vendor name */
15316 - cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
15317 - (unsigned int *)&c->x86_vendor_id[0],
15318 - (unsigned int *)&c->x86_vendor_id[8],
15319 - (unsigned int *)&c->x86_vendor_id[4]);
15321 - get_cpu_vendor(c);
15323 - /* Initialize the standard set of capabilities */
15324 - /* Note that the vendor-specific code below might override */
15326 - /* Intel-defined flags: level 0x00000001 */
15327 - if (c->cpuid_level >= 0x00000001) {
15329 - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
15330 - &c->x86_capability[0]);
15331 - c->x86 = (tfms >> 8) & 0xf;
15332 - c->x86_model = (tfms >> 4) & 0xf;
15333 - c->x86_mask = tfms & 0xf;
15334 - if (c->x86 == 0xf)
15335 - c->x86 += (tfms >> 20) & 0xff;
15336 - if (c->x86 >= 0x6)
15337 - c->x86_model += ((tfms >> 16) & 0xF) << 4;
15338 - if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
15339 - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
15341 - /* Have CPUID level 0 only - unheard of */
15345 - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
15347 - c->phys_proc_id = c->initial_apicid;
15349 - /* AMD-defined flags: level 0x80000001 */
15350 - xlvl = cpuid_eax(0x80000000);
15351 - c->extended_cpuid_level = xlvl;
15352 - if ((xlvl & 0xffff0000) == 0x80000000) {
15353 - if (xlvl >= 0x80000001) {
15354 - c->x86_capability[1] = cpuid_edx(0x80000001);
15355 - c->x86_capability[6] = cpuid_ecx(0x80000001);
15357 - if (xlvl >= 0x80000004)
15358 - get_model_name(c); /* Default name */
15361 - /* Transmeta-defined flags: level 0x80860001 */
15362 - xlvl = cpuid_eax(0x80860000);
15363 - if ((xlvl & 0xffff0000) == 0x80860000) {
15364 - /* Don't set x86_cpuid_level here for now to not confuse. */
15365 - if (xlvl >= 0x80860001)
15366 - c->x86_capability[2] = cpuid_edx(0x80860001);
15369 - c->extended_cpuid_level = cpuid_eax(0x80000000);
15370 - if (c->extended_cpuid_level >= 0x80000007)
15371 - c->x86_power = cpuid_edx(0x80000007);
15373 - switch (c->x86_vendor) {
15374 - case X86_VENDOR_AMD:
15375 - early_init_amd(c);
15377 - case X86_VENDOR_INTEL:
15378 - early_init_intel(c);
15380 - case X86_VENDOR_CENTAUR:
15381 - early_init_centaur(c);
15385 - validate_pat_support(c);
15389 - * This does the hard work of actually picking apart the CPU stuff...
15391 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
15395 - early_identify_cpu(c);
15397 - init_scattered_cpuid_features(c);
15399 - c->apicid = phys_pkg_id(0);
15402 - * Vendor-specific initialization. In this section we
15403 - * canonicalize the feature flags, meaning if there are
15404 - * features a certain CPU supports which CPUID doesn't
15405 - * tell us, CPUID claiming incorrect flags, or other bugs,
15406 - * we handle them here.
15408 - * At the end of this section, c->x86_capability better
15409 - * indicate the features this CPU genuinely supports!
15411 - switch (c->x86_vendor) {
15412 - case X86_VENDOR_AMD:
15416 - case X86_VENDOR_INTEL:
15420 - case X86_VENDOR_CENTAUR:
15424 - case X86_VENDOR_UNKNOWN:
15426 - display_cacheinfo(c);
15433 - * On SMP, boot_cpu_data holds the common feature set between
15434 - * all CPUs; so make sure that we indicate which features are
15435 - * common between the CPUs. The first time this routine gets
15436 - * executed, c == &boot_cpu_data.
15438 - if (c != &boot_cpu_data) {
15439 - /* AND the already accumulated flags with these */
15440 - for (i = 0; i < NCAPINTS; i++)
15441 - boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
15444 - /* Clear all flags overriden by options */
15445 - for (i = 0; i < NCAPINTS; i++)
15446 - c->x86_capability[i] &= ~cleared_cpu_caps[i];
15448 -#ifdef CONFIG_X86_MCE
15451 - select_idle_routine(c);
15453 -#ifdef CONFIG_NUMA
15454 - numa_add_cpu(smp_processor_id());
15459 -void __cpuinit identify_boot_cpu(void)
15461 - identify_cpu(&boot_cpu_data);
15464 -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
15466 - BUG_ON(c == &boot_cpu_data);
15471 -static __init int setup_noclflush(char *arg)
15473 - setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
15476 -__setup("noclflush", setup_noclflush);
15478 -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
15480 - if (c->x86_model_id[0])
15481 - printk(KERN_CONT "%s", c->x86_model_id);
15483 - if (c->x86_mask || c->cpuid_level >= 0)
15484 - printk(KERN_CONT " stepping %02x\n", c->x86_mask);
15486 - printk(KERN_CONT "\n");
15489 -static __init int setup_disablecpuid(char *arg)
15492 - if (get_option(&arg, &bit) && bit < NCAPINTS*32)
15493 - setup_clear_cpu_cap(bit);
15498 -__setup("clearcpuid=", setup_disablecpuid);
15499 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
15500 +++ sle11-2009-06-04/arch/x86/kernel/setup_percpu-xen.c 2009-06-04 10:21:39.000000000 +0200
15502 +#include <linux/kernel.h>
15503 +#include <linux/module.h>
15504 +#include <linux/init.h>
15505 +#include <linux/bootmem.h>
15506 +#include <linux/percpu.h>
15507 +#include <linux/kexec.h>
15508 +#include <linux/crash_dump.h>
15509 +#include <asm/smp.h>
15510 +#include <asm/percpu.h>
15511 +#include <asm/sections.h>
15512 +#include <asm/processor.h>
15513 +#include <asm/setup.h>
15514 +#include <asm/topology.h>
15515 +#include <asm/mpspec.h>
15516 +#include <asm/apicdef.h>
15517 +#include <asm/highmem.h>
15519 +#ifdef CONFIG_X86_LOCAL_APIC
15520 +unsigned int num_processors;
15521 +unsigned disabled_cpus __cpuinitdata;
15522 +/* Processor that is doing the boot up */
15523 +unsigned int boot_cpu_physical_apicid = -1U;
15524 +unsigned int max_physical_apicid;
15525 +EXPORT_SYMBOL(boot_cpu_physical_apicid);
15527 +/* Bitmask of physically existing CPUs */
15528 +physid_mask_t phys_cpu_present_map;
15531 +/* map cpu index to physical APIC ID */
15532 +#ifndef CONFIG_XEN
15533 +DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
15534 +DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
15535 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15536 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
15538 +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
15539 +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15542 +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
15543 +#define X86_64_NUMA 1
15545 +/* map cpu index to node index */
15546 +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
15547 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
15549 +/* which logical CPUs are on which nodes */
15550 +cpumask_t *node_to_cpumask_map;
15551 +EXPORT_SYMBOL(node_to_cpumask_map);
15553 +/* setup node_to_cpumask_map */
15554 +static void __init setup_node_to_cpumask_map(void);
15557 +static inline void setup_node_to_cpumask_map(void) { }
15560 +#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
15562 + * Copy data used in early init routines from the initial arrays to the
15563 + * per cpu data areas. These arrays then become expendable and the
15564 + * *_early_ptr's are zeroed indicating that the static arrays are gone.
15566 +static void __init setup_per_cpu_maps(void)
15568 +#ifndef CONFIG_XEN
15571 + for_each_possible_cpu(cpu) {
15572 + per_cpu(x86_cpu_to_apicid, cpu) =
15573 + early_per_cpu_map(x86_cpu_to_apicid, cpu);
15574 + per_cpu(x86_bios_cpu_apicid, cpu) =
15575 + early_per_cpu_map(x86_bios_cpu_apicid, cpu);
15576 +#ifdef X86_64_NUMA
15577 + per_cpu(x86_cpu_to_node_map, cpu) =
15578 + early_per_cpu_map(x86_cpu_to_node_map, cpu);
15582 + /* indicate the early static arrays will soon be gone */
15583 + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
15584 + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
15585 +#ifdef X86_64_NUMA
15586 + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
15591 +#ifdef CONFIG_X86_32
15593 + * Great future not-so-futuristic plan: make i386 and x86_64 do it
15596 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
15597 +EXPORT_SYMBOL(__per_cpu_offset);
15598 +static inline void setup_cpu_pda_map(void) { }
15600 +#elif !defined(CONFIG_SMP)
15601 +static inline void setup_cpu_pda_map(void) { }
15603 +#else /* CONFIG_SMP && CONFIG_X86_64 */
15606 + * Allocate cpu_pda pointer table and array via alloc_bootmem.
15608 +static void __init setup_cpu_pda_map(void)
15611 + struct x8664_pda **new_cpu_pda;
15612 + unsigned long size;
15615 + size = roundup(sizeof(struct x8664_pda), cache_line_size());
15617 + /* allocate cpu_pda array and pointer table */
15619 + unsigned long tsize = nr_cpu_ids * sizeof(void *);
15620 + unsigned long asize = size * (nr_cpu_ids - 1);
15622 + tsize = roundup(tsize, cache_line_size());
15623 + new_cpu_pda = alloc_bootmem(tsize + asize);
15624 + pda = (char *)new_cpu_pda + tsize;
15627 + /* initialize pointer table to static pda's */
15628 + for_each_possible_cpu(cpu) {
15630 + /* leave boot cpu pda in place */
15631 + new_cpu_pda[0] = cpu_pda(0);
15634 + new_cpu_pda[cpu] = (struct x8664_pda *)pda;
15635 + new_cpu_pda[cpu]->in_bootmem = 1;
15639 + /* point to new pointer table */
15640 + _cpu_pda = new_cpu_pda;
15645 + * Great future plan:
15646 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
15647 + * Always point %gs to its beginning
15649 +void __init setup_per_cpu_areas(void)
15651 + ssize_t size = PERCPU_ENOUGH_ROOM;
15655 + /* Setup cpu_pda map */
15656 + setup_cpu_pda_map();
15658 + /* Copy section for each CPU (we discard the original) */
15659 + size = PERCPU_ENOUGH_ROOM;
15660 + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
15663 + for_each_possible_cpu(cpu) {
15664 +#ifndef CONFIG_NEED_MULTIPLE_NODES
15665 + ptr = alloc_bootmem_pages(size);
15667 + int node = early_cpu_to_node(cpu);
15668 + if (!node_online(node) || !NODE_DATA(node)) {
15669 + ptr = alloc_bootmem_pages(size);
15671 + "cpu %d has no node %d or node-local memory\n",
15675 + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
15677 + per_cpu_offset(cpu) = ptr - __per_cpu_start;
15678 + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
15682 + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
15683 + NR_CPUS, nr_cpu_ids, nr_node_ids);
15685 + /* Setup percpu data maps */
15686 + setup_per_cpu_maps();
15688 + /* Setup node to cpumask map */
15689 + setup_node_to_cpumask_map();
15694 +#ifdef X86_64_NUMA
15697 + * Allocate node_to_cpumask_map based on number of available nodes
15698 + * Requires node_possible_map to be valid.
15700 + * Note: node_to_cpumask() is not valid until after this is done.
15702 +static void __init setup_node_to_cpumask_map(void)
15704 + unsigned int node, num = 0;
15707 + /* setup nr_node_ids if not done yet */
15708 + if (nr_node_ids == MAX_NUMNODES) {
15709 + for_each_node_mask(node, node_possible_map)
15711 + nr_node_ids = num + 1;
15714 + /* allocate the map */
15715 + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
15717 + pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
15718 + map, nr_node_ids);
15720 + /* node_to_cpumask() will now work */
15721 + node_to_cpumask_map = map;
15724 +void __cpuinit numa_set_node(int cpu, int node)
15726 + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
15728 + if (cpu_pda(cpu) && node != NUMA_NO_NODE)
15729 + cpu_pda(cpu)->nodenumber = node;
15731 + if (cpu_to_node_map)
15732 + cpu_to_node_map[cpu] = node;
15734 + else if (per_cpu_offset(cpu))
15735 + per_cpu(x86_cpu_to_node_map, cpu) = node;
15738 + pr_debug("Setting node for non-present cpu %d\n", cpu);
15741 +void __cpuinit numa_clear_node(int cpu)
15743 + numa_set_node(cpu, NUMA_NO_NODE);
15746 +#ifndef CONFIG_DEBUG_PER_CPU_MAPS
15748 +void __cpuinit numa_add_cpu(int cpu)
15750 + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
15753 +void __cpuinit numa_remove_cpu(int cpu)
15755 + cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
15758 +#else /* CONFIG_DEBUG_PER_CPU_MAPS */
15761 + * --------- debug versions of the numa functions ---------
15763 +static void __cpuinit numa_set_cpumask(int cpu, int enable)
15765 + int node = cpu_to_node(cpu);
15769 + if (node_to_cpumask_map == NULL) {
15770 + printk(KERN_ERR "node_to_cpumask_map NULL\n");
15775 + mask = &node_to_cpumask_map[node];
15777 + cpu_set(cpu, *mask);
15779 + cpu_clear(cpu, *mask);
15781 + cpulist_scnprintf(buf, sizeof(buf), *mask);
15782 + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
15783 + enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
15786 +void __cpuinit numa_add_cpu(int cpu)
15788 + numa_set_cpumask(cpu, 1);
15791 +void __cpuinit numa_remove_cpu(int cpu)
15793 + numa_set_cpumask(cpu, 0);
15796 +int cpu_to_node(int cpu)
15798 + if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
15799 + printk(KERN_WARNING
15800 + "cpu_to_node(%d): usage too early!\n", cpu);
15802 + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15804 + return per_cpu(x86_cpu_to_node_map, cpu);
15806 +EXPORT_SYMBOL(cpu_to_node);
15809 + * Same function as cpu_to_node() but used if called before the
15810 + * per_cpu areas are setup.
15812 +int early_cpu_to_node(int cpu)
15814 + if (early_per_cpu_ptr(x86_cpu_to_node_map))
15815 + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15817 + if (!per_cpu_offset(cpu)) {
15818 + printk(KERN_WARNING
15819 + "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
15821 + return NUMA_NO_NODE;
15823 + return per_cpu(x86_cpu_to_node_map, cpu);
15827 +/* empty cpumask */
15828 +static const cpumask_t cpu_mask_none;
15831 + * Returns a pointer to the bitmask of CPUs on Node 'node'.
15833 +const cpumask_t *_node_to_cpumask_ptr(int node)
15835 + if (node_to_cpumask_map == NULL) {
15836 + printk(KERN_WARNING
15837 + "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
15840 + return (const cpumask_t *)&cpu_online_map;
15842 + if (node >= nr_node_ids) {
15843 + printk(KERN_WARNING
15844 + "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
15845 + node, nr_node_ids);
15847 + return &cpu_mask_none;
15849 + return &node_to_cpumask_map[node];
15851 +EXPORT_SYMBOL(_node_to_cpumask_ptr);
15854 + * Returns a bitmask of CPUs on Node 'node'.
15856 + * Side note: this function creates the returned cpumask on the stack
15857 + * so with a high NR_CPUS count, excessive stack space is used. The
15858 + * node_to_cpumask_ptr function should be used whenever possible.
15860 +cpumask_t node_to_cpumask(int node)
15862 + if (node_to_cpumask_map == NULL) {
15863 + printk(KERN_WARNING
15864 + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
15866 + return cpu_online_map;
15868 + if (node >= nr_node_ids) {
15869 + printk(KERN_WARNING
15870 + "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
15871 + node, nr_node_ids);
15873 + return cpu_mask_none;
15875 + return node_to_cpumask_map[node];
15877 +EXPORT_SYMBOL(node_to_cpumask);
15880 + * --------- end of debug versions of the numa functions ---------
15883 +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
15885 +#endif /* X86_64_NUMA */
15887 --- sle11-2009-06-04.orig/arch/x86/kernel/smp-xen.c 2009-03-16 16:38:05.000000000 +0100
15888 +++ sle11-2009-06-04/arch/x86/kernel/smp-xen.c 2009-06-04 10:21:39.000000000 +0200
15889 @@ -121,132 +121,14 @@ void xen_smp_send_reschedule(int cpu)
15890 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
15894 - * Structure and data for smp_call_function(). This is designed to minimise
15895 - * static memory requirements. It also looks cleaner.
15897 -static DEFINE_SPINLOCK(call_lock);
15899 -struct call_data_struct {
15900 - void (*func) (void *info);
15902 - atomic_t started;
15903 - atomic_t finished;
15907 -void lock_ipi_call_lock(void)
15908 +void xen_send_call_func_single_ipi(int cpu)
15910 - spin_lock_irq(&call_lock);
15911 + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR);
15914 -void unlock_ipi_call_lock(void)
15915 +void xen_send_call_func_ipi(cpumask_t mask)
15917 - spin_unlock_irq(&call_lock);
15920 -static struct call_data_struct *call_data;
15922 -static void __smp_call_function(void (*func) (void *info), void *info,
15923 - int nonatomic, int wait)
15925 - struct call_data_struct data;
15926 - int cpus = num_online_cpus() - 1;
15931 - data.func = func;
15932 - data.info = info;
15933 - atomic_set(&data.started, 0);
15934 - data.wait = wait;
15936 - atomic_set(&data.finished, 0);
15938 - call_data = &data;
15941 - /* Send a message to all other CPUs and wait for them to respond */
15942 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
15944 - /* Wait for response */
15945 - while (atomic_read(&data.started) != cpus)
15949 - while (atomic_read(&data.finished) != cpus)
15955 - * smp_call_function_mask(): Run a function on a set of other CPUs.
15956 - * @mask: The set of cpus to run on. Must not include the current cpu.
15957 - * @func: The function to run. This must be fast and non-blocking.
15958 - * @info: An arbitrary pointer to pass to the function.
15959 - * @wait: If true, wait (atomically) until function has completed on other CPUs.
15961 - * Returns 0 on success, else a negative status code.
15963 - * If @wait is true, then returns once @func has returned; otherwise
15964 - * it returns just before the target cpu calls @func.
15966 - * You must not call this function with disabled interrupts or from a
15967 - * hardware interrupt handler or from a bottom half handler.
15970 -xen_smp_call_function_mask(cpumask_t mask,
15971 - void (*func)(void *), void *info,
15974 - struct call_data_struct data;
15975 - cpumask_t allbutself;
15978 - /* Can deadlock when called with interrupts disabled */
15979 - WARN_ON(irqs_disabled());
15981 - /* Holding any lock stops cpus from going down. */
15982 - spin_lock(&call_lock);
15984 - allbutself = cpu_online_map;
15985 - cpu_clear(smp_processor_id(), allbutself);
15987 - cpus_and(mask, mask, allbutself);
15988 - cpus = cpus_weight(mask);
15991 - spin_unlock(&call_lock);
15995 - data.func = func;
15996 - data.info = info;
15997 - atomic_set(&data.started, 0);
15998 - data.wait = wait;
16000 - atomic_set(&data.finished, 0);
16002 - call_data = &data;
16005 - /* Send a message to other CPUs */
16006 - if (cpus_equal(mask, allbutself) &&
16007 - cpus_equal(cpu_online_map, cpu_callout_map))
16008 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
16010 - send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16012 - /* Wait for response */
16013 - while (atomic_read(&data.started) != cpus)
16017 - while (atomic_read(&data.finished) != cpus)
16019 - spin_unlock(&call_lock);
16022 + send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16025 static void stop_this_cpu(void *dummy)
16026 @@ -268,15 +150,10 @@ static void stop_this_cpu(void *dummy)
16028 void xen_smp_send_stop(void)
16031 unsigned long flags;
16033 - /* Don't deadlock on the call lock in panic */
16034 - nolock = !spin_trylock(&call_lock);
16035 + smp_call_function(stop_this_cpu, NULL, 0);
16036 local_irq_save(flags);
16037 - __smp_call_function(stop_this_cpu, NULL, 0, 0);
16039 - spin_unlock(&call_lock);
16040 disable_all_local_evtchn();
16041 local_irq_restore(flags);
16043 @@ -298,21 +175,8 @@ irqreturn_t smp_reschedule_interrupt(int
16045 irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
16047 - void (*func) (void *info) = call_data->func;
16048 - void *info = call_data->info;
16049 - int wait = call_data->wait;
16052 - * Notify initiating CPU that I've grabbed the data and am
16053 - * about to execute the function
16056 - atomic_inc(&call_data->started);
16058 - * At this point the info structure may be out of scope unless wait==1
16062 + generic_smp_call_function_interrupt();
16063 #ifdef CONFIG_X86_32
16064 __get_cpu_var(irq_stat).irq_call_count++;
16066 @@ -320,10 +184,19 @@ irqreturn_t smp_call_function_interrupt(
16072 - atomic_inc(&call_data->finished);
16074 + return IRQ_HANDLED;
16077 +irqreturn_t smp_call_function_single_interrupt(int irq, void *dev_id)
16080 + generic_smp_call_function_single_interrupt();
16081 +#ifdef CONFIG_X86_32
16082 + __get_cpu_var(irq_stat).irq_call_count++;
16084 + add_pda(irq_call_count, 1);
16088 return IRQ_HANDLED;
16090 --- sle11-2009-06-04.orig/arch/x86/kernel/time_32-xen.c 2009-03-24 10:13:09.000000000 +0100
16091 +++ sle11-2009-06-04/arch/x86/kernel/time_32-xen.c 2009-06-04 10:21:39.000000000 +0200
16092 @@ -468,7 +468,7 @@ irqreturn_t timer_interrupt(int irq, voi
16094 /* Keep nmi watchdog up to date */
16096 - per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
16097 + x86_add_percpu(irq_stat.irq0_irqs, 1);
16099 add_pda(irq0_irqs, 1);
16101 @@ -746,9 +746,7 @@ void __init time_init(void)
16103 update_wallclock();
16105 -#ifndef CONFIG_X86_64
16109 /* Cannot request_irq() until kmem is initialised. */
16110 late_time_init = setup_cpu0_timer_irq;
16111 @@ -805,7 +803,8 @@ static void stop_hz_timer(void)
16113 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
16114 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
16115 - (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
16116 + (j = get_next_timer_interrupt(jiffies),
16117 + time_before_eq(j, jiffies))) {
16118 cpu_clear(cpu, nohz_cpu_mask);
16121 --- sle11-2009-06-04.orig/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:38:05.000000000 +0100
16122 +++ sle11-2009-06-04/arch/x86/kernel/traps_32-xen.c 2009-06-04 10:21:39.000000000 +0200
16125 * Copyright (C) 1991, 1992 Linus Torvalds
16126 + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
16128 * Pentium III FXSR, SSE support
16129 * Gareth Hughes <gareth@valinux.com>, May 2000
16130 @@ -57,11 +58,10 @@
16131 #include <asm/nmi.h>
16132 #include <asm/smp.h>
16133 #include <asm/io.h>
16134 +#include <asm/traps.h>
16136 #include "mach_traps.h"
16138 -int panic_on_unrecovered_nmi;
16141 DECLARE_BITMAP(used_vectors, NR_VECTORS);
16142 EXPORT_SYMBOL_GPL(used_vectors);
16143 @@ -82,43 +82,22 @@ gate_desc idt_table[256]
16144 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
16147 -asmlinkage void divide_error(void);
16148 -asmlinkage void debug(void);
16149 -asmlinkage void nmi(void);
16150 -asmlinkage void int3(void);
16151 -asmlinkage void overflow(void);
16152 -asmlinkage void bounds(void);
16153 -asmlinkage void invalid_op(void);
16154 -asmlinkage void device_not_available(void);
16155 -asmlinkage void coprocessor_segment_overrun(void);
16156 -asmlinkage void invalid_TSS(void);
16157 -asmlinkage void segment_not_present(void);
16158 -asmlinkage void stack_segment(void);
16159 -asmlinkage void general_protection(void);
16160 -asmlinkage void page_fault(void);
16161 -asmlinkage void coprocessor_error(void);
16162 -asmlinkage void simd_coprocessor_error(void);
16163 -asmlinkage void alignment_check(void);
16164 -#ifndef CONFIG_XEN
16165 -asmlinkage void spurious_interrupt_bug(void);
16167 -asmlinkage void fixup_4gb_segment(void);
16169 -asmlinkage void machine_check(void);
16171 +int panic_on_unrecovered_nmi;
16172 int kstack_depth_to_print = 24;
16173 static unsigned int code_bytes = 64;
16174 +static int ignore_nmis;
16175 +static int die_counter;
16177 void printk_address(unsigned long address, int reliable)
16179 #ifdef CONFIG_KALLSYMS
16180 - char namebuf[KSYM_NAME_LEN];
16181 unsigned long offset = 0;
16182 unsigned long symsize;
16183 const char *symname;
16184 - char reliab[4] = "";
16185 - char *delim = ":";
16187 + char *delim = ":";
16188 + char namebuf[KSYM_NAME_LEN];
16189 + char reliab[4] = "";
16191 symname = kallsyms_lookup(address, &symsize, &offset,
16192 &modname, namebuf);
16193 @@ -138,22 +117,23 @@ void printk_address(unsigned long addres
16197 -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
16198 +static inline int valid_stack_ptr(struct thread_info *tinfo,
16199 + void *p, unsigned int size)
16201 - return p > (void *)tinfo &&
16202 - p <= (void *)tinfo + THREAD_SIZE - size;
16204 + return p > t && p <= t + THREAD_SIZE - size;
16207 /* The form of the top of the frame on the stack */
16208 struct stack_frame {
16209 - struct stack_frame *next_frame;
16210 - unsigned long return_address;
16211 + struct stack_frame *next_frame;
16212 + unsigned long return_address;
16215 static inline unsigned long
16216 print_context_stack(struct thread_info *tinfo,
16217 - unsigned long *stack, unsigned long bp,
16218 - const struct stacktrace_ops *ops, void *data)
16219 + unsigned long *stack, unsigned long bp,
16220 + const struct stacktrace_ops *ops, void *data)
16222 struct stack_frame *frame = (struct stack_frame *)bp;
16224 @@ -175,8 +155,6 @@ print_context_stack(struct thread_info *
16228 -#define MSG(msg) ops->warning(data, msg)
16230 void dump_trace(struct task_struct *task, struct pt_regs *regs,
16231 unsigned long *stack, unsigned long bp,
16232 const struct stacktrace_ops *ops, void *data)
16233 @@ -186,7 +164,6 @@ void dump_trace(struct task_struct *task
16236 unsigned long dummy;
16239 if (task != current)
16240 stack = (unsigned long *)task->thread.sp;
16241 @@ -204,7 +181,7 @@ void dump_trace(struct task_struct *task
16247 struct thread_info *context;
16249 context = (struct thread_info *)
16250 @@ -256,15 +233,15 @@ static void print_trace_address(void *da
16253 static const struct stacktrace_ops print_trace_ops = {
16254 - .warning = print_trace_warning,
16255 - .warning_symbol = print_trace_warning_symbol,
16256 - .stack = print_trace_stack,
16257 - .address = print_trace_address,
16258 + .warning = print_trace_warning,
16259 + .warning_symbol = print_trace_warning_symbol,
16260 + .stack = print_trace_stack,
16261 + .address = print_trace_address,
16265 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16266 - unsigned long *stack, unsigned long bp, char *log_lvl)
16267 + unsigned long *stack, unsigned long bp, char *log_lvl)
16269 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16270 printk("%s =======================\n", log_lvl);
16271 @@ -359,15 +336,14 @@ void show_registers(struct pt_regs *regs
16272 printk(KERN_EMERG "Code: ");
16274 ip = (u8 *)regs->ip - code_prologue;
16275 - if (ip < (u8 *)PAGE_OFFSET ||
16276 - probe_kernel_address(ip, c)) {
16277 + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
16278 /* try starting at EIP */
16279 ip = (u8 *)regs->ip;
16280 code_len = code_len - code_prologue + 1;
16282 for (i = 0; i < code_len; i++, ip++) {
16283 if (ip < (u8 *)PAGE_OFFSET ||
16284 - probe_kernel_address(ip, c)) {
16285 + probe_kernel_address(ip, c)) {
16286 printk(" Bad EIP value.");
16289 @@ -392,7 +368,53 @@ int is_valid_bugaddr(unsigned long ip)
16290 return ud2 == 0x0b0f;
16293 -static int die_counter;
16294 +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
16295 +static int die_owner = -1;
16296 +static unsigned int die_nest_count;
16298 +unsigned __kprobes long oops_begin(void)
16300 + unsigned long flags;
16304 + if (die_owner != raw_smp_processor_id()) {
16305 + console_verbose();
16306 + raw_local_irq_save(flags);
16307 + __raw_spin_lock(&die_lock);
16308 + die_owner = smp_processor_id();
16309 + die_nest_count = 0;
16310 + bust_spinlocks(1);
16312 + raw_local_irq_save(flags);
16314 + die_nest_count++;
16318 +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
16320 + bust_spinlocks(0);
16322 + add_taint(TAINT_DIE);
16323 + __raw_spin_unlock(&die_lock);
16324 + raw_local_irq_restore(flags);
16329 + if (kexec_should_crash(current))
16330 + crash_kexec(regs);
16332 + if (in_interrupt())
16333 + panic("Fatal exception in interrupt");
16335 + if (panic_on_oops)
16336 + panic("Fatal exception");
16342 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
16344 @@ -410,26 +432,22 @@ int __kprobes __die(const char *str, str
16345 printk("DEBUG_PAGEALLOC");
16349 if (notify_die(DIE_OOPS, str, regs, err,
16350 - current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
16351 + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
16354 - show_registers(regs);
16355 - /* Executive summary in case the oops scrolled away */
16356 - sp = (unsigned long) (®s->sp);
16357 - savesegment(ss, ss);
16358 - if (user_mode(regs)) {
16360 - ss = regs->ss & 0xffff;
16362 - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16363 - print_symbol("%s", regs->ip);
16364 - printk(" SS:ESP %04x:%08lx\n", ss, sp);
16370 + show_registers(regs);
16371 + /* Executive summary in case the oops scrolled away */
16372 + sp = (unsigned long) (®s->sp);
16373 + savesegment(ss, ss);
16374 + if (user_mode(regs)) {
16376 + ss = regs->ss & 0xffff;
16378 + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16379 + print_symbol("%s", regs->ip);
16380 + printk(" SS:ESP %04x:%08lx\n", ss, sp);
16385 @@ -438,31 +456,9 @@ int __kprobes __die(const char *str, str
16387 void die(const char *str, struct pt_regs *regs, long err)
16390 - raw_spinlock_t lock;
16392 - int lock_owner_depth;
16394 - .lock = __RAW_SPIN_LOCK_UNLOCKED,
16395 - .lock_owner = -1,
16396 - .lock_owner_depth = 0
16398 - unsigned long flags;
16401 + unsigned long flags = oops_begin();
16403 - if (die.lock_owner != raw_smp_processor_id()) {
16404 - console_verbose();
16405 - raw_local_irq_save(flags);
16406 - __raw_spin_lock(&die.lock);
16407 - die.lock_owner = smp_processor_id();
16408 - die.lock_owner_depth = 0;
16409 - bust_spinlocks(1);
16411 - raw_local_irq_save(flags);
16414 - if (++die.lock_owner_depth < 3) {
16415 + if (die_nest_count < 3) {
16416 report_bug(regs->ip, regs);
16418 if (__die(str, regs, err))
16419 @@ -471,26 +467,7 @@ void die(const char *str, struct pt_regs
16420 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
16423 - bust_spinlocks(0);
16424 - die.lock_owner = -1;
16425 - add_taint(TAINT_DIE);
16426 - __raw_spin_unlock(&die.lock);
16427 - raw_local_irq_restore(flags);
16432 - if (kexec_should_crash(current))
16433 - crash_kexec(regs);
16435 - if (in_interrupt())
16436 - panic("Fatal exception in interrupt");
16438 - if (panic_on_oops)
16439 - panic("Fatal exception");
16442 - do_exit(SIGSEGV);
16443 + oops_end(flags, regs, SIGSEGV);
16447 @@ -554,7 +531,7 @@ void do_##name(struct pt_regs *regs, lon
16449 trace_hardirqs_fixup(); \
16450 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16451 - == NOTIFY_STOP) \
16452 + == NOTIFY_STOP) \
16454 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
16456 @@ -570,7 +547,7 @@ void do_##name(struct pt_regs *regs, lon
16457 info.si_code = sicode; \
16458 info.si_addr = (void __user *)siaddr; \
16459 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16460 - == NOTIFY_STOP) \
16461 + == NOTIFY_STOP) \
16463 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
16465 @@ -579,7 +556,7 @@ void do_##name(struct pt_regs *regs, lon
16466 void do_##name(struct pt_regs *regs, long error_code) \
16468 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16469 - == NOTIFY_STOP) \
16470 + == NOTIFY_STOP) \
16472 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
16474 @@ -594,28 +571,29 @@ void do_##name(struct pt_regs *regs, lon
16475 info.si_addr = (void __user *)siaddr; \
16476 trace_hardirqs_fixup(); \
16477 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16478 - == NOTIFY_STOP) \
16479 + == NOTIFY_STOP) \
16481 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
16484 -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16485 +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16486 #ifndef CONFIG_KPROBES
16487 DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
16489 DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
16490 DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
16491 -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16492 -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16493 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16494 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16495 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
16496 -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16497 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16498 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16499 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16500 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
16501 DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
16503 -void __kprobes do_general_protection(struct pt_regs * regs,
16506 +do_general_protection(struct pt_regs *regs, long error_code)
16508 + struct task_struct *tsk;
16509 struct thread_struct *thread;
16511 thread = ¤t->thread;
16512 @@ -623,23 +601,24 @@ void __kprobes do_general_protection(str
16513 if (regs->flags & X86_VM_MASK)
16517 if (!user_mode(regs))
16520 - current->thread.error_code = error_code;
16521 - current->thread.trap_no = 13;
16522 + tsk->thread.error_code = error_code;
16523 + tsk->thread.trap_no = 13;
16525 - if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
16526 - printk_ratelimit()) {
16527 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
16528 + printk_ratelimit()) {
16530 - "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16531 - current->comm, task_pid_nr(current),
16532 - regs->ip, regs->sp, error_code);
16533 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16534 + tsk->comm, task_pid_nr(tsk),
16535 + regs->ip, regs->sp, error_code);
16536 print_vma_addr(" in ", regs->ip);
16540 - force_sig(SIGSEGV, current);
16541 + force_sig(SIGSEGV, tsk);
16545 @@ -648,14 +627,15 @@ gp_in_vm86:
16549 - if (!fixup_exception(regs)) {
16550 - current->thread.error_code = error_code;
16551 - current->thread.trap_no = 13;
16552 - if (notify_die(DIE_GPF, "general protection fault", regs,
16553 + if (fixup_exception(regs))
16556 + tsk->thread.error_code = error_code;
16557 + tsk->thread.trap_no = 13;
16558 + if (notify_die(DIE_GPF, "general protection fault", regs,
16559 error_code, 13, SIGSEGV) == NOTIFY_STOP)
16561 - die("general protection fault", regs, error_code);
16564 + die("general protection fault", regs, error_code);
16567 static notrace __kprobes void
16568 @@ -722,9 +702,9 @@ unknown_nmi_error(unsigned char reason,
16570 static DEFINE_SPINLOCK(nmi_print_lock);
16572 -void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
16573 +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
16575 - if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16576 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16579 spin_lock(&nmi_print_lock);
16580 @@ -733,10 +713,12 @@ void notrace __kprobes die_nmi(struct pt
16581 * to get a message out:
16584 - printk(KERN_EMERG "%s", msg);
16585 + printk(KERN_EMERG "%s", str);
16586 printk(" on CPU%d, ip %08lx, registers:\n",
16587 smp_processor_id(), regs->ip);
16588 show_registers(regs);
16590 + panic("Non maskable interrupt");
16592 spin_unlock(&nmi_print_lock);
16594 @@ -756,14 +738,17 @@ void notrace __kprobes die_nmi(struct pt
16595 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
16597 unsigned char reason = 0;
16600 - /* Only the BSP gets external NMIs from the system: */
16601 - if (!smp_processor_id())
16602 + cpu = smp_processor_id();
16604 + /* Only the BSP gets external NMIs from the system. */
16606 reason = get_nmi_reason();
16608 if (!(reason & 0xc0)) {
16609 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
16613 #ifdef CONFIG_X86_LOCAL_APIC
16615 @@ -772,7 +757,7 @@ static notrace __kprobes void default_do
16617 if (nmi_watchdog_tick(regs, reason))
16619 - if (!do_nmi_callback(regs, smp_processor_id()))
16620 + if (!do_nmi_callback(regs, cpu))
16621 unknown_nmi_error(reason, regs);
16623 unknown_nmi_error(reason, regs);
16624 @@ -782,6 +767,8 @@ static notrace __kprobes void default_do
16626 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
16629 + /* AK: following checks seem to be broken on modern chipsets. FIXME */
16631 mem_parity_error(reason, regs);
16633 @@ -793,8 +780,6 @@ static notrace __kprobes void default_do
16637 -static int ignore_nmis;
16639 notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
16642 @@ -879,7 +864,7 @@ void __kprobes do_debug(struct pt_regs *
16643 tsk->thread.debugctlmsr = 0;
16645 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
16646 - SIGTRAP) == NOTIFY_STOP)
16647 + SIGTRAP) == NOTIFY_STOP)
16649 /* It's safe to allow irq's after DR6 has been saved */
16650 if (regs->flags & X86_EFLAGS_IF)
16651 @@ -940,9 +925,8 @@ clear_TF_reenable:
16652 void math_error(void __user *ip)
16654 struct task_struct *task;
16655 - unsigned short cwd;
16656 - unsigned short swd;
16658 + unsigned short cwd, swd;
16661 * Save the info for the exception handler and clear the error.
16662 @@ -961,7 +945,7 @@ void math_error(void __user *ip)
16663 * C1 reg you need in case of a stack fault, 0x040 is the stack
16664 * fault bit. We should only be taking one exception at a time,
16665 * so if this combination doesn't produce any single exception,
16666 - * then we have a bad program that isn't syncronizing its FPU usage
16667 + * then we have a bad program that isn't synchronizing its FPU usage
16668 * and it will suffer the consequences since we won't be able to
16669 * fully reproduce the context of the exception
16671 @@ -970,7 +954,7 @@ void math_error(void __user *ip)
16672 switch (swd & ~cwd & 0x3f) {
16673 case 0x000: /* No unmasked exception */
16675 - default: /* Multiple exceptions */
16676 + default: /* Multiple exceptions */
16678 case 0x001: /* Invalid Op */
16680 @@ -1006,8 +990,8 @@ void do_coprocessor_error(struct pt_regs
16681 static void simd_math_error(void __user *ip)
16683 struct task_struct *task;
16684 - unsigned short mxcsr;
16686 + unsigned short mxcsr;
16689 * Save the info for the exception handler and clear the error.
16690 @@ -1084,7 +1068,7 @@ void do_spurious_interrupt_bug(struct pt
16692 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
16694 - struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
16695 + struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
16696 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
16697 unsigned long new_kesp = kesp - base;
16698 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
16699 --- sle11-2009-06-04.orig/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:38:05.000000000 +0100
16700 +++ sle11-2009-06-04/arch/x86/kernel/traps_64-xen.c 2009-06-04 10:21:39.000000000 +0200
16701 @@ -10,73 +10,56 @@
16702 * 'Traps.c' handles hardware traps and faults after we have saved some
16703 * state in 'entry.S'.
16705 -#include <linux/sched.h>
16706 +#include <linux/moduleparam.h>
16707 +#include <linux/interrupt.h>
16708 +#include <linux/kallsyms.h>
16709 +#include <linux/spinlock.h>
16710 +#include <linux/kprobes.h>
16711 +#include <linux/uaccess.h>
16712 +#include <linux/utsname.h>
16713 +#include <linux/kdebug.h>
16714 #include <linux/kernel.h>
16715 +#include <linux/module.h>
16716 +#include <linux/ptrace.h>
16717 #include <linux/string.h>
16718 +#include <linux/unwind.h>
16719 +#include <linux/delay.h>
16720 #include <linux/errno.h>
16721 -#include <linux/ptrace.h>
16722 +#include <linux/kexec.h>
16723 +#include <linux/sched.h>
16724 #include <linux/timer.h>
16725 -#include <linux/mm.h>
16726 #include <linux/init.h>
16727 -#include <linux/delay.h>
16728 -#include <linux/spinlock.h>
16729 -#include <linux/interrupt.h>
16730 -#include <linux/kallsyms.h>
16731 -#include <linux/module.h>
16732 -#include <linux/moduleparam.h>
16733 -#include <linux/nmi.h>
16734 -#include <linux/kprobes.h>
16735 -#include <linux/kexec.h>
16736 -#include <linux/unwind.h>
16737 -#include <linux/uaccess.h>
16738 #include <linux/bug.h>
16739 -#include <linux/kdebug.h>
16740 -#include <linux/utsname.h>
16742 -#include <mach_traps.h>
16743 +#include <linux/nmi.h>
16744 +#include <linux/mm.h>
16746 #if defined(CONFIG_EDAC)
16747 #include <linux/edac.h>
16750 -#include <asm/system.h>
16751 -#include <asm/io.h>
16752 -#include <asm/atomic.h>
16753 +#include <asm/stacktrace.h>
16754 +#include <asm/processor.h>
16755 #include <asm/debugreg.h>
16756 +#include <asm/atomic.h>
16757 +#include <asm/system.h>
16758 +#include <asm/unwind.h>
16759 #include <asm/desc.h>
16760 #include <asm/i387.h>
16761 -#include <asm/processor.h>
16762 -#include <asm/unwind.h>
16763 +#include <asm/nmi.h>
16764 #include <asm/smp.h>
16765 +#include <asm/io.h>
16766 #include <asm/pgalloc.h>
16767 -#include <asm/pda.h>
16768 #include <asm/proto.h>
16769 -#include <asm/nmi.h>
16770 -#include <asm/stacktrace.h>
16771 +#include <asm/pda.h>
16772 +#include <asm/traps.h>
16774 -asmlinkage void divide_error(void);
16775 -asmlinkage void debug(void);
16776 -asmlinkage void nmi(void);
16777 -asmlinkage void int3(void);
16778 -asmlinkage void overflow(void);
16779 -asmlinkage void bounds(void);
16780 -asmlinkage void invalid_op(void);
16781 -asmlinkage void device_not_available(void);
16782 -asmlinkage void double_fault(void);
16783 -asmlinkage void coprocessor_segment_overrun(void);
16784 -asmlinkage void invalid_TSS(void);
16785 -asmlinkage void segment_not_present(void);
16786 -asmlinkage void stack_segment(void);
16787 -asmlinkage void general_protection(void);
16788 -asmlinkage void page_fault(void);
16789 -asmlinkage void coprocessor_error(void);
16790 -asmlinkage void simd_coprocessor_error(void);
16791 -asmlinkage void reserved(void);
16792 -asmlinkage void alignment_check(void);
16793 -asmlinkage void machine_check(void);
16794 -asmlinkage void spurious_interrupt_bug(void);
16795 +#include <mach_traps.h>
16797 +int panic_on_unrecovered_nmi;
16798 +int kstack_depth_to_print = 12;
16799 static unsigned int code_bytes = 64;
16800 +static int ignore_nmis;
16801 +static int die_counter;
16803 static inline void conditional_sti(struct pt_regs *regs)
16805 @@ -100,34 +83,9 @@ static inline void preempt_conditional_c
16806 dec_preempt_count();
16809 -int kstack_depth_to_print = 12;
16811 void printk_address(unsigned long address, int reliable)
16813 -#ifdef CONFIG_KALLSYMS
16814 - unsigned long offset = 0, symsize;
16815 - const char *symname;
16817 - char *delim = ":";
16818 - char namebuf[KSYM_NAME_LEN];
16819 - char reliab[4] = "";
16821 - symname = kallsyms_lookup(address, &symsize, &offset,
16822 - &modname, namebuf);
16824 - printk(" [<%016lx>]\n", address);
16828 - strcpy(reliab, "? ");
16831 - modname = delim = "";
16832 - printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
16833 - address, reliab, delim, modname, delim, symname, offset, symsize);
16835 - printk(" [<%016lx>]\n", address);
16837 + printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
16840 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
16841 @@ -206,8 +164,6 @@ static unsigned long *in_exception_stack
16845 -#define MSG(txt) ops->warning(data, txt)
16848 * x86-64 can have up to three kernel stacks:
16850 @@ -234,11 +190,11 @@ struct stack_frame {
16851 unsigned long return_address;
16855 -static inline unsigned long print_context_stack(struct thread_info *tinfo,
16856 - unsigned long *stack, unsigned long bp,
16857 - const struct stacktrace_ops *ops, void *data,
16858 - unsigned long *end)
16859 +static inline unsigned long
16860 +print_context_stack(struct thread_info *tinfo,
16861 + unsigned long *stack, unsigned long bp,
16862 + const struct stacktrace_ops *ops, void *data,
16863 + unsigned long *end)
16865 struct stack_frame *frame = (struct stack_frame *)bp;
16867 @@ -260,7 +216,7 @@ static inline unsigned long print_contex
16871 -void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
16872 +void dump_trace(struct task_struct *task, struct pt_regs *regs,
16873 unsigned long *stack, unsigned long bp,
16874 const struct stacktrace_ops *ops, void *data)
16876 @@ -269,36 +225,34 @@ void dump_trace(struct task_struct *tsk,
16878 struct thread_info *tinfo;
16882 - tinfo = task_thread_info(tsk);
16887 unsigned long dummy;
16889 - if (tsk && tsk != current)
16890 - stack = (unsigned long *)tsk->thread.sp;
16891 + if (task && task != current)
16892 + stack = (unsigned long *)task->thread.sp;
16895 #ifdef CONFIG_FRAME_POINTER
16897 - if (tsk == current) {
16898 + if (task == current) {
16899 /* Grab bp right from our regs */
16900 - asm("movq %%rbp, %0" : "=r" (bp):);
16901 + asm("movq %%rbp, %0" : "=r" (bp) :);
16903 /* bp is the last reg pushed by switch_to */
16904 - bp = *(unsigned long *) tsk->thread.sp;
16905 + bp = *(unsigned long *) task->thread.sp;
16913 * Print function call entries in all stacks, starting at the
16914 * current stack address. If the stacks consist of nested
16917 + tinfo = task_thread_info(task);
16920 unsigned long *estack_end;
16921 @@ -383,18 +337,24 @@ static const struct stacktrace_ops print
16922 .address = print_trace_address,
16926 -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
16927 - unsigned long bp)
16929 +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16930 + unsigned long *stack, unsigned long bp, char *log_lvl)
16932 printk("\nCall Trace:\n");
16933 - dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
16934 + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16938 +void show_trace(struct task_struct *task, struct pt_regs *regs,
16939 + unsigned long *stack, unsigned long bp)
16941 + show_trace_log_lvl(task, regs, stack, bp, "");
16945 -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
16946 - unsigned long bp)
16947 +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
16948 + unsigned long *sp, unsigned long bp, char *log_lvl)
16950 unsigned long *stack;
16952 @@ -406,14 +366,14 @@ _show_stack(struct task_struct *tsk, str
16953 // back trace for this cpu.
16957 - sp = (unsigned long *)tsk->thread.sp;
16959 + sp = (unsigned long *)task->thread.sp;
16961 sp = (unsigned long *)&sp;
16965 - for(i=0; i < kstack_depth_to_print; i++) {
16966 + for (i = 0; i < kstack_depth_to_print; i++) {
16967 if (stack >= irqstack && stack <= irqstack_end) {
16968 if (stack == irqstack_end) {
16969 stack = (unsigned long *) (irqstack_end[-1]);
16970 @@ -428,12 +388,12 @@ _show_stack(struct task_struct *tsk, str
16971 printk(" %016lx", *stack++);
16972 touch_nmi_watchdog();
16974 - show_trace(tsk, regs, sp, bp);
16975 + show_trace_log_lvl(task, regs, sp, bp, log_lvl);
16978 -void show_stack(struct task_struct *tsk, unsigned long * sp)
16979 +void show_stack(struct task_struct *task, unsigned long *sp)
16981 - _show_stack(tsk, NULL, sp, 0);
16982 + show_stack_log_lvl(task, NULL, sp, 0, "");
16986 @@ -441,8 +401,8 @@ void show_stack(struct task_struct *tsk,
16988 void dump_stack(void)
16990 - unsigned long dummy;
16991 unsigned long bp = 0;
16992 + unsigned long stack;
16994 #ifdef CONFIG_FRAME_POINTER
16996 @@ -454,7 +414,7 @@ void dump_stack(void)
16997 init_utsname()->release,
16998 (int)strcspn(init_utsname()->version, " "),
16999 init_utsname()->version);
17000 - show_trace(NULL, NULL, &dummy, bp);
17001 + show_trace(NULL, NULL, &stack, bp);
17004 EXPORT_SYMBOL(dump_stack);
17005 @@ -465,12 +425,8 @@ void show_registers(struct pt_regs *regs
17007 const int cpu = smp_processor_id();
17008 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
17010 - unsigned int code_prologue = code_bytes * 43 / 64;
17011 - unsigned int code_len = code_bytes;
17014 - ip = (u8 *) regs->ip - code_prologue;
17015 printk("CPU %d ", cpu);
17017 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
17018 @@ -481,15 +437,22 @@ void show_registers(struct pt_regs *regs
17019 * time of the fault..
17021 if (!user_mode(regs)) {
17022 + unsigned int code_prologue = code_bytes * 43 / 64;
17023 + unsigned int code_len = code_bytes;
17028 - _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
17029 + show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
17033 printk(KERN_EMERG "Code: ");
17035 + ip = (u8 *)regs->ip - code_prologue;
17036 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
17037 /* try starting at RIP */
17038 - ip = (u8 *) regs->ip;
17039 + ip = (u8 *)regs->ip;
17040 code_len = code_len - code_prologue + 1;
17042 for (i = 0; i < code_len; i++, ip++) {
17043 @@ -505,7 +468,7 @@ void show_registers(struct pt_regs *regs
17050 int is_valid_bugaddr(unsigned long ip)
17052 @@ -545,7 +508,7 @@ unsigned __kprobes long oops_begin(void)
17055 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
17061 @@ -563,10 +526,9 @@ void __kprobes oops_end(unsigned long fl
17065 -int __kprobes __die(const char * str, struct pt_regs * regs, long err)
17066 +int __kprobes __die(const char *str, struct pt_regs *regs, long err)
17068 - static int die_counter;
17069 - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
17070 + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
17071 #ifdef CONFIG_PREEMPT
17072 printk("PREEMPT ");
17074 @@ -577,8 +539,10 @@ int __kprobes __die(const char * str, st
17075 printk("DEBUG_PAGEALLOC");
17078 - if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17079 + if (notify_die(DIE_OOPS, str, regs, err,
17080 + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17083 show_registers(regs);
17084 add_taint(TAINT_DIE);
17085 /* Executive summary in case the oops scrolled away */
17086 @@ -590,7 +554,7 @@ int __kprobes __die(const char * str, st
17090 -void die(const char * str, struct pt_regs * regs, long err)
17091 +void die(const char *str, struct pt_regs *regs, long err)
17093 unsigned long flags = oops_begin();
17095 @@ -608,8 +572,7 @@ die_nmi(char *str, struct pt_regs *regs,
17097 unsigned long flags;
17099 - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
17101 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
17104 flags = oops_begin();
17105 @@ -617,7 +580,9 @@ die_nmi(char *str, struct pt_regs *regs,
17106 * We are in trouble anyway, lets at least try
17107 * to get a message out.
17109 - printk(str, smp_processor_id());
17110 + printk(KERN_EMERG "%s", str);
17111 + printk(" on CPU%d, ip %08lx, registers:\n",
17112 + smp_processor_id(), regs->ip);
17113 show_registers(regs);
17114 if (kexec_should_crash(current))
17116 @@ -630,44 +595,44 @@ die_nmi(char *str, struct pt_regs *regs,
17120 -static void __kprobes do_trap(int trapnr, int signr, char *str,
17121 - struct pt_regs * regs, long error_code,
17123 +static void __kprobes
17124 +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
17125 + long error_code, siginfo_t *info)
17127 struct task_struct *tsk = current;
17129 - if (user_mode(regs)) {
17131 - * We want error_code and trap_no set for userspace
17132 - * faults and kernelspace faults which result in
17133 - * die(), but not kernelspace faults which are fixed
17134 - * up. die() gives the process no chance to handle
17135 - * the signal and notice the kernel fault information,
17136 - * so that won't result in polluting the information
17137 - * about previously queued, but not yet delivered,
17138 - * faults. See also do_general_protection below.
17140 - tsk->thread.error_code = error_code;
17141 - tsk->thread.trap_no = trapnr;
17142 + if (!user_mode(regs))
17143 + goto kernel_trap;
17145 - if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17146 - printk_ratelimit()) {
17148 - "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17149 - tsk->comm, tsk->pid, str,
17150 - regs->ip, regs->sp, error_code);
17151 - print_vma_addr(" in ", regs->ip);
17155 + * We want error_code and trap_no set for userspace faults and
17156 + * kernelspace faults which result in die(), but not
17157 + * kernelspace faults which are fixed up. die() gives the
17158 + * process no chance to handle the signal and notice the
17159 + * kernel fault information, so that won't result in polluting
17160 + * the information about previously queued, but not yet
17161 + * delivered, faults. See also do_general_protection below.
17163 + tsk->thread.error_code = error_code;
17164 + tsk->thread.trap_no = trapnr;
17167 - force_sig_info(signr, info, tsk);
17169 - force_sig(signr, tsk);
17171 + if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17172 + printk_ratelimit()) {
17174 + "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17175 + tsk->comm, tsk->pid, str,
17176 + regs->ip, regs->sp, error_code);
17177 + print_vma_addr(" in ", regs->ip);
17182 + force_sig_info(signr, info, tsk);
17184 + force_sig(signr, tsk);
17188 if (!fixup_exception(regs)) {
17189 tsk->thread.error_code = error_code;
17190 tsk->thread.trap_no = trapnr;
17191 @@ -677,41 +642,39 @@ static void __kprobes do_trap(int trapnr
17194 #define DO_ERROR(trapnr, signr, str, name) \
17195 -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17197 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17198 - == NOTIFY_STOP) \
17200 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17202 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17203 + == NOTIFY_STOP) \
17205 conditional_sti(regs); \
17206 - do_trap(trapnr, signr, str, regs, error_code, NULL); \
17207 + do_trap(trapnr, signr, str, regs, error_code, NULL); \
17210 -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17211 -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17213 - siginfo_t info; \
17214 - info.si_signo = signr; \
17215 - info.si_errno = 0; \
17216 - info.si_code = sicode; \
17217 - info.si_addr = (void __user *)siaddr; \
17218 - trace_hardirqs_fixup(); \
17219 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17220 - == NOTIFY_STOP) \
17222 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17223 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17225 + siginfo_t info; \
17226 + info.si_signo = signr; \
17227 + info.si_errno = 0; \
17228 + info.si_code = sicode; \
17229 + info.si_addr = (void __user *)siaddr; \
17230 + trace_hardirqs_fixup(); \
17231 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17232 + == NOTIFY_STOP) \
17234 conditional_sti(regs); \
17235 - do_trap(trapnr, signr, str, regs, error_code, &info); \
17236 + do_trap(trapnr, signr, str, regs, error_code, &info); \
17239 -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17240 -DO_ERROR( 4, SIGSEGV, "overflow", overflow)
17241 -DO_ERROR( 5, SIGSEGV, "bounds", bounds)
17242 -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17243 -DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
17244 -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17245 +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17246 +DO_ERROR(4, SIGSEGV, "overflow", overflow)
17247 +DO_ERROR(5, SIGSEGV, "bounds", bounds)
17248 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17249 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17250 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
17251 -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17252 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17253 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
17254 -DO_ERROR(18, SIGSEGV, "reserved", reserved)
17256 /* Runs on IST stack */
17257 asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
17258 @@ -741,31 +704,34 @@ asmlinkage void do_double_fault(struct p
17259 die(str, regs, error_code);
17262 -asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
17264 +asmlinkage void __kprobes
17265 +do_general_protection(struct pt_regs *regs, long error_code)
17267 - struct task_struct *tsk = current;
17268 + struct task_struct *tsk;
17270 conditional_sti(regs);
17272 - if (user_mode(regs)) {
17273 - tsk->thread.error_code = error_code;
17274 - tsk->thread.trap_no = 13;
17276 + if (!user_mode(regs))
17277 + goto gp_in_kernel;
17279 - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17280 - printk_ratelimit()) {
17282 - "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17283 - tsk->comm, tsk->pid,
17284 - regs->ip, regs->sp, error_code);
17285 - print_vma_addr(" in ", regs->ip);
17288 + tsk->thread.error_code = error_code;
17289 + tsk->thread.trap_no = 13;
17291 - force_sig(SIGSEGV, tsk);
17294 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17295 + printk_ratelimit()) {
17297 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17298 + tsk->comm, tsk->pid,
17299 + regs->ip, regs->sp, error_code);
17300 + print_vma_addr(" in ", regs->ip);
17304 + force_sig(SIGSEGV, tsk);
17308 if (fixup_exception(regs))
17311 @@ -778,14 +744,14 @@ asmlinkage void __kprobes do_general_pro
17314 static notrace __kprobes void
17315 -mem_parity_error(unsigned char reason, struct pt_regs * regs)
17316 +mem_parity_error(unsigned char reason, struct pt_regs *regs)
17318 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
17320 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
17322 #if defined(CONFIG_EDAC)
17323 - if(edac_handler_set()) {
17324 + if (edac_handler_set()) {
17325 edac_atomic_assert_error();
17328 @@ -801,7 +767,7 @@ mem_parity_error(unsigned char reason, s
17331 static notrace __kprobes void
17332 -io_check_error(unsigned char reason, struct pt_regs * regs)
17333 +io_check_error(unsigned char reason, struct pt_regs *regs)
17335 printk("NMI: IOCK error (debug interrupt?)\n");
17336 show_registers(regs);
17337 @@ -827,14 +793,14 @@ unknown_nmi_error(unsigned char reason,
17339 /* Runs on IST stack. This code must keep interrupts off all the time.
17340 Nested NMIs are prevented by the CPU. */
17341 -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17342 +asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17344 unsigned char reason = 0;
17347 cpu = smp_processor_id();
17349 - /* Only the BSP gets external NMIs from the system. */
17350 + /* Only the BSP gets external NMIs from the system. */
17352 reason = get_nmi_reason();
17354 @@ -847,33 +813,58 @@ asmlinkage notrace __kprobes void defau
17355 * Ok, so this is none of the documented NMI sources,
17356 * so it must be the NMI watchdog.
17358 - if (nmi_watchdog_tick(regs,reason))
17359 + if (nmi_watchdog_tick(regs, reason))
17362 - if (!do_nmi_callback(regs,cpu))
17363 + if (!do_nmi_callback(regs, cpu))
17364 unknown_nmi_error(reason, regs);
17368 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
17372 /* AK: following checks seem to be broken on modern chipsets. FIXME */
17375 mem_parity_error(reason, regs);
17377 io_check_error(reason, regs);
17380 +asmlinkage notrace __kprobes void
17381 +do_nmi(struct pt_regs *regs, long error_code)
17385 + add_pda(__nmi_count, 1);
17387 + if (!ignore_nmis)
17388 + default_do_nmi(regs);
17393 +void stop_nmi(void)
17395 + acpi_nmi_disable();
17399 +void restart_nmi(void)
17402 + acpi_nmi_enable();
17405 /* runs on IST stack. */
17406 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
17408 trace_hardirqs_fixup();
17410 - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
17411 + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
17416 preempt_conditional_sti(regs);
17417 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
17418 preempt_conditional_cli(regs);
17419 @@ -904,8 +895,8 @@ asmlinkage __kprobes struct pt_regs *syn
17420 asmlinkage void __kprobes do_debug(struct pt_regs * regs,
17421 unsigned long error_code)
17423 - unsigned long condition;
17424 struct task_struct *tsk = current;
17425 + unsigned long condition;
17428 trace_hardirqs_fixup();
17429 @@ -926,21 +917,19 @@ asmlinkage void __kprobes do_debug(struc
17431 /* Mask out spurious debug traps due to lazy DR7 setting */
17432 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
17433 - if (!tsk->thread.debugreg7) {
17434 + if (!tsk->thread.debugreg7)
17439 tsk->thread.debugreg6 = condition;
17443 * Single-stepping through TF: make sure we ignore any events in
17444 * kernel space (but re-enable TF when returning to user mode).
17446 if (condition & DR_STEP) {
17447 - if (!user_mode(regs))
17448 - goto clear_TF_reenable;
17449 + if (!user_mode(regs))
17450 + goto clear_TF_reenable;
17453 /* Ok, finally something we can handle */
17454 @@ -953,7 +942,7 @@ asmlinkage void __kprobes do_debug(struc
17455 force_sig_info(SIGTRAP, &info, tsk);
17458 - set_debugreg(0UL, 7);
17459 + set_debugreg(0, 7);
17460 preempt_conditional_cli(regs);
17463 @@ -961,6 +950,7 @@ clear_TF_reenable:
17464 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
17465 regs->flags &= ~X86_EFLAGS_TF;
17466 preempt_conditional_cli(regs);
17470 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
17471 @@ -983,7 +973,7 @@ static int kernel_math_error(struct pt_r
17472 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
17474 void __user *ip = (void __user *)(regs->ip);
17475 - struct task_struct * task;
17476 + struct task_struct *task;
17478 unsigned short cwd, swd;
17480 @@ -1016,30 +1006,30 @@ asmlinkage void do_coprocessor_error(str
17481 cwd = get_fpu_cwd(task);
17482 swd = get_fpu_swd(task);
17483 switch (swd & ~cwd & 0x3f) {
17487 - case 0x001: /* Invalid Op */
17489 - * swd & 0x240 == 0x040: Stack Underflow
17490 - * swd & 0x240 == 0x240: Stack Overflow
17491 - * User must clear the SF bit (0x40) if set
17493 - info.si_code = FPE_FLTINV;
17495 - case 0x002: /* Denormalize */
17496 - case 0x010: /* Underflow */
17497 - info.si_code = FPE_FLTUND;
17499 - case 0x004: /* Zero Divide */
17500 - info.si_code = FPE_FLTDIV;
17502 - case 0x008: /* Overflow */
17503 - info.si_code = FPE_FLTOVF;
17505 - case 0x020: /* Precision */
17506 - info.si_code = FPE_FLTRES;
17508 + case 0x000: /* No unmasked exception */
17509 + default: /* Multiple exceptions */
17511 + case 0x001: /* Invalid Op */
17513 + * swd & 0x240 == 0x040: Stack Underflow
17514 + * swd & 0x240 == 0x240: Stack Overflow
17515 + * User must clear the SF bit (0x40) if set
17517 + info.si_code = FPE_FLTINV;
17519 + case 0x002: /* Denormalize */
17520 + case 0x010: /* Underflow */
17521 + info.si_code = FPE_FLTUND;
17523 + case 0x004: /* Zero Divide */
17524 + info.si_code = FPE_FLTDIV;
17526 + case 0x008: /* Overflow */
17527 + info.si_code = FPE_FLTOVF;
17529 + case 0x020: /* Precision */
17530 + info.si_code = FPE_FLTRES;
17533 force_sig_info(SIGFPE, &info, task);
17535 @@ -1052,7 +1042,7 @@ asmlinkage void bad_intr(void)
17536 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
17538 void __user *ip = (void __user *)(regs->ip);
17539 - struct task_struct * task;
17540 + struct task_struct *task;
17542 unsigned short mxcsr;
17544 @@ -1080,25 +1070,25 @@ asmlinkage void do_simd_coprocessor_erro
17546 mxcsr = get_fpu_mxcsr(task);
17547 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
17551 - case 0x001: /* Invalid Op */
17552 - info.si_code = FPE_FLTINV;
17554 - case 0x002: /* Denormalize */
17555 - case 0x010: /* Underflow */
17556 - info.si_code = FPE_FLTUND;
17558 - case 0x004: /* Zero Divide */
17559 - info.si_code = FPE_FLTDIV;
17561 - case 0x008: /* Overflow */
17562 - info.si_code = FPE_FLTOVF;
17564 - case 0x020: /* Precision */
17565 - info.si_code = FPE_FLTRES;
17570 + case 0x001: /* Invalid Op */
17571 + info.si_code = FPE_FLTINV;
17573 + case 0x002: /* Denormalize */
17574 + case 0x010: /* Underflow */
17575 + info.si_code = FPE_FLTUND;
17577 + case 0x004: /* Zero Divide */
17578 + info.si_code = FPE_FLTDIV;
17580 + case 0x008: /* Overflow */
17581 + info.si_code = FPE_FLTOVF;
17583 + case 0x020: /* Precision */
17584 + info.si_code = FPE_FLTRES;
17587 force_sig_info(SIGFPE, &info, task);
17589 @@ -1118,7 +1108,7 @@ asmlinkage void __attribute__((weak)) mc
17593 - * 'math_state_restore()' saves the current math information in the
17594 + * 'math_state_restore()' saves the current math information in the
17595 * old math state array, and gets the new ones from the current task
17597 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
17598 @@ -1145,7 +1135,14 @@ asmlinkage void math_state_restore(void)
17600 /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
17602 - restore_fpu_checking(&me->thread.xstate->fxsave);
17604 + * Paranoid restore. send a SIGSEGV if we fail to restore the state.
17606 + if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
17608 + force_sig(SIGSEGV, me);
17611 task_thread_info(me)->status |= TS_USEDFPU;
17614 @@ -1190,13 +1187,12 @@ void __init trap_init(void)
17615 ret = HYPERVISOR_set_trap_table(trap_table);
17617 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
17620 * initialize the per thread extended state:
17622 - init_thread_xstate();
17623 + init_thread_xstate();
17625 - * Should be a barrier for any external CPU state.
17626 + * Should be a barrier for any external CPU state:
17630 @@ -1212,27 +1208,25 @@ void __cpuinit smp_trap_init(trap_info_t
17635 static int __init oops_setup(char *s)
17640 if (!strcmp(s, "panic"))
17645 early_param("oops", oops_setup);
17647 static int __init kstack_setup(char *s)
17651 - kstack_depth_to_print = simple_strtoul(s,NULL,0);
17652 + kstack_depth_to_print = simple_strtoul(s, NULL, 0);
17655 early_param("kstack", kstack_setup);
17658 static int __init code_bytes_setup(char *s)
17660 code_bytes = simple_strtoul(s, NULL, 0);
17661 --- sle11-2009-06-04.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:38:05.000000000 +0100
17662 +++ sle11-2009-06-04/arch/x86/kernel/vsyscall_64-xen.c 2009-06-04 10:21:39.000000000 +0200
17664 #include <asm/topology.h>
17665 #include <asm/vgtod.h>
17667 -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
17668 +#define __vsyscall(nr) \
17669 + __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
17670 #define __syscall_clobber "r11","cx","memory"
17673 @@ -264,10 +265,7 @@ static void __cpuinit vsyscall_set_cpu(i
17675 d |= (node & 0xf) << 12;
17676 d |= (node >> 4) << 48;
17677 - if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
17678 - + GDT_ENTRY_PER_CPU),
17681 + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
17684 static void __cpuinit cpu_vsyscall_init(void *arg)
17685 @@ -281,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_bl
17687 long cpu = (long)arg;
17688 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
17689 - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
17690 + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
17691 return NOTIFY_DONE;
17694 @@ -311,7 +309,7 @@ static int __init vsyscall_init(void)
17695 #ifdef CONFIG_SYSCTL
17696 register_sysctl_table(kernel_root_table2);
17698 - on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
17699 + on_each_cpu(cpu_vsyscall_init, NULL, 1);
17700 hotcpu_notifier(cpu_vsyscall_notifier, 0);
17703 --- sle11-2009-06-04.orig/arch/x86/mach-xen/setup.c 2009-03-16 16:33:40.000000000 +0100
17704 +++ sle11-2009-06-04/arch/x86/mach-xen/setup.c 2009-06-04 10:21:39.000000000 +0200
17706 #include <xen/interface/callback.h>
17707 #include <xen/interface/memory.h>
17709 +#ifdef CONFIG_X86_32
17711 #ifdef CONFIG_HOTPLUG_CPU
17712 #define DEFAULT_SEND_IPI (1)
17714 @@ -44,51 +46,6 @@ static int __init print_ipi_mode(void)
17716 late_initcall(print_ipi_mode);
17719 - * machine_specific_memory_setup - Hook for machine specific memory setup.
17722 - * This is included late in kernel/setup.c so that it can make
17723 - * use of all of the static functions.
17726 -char * __init machine_specific_memory_setup(void)
17729 - struct xen_memory_map memmap;
17731 - * This is rather large for a stack variable but this early in
17732 - * the boot process we know we have plenty slack space.
17734 - struct e820entry map[E820MAX];
17736 - memmap.nr_entries = E820MAX;
17737 - set_xen_guest_handle(memmap.buffer, map);
17739 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
17740 - if ( rc == -ENOSYS ) {
17741 - memmap.nr_entries = 1;
17742 - map[0].addr = 0ULL;
17743 - map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
17744 - /* 8MB slack (to balance backend allocations). */
17745 - map[0].size += 8ULL << 20;
17746 - map[0].type = E820_RAM;
17751 - sanitize_e820_map(map, (char *)&memmap.nr_entries);
17753 - BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
17759 -extern void hypervisor_callback(void);
17760 -extern void failsafe_callback(void);
17761 -extern void nmi(void);
17763 unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
17764 EXPORT_SYMBOL(machine_to_phys_mapping);
17765 unsigned int machine_to_phys_order;
17766 @@ -121,33 +78,66 @@ void __init pre_setup_arch_hook(void)
17767 (unsigned long *)xen_start_info->mfn_list;
17770 +#endif /* CONFIG_X86_32 */
17772 +extern void hypervisor_callback(void);
17773 +extern void failsafe_callback(void);
17774 +extern void nmi(void);
17776 +#ifdef CONFIG_X86_64
17777 +#include <asm/proto.h>
17778 +#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
17780 +#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
17783 void __init machine_specific_arch_setup(void)
17786 static struct callback_register __initdata event = {
17787 .type = CALLBACKTYPE_event,
17788 - .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
17789 + .address = CALLBACK_ADDR(hypervisor_callback)
17791 static struct callback_register __initdata failsafe = {
17792 .type = CALLBACKTYPE_failsafe,
17793 - .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
17794 + .address = CALLBACK_ADDR(failsafe_callback)
17796 +#ifdef CONFIG_X86_64
17797 + static struct callback_register __initdata syscall = {
17798 + .type = CALLBACKTYPE_syscall,
17799 + .address = CALLBACK_ADDR(system_call)
17802 +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17803 static struct callback_register __initdata nmi_cb = {
17804 .type = CALLBACKTYPE_nmi,
17805 - .address = { __KERNEL_CS, (unsigned long)nmi },
17806 + .address = CALLBACK_ADDR(nmi)
17810 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
17812 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
17813 +#ifdef CONFIG_X86_64
17815 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
17817 #if CONFIG_XEN_COMPAT <= 0x030002
17818 +#ifdef CONFIG_X86_32
17819 if (ret == -ENOSYS)
17820 ret = HYPERVISOR_set_callbacks(
17821 event.address.cs, event.address.eip,
17822 failsafe.address.cs, failsafe.address.eip);
17824 + ret = HYPERVISOR_set_callbacks(
17826 + failsafe.address,
17827 + syscall.address);
17832 +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17833 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
17834 #if CONFIG_XEN_COMPAT <= 0x030002
17835 if (ret == -ENOSYS) {
17836 @@ -158,15 +148,43 @@ void __init machine_specific_arch_setup(
17837 HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
17842 +#ifdef CONFIG_X86_32
17843 /* Do an early initialization of the fixmap area */
17845 extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
17846 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
17847 pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
17848 pmd_t *pmd = pmd_offset(pud, addr);
17851 make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
17852 set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
17854 +#define __FIXADDR_TOP (-PAGE_SIZE)
17855 +#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
17856 + != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
17857 + FIX_BUG_ON(SHARED_INFO);
17858 + FIX_BUG_ON(ISAMAP_BEGIN);
17859 + FIX_BUG_ON(ISAMAP_END);
17860 +#undef __FIXADDR_TOP
17861 + BUG_ON(pte_index(hypervisor_virt_start));
17863 + /* Switch to the real shared_info page, and clear the
17865 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
17866 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
17867 + memset(empty_zero_page, 0, sizeof(empty_zero_page));
17869 + /* Setup mapping of lower 1st MB */
17870 + for (i = 0; i < NR_FIX_ISAMAPS; i++)
17871 + if (is_initial_xendomain())
17872 + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
17874 + __set_fixmap(FIX_ISAMAP_BEGIN - i,
17875 + virt_to_machine(empty_zero_page),
17880 --- sle11-2009-06-04.orig/arch/x86/mm/fault-xen.c 2009-03-16 16:38:05.000000000 +0100
17881 +++ sle11-2009-06-04/arch/x86/mm/fault-xen.c 2009-06-04 10:21:39.000000000 +0200
17883 #include <linux/string.h>
17884 #include <linux/types.h>
17885 #include <linux/ptrace.h>
17886 +#include <linux/mmiotrace.h>
17887 #include <linux/mman.h>
17888 #include <linux/mm.h>
17889 #include <linux/smp.h>
17890 @@ -49,17 +50,23 @@
17891 #define PF_RSVD (1<<3)
17892 #define PF_INSTR (1<<4)
17894 +static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
17896 +#ifdef CONFIG_MMIOTRACE_HOOKS
17897 + if (unlikely(is_kmmio_active()))
17898 + if (kmmio_handler(regs, addr) == 1)
17904 static inline int notify_page_fault(struct pt_regs *regs)
17906 #ifdef CONFIG_KPROBES
17909 /* kprobe_running() needs smp_processor_id() */
17910 -#ifdef CONFIG_X86_32
17911 if (!user_mode_vm(regs)) {
17913 - if (!user_mode(regs)) {
17916 if (kprobe_running() && kprobe_fault_handler(regs, 14))
17918 @@ -409,11 +416,7 @@ static void show_fault_oops(struct pt_re
17919 printk(KERN_CONT "NULL pointer dereference");
17921 printk(KERN_CONT "paging request");
17922 -#ifdef CONFIG_X86_32
17923 - printk(KERN_CONT " at %08lx\n", address);
17925 - printk(KERN_CONT " at %016lx\n", address);
17927 + printk(KERN_CONT " at %p\n", (void *) address);
17928 printk(KERN_ALERT "IP:");
17929 printk_address(regs->ip, 1);
17930 dump_pagetable(address);
17931 @@ -628,6 +631,8 @@ void __kprobes do_page_fault(struct pt_r
17933 if (notify_page_fault(regs))
17935 + if (unlikely(kmmio_fault(regs, address)))
17939 * We fault-in kernel-space virtual memory on-demand. The
17940 @@ -831,14 +836,10 @@ bad_area_nosemaphore:
17941 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17942 printk_ratelimit()) {
17944 -#ifdef CONFIG_X86_32
17945 - "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
17947 - "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
17949 + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
17950 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
17951 - tsk->comm, task_pid_nr(tsk), address, regs->ip,
17952 - regs->sp, error_code);
17953 + tsk->comm, task_pid_nr(tsk), address,
17954 + (void *) regs->ip, (void *) regs->sp, error_code);
17955 print_vma_addr(" in ", regs->ip);
17958 @@ -946,81 +947,45 @@ LIST_HEAD(pgd_list);
17959 void vmalloc_sync_all(void)
17961 #ifdef CONFIG_X86_32
17963 - * Note that races in the updates of insync and start aren't
17964 - * problematic: insync can only get set bits added, and updates to
17965 - * start are only improving performance (without affecting correctness
17967 - * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
17968 - * This change works just fine with 2-level paging too.
17970 -#define sync_index(a) ((a) >> PMD_SHIFT)
17971 - static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
17972 - static unsigned long start = TASK_SIZE;
17973 - unsigned long address;
17974 + unsigned long address = VMALLOC_START & PGDIR_MASK;
17976 if (SHARED_KERNEL_PMD)
17979 BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
17980 - for (address = start;
17981 - address < hypervisor_virt_start;
17982 - address += PMD_SIZE) {
17983 - if (!test_bit(sync_index(address), insync)) {
17984 - unsigned long flags;
17985 - struct page *page;
17987 - spin_lock_irqsave(&pgd_lock, flags);
17988 - /* XEN: failure path assumes non-empty pgd_list. */
17989 - if (unlikely(list_empty(&pgd_list))) {
17990 - spin_unlock_irqrestore(&pgd_lock, flags);
17993 - list_for_each_entry(page, &pgd_list, lru) {
17994 - if (!vmalloc_sync_one(page_address(page),
17998 - spin_unlock_irqrestore(&pgd_lock, flags);
18000 - set_bit(sync_index(address), insync);
18001 + for (; address < hypervisor_virt_start; address += PMD_SIZE) {
18002 + unsigned long flags;
18003 + struct page *page;
18005 + spin_lock_irqsave(&pgd_lock, flags);
18006 + list_for_each_entry(page, &pgd_list, lru) {
18007 + if (!vmalloc_sync_one(page_address(page),
18011 - if (address == start && test_bit(sync_index(address), insync))
18012 - start = address + PMD_SIZE;
18013 + spin_unlock_irqrestore(&pgd_lock, flags);
18015 #else /* CONFIG_X86_64 */
18017 - * Note that races in the updates of insync and start aren't
18018 - * problematic: insync can only get set bits added, and updates to
18019 - * start are only improving performance (without affecting correctness
18022 - static DECLARE_BITMAP(insync, PTRS_PER_PGD);
18023 - static unsigned long start = VMALLOC_START & PGDIR_MASK;
18024 + unsigned long start = VMALLOC_START & PGDIR_MASK;
18025 unsigned long address;
18027 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
18028 - if (!test_bit(pgd_index(address), insync)) {
18029 - const pgd_t *pgd_ref = pgd_offset_k(address);
18030 - unsigned long flags;
18031 - struct page *page;
18033 - if (pgd_none(*pgd_ref))
18035 - spin_lock_irqsave(&pgd_lock, flags);
18036 - list_for_each_entry(page, &pgd_list, lru) {
18038 - pgd = (pgd_t *)page_address(page) + pgd_index(address);
18039 - if (pgd_none(*pgd))
18040 - set_pgd(pgd, *pgd_ref);
18042 - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18044 - spin_unlock_irqrestore(&pgd_lock, flags);
18045 - set_bit(pgd_index(address), insync);
18046 + const pgd_t *pgd_ref = pgd_offset_k(address);
18047 + unsigned long flags;
18048 + struct page *page;
18050 + if (pgd_none(*pgd_ref))
18052 + spin_lock_irqsave(&pgd_lock, flags);
18053 + list_for_each_entry(page, &pgd_list, lru) {
18055 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
18056 + if (pgd_none(*pgd))
18057 + set_pgd(pgd, *pgd_ref);
18059 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18061 - if (address == start)
18062 - start = address + PGDIR_SIZE;
18063 + spin_unlock_irqrestore(&pgd_lock, flags);
18067 --- sle11-2009-06-04.orig/arch/x86/mm/hypervisor.c 2009-05-14 11:18:39.000000000 +0200
18068 +++ sle11-2009-06-04/arch/x86/mm/hypervisor.c 2009-06-04 10:21:39.000000000 +0200
18069 @@ -709,6 +709,72 @@ void xen_destroy_contiguous_region(unsig
18071 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
18073 +int __init early_create_contiguous_region(unsigned long pfn,
18074 + unsigned int order,
18075 + unsigned int address_bits)
18077 + unsigned long *in_frames = discontig_frames, out_frame = pfn;
18080 + struct xen_memory_exchange exchange = {
18082 + .nr_extents = 1UL << order,
18083 + .extent_order = 0,
18084 + .domid = DOMID_SELF
18088 + .extent_order = order,
18089 + .address_bits = address_bits,
18090 + .domid = DOMID_SELF
18094 + if (xen_feature(XENFEAT_auto_translated_physmap))
18097 + if (unlikely(order > MAX_CONTIG_ORDER))
18100 + for (i = 0; i < (1U << order); ++i) {
18101 + in_frames[i] = pfn_to_mfn(pfn + i);
18102 + set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY);
18105 + set_xen_guest_handle(exchange.in.extent_start, in_frames);
18106 + set_xen_guest_handle(exchange.out.extent_start, &out_frame);
18108 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
18109 + success = (exchange.nr_exchanged == (1UL << order));
18110 + BUG_ON(!success && (exchange.nr_exchanged || !rc));
18111 + BUG_ON(success && rc);
18112 +#if CONFIG_XEN_COMPAT <= 0x030002
18113 + if (unlikely(rc == -ENOSYS)) {
18114 + /* Compatibility when XENMEM_exchange is unavailable. */
18115 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
18116 + &exchange.in) != (1UL << order))
18118 + success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18119 + &exchange.out) == 1);
18121 + for (i = 0; i < (1U << order); ++i)
18122 + in_frames[i] = pfn + i;
18123 + if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18124 + &exchange.in) != (1UL << order))
18130 + for (i = 0; i < (1U << order); ++i, ++out_frame) {
18132 + out_frame = in_frames[i];
18133 + set_phys_to_machine(pfn + i, out_frame);
18136 + return success ? 0 : -ENOMEM;
18139 static void undo_limit_pages(struct page *pages, unsigned int order)
18141 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
18142 @@ -875,42 +941,9 @@ int write_ldt_entry(struct desc_struct *
18143 return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
18146 -#define MAX_BATCHED_FULL_PTES 32
18148 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
18149 - unsigned long addr, unsigned long end, pgprot_t newprot,
18150 - int dirty_accountable)
18151 +int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
18154 - int rc = 0, i = 0;
18155 - mmu_update_t u[MAX_BATCHED_FULL_PTES];
18159 - if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
18162 - pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
18164 - if (pte_present(*pte)) {
18165 - pte_t ptent = pte_modify(*pte, newprot);
18167 - if (dirty_accountable && pte_dirty(ptent))
18168 - ptent = pte_mkwrite(ptent);
18169 - u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
18170 - | ((unsigned long)pte & ~PAGE_MASK)
18171 - | MMU_PT_UPDATE_PRESERVE_AD;
18172 - u[i].val = __pte_val(ptent);
18173 - if (++i == MAX_BATCHED_FULL_PTES) {
18174 - if ((rc = HYPERVISOR_mmu_update(
18175 - &u[0], i, NULL, DOMID_SELF)) != 0)
18180 - } while (pte++, addr += PAGE_SIZE, addr != end);
18182 - rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
18183 - pte_unmap_unlock(pte - 1, ptl);
18184 - BUG_ON(rc && rc != -ENOSYS);
18186 + maddr_t mach_gp = virt_to_machine(gdt + entry);
18187 + return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
18189 --- sle11-2009-06-04.orig/arch/x86/mm/init_32-xen.c 2009-03-16 16:38:05.000000000 +0100
18190 +++ sle11-2009-06-04/arch/x86/mm/init_32-xen.c 2009-06-04 10:21:39.000000000 +0200
18193 unsigned int __VMALLOC_RESERVE = 128 << 20;
18195 +unsigned long max_low_pfn_mapped;
18196 unsigned long max_pfn_mapped;
18198 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
18199 @@ -61,6 +62,27 @@ unsigned long highstart_pfn, highend_pfn
18201 static noinline int do_test_wp_bit(void);
18204 +static unsigned long __initdata table_start;
18205 +static unsigned long __initdata table_end;
18206 +static unsigned long __initdata table_top;
18208 +static int __initdata after_init_bootmem;
18210 +static __init void *alloc_low_page(unsigned long *phys)
18212 + unsigned long pfn = table_end++;
18215 + if (pfn >= table_top)
18216 + panic("alloc_low_page: ran out of memory");
18218 + adr = __va(pfn * PAGE_SIZE);
18219 + memset(adr, 0, PAGE_SIZE);
18220 + *phys = pfn * PAGE_SIZE;
18225 * Creates a middle page table and puts a pointer to it in the
18226 * given global directory entry. This only returns the gd entry
18227 @@ -72,9 +94,12 @@ static pmd_t * __init one_md_table_init(
18230 #ifdef CONFIG_X86_PAE
18231 + unsigned long phys;
18232 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
18233 - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18235 + if (after_init_bootmem)
18236 + pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18238 + pmd_table = (pmd_t *)alloc_low_page(&phys);
18239 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
18240 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
18241 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
18242 @@ -101,12 +126,16 @@ static pte_t * __init one_page_table_ini
18244 pte_t *page_table = NULL;
18246 + if (after_init_bootmem) {
18247 #ifdef CONFIG_DEBUG_PAGEALLOC
18248 - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18249 + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18251 - if (!page_table) {
18255 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18257 + unsigned long phys;
18258 + page_table = (pte_t *)alloc_low_page(&phys);
18261 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
18262 @@ -167,24 +196,24 @@ static inline int is_kernel_text(unsigne
18263 * of max_low_pfn pages, by creating page tables starting from address
18266 -static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
18267 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
18268 + unsigned long start_pfn,
18269 + unsigned long end_pfn,
18272 int pgd_idx, pmd_idx, pte_ofs;
18277 + unsigned pages_2m = 0, pages_4k = 0;
18279 - unsigned long max_ram_pfn = xen_start_info->nr_pages;
18280 - if (max_ram_pfn > max_low_pfn)
18281 - max_ram_pfn = max_low_pfn;
18282 + if (!cpu_has_pse)
18285 - pgd_idx = pgd_index(PAGE_OFFSET);
18287 + pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18288 pgd = pgd_base + pgd_idx;
18290 - pmd_idx = pmd_index(PAGE_OFFSET);
18291 - pte_ofs = pte_index(PAGE_OFFSET);
18293 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
18296 @@ -198,10 +227,16 @@ static void __init kernel_physical_mappi
18298 pmd = one_md_table_init(pgd);
18300 - if (pfn >= max_low_pfn)
18302 + if (pfn >= end_pfn)
18304 +#ifdef CONFIG_X86_PAE
18305 + pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18307 - for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
18311 + for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
18312 pmd++, pmd_idx++) {
18313 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
18315 @@ -211,13 +246,8 @@ static void __init kernel_physical_mappi
18317 * Map with big pages if possible, otherwise
18318 * create normal page tables:
18320 - * Don't use a large page for the first 2/4MB of memory
18321 - * because there are often fixed size MTRRs in there
18322 - * and overlapping MTRRs into large pages can cause
18325 - if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
18327 unsigned int addr2;
18328 pgprot_t prot = PAGE_KERNEL_LARGE;
18330 @@ -228,49 +258,35 @@ static void __init kernel_physical_mappi
18331 is_kernel_text(addr2))
18332 prot = PAGE_KERNEL_LARGE_EXEC;
18335 set_pmd(pmd, pfn_pmd(pfn, prot));
18337 pfn += PTRS_PER_PTE;
18338 - max_pfn_mapped = pfn;
18341 pte = one_page_table_init(pmd);
18343 - for (pte += pte_ofs;
18344 - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
18345 + pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18347 + for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
18348 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
18349 pgprot_t prot = PAGE_KERNEL;
18351 /* XEN: Only map initial RAM allocation. */
18352 - if ((pfn >= max_ram_pfn) || pte_present(*pte))
18353 + if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
18355 if (is_kernel_text(addr))
18356 prot = PAGE_KERNEL_EXEC;
18359 set_pte(pte, pfn_pte(pfn, prot));
18361 - max_pfn_mapped = pfn;
18366 + update_page_count(PG_LEVEL_2M, pages_2m);
18367 + update_page_count(PG_LEVEL_4K, pages_4k);
18370 -#ifndef CONFIG_XEN
18372 -static inline int page_kills_ppro(unsigned long pagenr)
18374 - if (pagenr >= 0x70000 && pagenr <= 0x7003F)
18381 -#define page_kills_ppro(p) 0
18386 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
18387 * is valid. The argument is a physical page number.
18388 @@ -331,30 +347,63 @@ static void __init permanent_kmaps_init(
18389 pkmap_page_table = pte;
18392 -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
18393 +static void __init add_one_highpage_init(struct page *page, int pfn)
18395 - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
18396 - ClearPageReserved(page);
18397 - init_page_count(page);
18398 - if (pfn < xen_start_info->nr_pages)
18399 - __free_page(page);
18400 - totalhigh_pages++;
18402 - SetPageReserved(page);
18403 + ClearPageReserved(page);
18404 + init_page_count(page);
18405 + if (pfn < xen_start_info->nr_pages)
18406 + __free_page(page);
18407 + totalhigh_pages++;
18410 +struct add_highpages_data {
18411 + unsigned long start_pfn;
18412 + unsigned long end_pfn;
18415 +static int __init add_highpages_work_fn(unsigned long start_pfn,
18416 + unsigned long end_pfn, void *datax)
18419 + struct page *page;
18420 + unsigned long final_start_pfn, final_end_pfn;
18421 + struct add_highpages_data *data;
18423 + data = (struct add_highpages_data *)datax;
18425 + final_start_pfn = max(start_pfn, data->start_pfn);
18426 + final_end_pfn = min(end_pfn, data->end_pfn);
18427 + if (final_start_pfn >= final_end_pfn)
18430 + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
18432 + if (!pfn_valid(node_pfn))
18434 + page = pfn_to_page(node_pfn);
18435 + add_one_highpage_init(page, node_pfn);
18442 +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
18443 + unsigned long end_pfn)
18445 + struct add_highpages_data data;
18447 + data.start_pfn = start_pfn;
18448 + data.end_pfn = end_pfn;
18450 + work_with_active_regions(nid, add_highpages_work_fn, &data);
18453 #ifndef CONFIG_NUMA
18454 -static void __init set_highmem_pages_init(int bad_ppro)
18455 +static void __init set_highmem_pages_init(void)
18458 + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
18460 - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
18462 - * Holes under sparsemem might not have no mem_map[]:
18464 - if (pfn_valid(pfn))
18465 - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
18467 totalram_pages += totalhigh_pages;
18469 #endif /* !CONFIG_NUMA */
18470 @@ -362,24 +411,11 @@ static void __init set_highmem_pages_ini
18472 # define kmap_init() do { } while (0)
18473 # define permanent_kmaps_init(pgd_base) do { } while (0)
18474 -# define set_highmem_pages_init(bad_ppro) do { } while (0)
18475 +# define set_highmem_pages_init() do { } while (0)
18476 #endif /* CONFIG_HIGHMEM */
18478 -pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
18479 -EXPORT_SYMBOL(__PAGE_KERNEL);
18481 -pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
18483 pgd_t *swapper_pg_dir;
18485 -static void __init xen_pagetable_setup_start(pgd_t *base)
18489 -static void __init xen_pagetable_setup_done(pgd_t *base)
18494 * Build a proper pagetable for the kernel mappings. Up until this
18495 * point, we've been running on some set of pagetables constructed by
18496 @@ -399,27 +435,10 @@ static void __init xen_pagetable_setup_d
18497 * be partially populated, and so it avoids stomping on any existing
18500 -static void __init pagetable_init(void)
18501 +static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
18503 - pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18504 unsigned long vaddr, end;
18506 - xen_pagetable_setup_start(pgd_base);
18508 - /* Enable PSE if available */
18510 - set_in_cr4(X86_CR4_PSE);
18512 - /* Enable PGE if available */
18513 - if (cpu_has_pge) {
18514 - set_in_cr4(X86_CR4_PGE);
18515 - __PAGE_KERNEL |= _PAGE_GLOBAL;
18516 - __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
18519 - kernel_physical_mapping_init(pgd_base);
18520 - remap_numa_kva();
18523 * Fixed mappings, only the page table structure has to be
18524 * created - mappings will be set by set_fixmap():
18525 @@ -429,10 +448,13 @@ static void __init pagetable_init(void)
18526 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
18527 page_table_range_init(vaddr, end, pgd_base);
18528 early_ioremap_reset();
18531 - permanent_kmaps_init(pgd_base);
18532 +static void __init pagetable_init(void)
18534 + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18536 - xen_pagetable_setup_done(pgd_base);
18537 + permanent_kmaps_init(pgd_base);
18540 #if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
18541 @@ -475,7 +497,7 @@ void zap_low_mappings(void)
18545 -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
18546 +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
18547 EXPORT_SYMBOL_GPL(__supported_pte_mask);
18549 #ifdef CONFIG_X86_PAE
18550 @@ -528,42 +550,364 @@ static void __init set_nx(void)
18554 +/* user-defined highmem size */
18555 +static unsigned int highmem_pages = -1;
18558 - * paging_init() sets up the page tables - note that the first 8MB are
18559 - * already mapped by head.S.
18561 - * This routines also unmaps the page at virtual kernel address 0, so
18562 - * that we can trap those pesky NULL-reference errors in the kernel.
18563 + * highmem=size forces highmem to be exactly 'size' bytes.
18564 + * This works even on boxes that have no highmem otherwise.
18565 + * This also works to reduce highmem size on bigger boxes.
18567 -void __init paging_init(void)
18568 +static int __init parse_highmem(char *arg)
18573 + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
18576 +early_param("highmem", parse_highmem);
18579 + * Determine low and high memory ranges:
18581 +void __init find_low_pfn_range(void)
18583 + /* it could update max_pfn */
18585 + /* max_low_pfn is 0, we already have early_res support */
18587 + max_low_pfn = max_pfn;
18588 + if (max_low_pfn > MAXMEM_PFN) {
18589 + if (highmem_pages == -1)
18590 + highmem_pages = max_pfn - MAXMEM_PFN;
18591 + if (highmem_pages + MAXMEM_PFN < max_pfn)
18592 + max_pfn = MAXMEM_PFN + highmem_pages;
18593 + if (highmem_pages + MAXMEM_PFN > max_pfn) {
18594 + printk(KERN_WARNING "only %luMB highmem pages "
18595 + "available, ignoring highmem size of %uMB.\n",
18596 + pages_to_mb(max_pfn - MAXMEM_PFN),
18597 + pages_to_mb(highmem_pages));
18598 + highmem_pages = 0;
18600 + max_low_pfn = MAXMEM_PFN;
18601 +#ifndef CONFIG_HIGHMEM
18602 + /* Maximum memory usable is what is directly addressable */
18603 + printk(KERN_WARNING "Warning only %ldMB will be used.\n",
18605 + if (max_pfn > MAX_NONPAE_PFN)
18606 + printk(KERN_WARNING
18607 + "Use a HIGHMEM64G enabled kernel.\n");
18609 + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
18610 + max_pfn = MAXMEM_PFN;
18611 +#else /* !CONFIG_HIGHMEM */
18612 +#ifndef CONFIG_HIGHMEM64G
18613 + if (max_pfn > MAX_NONPAE_PFN) {
18614 + max_pfn = MAX_NONPAE_PFN;
18615 + printk(KERN_WARNING "Warning only 4GB will be used."
18616 + "Use a HIGHMEM64G enabled kernel.\n");
18618 +#endif /* !CONFIG_HIGHMEM64G */
18619 +#endif /* !CONFIG_HIGHMEM */
18621 + if (highmem_pages == -1)
18622 + highmem_pages = 0;
18623 +#ifdef CONFIG_HIGHMEM
18624 + if (highmem_pages >= max_pfn) {
18625 + printk(KERN_ERR "highmem size specified (%uMB) is "
18626 + "bigger than pages available (%luMB)!.\n",
18627 + pages_to_mb(highmem_pages),
18628 + pages_to_mb(max_pfn));
18629 + highmem_pages = 0;
18631 + if (highmem_pages) {
18632 + if (max_low_pfn - highmem_pages <
18633 + 64*1024*1024/PAGE_SIZE){
18634 + printk(KERN_ERR "highmem size %uMB results in "
18635 + "smaller than 64MB lowmem, ignoring it.\n"
18636 + , pages_to_mb(highmem_pages));
18637 + highmem_pages = 0;
18639 + max_low_pfn -= highmem_pages;
18642 + if (highmem_pages)
18643 + printk(KERN_ERR "ignoring highmem size on non-highmem"
18649 +#ifndef CONFIG_NEED_MULTIPLE_NODES
18650 +void __init initmem_init(unsigned long start_pfn,
18651 + unsigned long end_pfn)
18653 +#ifdef CONFIG_HIGHMEM
18654 + highstart_pfn = highend_pfn = max_pfn;
18655 + if (max_pfn > max_low_pfn)
18656 + highstart_pfn = max_low_pfn;
18657 + memory_present(0, 0, highend_pfn);
18658 + e820_register_active_regions(0, 0, highend_pfn);
18659 + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
18660 + pages_to_mb(highend_pfn - highstart_pfn));
18661 + num_physpages = highend_pfn;
18662 + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
18664 + memory_present(0, 0, max_low_pfn);
18665 + e820_register_active_regions(0, 0, max_low_pfn);
18666 + num_physpages = max_low_pfn;
18667 + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
18669 +#ifdef CONFIG_FLATMEM
18670 + max_mapnr = num_physpages;
18672 + printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
18673 + pages_to_mb(max_low_pfn));
18675 + setup_bootmem_allocator();
18677 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
18679 +static void __init zone_sizes_init(void)
18681 + unsigned long max_zone_pfns[MAX_NR_ZONES];
18682 + memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
18683 + max_zone_pfns[ZONE_DMA] =
18684 + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
18685 + max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
18686 +#ifdef CONFIG_HIGHMEM
18687 + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
18690 + free_area_init_nodes(max_zone_pfns);
18693 +void __init setup_bootmem_allocator(void)
18696 + unsigned long bootmap_size, bootmap;
18697 + unsigned long end_pfn = min(max_low_pfn, xen_start_info->nr_pages);
18700 + * Initialize the boot-time allocator (with low memory only):
18702 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
18703 + bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
18704 + min(max_pfn_mapped, xen_start_info->nr_pages)<<PAGE_SHIFT,
18705 + bootmap_size, PAGE_SIZE);
18706 + if (bootmap == -1L)
18707 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
18708 + reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
18710 + /* don't touch min_low_pfn */
18711 + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
18712 + min_low_pfn, end_pfn);
18713 + printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
18714 + max_pfn_mapped<<PAGE_SHIFT);
18715 + printk(KERN_INFO " low ram: %08lx - %08lx\n",
18716 + min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
18717 + printk(KERN_INFO " bootmap %08lx - %08lx\n",
18718 + bootmap, bootmap + bootmap_size);
18719 + for_each_online_node(i)
18720 + free_bootmem_with_active_regions(i, end_pfn);
18721 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
18723 + after_init_bootmem = 1;
18726 +static unsigned long __init extend_init_mapping(unsigned long tables_space)
18728 + unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
18729 + + xen_start_info->nr_pt_frames;
18730 + unsigned long start = start_pfn, va = (unsigned long)&_text;
18736 + /* Ensure init mappings cover kernel text/data and initial tables. */
18737 + while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
18738 + pgd = pgd_offset_k(va);
18739 + pud = pud_offset(pgd, va);
18740 + pmd = pmd_offset(pud, va);
18741 + if (pmd_none(*pmd)) {
18742 + unsigned long pa = start_pfn++ << PAGE_SHIFT;
18744 + memset(__va(pa), 0, PAGE_SIZE);
18745 + make_lowmem_page_readonly(__va(pa),
18746 + XENFEAT_writable_page_tables);
18747 + xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
18749 + pte = pte_offset_kernel(pmd, va);
18750 + if (pte_none(*pte)) {
18751 + pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
18753 + if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
18759 + /* Finally, blow away any spurious initial mappings. */
18761 + pgd = pgd_offset_k(va);
18762 + pud = pud_offset(pgd, va);
18763 + pmd = pmd_offset(pud, va);
18764 + if (pmd_none(*pmd))
18766 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
18771 + if (start_pfn > start)
18772 + reserve_early(start << PAGE_SHIFT,
18773 + start_pfn << PAGE_SHIFT, "INITMAP");
18775 + return start_pfn;
18778 +static void __init find_early_table_space(unsigned long end)
18780 + unsigned long puds, pmds, ptes, tables;
18782 + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
18783 + tables = PAGE_ALIGN(puds * sizeof(pud_t));
18785 + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
18786 + tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
18788 + if (cpu_has_pse) {
18789 + unsigned long extra;
18791 + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
18792 + extra += PMD_SIZE;
18793 + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
18795 + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
18797 + tables += PAGE_ALIGN(ptes * sizeof(pte_t));
18800 + tables += PAGE_SIZE
18801 + * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK)
18802 + - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK))
18805 + table_start = extend_init_mapping(tables);
18807 + table_end = table_start;
18808 + table_top = table_start + (tables>>PAGE_SHIFT);
18810 + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
18811 + end, table_start << PAGE_SHIFT,
18812 + (table_start << PAGE_SHIFT) + tables);
18815 +unsigned long __init_refok init_memory_mapping(unsigned long start,
18816 + unsigned long end)
18818 + pgd_t *pgd_base = swapper_pg_dir;
18819 + unsigned long start_pfn, end_pfn;
18820 + unsigned long big_page_start;
18823 + * Find space for the kernel direct mapping tables.
18825 + if (!after_init_bootmem)
18826 + find_early_table_space(end);
18828 #ifdef CONFIG_X86_PAE
18831 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
18834 + /* Enable PSE if available */
18836 + set_in_cr4(X86_CR4_PSE);
18838 + /* Enable PGE if available */
18839 + if (cpu_has_pge) {
18840 + set_in_cr4(X86_CR4_PGE);
18841 + __supported_pte_mask |= _PAGE_GLOBAL;
18845 + * Don't use a large page for the first 2/4MB of memory
18846 + * because there are often fixed size MTRRs in there
18847 + * and overlapping MTRRs into large pages can cause
18850 + big_page_start = PMD_SIZE;
18852 + if (start < big_page_start) {
18853 + start_pfn = start >> PAGE_SHIFT;
18854 + end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
18856 + /* head is not big page alignment ? */
18857 + start_pfn = start >> PAGE_SHIFT;
18858 + end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18859 + << (PMD_SHIFT - PAGE_SHIFT);
18861 + if (start_pfn < end_pfn)
18862 + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
18864 + /* big page range */
18865 + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18866 + << (PMD_SHIFT - PAGE_SHIFT);
18867 + if (start_pfn < (big_page_start >> PAGE_SHIFT))
18868 + start_pfn = big_page_start >> PAGE_SHIFT;
18869 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
18870 + if (start_pfn < end_pfn)
18871 + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
18874 + /* tail is not big page alignment ? */
18875 + start_pfn = end_pfn;
18876 + if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
18877 + end_pfn = end >> PAGE_SHIFT;
18878 + if (start_pfn < end_pfn)
18879 + kernel_physical_mapping_init(pgd_base, start_pfn,
18883 + early_ioremap_page_table_range_init(pgd_base);
18885 + __flush_tlb_all();
18887 + if (!after_init_bootmem)
18888 + reserve_early(table_start << PAGE_SHIFT,
18889 + table_end << PAGE_SHIFT, "PGTABLE");
18891 + if (!after_init_bootmem)
18892 + early_memtest(start, end);
18894 + return end >> PAGE_SHIFT;
18899 + * paging_init() sets up the page tables - note that the first 8MB are
18900 + * already mapped by head.S.
18902 + * This routines also unmaps the page at virtual kernel address 0, so
18903 + * that we can trap those pesky NULL-reference errors in the kernel.
18905 +void __init paging_init(void)
18913 - /* Switch to the real shared_info page, and clear the
18915 - set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
18916 - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
18917 - memset(empty_zero_page, 0, sizeof(empty_zero_page));
18919 - /* Setup mapping of lower 1st MB */
18920 - for (i = 0; i < NR_FIX_ISAMAPS; i++)
18921 - if (is_initial_xendomain())
18922 - set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
18924 - __set_fixmap(FIX_ISAMAP_BEGIN - i,
18925 - virt_to_machine(empty_zero_page),
18928 + * NOTE: at this point the bootmem allocator is fully available.
18931 + zone_sizes_init();
18935 @@ -598,7 +942,7 @@ static struct kcore_list kcore_mem, kcor
18936 void __init mem_init(void)
18938 int codesize, reservedpages, datasize, initsize;
18939 - int tmp, bad_ppro;
18944 @@ -606,19 +950,6 @@ void __init mem_init(void)
18945 #ifdef CONFIG_FLATMEM
18948 - bad_ppro = ppro_with_ram_bug();
18950 -#ifdef CONFIG_HIGHMEM
18951 - /* check that fixmap and pkmap do not overlap */
18952 - if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
18954 - "fixmap and kmap areas overlap - this will crash\n");
18955 - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
18956 - PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
18961 /* this will put all low memory onto the freelists */
18962 totalram_pages += free_all_bootmem();
18963 /* XEN: init and count low-mem pages outside initial allocation. */
18964 @@ -636,7 +967,7 @@ void __init mem_init(void)
18965 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
18968 - set_highmem_pages_init(bad_ppro);
18969 + set_highmem_pages_init();
18971 codesize = (unsigned long) &_etext - (unsigned long) &_text;
18972 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
18973 @@ -657,7 +988,6 @@ void __init mem_init(void)
18974 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
18977 -#if 1 /* double-sanity-check paranoia */
18978 printk(KERN_INFO "virtual kernel memory layout:\n"
18979 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
18980 #ifdef CONFIG_HIGHMEM
18981 @@ -698,7 +1028,6 @@ void __init mem_init(void)
18983 BUG_ON(VMALLOC_START > VMALLOC_END);
18984 BUG_ON((unsigned long)high_memory > VMALLOC_START);
18985 -#endif /* double-sanity-check paranoia */
18987 if (boot_cpu_data.wp_works_ok < 0)
18989 @@ -755,6 +1084,8 @@ void mark_rodata_ro(void)
18990 unsigned long start = PFN_ALIGN(_text);
18991 unsigned long size = PFN_ALIGN(_etext) - start;
18993 +#ifndef CONFIG_DYNAMIC_FTRACE
18994 + /* Dynamic tracing modifies the kernel text section */
18995 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
18996 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
18998 @@ -767,6 +1098,8 @@ void mark_rodata_ro(void)
18999 printk(KERN_INFO "Testing CPA: write protecting again\n");
19000 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
19002 +#endif /* CONFIG_DYNAMIC_FTRACE */
19005 size = (unsigned long)__end_rodata - start;
19006 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
19007 @@ -829,3 +1162,9 @@ void free_initrd_mem(unsigned long start
19008 free_init_pages("initrd memory", start, end);
19012 +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
19015 + return reserve_bootmem(phys, len, flags);
19017 --- sle11-2009-06-04.orig/arch/x86/mm/init_64-xen.c 2009-03-16 16:38:05.000000000 +0100
19018 +++ sle11-2009-06-04/arch/x86/mm/init_64-xen.c 2009-06-04 10:21:39.000000000 +0200
19020 #include <linux/swap.h>
19021 #include <linux/smp.h>
19022 #include <linux/init.h>
19023 +#include <linux/initrd.h>
19024 #include <linux/pagemap.h>
19025 #include <linux/bootmem.h>
19026 #include <linux/proc_fs.h>
19029 #include <xen/features.h>
19032 + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
19033 + * The direct mapping extends to max_pfn_mapped, so that we can directly access
19034 + * apertures, ACPI and other tables without having to play with fixmaps.
19036 +unsigned long max_low_pfn_mapped;
19037 +unsigned long max_pfn_mapped;
19039 #if CONFIG_XEN_COMPAT <= 0x030002
19040 unsigned int __kernel_page_user;
19041 EXPORT_SYMBOL(__kernel_page_user);
19042 @@ -60,13 +69,12 @@ EXPORT_SYMBOL(__kernel_page_user);
19045 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
19046 -extern unsigned long start_pfn;
19048 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
19049 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
19052 -int direct_gbpages __meminitdata
19053 +int direct_gbpages
19054 #ifdef CONFIG_DIRECT_GBPAGES
19057 @@ -145,55 +153,23 @@ void __meminit early_make_page_readonly(
19058 * around without checking the pgd every time.
19061 -void show_mem(void)
19063 - long i, total = 0, reserved = 0;
19064 - long shared = 0, cached = 0;
19065 - struct page *page;
19066 - pg_data_t *pgdat;
19068 - printk(KERN_INFO "Mem-info:\n");
19069 - show_free_areas();
19070 - for_each_online_pgdat(pgdat) {
19071 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
19073 - * This loop can take a while with 256 GB and
19074 - * 4k pages so defer the NMI watchdog:
19076 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
19077 - touch_nmi_watchdog();
19079 - if (!pfn_valid(pgdat->node_start_pfn + i))
19082 - page = pfn_to_page(pgdat->node_start_pfn + i);
19084 - if (PageReserved(page))
19086 - else if (PageSwapCache(page))
19088 - else if (page_count(page))
19089 - shared += page_count(page) - 1;
19092 - printk(KERN_INFO "%lu pages of RAM\n", total);
19093 - printk(KERN_INFO "%lu reserved pages\n", reserved);
19094 - printk(KERN_INFO "%lu pages shared\n", shared);
19095 - printk(KERN_INFO "%lu pages swap cached\n", cached);
19098 static unsigned long __meminitdata table_start;
19099 -static unsigned long __meminitdata table_end;
19100 +static unsigned long __meminitdata table_cur;
19101 +static unsigned long __meminitdata table_top;
19103 -static __init void *spp_getpage(void)
19105 + * NOTE: This function is marked __ref because it calls __init function
19106 + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
19108 +static __ref void *spp_getpage(void)
19113 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
19114 - else if (start_pfn < table_end) {
19115 - ptr = __va(start_pfn << PAGE_SHIFT);
19117 + else if (table_cur < table_top) {
19118 + ptr = __va(table_cur << PAGE_SHIFT);
19120 memset(ptr, 0, PAGE_SIZE);
19122 ptr = alloc_bootmem_pages(PAGE_SIZE);
19123 @@ -208,30 +184,18 @@ static __init void *spp_getpage(void)
19127 -#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
19128 -#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
19130 -static __init void
19131 -set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
19133 +set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
19138 - pte_t *pte, new_pte;
19140 - pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
19143 - pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
19144 - if (pgd_none(*pgd)) {
19146 - "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19149 - pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
19150 + pud = pud_page + pud_index(vaddr);
19151 if (pud_none(*pud)) {
19152 pmd = (pmd_t *) spp_getpage();
19153 make_page_readonly(pmd, XENFEAT_writable_page_tables);
19154 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19155 + pud_populate(&init_mm, pud, pmd);
19156 if (pmd != pmd_offset(pud, 0)) {
19157 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19158 pmd, pmd_offset(pud, 0));
19159 @@ -242,19 +206,20 @@ set_pte_phys(unsigned long vaddr, unsign
19160 if (pmd_none(*pmd)) {
19161 pte = (pte_t *) spp_getpage();
19162 make_page_readonly(pte, XENFEAT_writable_page_tables);
19163 - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19164 + pmd_populate_kernel(&init_mm, pmd, pte);
19165 if (pte != pte_offset_kernel(pmd, 0)) {
19166 printk(KERN_ERR "PAGETABLE BUG #02!\n");
19170 - if (pgprot_val(prot))
19171 - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
19173 - new_pte = __pte(0);
19175 pte = pte_offset_kernel(pmd, vaddr);
19176 if (!pte_none(*pte) && __pte_val(new_pte) &&
19177 +#ifdef CONFIG_ACPI
19178 + /* __acpi_map_table() fails to properly call clear_fixmap() */
19179 + (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19180 + vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19182 __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19184 set_pte(pte, new_pte);
19185 @@ -266,15 +231,13 @@ set_pte_phys(unsigned long vaddr, unsign
19186 __flush_tlb_one(vaddr);
19189 -static __init void
19190 -set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
19192 +set_pte_vaddr(unsigned long vaddr, pte_t pteval)
19197 - pte_t *pte, new_pte;
19200 - pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
19201 + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
19203 pgd = pgd_offset_k(vaddr);
19204 if (pgd_none(*pgd)) {
19205 @@ -282,47 +245,51 @@ set_pte_phys_ma(unsigned long vaddr, uns
19206 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19209 - pud = pud_offset(pgd, vaddr);
19210 - if (pud_none(*pud)) {
19211 - pmd = (pmd_t *) spp_getpage();
19212 - make_page_readonly(pmd, XENFEAT_writable_page_tables);
19213 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19214 - if (pmd != pmd_offset(pud, 0)) {
19215 - printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19216 - pmd, pmd_offset(pud, 0));
19217 + pud_page = (pud_t*)pgd_page_vaddr(*pgd);
19218 + set_pte_vaddr_pud(pud_page, vaddr, pteval);
19221 +#ifndef CONFIG_XEN
19223 + * Create large page table mappings for a range of physical addresses.
19225 +static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
19232 + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
19233 + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
19234 + pgd = pgd_offset_k((unsigned long)__va(phys));
19235 + if (pgd_none(*pgd)) {
19236 + pud = (pud_t *) spp_getpage();
19237 + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
19241 - pmd = pmd_offset(pud, vaddr);
19242 - if (pmd_none(*pmd)) {
19243 - pte = (pte_t *) spp_getpage();
19244 - make_page_readonly(pte, XENFEAT_writable_page_tables);
19245 - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19246 - if (pte != pte_offset_kernel(pmd, 0)) {
19247 - printk(KERN_ERR "PAGETABLE BUG #02!\n");
19249 + pud = pud_offset(pgd, (unsigned long)__va(phys));
19250 + if (pud_none(*pud)) {
19251 + pmd = (pmd_t *) spp_getpage();
19252 + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
19255 + pmd = pmd_offset(pud, phys);
19256 + BUG_ON(!pmd_none(*pmd));
19257 + set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
19259 - new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
19262 - pte = pte_offset_kernel(pmd, vaddr);
19263 - if (!pte_none(*pte) && __pte_val(new_pte) &&
19264 -#ifdef CONFIG_ACPI
19265 - /* __acpi_map_table() fails to properly call clear_fixmap() */
19266 - (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19267 - vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19269 - __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19271 - set_pte(pte, new_pte);
19272 +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
19274 + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
19278 - * It's enough to flush this one mapping.
19279 - * (PGE mappings get flushed as well)
19281 - __flush_tlb_one(vaddr);
19282 +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
19284 + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
19287 -#ifndef CONFIG_XEN
19289 * The head.S code sets up the kernel high mapping:
19291 @@ -352,63 +319,52 @@ void __init cleanup_highmap(void)
19295 -/* NOTE: this is meant to be run only at boot */
19296 -void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
19298 - unsigned long address = __fix_to_virt(idx);
19300 - if (idx >= __end_of_fixed_addresses) {
19301 - printk(KERN_ERR "Invalid __set_fixmap\n");
19305 - case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
19306 - set_pte_phys(address, phys, prot, 0);
19307 - set_pte_phys(address, phys, prot, 1);
19309 - case FIX_EARLYCON_MEM_BASE:
19310 - xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
19311 - pfn_pte_ma(phys >> PAGE_SHIFT, prot));
19314 - set_pte_phys_ma(address, phys, prot);
19319 -static __meminit void *alloc_static_page(unsigned long *phys)
19320 +static __ref void *alloc_low_page(unsigned long *phys)
19322 - unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
19323 + unsigned long pfn;
19326 if (after_bootmem) {
19327 - void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
19328 + adr = (void *)get_zeroed_page(GFP_ATOMIC);
19334 - *phys = start_pfn << PAGE_SHIFT;
19336 - memset((void *)va, 0, PAGE_SIZE);
19337 - return (void *)va;
19338 + BUG_ON(!table_cur);
19339 + pfn = table_cur++;
19340 + if (pfn >= table_top)
19341 + panic("alloc_low_page: ran out of memory");
19343 + adr = early_ioremap(pfn_to_mfn(pfn) * PAGE_SIZE, PAGE_SIZE);
19344 + memset(adr, 0, PAGE_SIZE);
19345 + *phys = pfn * PAGE_SIZE;
19349 -#define PTE_SIZE PAGE_SIZE
19350 +static __ref void unmap_low_page(void *adr)
19352 + if (after_bootmem)
19355 + early_iounmap(adr, PAGE_SIZE);
19358 static inline int __meminit make_readonly(unsigned long paddr)
19360 extern char __vsyscall_0;
19363 - /* Make new page tables read-only. */
19364 + /* Make new page tables read-only on the first pass. */
19365 if (!xen_feature(XENFEAT_writable_page_tables)
19366 + && !max_pfn_mapped
19367 && (paddr >= (table_start << PAGE_SHIFT))
19368 - && (paddr < (table_end << PAGE_SHIFT)))
19369 + && (paddr < (table_top << PAGE_SHIFT)))
19371 /* Make old page tables read-only. */
19372 if (!xen_feature(XENFEAT_writable_page_tables)
19373 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
19374 - && (paddr < (start_pfn << PAGE_SHIFT)))
19375 + && (paddr < (table_cur << PAGE_SHIFT)))
19379 @@ -425,118 +381,131 @@ static inline int __meminit make_readonl
19383 -#ifndef CONFIG_XEN
19384 -/* Must run before zap_low_mappings */
19385 -__meminit void *early_ioremap(unsigned long addr, unsigned long size)
19386 +static unsigned long __meminit
19387 +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
19389 - pmd_t *pmd, *last_pmd;
19390 - unsigned long vaddr;
19393 - pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19394 - vaddr = __START_KERNEL_map;
19395 - pmd = level2_kernel_pgt;
19396 - last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
19398 - for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
19399 - for (i = 0; i < pmds; i++) {
19400 - if (pmd_present(pmd[i]))
19401 - goto continue_outer_loop;
19403 - vaddr += addr & ~PMD_MASK;
19404 - addr &= PMD_MASK;
19405 + unsigned pages = 0;
19406 + unsigned long last_map_addr = end;
19409 + pte_t *pte = pte_page + pte_index(addr);
19411 + for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
19412 + unsigned long pteval = addr | __PAGE_KERNEL;
19414 - for (i = 0; i < pmds; i++, addr += PMD_SIZE)
19415 - set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
19416 - __flush_tlb_all();
19418 - return (void *)vaddr;
19419 -continue_outer_loop:
19421 + if (addr >= end ||
19422 + (!after_bootmem &&
19423 + (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages))
19426 + if (__pte_val(*pte))
19429 + if (make_readonly(addr))
19430 + pteval &= ~_PAGE_RW;
19432 + printk(" pte=%p addr=%lx pte=%016lx\n",
19433 + pte, addr, pteval);
19434 + if (!after_bootmem)
19435 + *pte = __pte(pteval & __supported_pte_mask);
19437 + set_pte(pte, __pte(pteval & __supported_pte_mask));
19438 + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
19441 - printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
19443 + update_page_count(PG_LEVEL_4K, pages);
19445 + return last_map_addr;
19449 - * To avoid virtual aliases later:
19451 -__meminit void early_iounmap(void *addr, unsigned long size)
19452 +static unsigned long __meminit
19453 +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
19455 - unsigned long vaddr;
19459 - vaddr = (unsigned long)addr;
19460 - pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19461 - pmd = level2_kernel_pgt + pmd_index(vaddr);
19463 - for (i = 0; i < pmds; i++)
19464 - pmd_clear(pmd + i);
19465 + pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
19467 - __flush_tlb_all();
19468 + BUG_ON(!max_pfn_mapped);
19469 + return phys_pte_init(pte, address, end);
19473 static unsigned long __meminit
19474 -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
19475 +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
19476 + unsigned long page_size_mask)
19478 + unsigned long pages = 0;
19479 + unsigned long last_map_addr = end;
19480 + unsigned long start = address;
19482 int i = pmd_index(address);
19484 - for (; i < PTRS_PER_PMD; i++) {
19485 + for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
19486 unsigned long pte_phys;
19487 - pmd_t *pmd = pmd_page + i;
19488 - pte_t *pte, *pte_save;
19490 + pmd_t *pmd = pmd_page + pmd_index(address);
19493 if (address >= end)
19496 if (__pmd_val(*pmd)) {
19497 - address += PMD_SIZE;
19498 + if (!pmd_large(*pmd)) {
19499 + spin_lock(&init_mm.page_table_lock);
19500 + last_map_addr = phys_pte_update(pmd, address,
19502 + spin_unlock(&init_mm.page_table_lock);
19504 + /* Count entries we're using from level2_ident_pgt */
19510 - pte = alloc_static_page(&pte_phys);
19512 - for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
19513 - unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
19515 - if (address >= (after_bootmem
19517 - : xen_start_info->nr_pages << PAGE_SHIFT))
19519 - else if (make_readonly(address))
19520 - pteval &= ~_PAGE_RW;
19521 - set_pte(pte, __pte(pteval & __supported_pte_mask));
19522 + if (page_size_mask & (1<<PG_LEVEL_2M)) {
19524 + spin_lock(&init_mm.page_table_lock);
19525 + set_pte((pte_t *)pmd,
19526 + pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19527 + spin_unlock(&init_mm.page_table_lock);
19528 + last_map_addr = (address & PMD_MASK) + PMD_SIZE;
19532 + pte = alloc_low_page(&pte_phys);
19533 + last_map_addr = phys_pte_init(pte, address, end);
19534 + unmap_low_page(pte);
19536 if (!after_bootmem) {
19537 - early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19538 - *pmd = __pmd(pte_phys | _KERNPG_TABLE);
19539 + if (max_pfn_mapped)
19540 + make_page_readonly(__va(pte_phys),
19541 + XENFEAT_writable_page_tables);
19542 + *pmd = __pmd(pte_phys | _PAGE_TABLE);
19544 - make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19545 - set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
19546 + make_page_readonly(pte, XENFEAT_writable_page_tables);
19547 + spin_lock(&init_mm.page_table_lock);
19548 + pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
19549 + spin_unlock(&init_mm.page_table_lock);
19553 + update_page_count(PG_LEVEL_2M, pages);
19554 + return last_map_addr;
19557 static unsigned long __meminit
19558 -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
19559 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
19560 + unsigned long page_size_mask)
19562 pmd_t *pmd = pmd_offset(pud, 0);
19563 unsigned long last_map_addr;
19565 - spin_lock(&init_mm.page_table_lock);
19566 - last_map_addr = phys_pmd_init(pmd, address, end);
19567 - spin_unlock(&init_mm.page_table_lock);
19568 + BUG_ON(!max_pfn_mapped);
19569 + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
19571 return last_map_addr;
19574 static unsigned long __meminit
19575 -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
19576 +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
19577 + unsigned long page_size_mask)
19579 + unsigned long pages = 0;
19580 unsigned long last_map_addr = end;
19581 int i = pud_index(addr);
19583 @@ -550,29 +519,55 @@ phys_pud_init(pud_t *pud_page, unsigned
19585 if (__pud_val(*pud)) {
19586 if (!pud_large(*pud))
19587 - last_map_addr = phys_pmd_update(pud, addr, end);
19588 + last_map_addr = phys_pmd_update(pud, addr, end,
19593 - if (direct_gbpages) {
19594 + if (page_size_mask & (1<<PG_LEVEL_1G)) {
19596 + spin_lock(&init_mm.page_table_lock);
19597 set_pte((pte_t *)pud,
19598 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19599 + spin_unlock(&init_mm.page_table_lock);
19600 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
19604 - pmd = alloc_static_page(&pmd_phys);
19606 - spin_lock(&init_mm.page_table_lock);
19607 - *pud = __pud(pmd_phys | _KERNPG_TABLE);
19608 - last_map_addr = phys_pmd_init(pmd, addr, end);
19609 - spin_unlock(&init_mm.page_table_lock);
19610 + pmd = alloc_low_page(&pmd_phys);
19611 + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
19612 + unmap_low_page(pmd);
19614 - early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
19615 + if (!after_bootmem) {
19616 + if (max_pfn_mapped)
19617 + make_page_readonly(__va(pmd_phys),
19618 + XENFEAT_writable_page_tables);
19619 + if (page_size_mask & (1 << PG_LEVEL_NUM))
19620 + xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE));
19622 + *pud = __pud(pmd_phys | _PAGE_TABLE);
19624 + make_page_readonly(pmd, XENFEAT_writable_page_tables);
19625 + spin_lock(&init_mm.page_table_lock);
19626 + pud_populate(&init_mm, pud, __va(pmd_phys));
19627 + spin_unlock(&init_mm.page_table_lock);
19631 + update_page_count(PG_LEVEL_1G, pages);
19633 - return last_map_addr >> PAGE_SHIFT;
19634 + return last_map_addr;
19637 +static unsigned long __meminit
19638 +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
19639 + unsigned long page_size_mask)
19643 + pud = (pud_t *)pgd_page_vaddr(*pgd);
19645 + return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM));
19648 void __init xen_init_pt(void)
19649 @@ -651,86 +646,36 @@ void __init xen_init_pt(void)
19653 -static void __init extend_init_mapping(unsigned long tables_space)
19655 - unsigned long va = __START_KERNEL_map;
19656 - unsigned long start = start_pfn;
19657 - unsigned long phys, addr, *pte_page;
19659 - pte_t *pte, new_pte;
19660 - unsigned long *page = (unsigned long *)init_level4_pgt;
19662 - addr = page[pgd_index(va)];
19663 - addr_to_page(addr, page);
19664 - addr = page[pud_index(va)];
19665 - addr_to_page(addr, page);
19667 - /* Kill mapping of low 1MB. */
19668 - while (va < (unsigned long)&_text) {
19669 - if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
19674 - /* Ensure init mappings cover kernel text/data and initial tables. */
19675 - while (va < (__START_KERNEL_map
19676 - + (start_pfn << PAGE_SHIFT)
19677 - + tables_space)) {
19678 - pmd = (pmd_t *)&page[pmd_index(va)];
19679 - if (pmd_none(*pmd)) {
19680 - pte_page = alloc_static_page(&phys);
19681 - early_make_page_readonly(
19682 - pte_page, XENFEAT_writable_page_tables);
19683 - set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
19685 - addr = page[pmd_index(va)];
19686 - addr_to_page(addr, pte_page);
19688 - pte = (pte_t *)&pte_page[pte_index(va)];
19689 - if (pte_none(*pte)) {
19690 - new_pte = pfn_pte(
19691 - (va - __START_KERNEL_map) >> PAGE_SHIFT,
19692 - __pgprot(_KERNPG_TABLE));
19693 - xen_l1_entry_update(pte, new_pte);
19698 - /* Finally, blow away any spurious initial mappings. */
19700 - pmd = (pmd_t *)&page[pmd_index(va)];
19701 - if (pmd_none(*pmd))
19703 - if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
19708 - if (start_pfn > start)
19709 - reserve_early(start << PAGE_SHIFT,
19710 - start_pfn << PAGE_SHIFT, "INITMAP");
19713 static void __init find_early_table_space(unsigned long end)
19715 unsigned long puds, pmds, ptes, tables;
19717 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
19718 + tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
19719 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
19720 - ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
19721 + tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
19723 - tables = round_up(puds * 8, PAGE_SIZE) +
19724 - round_up(pmds * 8, PAGE_SIZE) +
19725 - round_up(ptes * 8, PAGE_SIZE);
19726 + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
19727 + tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
19729 - extend_init_mapping(tables);
19730 + if (!table_top) {
19731 + table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
19732 + xen_start_info->nr_pt_frames;
19733 + table_cur = table_start;
19736 + * [table_start, table_top) gets passed to reserve_early(),
19737 + * so we must not use table_cur here, despite continuing
19738 + * to allocate from there. table_cur possibly being below
19739 + * table_start is otoh not a problem.
19741 + table_start = table_top;
19744 - table_start = start_pfn;
19745 - table_end = table_start + (tables>>PAGE_SHIFT);
19746 + table_top = table_cur + (tables >> PAGE_SHIFT);
19748 - early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
19749 - end, table_start << PAGE_SHIFT,
19750 - (table_start << PAGE_SHIFT) + tables);
19751 + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
19752 + end, table_cur << PAGE_SHIFT, table_top << PAGE_SHIFT);
19755 static void __init xen_finish_init_mapping(void)
19756 @@ -752,18 +697,18 @@ static void __init xen_finish_init_mappi
19757 xen_start_info->mod_start = (unsigned long)
19758 __va(__pa(xen_start_info->mod_start));
19760 - /* Destroy the Xen-created mappings beyond the kernel image as
19761 - * well as the temporary mappings created above. Prevents
19762 - * overlap with modules area (if init mapping is very big).
19764 + /* Destroy the Xen-created mappings beyond the kernel image. */
19765 start = PAGE_ALIGN((unsigned long)_end);
19766 - end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
19767 + end = __START_KERNEL_map + (table_start << PAGE_SHIFT);
19768 for (; start < end; start += PAGE_SIZE)
19769 if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
19772 - /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
19773 - table_end = ~0UL;
19774 + /* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
19775 + start = table_top;
19776 + WARN(table_cur != start, "start=%lx cur=%lx top=%lx\n",
19777 + table_start, table_cur, start);
19778 + table_top = ~0UL;
19780 /* Switch to the real shared_info page, and clear the dummy page. */
19781 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
19782 @@ -780,8 +725,7 @@ static void __init xen_finish_init_mappi
19786 - /* Disable the 'start_pfn' allocator. */
19787 - table_end = start_pfn;
19788 + table_top = max(table_cur, start);
19791 static void __init init_gbpages(void)
19792 @@ -794,126 +738,91 @@ static void __init init_gbpages(void)
19796 -#ifdef CONFIG_MEMTEST_BOOTPARAM
19798 -static void __init memtest(unsigned long start_phys, unsigned long size,
19799 - unsigned pattern)
19800 +static unsigned long __init kernel_physical_mapping_init(unsigned long start,
19801 + unsigned long end,
19802 + unsigned long page_size_mask)
19805 - unsigned long *start;
19806 - unsigned long start_bad;
19807 - unsigned long last_bad;
19808 - unsigned long val;
19809 - unsigned long start_phys_aligned;
19810 - unsigned long count;
19811 - unsigned long incr;
19813 - switch (pattern) {
19821 - val = 0x5555555555555555UL;
19824 - val = 0xaaaaaaaaaaaaaaaaUL;
19830 - incr = sizeof(unsigned long);
19831 - start_phys_aligned = ALIGN(start_phys, incr);
19832 - count = (size - (start_phys_aligned - start_phys))/incr;
19833 - start = __va(start_phys_aligned);
19837 - for (i = 0; i < count; i++)
19839 - for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
19840 - if (*start != val) {
19841 - if (start_phys_aligned == last_bad + incr) {
19842 - last_bad += incr;
19845 - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19846 - val, start_bad, last_bad + incr);
19847 - reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19849 - start_bad = last_bad = start_phys_aligned;
19854 - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19855 - val, start_bad, last_bad + incr);
19856 - reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19860 + unsigned long next, last_map_addr = end;
19862 -static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
19863 + start = (unsigned long)__va(start);
19864 + end = (unsigned long)__va(end);
19866 -static int __init parse_memtest(char *arg)
19869 - memtest_pattern = simple_strtoul(arg, NULL, 0);
19872 + for (; start < end; start = next) {
19873 + pgd_t *pgd = pgd_offset_k(start);
19874 + unsigned long pud_phys;
19877 -early_param("memtest", parse_memtest);
19878 + next = (start + PGDIR_SIZE) & PGDIR_MASK;
19882 -static void __init early_memtest(unsigned long start, unsigned long end)
19884 - u64 t_start, t_size;
19885 - unsigned pattern;
19886 + if (__pgd_val(*pgd)) {
19887 + last_map_addr = phys_pud_update(pgd, __pa(start),
19888 + __pa(end), page_size_mask);
19892 - if (!memtest_pattern)
19894 + pud = alloc_low_page(&pud_phys);
19895 + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
19897 + unmap_low_page(pud);
19899 + if(!after_bootmem) {
19900 + if (max_pfn_mapped)
19901 + make_page_readonly(__va(pud_phys),
19902 + XENFEAT_writable_page_tables);
19903 + xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
19905 + make_page_readonly(pud, XENFEAT_writable_page_tables);
19906 + spin_lock(&init_mm.page_table_lock);
19907 + pgd_populate(&init_mm, pgd, __va(pud_phys));
19908 + spin_unlock(&init_mm.page_table_lock);
19912 - printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
19913 - for (pattern = 0; pattern < memtest_pattern; pattern++) {
19916 - while (t_start < end) {
19917 - t_start = find_e820_area_size(t_start, &t_size, 1);
19918 + return last_map_addr;
19922 - if (t_start >= end)
19924 - if (t_start + t_size > end)
19925 - t_size = end - t_start;
19926 +struct map_range {
19927 + unsigned long start;
19928 + unsigned long end;
19929 + unsigned page_size_mask;
19932 - printk(KERN_CONT "\n %016llx - %016llx pattern %d",
19933 - (unsigned long long)t_start,
19934 - (unsigned long long)t_start + t_size, pattern);
19935 +#define NR_RANGE_MR 5
19937 - memtest(t_start, t_size, pattern);
19938 +static int save_mr(struct map_range *mr, int nr_range,
19939 + unsigned long start_pfn, unsigned long end_pfn,
19940 + unsigned long page_size_mask)
19943 - t_start += t_size;
19945 + if (start_pfn < end_pfn) {
19946 + if (nr_range >= NR_RANGE_MR)
19947 + panic("run out of range for init_memory_mapping\n");
19948 + mr[nr_range].start = start_pfn<<PAGE_SHIFT;
19949 + mr[nr_range].end = end_pfn<<PAGE_SHIFT;
19950 + mr[nr_range].page_size_mask = page_size_mask;
19953 - printk(KERN_CONT "\n");
19956 -static void __init early_memtest(unsigned long start, unsigned long end)
19964 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
19965 * This runs before bootmem is initialized and gets pages directly from
19966 * the physical memory. To access them they are temporarily mapped.
19968 -unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
19969 +unsigned long __init_refok init_memory_mapping(unsigned long start,
19970 + unsigned long end)
19972 - unsigned long next, last_map_addr = end;
19973 - unsigned long start_phys = start, end_phys = end;
19974 + unsigned long last_map_addr = 0;
19975 + unsigned long page_size_mask = 0;
19976 + unsigned long start_pfn, end_pfn;
19978 + struct map_range mr[NR_RANGE_MR];
19981 printk(KERN_INFO "init_memory_mapping\n");
19983 @@ -924,51 +833,150 @@ unsigned long __init_refok init_memory_m
19984 * memory mapped. Unfortunately this is done currently before the
19985 * nodes are discovered.
19987 - if (!after_bootmem) {
19988 + if (!after_bootmem)
19990 - find_early_table_space(end);
19992 + if (direct_gbpages)
19993 + page_size_mask |= 1 << PG_LEVEL_1G;
19995 + page_size_mask |= 1 << PG_LEVEL_2M;
19997 + memset(mr, 0, sizeof(mr));
20000 + /* head if not big page alignment ?*/
20001 + start_pfn = start >> PAGE_SHIFT;
20002 + end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
20003 + << (PMD_SHIFT - PAGE_SHIFT);
20004 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
20006 + /* big page (2M) range*/
20007 + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
20008 + << (PMD_SHIFT - PAGE_SHIFT);
20009 + end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
20010 + << (PUD_SHIFT - PAGE_SHIFT);
20011 + if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
20012 + end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
20013 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20014 + page_size_mask & (1<<PG_LEVEL_2M));
20016 + /* big page (1G) range */
20017 + start_pfn = end_pfn;
20018 + end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
20019 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20021 + ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
20023 + /* tail is not big page (1G) alignment */
20024 + start_pfn = end_pfn;
20025 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
20026 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20027 + page_size_mask & (1<<PG_LEVEL_2M));
20029 + /* tail is not big page (2M) alignment */
20030 + start_pfn = end_pfn;
20031 + end_pfn = end>>PAGE_SHIFT;
20032 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
20034 + /* try to merge same page size and continuous */
20035 + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
20036 + unsigned long old_start;
20037 + if (mr[i].end != mr[i+1].start ||
20038 + mr[i].page_size_mask != mr[i+1].page_size_mask)
20041 + old_start = mr[i].start;
20042 + memmove(&mr[i], &mr[i+1],
20043 + (nr_range - 1 - i) * sizeof (struct map_range));
20044 + mr[i--].start = old_start;
20048 - start = (unsigned long)__va(start);
20049 - end = (unsigned long)__va(end);
20050 + for (i = 0; i < nr_range; i++)
20051 + printk(KERN_DEBUG " %010lx - %010lx page %s\n",
20052 + mr[i].start, mr[i].end,
20053 + (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
20054 + (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
20056 - for (; start < end; start = next) {
20057 - pgd_t *pgd = pgd_offset_k(start);
20058 - unsigned long pud_phys;
20060 + if (!after_bootmem)
20061 + find_early_table_space(end);
20063 - if (after_bootmem)
20064 - pud = pud_offset(pgd, start & PGDIR_MASK);
20066 - pud = alloc_static_page(&pud_phys);
20067 - next = start + PGDIR_SIZE;
20070 - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
20071 - if (!after_bootmem) {
20072 - early_make_page_readonly(pud, XENFEAT_writable_page_tables);
20073 - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
20075 + unsigned long addr, va = __START_KERNEL_map;
20076 + unsigned long *page = (unsigned long *)init_level4_pgt;
20078 + /* Kill mapping of memory below _text. */
20079 + while (va < (unsigned long)&_text) {
20080 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
20085 + /* Blow away any spurious initial mappings. */
20086 + va = __START_KERNEL_map + (table_start << PAGE_SHIFT);
20087 + addr = page[pgd_index(va)];
20088 + addr_to_page(addr, page);
20089 + addr = page[pud_index(va)];
20090 + addr_to_page(addr, page);
20091 + while (pmd_index(va) | pte_index(va)) {
20092 + if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
20094 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
20100 - if (!after_bootmem) {
20101 - BUG_ON(start_pfn != table_end);
20102 + for (i = 0; i < nr_range; i++)
20103 + last_map_addr = kernel_physical_mapping_init(
20104 + mr[i].start, mr[i].end,
20105 + mr[i].page_size_mask);
20107 + BUG_ON(table_cur > table_top);
20109 xen_finish_init_mapping();
20111 + else if (table_cur < table_top)
20112 + /* Disable the 'table_cur' allocator. */
20113 + table_top = table_cur;
20117 - if (!after_bootmem)
20118 + if (!after_bootmem && table_top > table_start)
20119 reserve_early(table_start << PAGE_SHIFT,
20120 - table_end << PAGE_SHIFT, "PGTABLE");
20121 + table_top << PAGE_SHIFT, "PGTABLE");
20123 + printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
20124 + last_map_addr, end);
20126 if (!after_bootmem)
20127 - early_memtest(start_phys, end_phys);
20128 + early_memtest(start, end);
20130 - return last_map_addr;
20131 + return last_map_addr >> PAGE_SHIFT;
20134 #ifndef CONFIG_NUMA
20135 +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
20137 + unsigned long bootmap_size, bootmap;
20139 + e820_register_active_regions(0, start_pfn, end_pfn);
20141 + if (end_pfn > xen_start_info->nr_pages)
20142 + end_pfn = xen_start_info->nr_pages;
20144 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
20145 + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
20147 + if (bootmap == -1L)
20148 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
20149 + /* don't touch min_low_pfn */
20150 + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
20152 + free_bootmem_with_active_regions(0, end_pfn);
20153 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
20154 + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
20157 void __init paging_init(void)
20159 unsigned long max_zone_pfns[MAX_NR_ZONES];
20160 @@ -976,9 +984,9 @@ void __init paging_init(void)
20161 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
20162 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
20163 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
20164 - max_zone_pfns[ZONE_NORMAL] = end_pfn;
20165 + max_zone_pfns[ZONE_NORMAL] = max_pfn;
20167 - memory_present(0, 0, end_pfn);
20168 + memory_present(0, 0, max_pfn);
20170 free_area_init_nodes(max_zone_pfns);
20172 @@ -1069,8 +1077,8 @@ void __init mem_init(void)
20173 init_page_count(pfn_to_page(pfn));
20176 - reservedpages = end_pfn - totalram_pages -
20177 - absent_pages_in_range(0, end_pfn);
20178 + reservedpages = max_pfn - totalram_pages -
20179 + absent_pages_in_range(0, max_pfn);
20182 codesize = (unsigned long) &_etext - (unsigned long) &_text;
20183 @@ -1089,7 +1097,7 @@ void __init mem_init(void)
20184 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
20185 "%ldk reserved, %ldk data, %ldk init)\n",
20186 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
20187 - end_pfn << (PAGE_SHIFT-10),
20188 + max_pfn << (PAGE_SHIFT-10),
20190 reservedpages << (PAGE_SHIFT-10),
20192 @@ -1152,6 +1160,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
20193 void mark_rodata_ro(void)
20195 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
20196 + unsigned long rodata_start =
20197 + ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20199 +#ifdef CONFIG_DYNAMIC_FTRACE
20200 + /* Dynamic tracing modifies the kernel text section */
20201 + start = rodata_start;
20204 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
20205 (end - start) >> 10);
20206 @@ -1161,8 +1176,7 @@ void mark_rodata_ro(void)
20207 * The rodata section (but not the kernel text!) should also be
20210 - start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20211 - set_memory_nx(start, (end - start) >> PAGE_SHIFT);
20212 + set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
20216 @@ -1184,24 +1198,26 @@ void free_initrd_mem(unsigned long start
20220 -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
20221 +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
20228 unsigned long pfn = phys >> PAGE_SHIFT;
20230 - if (pfn >= end_pfn) {
20231 + if (pfn >= max_pfn) {
20233 * This can happen with kdump kernels when accessing
20236 if (pfn < max_pfn_mapped)
20240 - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
20241 + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
20247 /* Should check here against the e820 map to avoid double free */
20248 @@ -1209,9 +1225,13 @@ void __init reserve_bootmem_generic(unsi
20249 nid = phys_to_nid(phys);
20250 next_nid = phys_to_nid(phys + len - 1);
20251 if (nid == next_nid)
20252 - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
20253 + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
20255 - reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20256 + ret = reserve_bootmem(phys, len, flags);
20262 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20264 @@ -1222,6 +1242,8 @@ void __init reserve_bootmem_generic(unsi
20265 set_dma_reserve(dma_reserve);
20272 int kern_addr_valid(unsigned long addr)
20273 @@ -1326,7 +1348,7 @@ vmemmap_populate(struct page *start_page
20276 for (; addr < end; addr = next) {
20277 - next = pmd_addr_end(addr, end);
20280 pgd = vmemmap_pgd_populate(addr, node);
20282 @@ -1336,33 +1358,51 @@ vmemmap_populate(struct page *start_page
20286 - pmd = pmd_offset(pud, addr);
20287 - if (pmd_none(*pmd)) {
20290 + if (!cpu_has_pse) {
20291 + next = (addr + PAGE_SIZE) & PAGE_MASK;
20292 + pmd = vmemmap_pmd_populate(pud, addr, node);
20297 + p = vmemmap_pte_populate(pmd, addr, node);
20299 - p = vmemmap_alloc_block(PMD_SIZE, node);
20303 - entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20304 - PAGE_KERNEL_LARGE);
20305 - set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20307 - /* check to see if we have contiguous blocks */
20308 - if (p_end != p || node_start != node) {
20310 - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20311 - addr_start, addr_end-1, p_start, p_end-1, node_start);
20312 - addr_start = addr;
20313 - node_start = node;
20316 - addr_end = addr + PMD_SIZE;
20317 - p_end = p + PMD_SIZE;
20318 + addr_end = addr + PAGE_SIZE;
20319 + p_end = p + PAGE_SIZE;
20321 - vmemmap_verify((pte_t *)pmd, node, addr, next);
20322 + next = pmd_addr_end(addr, end);
20324 + pmd = pmd_offset(pud, addr);
20325 + if (pmd_none(*pmd)) {
20328 + p = vmemmap_alloc_block(PMD_SIZE, node);
20332 + entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20333 + PAGE_KERNEL_LARGE);
20334 + set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20336 + /* check to see if we have contiguous blocks */
20337 + if (p_end != p || node_start != node) {
20339 + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20340 + addr_start, addr_end-1, p_start, p_end-1, node_start);
20341 + addr_start = addr;
20342 + node_start = node;
20346 + addr_end = addr + PMD_SIZE;
20347 + p_end = p + PMD_SIZE;
20349 + vmemmap_verify((pte_t *)pmd, node, addr, next);
20355 --- sle11-2009-06-04.orig/arch/x86/mm/ioremap-xen.c 2009-03-16 16:38:05.000000000 +0100
20356 +++ sle11-2009-06-04/arch/x86/mm/ioremap-xen.c 2009-06-04 10:21:39.000000000 +0200
20358 #include <linux/pfn.h>
20359 #include <linux/slab.h>
20360 #include <linux/vmalloc.h>
20361 +#include <linux/mmiotrace.h>
20363 #include <asm/cacheflush.h>
20364 #include <asm/e820.h>
20365 @@ -274,7 +275,8 @@ int ioremap_check_change_attr(unsigned l
20366 for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
20367 unsigned long pfn = mfn_to_local_pfn(mfn);
20369 - if (pfn >= max_pfn_mapped)
20370 + if (pfn >= max_low_pfn_mapped &&
20371 + (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
20373 rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
20374 PAGE_SIZE, prot_val);
20375 @@ -297,11 +299,14 @@ static void __iomem *__ioremap_caller(re
20377 unsigned long mfn, offset, vaddr;
20378 resource_size_t last_addr;
20379 + const resource_size_t unaligned_phys_addr = phys_addr;
20380 + const unsigned long unaligned_size = size;
20381 struct vm_struct *area;
20382 unsigned long new_prot_val;
20385 domid_t domid = DOMID_IO;
20386 + void __iomem *ret_addr;
20388 /* Don't allow wraparound or zero size */
20389 last_addr = phys_addr + size - 1;
20390 @@ -318,7 +323,7 @@ static void __iomem *__ioremap_caller(re
20392 * Don't remap the low PCI/ISA area, it's always mapped..
20394 - if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
20395 + if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
20396 return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
20399 @@ -342,7 +347,7 @@ static void __iomem *__ioremap_caller(re
20400 phys_addr &= PAGE_MASK;
20401 size = PAGE_ALIGN(last_addr+1) - phys_addr;
20403 - retval = reserve_memtype(phys_addr, phys_addr + size,
20404 + retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
20405 prot_val, &new_prot_val);
20407 pr_debug("Warning: reserve_memtype returned %d\n", retval);
20408 @@ -410,7 +415,10 @@ static void __iomem *__ioremap_caller(re
20412 - return (void __iomem *) (vaddr + offset);
20413 + ret_addr = (void __iomem *) (vaddr + offset);
20414 + mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
20420 @@ -438,7 +446,7 @@ void __iomem *ioremap_nocache(resource_s
20423 * Ideally, this should be:
20424 - * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20425 + * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20427 * Till we fix all X drivers to use ioremap_wc(), we will use
20429 @@ -462,7 +470,7 @@ EXPORT_SYMBOL(ioremap_nocache);
20431 void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
20433 - if (pat_wc_enabled)
20435 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
20436 __builtin_return_address(0));
20438 @@ -502,6 +510,14 @@ static void __iomem *ioremap_default(res
20442 +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
20443 + unsigned long prot_val)
20445 + return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
20446 + __builtin_return_address(0));
20448 +EXPORT_SYMBOL(ioremap_prot);
20451 * iounmap - Free a IO remapping
20452 * @addr: virtual address from ioremap_*
20453 @@ -526,6 +542,8 @@ void iounmap(volatile void __iomem *addr
20454 addr = (volatile void __iomem *)
20455 (PAGE_MASK & (unsigned long __force)addr);
20457 + mmiotrace_iounmap(addr);
20459 /* Use the vm area unlocked, assuming the caller
20460 ensures there isn't another iounmap for the same address
20461 in parallel. Reuse of the virtual address is prevented by
20462 @@ -533,7 +551,7 @@ void iounmap(volatile void __iomem *addr
20463 cpa takes care of the direct mappings. */
20464 read_lock(&vmlist_lock);
20465 for (p = vmlist; p; p = p->next) {
20466 - if (p->addr == addr)
20467 + if (p->addr == (void __force *)addr)
20470 read_unlock(&vmlist_lock);
20471 @@ -547,7 +565,7 @@ void iounmap(volatile void __iomem *addr
20472 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
20474 /* Finally remove it */
20475 - o = remove_vm_area((void *)addr);
20476 + o = remove_vm_area((void __force *)addr);
20477 BUG_ON(p != o || o == NULL);
20480 @@ -567,7 +585,7 @@ void *xlate_dev_mem_ptr(unsigned long ph
20481 if (page_is_ram(start >> PAGE_SHIFT))
20484 - addr = (void *)ioremap_default(start, PAGE_SIZE);
20485 + addr = (void __force *)ioremap_default(start, PAGE_SIZE);
20487 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
20489 @@ -595,8 +613,7 @@ static int __init early_ioremap_debug_se
20490 early_param("early_ioremap_debug", early_ioremap_debug_setup);
20492 static __initdata int after_paging_init;
20493 -static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
20494 - __section(.bss.page_aligned);
20495 +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
20497 #ifdef CONFIG_X86_32
20498 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
20499 @@ -695,10 +712,11 @@ static void __init __early_set_fixmap(en
20502 pte = early_ioremap_pte(addr);
20504 if (pgprot_val(flags))
20505 set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
20507 - pte_clear(NULL, addr, pte);
20508 + pte_clear(&init_mm, addr, pte);
20509 __flush_tlb_one(addr);
20512 @@ -726,13 +744,11 @@ static int __init check_early_ioremap_le
20514 if (!early_ioremap_nested)
20517 - printk(KERN_WARNING
20518 + WARN(1, KERN_WARNING
20519 "Debug warning: early ioremap leak of %d areas detected.\n",
20520 - early_ioremap_nested);
20521 + early_ioremap_nested);
20522 printk(KERN_WARNING
20523 - "please boot with early_ioremap_debug and report the dmesg.\n");
20525 + "please boot with early_ioremap_debug and report the dmesg.\n");
20529 --- sle11-2009-06-04.orig/arch/x86/mm/pageattr-xen.c 2009-03-16 16:38:05.000000000 +0100
20530 +++ sle11-2009-06-04/arch/x86/mm/pageattr-xen.c 2009-06-04 10:21:39.000000000 +0200
20531 @@ -34,6 +34,47 @@ struct cpa_data {
20532 unsigned force_split : 1;
20535 +#ifdef CONFIG_PROC_FS
20536 +static unsigned long direct_pages_count[PG_LEVEL_NUM];
20538 +void update_page_count(int level, unsigned long pages)
20540 + unsigned long flags;
20542 + /* Protect against CPA */
20543 + spin_lock_irqsave(&pgd_lock, flags);
20544 + direct_pages_count[level] += pages;
20545 + spin_unlock_irqrestore(&pgd_lock, flags);
20548 +static void split_page_count(int level)
20550 + direct_pages_count[level]--;
20551 + direct_pages_count[level - 1] += PTRS_PER_PTE;
20554 +int arch_report_meminfo(char *page)
20556 + int n = sprintf(page, "DirectMap4k: %8lu kB\n",
20557 + direct_pages_count[PG_LEVEL_4K] << 2);
20558 +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
20559 + n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
20560 + direct_pages_count[PG_LEVEL_2M] << 11);
20562 + n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
20563 + direct_pages_count[PG_LEVEL_2M] << 12);
20565 +#ifdef CONFIG_X86_64
20566 + if (direct_gbpages)
20567 + n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
20568 + direct_pages_count[PG_LEVEL_1G] << 20);
20573 +static inline void split_page_count(int level) { }
20576 #ifdef CONFIG_X86_64
20578 static inline unsigned long highmap_start_pfn(void)
20579 @@ -106,7 +147,7 @@ static void cpa_flush_all(unsigned long
20581 BUG_ON(irqs_disabled());
20583 - on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
20584 + on_each_cpu(__cpa_flush_all, (void *) cache, 1);
20587 static void __cpa_flush_range(void *arg)
20588 @@ -127,7 +168,7 @@ static void cpa_flush_range(unsigned lon
20589 BUG_ON(irqs_disabled());
20590 WARN_ON(PAGE_ALIGN(start) != start);
20592 - on_each_cpu(__cpa_flush_range, NULL, 1, 1);
20593 + on_each_cpu(__cpa_flush_range, NULL, 1);
20597 @@ -229,6 +270,7 @@ pte_t *lookup_address(unsigned long addr
20599 return pte_offset_kernel(pmd, address);
20601 +EXPORT_SYMBOL_GPL(lookup_address);
20604 * Set the new pmd in all the pgds we know about:
20605 @@ -509,6 +551,16 @@ static int split_large_page(pte_t *kpte,
20609 + if (address >= (unsigned long)__va(0) &&
20610 + address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
20611 + split_page_count(level);
20613 +#ifdef CONFIG_X86_64
20614 + if (address >= (unsigned long)__va(1UL<<32) &&
20615 + address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
20616 + split_page_count(level);
20620 * Get the target mfn from the original entry:
20622 @@ -566,10 +618,9 @@ repeat:
20623 if (!__pte_val(old_pte)) {
20626 - printk(KERN_WARNING "CPA: called for zero pte. "
20627 + WARN(1, KERN_WARNING "CPA: called for zero pte. "
20628 "vaddr = %lx cpa->vaddr = %lx\n", address,
20634 @@ -634,15 +685,24 @@ static int cpa_process_alias(struct cpa_
20635 struct cpa_data alias_cpa;
20638 - if (cpa->pfn > max_pfn_mapped)
20639 + if (cpa->pfn >= max_pfn_mapped)
20642 +#ifdef CONFIG_X86_64
20643 + if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
20647 * No need to redo, when the primary call touched the direct
20650 - if (!within(cpa->vaddr, PAGE_OFFSET,
20651 - PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
20652 + if (!(within(cpa->vaddr, PAGE_OFFSET,
20653 + PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
20654 +#ifdef CONFIG_X86_64
20655 + || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
20656 + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
20661 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
20662 @@ -796,6 +856,51 @@ static inline int change_page_attr_clear
20663 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
20667 +static void _free_memtype(u64 pstart, u64 pend)
20669 + u64 pa = pstart &= __PHYSICAL_MASK;
20670 + u64 ma = phys_to_machine(pa);
20672 + while ((pa += PAGE_SIZE) < pend) {
20673 + if (phys_to_machine(pa) != ma + (pa - pstart)) {
20674 + free_memtype(ma, ma + (pa - pstart));
20676 + ma = phys_to_machine(pa);
20679 + free_memtype(ma, ma + (pend - pstart));
20681 +#define free_memtype _free_memtype
20683 +static int _reserve_memtype(u64 pstart, u64 pend, unsigned long req_type)
20685 + u64 pcur = pstart &= __PHYSICAL_MASK, pa = pcur;
20686 + u64 ma = phys_to_machine(pa);
20689 + while ((pa += PAGE_SIZE) < pend) {
20690 + if (phys_to_machine(pa) != ma + (pa - pcur)) {
20691 + rc = reserve_memtype(ma, ma + (pa - pcur),
20696 + ma = phys_to_machine(pa);
20700 + rc = reserve_memtype(ma, ma + (pend - pcur), req_type, NULL);
20702 + if (unlikely(!rc) && pstart < pcur)
20703 + _free_memtype(pstart, pcur);
20707 +#define reserve_memtype(s, e, r, n) \
20708 + _reserve_memtype(s, e, BUILD_BUG_ON_ZERO(n) ?: (r))
20711 int _set_memory_uc(unsigned long addr, int numpages)
20714 @@ -810,7 +915,7 @@ int set_memory_uc(unsigned long addr, in
20716 * for now UC MINUS. see comments in ioremap_nocache()
20718 - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20719 + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20720 _PAGE_CACHE_UC_MINUS, NULL))
20723 @@ -826,10 +931,10 @@ int _set_memory_wc(unsigned long addr, i
20725 int set_memory_wc(unsigned long addr, int numpages)
20727 - if (!pat_wc_enabled)
20728 + if (!pat_enabled)
20729 return set_memory_uc(addr, numpages);
20731 - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20732 + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20733 _PAGE_CACHE_WC, NULL))
20736 @@ -845,7 +950,7 @@ int _set_memory_wb(unsigned long addr, i
20738 int set_memory_wb(unsigned long addr, int numpages)
20740 - free_memtype(addr, addr + numpages * PAGE_SIZE);
20741 + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
20743 return _set_memory_wb(addr, numpages);
20745 --- sle11-2009-06-04.orig/arch/x86/mm/pat-xen.c 2009-03-16 16:38:05.000000000 +0100
20746 +++ sle11-2009-06-04/arch/x86/mm/pat-xen.c 2009-06-04 10:21:39.000000000 +0200
20748 #include <linux/gfp.h>
20749 #include <linux/fs.h>
20750 #include <linux/bootmem.h>
20751 +#include <linux/debugfs.h>
20752 +#include <linux/seq_file.h>
20754 #include <asm/msr.h>
20755 #include <asm/tlbflush.h>
20756 @@ -26,11 +28,11 @@
20757 #include <asm/io.h>
20759 #ifdef CONFIG_X86_PAT
20760 -int __read_mostly pat_wc_enabled = 1;
20761 +int __read_mostly pat_enabled = 1;
20763 void __cpuinit pat_disable(char *reason)
20765 - pat_wc_enabled = 0;
20767 printk(KERN_INFO "%s\n", reason);
20770 @@ -42,6 +44,19 @@ static int __init nopat(char *str)
20771 early_param("nopat", nopat);
20775 +static int debug_enable;
20776 +static int __init pat_debug_setup(char *str)
20778 + debug_enable = 1;
20781 +__setup("debugpat", pat_debug_setup);
20783 +#define dprintk(fmt, arg...) \
20784 + do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
20787 static u64 __read_mostly boot_pat_state;
20790 @@ -53,24 +68,25 @@ enum {
20791 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
20794 -#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
20795 +#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
20797 void pat_init(void)
20801 - if (!pat_wc_enabled)
20802 + if (!pat_enabled)
20805 /* Paranoia check. */
20806 - if (!cpu_has_pat) {
20807 - printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
20808 + if (!cpu_has_pat && boot_pat_state) {
20810 - * Panic if this happens on the secondary CPU, and we
20811 + * If this happens we are on a secondary CPU, but
20812 * switched to PAT on the boot CPU. We have no way to
20815 - BUG_ON(boot_pat_state);
20817 + printk(KERN_ERR "PAT enabled, "
20818 + "but not supported by secondary CPU\n");
20823 @@ -87,8 +103,8 @@ void pat_init(void)
20824 * 011 UC _PAGE_CACHE_UC
20827 - pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
20828 - PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
20829 + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
20830 + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
20832 /* Boot CPU check */
20833 if (!boot_pat_state)
20834 @@ -113,13 +129,13 @@ void pat_init(void)
20835 static char *cattr_name(unsigned long flags)
20837 switch (flags & _PAGE_CACHE_MASK) {
20838 - case _PAGE_CACHE_UC: return "uncached";
20839 - case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20840 - case _PAGE_CACHE_WB: return "write-back";
20841 - case _PAGE_CACHE_WC: return "write-combining";
20842 - case _PAGE_CACHE_WP: return "write-protected";
20843 - case _PAGE_CACHE_WT: return "write-through";
20844 - default: return "broken";
20845 + case _PAGE_CACHE_UC: return "uncached";
20846 + case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20847 + case _PAGE_CACHE_WB: return "write-back";
20848 + case _PAGE_CACHE_WC: return "write-combining";
20849 + case _PAGE_CACHE_WP: return "write-protected";
20850 + case _PAGE_CACHE_WT: return "write-through";
20851 + default: return "broken";
20855 @@ -157,49 +173,55 @@ static DEFINE_SPINLOCK(memtype_lock); /
20856 * The intersection is based on "Effective Memory Type" tables in IA-32
20859 -static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
20860 - unsigned long *ret_prot)
20861 +static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
20863 - unsigned long pat_type;
20866 - pat_type = prot & _PAGE_CACHE_MASK;
20867 - prot &= (~_PAGE_CACHE_MASK);
20870 - * We return the PAT request directly for types where PAT takes
20871 - * precedence with respect to MTRR and for UC_MINUS.
20872 - * Consistency checks with other PAT requests is done later
20873 - * while going through memtype list.
20875 - if (pat_type == _PAGE_CACHE_WC) {
20876 - *ret_prot = prot | _PAGE_CACHE_WC;
20878 - } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
20879 - *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
20881 - } else if (pat_type == _PAGE_CACHE_UC) {
20882 - *ret_prot = prot | _PAGE_CACHE_UC;
20887 * Look for MTRR hint to get the effective type in case where PAT
20888 * request is for WB.
20890 - mtrr_type = mtrr_type_lookup(start, end);
20891 + if (req_type == _PAGE_CACHE_WB) {
20894 - if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
20895 - *ret_prot = prot | _PAGE_CACHE_UC;
20896 - } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
20897 - *ret_prot = prot | _PAGE_CACHE_WC;
20899 - *ret_prot = prot | _PAGE_CACHE_WB;
20900 + mtrr_type = mtrr_type_lookup(start, end);
20901 + if (mtrr_type == MTRR_TYPE_UNCACHABLE)
20902 + return _PAGE_CACHE_UC;
20903 + if (mtrr_type == MTRR_TYPE_WRCOMB)
20904 + return _PAGE_CACHE_WC;
20910 +static int chk_conflict(struct memtype *new, struct memtype *entry,
20911 + unsigned long *type)
20913 + if (new->type != entry->type) {
20915 + new->type = entry->type;
20916 + *type = entry->type;
20921 + /* check overlaps with more than one entry in the list */
20922 + list_for_each_entry_continue(entry, &memtype_list, nd) {
20923 + if (new->end <= entry->start)
20925 + else if (new->type != entry->type)
20931 + printk(KERN_INFO "%s:%d conflicting memory types "
20932 + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
20933 + new->end, cattr_name(new->type), cattr_name(entry->type));
20937 +static struct memtype *cached_entry;
20938 +static u64 cached_start;
20941 * req_type typically has one of the:
20943 @@ -210,37 +232,36 @@ static int pat_x_mtrr_type(u64 start, u6
20944 * req_type will have a special case value '-1', when requester want to inherit
20945 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
20947 - * If ret_type is NULL, function will return an error if it cannot reserve the
20948 - * region with req_type. If ret_type is non-null, function will return
20949 - * available type in ret_type in case of no error. In case of any error
20950 + * If new_type is NULL, function will return an error if it cannot reserve the
20951 + * region with req_type. If new_type is non-NULL, function will return
20952 + * available type in new_type in case of no error. In case of any error
20953 * it will return a negative return value.
20955 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
20956 - unsigned long *ret_type)
20957 + unsigned long *new_type)
20959 - struct memtype *new_entry = NULL;
20960 - struct memtype *parse;
20961 + struct memtype *new, *entry;
20962 unsigned long actual_type;
20963 + struct list_head *where;
20966 - /* Only track when pat_wc_enabled */
20967 - if (!pat_wc_enabled) {
20968 + BUG_ON(start >= end); /* end is exclusive */
20970 + if (!pat_enabled) {
20971 /* This is identical to page table setting without PAT */
20973 - if (req_type == -1) {
20974 - *ret_type = _PAGE_CACHE_WB;
20976 - *ret_type = req_type;
20979 + if (req_type == -1)
20980 + *new_type = _PAGE_CACHE_WB;
20982 + *new_type = req_type & _PAGE_CACHE_MASK;
20987 /* Low ISA region is always mapped WB in page table. No need to track */
20988 - if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
20990 - *ret_type = _PAGE_CACHE_WB;
20992 + if (is_ISA_range(start, end - 1)) {
20994 + *new_type = _PAGE_CACHE_WB;
20998 @@ -253,206 +274,118 @@ int reserve_memtype(u64 start, u64 end,
21000 u8 mtrr_type = mtrr_type_lookup(start, end);
21002 - if (mtrr_type == MTRR_TYPE_WRBACK) {
21003 - req_type = _PAGE_CACHE_WB;
21004 + if (mtrr_type == MTRR_TYPE_WRBACK)
21005 actual_type = _PAGE_CACHE_WB;
21007 - req_type = _PAGE_CACHE_UC_MINUS;
21009 actual_type = _PAGE_CACHE_UC_MINUS;
21012 - req_type &= _PAGE_CACHE_MASK;
21013 - err = pat_x_mtrr_type(start, end, req_type, &actual_type);
21018 - *ret_type = actual_type;
21020 + actual_type = pat_x_mtrr_type(start, end,
21021 + req_type & _PAGE_CACHE_MASK);
21026 - new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21028 + new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21032 - new_entry->start = start;
21033 - new_entry->end = end;
21034 - new_entry->type = actual_type;
21035 + new->start = start;
21037 + new->type = actual_type;
21040 - *ret_type = actual_type;
21042 + *new_type = actual_type;
21044 spin_lock(&memtype_lock);
21046 - /* Search for existing mapping that overlaps the current range */
21047 - list_for_each_entry(parse, &memtype_list, nd) {
21048 - struct memtype *saved_ptr;
21049 + if (cached_entry && start >= cached_start)
21050 + entry = cached_entry;
21052 + entry = list_entry(&memtype_list, struct memtype, nd);
21054 - if (parse->start >= end) {
21055 - pr_debug("New Entry\n");
21056 - list_add(&new_entry->nd, parse->nd.prev);
21057 - new_entry = NULL;
21058 + /* Search for existing mapping that overlaps the current range */
21060 + list_for_each_entry_continue(entry, &memtype_list, nd) {
21061 + if (end <= entry->start) {
21062 + where = entry->nd.prev;
21063 + cached_entry = list_entry(where, struct memtype, nd);
21067 - if (start <= parse->start && end >= parse->start) {
21068 - if (actual_type != parse->type && ret_type) {
21069 - actual_type = parse->type;
21070 - *ret_type = actual_type;
21071 - new_entry->type = actual_type;
21074 - if (actual_type != parse->type) {
21076 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21077 - current->comm, current->pid,
21079 - cattr_name(actual_type),
21080 - cattr_name(parse->type));
21085 - saved_ptr = parse;
21087 - * Check to see whether the request overlaps more
21088 - * than one entry in the list
21090 - list_for_each_entry_continue(parse, &memtype_list, nd) {
21091 - if (end <= parse->start) {
21095 - if (actual_type != parse->type) {
21097 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21098 - current->comm, current->pid,
21100 - cattr_name(actual_type),
21101 - cattr_name(parse->type));
21109 + } else if (start <= entry->start) { /* end > entry->start */
21110 + err = chk_conflict(new, entry, new_type);
21112 + dprintk("Overlap at 0x%Lx-0x%Lx\n",
21113 + entry->start, entry->end);
21114 + where = entry->nd.prev;
21115 + cached_entry = list_entry(where,
21116 + struct memtype, nd);
21119 - pr_debug("Overlap at 0x%Lx-0x%Lx\n",
21120 - saved_ptr->start, saved_ptr->end);
21121 - /* No conflict. Go ahead and add this new entry */
21122 - list_add(&new_entry->nd, saved_ptr->nd.prev);
21123 - new_entry = NULL;
21127 - if (start < parse->end) {
21128 - if (actual_type != parse->type && ret_type) {
21129 - actual_type = parse->type;
21130 - *ret_type = actual_type;
21131 - new_entry->type = actual_type;
21134 - if (actual_type != parse->type) {
21136 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21137 - current->comm, current->pid,
21139 - cattr_name(actual_type),
21140 - cattr_name(parse->type));
21145 - saved_ptr = parse;
21147 - * Check to see whether the request overlaps more
21148 - * than one entry in the list
21150 - list_for_each_entry_continue(parse, &memtype_list, nd) {
21151 - if (end <= parse->start) {
21155 - if (actual_type != parse->type) {
21157 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21158 - current->comm, current->pid,
21160 - cattr_name(actual_type),
21161 - cattr_name(parse->type));
21164 + } else if (start < entry->end) { /* start > entry->start */
21165 + err = chk_conflict(new, entry, new_type);
21167 + dprintk("Overlap at 0x%Lx-0x%Lx\n",
21168 + entry->start, entry->end);
21169 + cached_entry = list_entry(entry->nd.prev,
21170 + struct memtype, nd);
21173 + * Move to right position in the linked
21174 + * list to add this new entry
21176 + list_for_each_entry_continue(entry,
21177 + &memtype_list, nd) {
21178 + if (start <= entry->start) {
21179 + where = entry->nd.prev;
21189 - pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
21190 - saved_ptr->start, saved_ptr->end);
21191 - /* No conflict. Go ahead and add this new entry */
21192 - list_add(&new_entry->nd, &saved_ptr->nd);
21193 - new_entry = NULL;
21200 - "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
21201 - start, end, cattr_name(new_entry->type),
21202 - cattr_name(req_type));
21203 - kfree(new_entry);
21204 + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
21205 + "track %s, req %s\n",
21206 + start, end, cattr_name(new->type), cattr_name(req_type));
21208 spin_unlock(&memtype_lock);
21213 - /* No conflict. Not yet added to the list. Add to the tail */
21214 - list_add_tail(&new_entry->nd, &memtype_list);
21215 - pr_debug("New Entry\n");
21217 + cached_start = start;
21221 - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21222 - start, end, cattr_name(actual_type),
21223 - cattr_name(req_type), cattr_name(*ret_type));
21226 - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
21227 - start, end, cattr_name(actual_type),
21228 - cattr_name(req_type));
21231 + list_add(&new->nd, where);
21233 + list_add_tail(&new->nd, &memtype_list);
21235 spin_unlock(&memtype_lock);
21237 + dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21238 + start, end, cattr_name(new->type), cattr_name(req_type),
21239 + new_type ? cattr_name(*new_type) : "-");
21244 int free_memtype(u64 start, u64 end)
21246 - struct memtype *ml;
21247 + struct memtype *entry;
21250 - /* Only track when pat_wc_enabled */
21251 - if (!pat_wc_enabled) {
21252 + if (!pat_enabled)
21256 /* Low ISA region is always mapped WB. No need to track */
21257 - if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
21258 + if (is_ISA_range(start, end - 1))
21262 spin_lock(&memtype_lock);
21263 - list_for_each_entry(ml, &memtype_list, nd) {
21264 - if (ml->start == start && ml->end == end) {
21265 - list_del(&ml->nd);
21267 + list_for_each_entry(entry, &memtype_list, nd) {
21268 + if (entry->start == start && entry->end == end) {
21269 + if (cached_entry == entry || cached_start == start)
21270 + cached_entry = NULL;
21272 + list_del(&entry->nd);
21277 @@ -464,27 +397,19 @@ int free_memtype(u64 start, u64 end)
21278 current->comm, current->pid, start, end);
21281 - pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21282 + dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21288 - * /dev/mem mmap interface. The memtype used for mapping varies:
21289 - * - Use UC for mappings with O_SYNC flag
21290 - * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
21291 - * inherit the memtype from existing mapping.
21292 - * - Else use UC_MINUS memtype (for backward compatibility with existing
21295 pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
21296 unsigned long size, pgprot_t vma_prot)
21301 -#ifdef CONFIG_NONPROMISC_DEVMEM
21302 -/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
21303 +#ifdef CONFIG_STRICT_DEVMEM
21304 +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
21305 static inline int range_is_allowed(unsigned long mfn, unsigned long size)
21308 @@ -508,20 +433,20 @@ static inline int range_is_allowed(unsig
21312 -#endif /* CONFIG_NONPROMISC_DEVMEM */
21313 +#endif /* CONFIG_STRICT_DEVMEM */
21315 int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
21316 unsigned long size, pgprot_t *vma_prot)
21318 u64 addr = (u64)mfn << PAGE_SHIFT;
21319 - unsigned long flags = _PAGE_CACHE_UC_MINUS;
21320 + unsigned long flags = -1;
21323 if (!range_is_allowed(mfn, size))
21326 if (file->f_flags & O_SYNC) {
21327 - flags = _PAGE_CACHE_UC;
21328 + flags = _PAGE_CACHE_UC_MINUS;
21331 #ifndef CONFIG_X86_32
21332 @@ -534,25 +459,26 @@ int phys_mem_access_prot_allowed(struct
21333 * caching for the high addresses through the KEN pin, but
21334 * we maintain the tradition of paranoia in this code.
21336 - if (!pat_wc_enabled &&
21337 - ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
21338 - test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
21339 - test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
21340 - test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
21341 - (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21342 + if (!pat_enabled &&
21343 + !(boot_cpu_has(X86_FEATURE_MTRR) ||
21344 + boot_cpu_has(X86_FEATURE_K6_MTRR) ||
21345 + boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
21346 + boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
21347 + (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21348 flags = _PAGE_CACHE_UC;
21354 - * With O_SYNC, we can only take UC mapping. Fail if we cannot.
21355 + * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
21357 * Without O_SYNC, we want to get
21358 * - WB for WB-able memory and no other conflicting mappings
21359 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
21360 * - Inherit from confliting mappings otherwise
21362 - if (flags != _PAGE_CACHE_UC_MINUS) {
21363 + if (flags != -1) {
21364 retval = reserve_memtype(addr, addr + size, flags, NULL);
21366 retval = reserve_memtype(addr, addr + size, -1, &flags);
21367 @@ -600,3 +526,88 @@ void unmap_devmem(unsigned long mfn, uns
21368 free_memtype(addr, addr + size);
21371 +#if defined(CONFIG_DEBUG_FS)
21373 +/* get Nth element of the linked list */
21374 +static struct memtype *memtype_get_idx(loff_t pos)
21376 + struct memtype *list_node, *print_entry;
21379 + print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21380 + if (!print_entry)
21383 + spin_lock(&memtype_lock);
21384 + list_for_each_entry(list_node, &memtype_list, nd) {
21386 + *print_entry = *list_node;
21387 + spin_unlock(&memtype_lock);
21388 + return print_entry;
21392 + spin_unlock(&memtype_lock);
21393 + kfree(print_entry);
21397 +static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
21401 + seq_printf(seq, "PAT memtype list:\n");
21404 + return memtype_get_idx(*pos);
21407 +static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
21410 + return memtype_get_idx(*pos);
21413 +static void memtype_seq_stop(struct seq_file *seq, void *v)
21417 +static int memtype_seq_show(struct seq_file *seq, void *v)
21419 + struct memtype *print_entry = (struct memtype *)v;
21421 + seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
21422 + print_entry->start, print_entry->end);
21423 + kfree(print_entry);
21427 +static struct seq_operations memtype_seq_ops = {
21428 + .start = memtype_seq_start,
21429 + .next = memtype_seq_next,
21430 + .stop = memtype_seq_stop,
21431 + .show = memtype_seq_show,
21434 +static int memtype_seq_open(struct inode *inode, struct file *file)
21436 + return seq_open(file, &memtype_seq_ops);
21439 +static const struct file_operations memtype_fops = {
21440 + .open = memtype_seq_open,
21441 + .read = seq_read,
21442 + .llseek = seq_lseek,
21443 + .release = seq_release,
21446 +static int __init pat_memtype_list_init(void)
21448 + debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
21449 + NULL, &memtype_fops);
21453 +late_initcall(pat_memtype_list_init);
21455 +#endif /* CONFIG_DEBUG_FS */
21456 --- sle11-2009-06-04.orig/arch/x86/mm/pgtable-xen.c 2009-03-16 16:38:05.000000000 +0100
21457 +++ sle11-2009-06-04/arch/x86/mm/pgtable-xen.c 2009-06-04 10:21:39.000000000 +0200
21459 #include <asm/pgalloc.h>
21460 #include <asm/pgtable.h>
21461 #include <asm/tlb.h>
21462 +#include <asm/fixmap.h>
21463 #include <asm/hypervisor.h>
21464 #include <asm/mmu_context.h>
21466 @@ -410,15 +411,9 @@ static inline void pgd_list_del(pgd_t *p
21467 static void pgd_ctor(void *p)
21470 - unsigned long flags;
21472 pgd_test_and_unpin(pgd);
21474 - /* Clear usermode parts of PGD */
21475 - memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
21477 - spin_lock_irqsave(&pgd_lock, flags);
21479 /* If the pgd points to a shared pagetable level (either the
21480 ptes in non-PAE, or shared PMD in PAE), then just copy the
21481 references from swapper_pg_dir. */
21482 @@ -440,13 +435,9 @@ static void pgd_ctor(void *p)
21483 __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
21486 -#ifndef CONFIG_X86_PAE
21487 /* list required to sync kernel mapping updates */
21488 if (!SHARED_KERNEL_PMD)
21492 - spin_unlock_irqrestore(&pgd_lock, flags);
21495 static void pgd_dtor(void *pgd)
21496 @@ -475,33 +466,6 @@ static void pgd_dtor(void *pgd)
21498 #ifdef CONFIG_X86_PAE
21500 - * Mop up any pmd pages which may still be attached to the pgd.
21501 - * Normally they will be freed by munmap/exit_mmap, but any pmd we
21502 - * preallocate which never got a corresponding vma will need to be
21503 - * freed manually.
21505 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21509 - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
21510 - pgd_t pgd = pgdp[i];
21512 - if (__pgd_val(pgd) != 0) {
21513 - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21515 - pgdp[i] = xen_make_pgd(0);
21517 - paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21518 - pmd_free(mm, pmd);
21522 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21523 - xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21527 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
21528 * updating the top-level pagetable entries to guarantee the
21529 * processor notices the update. Since this is expensive, and
21530 @@ -512,61 +476,7 @@ static void pgd_mop_up_pmds(struct mm_st
21531 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
21532 * and initialize the kernel pmds here.
21534 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21537 - pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
21538 - unsigned long addr, flags;
21542 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
21543 - * allocation). We therefore store virtual addresses of pmds as they
21544 - * do not change across save/restore, and poke the machine addresses
21545 - * into the pgdir under the pgd_lock.
21547 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
21548 - pmds[i] = pmd_alloc_one(mm, addr);
21553 - spin_lock_irqsave(&pgd_lock, flags);
21555 - /* Protect against save/restore: move below 4GB under pgd_lock. */
21556 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21557 - && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21558 - spin_unlock_irqrestore(&pgd_lock, flags);
21561 - pmd_free(mm, pmds[i]);
21565 - /* Copy kernel pmd contents and write-protect the new pmds. */
21566 - pud = pud_offset(pgd, 0);
21567 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
21568 - i++, pud++, addr += PUD_SIZE) {
21569 - if (i >= KERNEL_PGD_BOUNDARY) {
21571 - (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21572 - sizeof(pmd_t) * PTRS_PER_PMD);
21573 - make_lowmem_page_readonly(
21574 - pmds[i], XENFEAT_writable_page_tables);
21577 - /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21578 - pud_populate(mm, pud, pmds[i]);
21581 - /* List required to sync kernel mapping updates and
21582 - * to pin/unpin on save/restore. */
21583 - pgd_list_add(pgd);
21585 - spin_unlock_irqrestore(&pgd_lock, flags);
21589 +#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
21591 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
21593 @@ -596,16 +506,101 @@ void pud_populate(struct mm_struct *mm,
21596 #else /* !CONFIG_X86_PAE */
21598 /* No need to prepopulate any pagetable entries in non-PAE modes. */
21599 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21600 +#define PREALLOCATED_PMDS 0
21602 +#endif /* CONFIG_X86_PAE */
21604 +static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
21609 +#ifdef CONFIG_X86_PAE
21611 + xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
21614 + for(i = 0; i < PREALLOCATED_PMDS; i++)
21616 + pmd_free(mm, pmds[i]);
21619 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
21620 +static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
21623 + bool failed = false;
21625 + for(i = 0; i < PREALLOCATED_PMDS; i++) {
21626 + pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
21633 + free_pmds(pmds, mm, false);
21641 + * Mop up any pmd pages which may still be attached to the pgd.
21642 + * Normally they will be freed by munmap/exit_mmap, but any pmd we
21643 + * preallocate which never got a corresponding vma will need to be
21644 + * freed manually.
21646 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21650 + for(i = 0; i < PREALLOCATED_PMDS; i++) {
21651 + pgd_t pgd = pgdp[i];
21653 + if (__pgd_val(pgd) != 0) {
21654 + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21656 + pgdp[i] = xen_make_pgd(0);
21658 + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21659 + pmd_free(mm, pmd);
21663 +#ifdef CONFIG_X86_PAE
21664 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21665 + xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21669 +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
21672 + unsigned long addr;
21675 + if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
21678 + pud = pud_offset(pgd, 0);
21679 + for (addr = i = 0; i < PREALLOCATED_PMDS;
21680 + i++, pud++, addr += PUD_SIZE) {
21681 + pmd_t *pmd = pmds[i];
21683 + if (i >= KERNEL_PGD_BOUNDARY) {
21685 + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21686 + sizeof(pmd_t) * PTRS_PER_PMD);
21687 + make_lowmem_page_readonly(
21688 + pmd, XENFEAT_writable_page_tables);
21691 + /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21692 + pud_populate(mm, pud, pmd);
21695 -#endif /* CONFIG_X86_PAE */
21697 #ifdef CONFIG_X86_64
21698 /* We allocate two contiguous pages for kernel and user. */
21699 @@ -616,19 +611,52 @@ static void pgd_mop_up_pmds(struct mm_st
21701 pgd_t *pgd_alloc(struct mm_struct *mm)
21703 - pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21705 + pmd_t *pmds[PREALLOCATED_PMDS];
21706 + unsigned long flags;
21708 + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21713 - /* so that alloc_pd can use it */
21718 - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
21719 - free_pages((unsigned long)pgd, PGD_ORDER);
21721 + if (preallocate_pmds(pmds, mm) != 0)
21722 + goto out_free_pgd;
21724 + if (paravirt_pgd_alloc(mm) != 0)
21725 + goto out_free_pmds;
21728 + * Make sure that pre-populating the pmds is atomic with
21729 + * respect to anything walking the pgd_list, so that they
21730 + * never see a partially populated pgd.
21732 + spin_lock_irqsave(&pgd_lock, flags);
21734 +#ifdef CONFIG_X86_PAE
21735 + /* Protect against save/restore: move below 4GB under pgd_lock. */
21736 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21737 + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21738 + spin_unlock_irqrestore(&pgd_lock, flags);
21739 + goto out_free_pmds;
21744 + pgd_prepopulate_pmd(mm, pgd, pmds);
21746 + spin_unlock_irqrestore(&pgd_lock, flags);
21751 + free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
21753 + free_pages((unsigned long)pgd, PGD_ORDER);
21758 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
21759 @@ -644,6 +672,7 @@ void pgd_free(struct mm_struct *mm, pgd_
21762 pgd_mop_up_pmds(mm, pgd);
21763 + paravirt_pgd_free(mm, pgd);
21764 free_pages((unsigned long)pgd, PGD_ORDER);
21767 @@ -685,7 +714,7 @@ int ptep_test_and_clear_young(struct vm_
21769 if (pte_young(*ptep))
21770 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
21772 + (unsigned long *) &ptep->pte);
21775 pte_update(vma->vm_mm, addr, ptep);
21776 @@ -707,3 +736,42 @@ int ptep_clear_flush_young(struct vm_are
21783 +void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21785 + unsigned long address = __fix_to_virt(idx);
21788 + if (idx >= __end_of_fixed_addresses) {
21794 +#ifdef CONFIG_X86_64
21795 + extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
21797 + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
21798 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21799 + set_pte_vaddr_pud(level3_user_pgt, address, pte);
21801 + case FIX_EARLYCON_MEM_BASE:
21802 + xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
21803 + pfn_pte_ma(phys >> PAGE_SHIFT, flags));
21807 + case FIX_WP_TEST:
21809 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21813 + pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21816 + set_pte_vaddr(address, pte);
21819 --- sle11-2009-06-04.orig/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:38:05.000000000 +0100
21820 +++ sle11-2009-06-04/arch/x86/mm/pgtable_32-xen.c 2009-06-04 10:21:39.000000000 +0200
21821 @@ -25,51 +25,49 @@
21822 #include <xen/features.h>
21823 #include <asm/hypervisor.h>
21825 -void show_mem(void)
21827 + * Associate a virtual page frame with a given physical page frame
21828 + * and protection flags for that frame.
21830 +void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
21832 - int total = 0, reserved = 0;
21833 - int shared = 0, cached = 0;
21835 - struct page *page;
21836 - pg_data_t *pgdat;
21838 - unsigned long flags;
21840 - printk(KERN_INFO "Mem-info:\n");
21841 - show_free_areas();
21842 - for_each_online_pgdat(pgdat) {
21843 - pgdat_resize_lock(pgdat, &flags);
21844 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
21845 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
21846 - touch_nmi_watchdog();
21847 - page = pgdat_page_nr(pgdat, i);
21849 - if (PageHighMem(page))
21851 - if (PageReserved(page))
21853 - else if (PageSwapCache(page))
21855 - else if (page_count(page))
21856 - shared += page_count(page) - 1;
21858 - pgdat_resize_unlock(pgdat, &flags);
21860 - printk(KERN_INFO "%d pages of RAM\n", total);
21861 - printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
21862 - printk(KERN_INFO "%d reserved pages\n", reserved);
21863 - printk(KERN_INFO "%d pages shared\n", shared);
21864 - printk(KERN_INFO "%d pages swap cached\n", cached);
21866 - printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
21867 - printk(KERN_INFO "%lu pages writeback\n",
21868 - global_page_state(NR_WRITEBACK));
21869 - printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
21870 - printk(KERN_INFO "%lu pages slab\n",
21871 - global_page_state(NR_SLAB_RECLAIMABLE) +
21872 - global_page_state(NR_SLAB_UNRECLAIMABLE));
21873 - printk(KERN_INFO "%lu pages pagetables\n",
21874 - global_page_state(NR_PAGETABLE));
21875 +#ifndef CONFIG_XEN
21881 + pgd = swapper_pg_dir + pgd_index(vaddr);
21882 + if (pgd_none(*pgd)) {
21886 + pud = pud_offset(pgd, vaddr);
21887 + if (pud_none(*pud)) {
21891 + pmd = pmd_offset(pud, vaddr);
21892 + if (pmd_none(*pmd)) {
21896 + pte = pte_offset_kernel(pmd, vaddr);
21897 + if (pte_val(pteval))
21898 + set_pte_present(&init_mm, vaddr, pte, pteval);
21900 + pte_clear(&init_mm, vaddr, pte);
21903 + * It's enough to flush this one mapping.
21904 + * (PGE mappings get flushed as well)
21906 + __flush_tlb_one(vaddr);
21908 + if (HYPERVISOR_update_va_mapping(vaddr, pteval,
21909 + UVMF_INVLPG|UVMF_ALL))
21915 @@ -107,35 +105,10 @@ void set_pmd_pfn(unsigned long vaddr, un
21916 __flush_tlb_one(vaddr);
21919 -static int fixmaps;
21920 unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
21921 unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
21922 EXPORT_SYMBOL(__FIXADDR_TOP);
21924 -void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21926 - unsigned long address = __fix_to_virt(idx);
21929 - if (idx >= __end_of_fixed_addresses) {
21934 - case FIX_WP_TEST:
21936 - pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21939 - pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21942 - if (HYPERVISOR_update_va_mapping(address, pte,
21943 - UVMF_INVLPG|UVMF_ALL))
21949 * reserve_top_address - reserves a hole in the top of kernel address space
21950 * @reserve - size of hole to reserve
21951 @@ -145,13 +118,48 @@ void __set_fixmap (enum fixed_addresses
21953 void __init reserve_top_address(unsigned long reserve)
21955 - BUG_ON(fixmaps > 0);
21956 + BUG_ON(fixmaps_set > 0);
21957 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
21959 __FIXADDR_TOP = -reserve - PAGE_SIZE;
21960 __VMALLOC_RESERVE += reserve;
21964 + * vmalloc=size forces the vmalloc area to be exactly 'size'
21965 + * bytes. This can be used to increase (or decrease) the
21966 + * vmalloc area - the default is 128m.
21968 +static int __init parse_vmalloc(char *arg)
21973 + __VMALLOC_RESERVE = memparse(arg, &arg);
21976 +early_param("vmalloc", parse_vmalloc);
21978 +#ifndef CONFIG_XEN
21980 + * reservetop=size reserves a hole at the top of the kernel address space which
21981 + * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
21982 + * so relocating the fixmap can be done before paging initialization.
21984 +static int __init parse_reservetop(char *arg)
21986 + unsigned long address;
21991 + address = memparse(arg, &arg);
21992 + reserve_top_address(address);
21995 +early_param("reservetop", parse_reservetop);
21998 void make_lowmem_page_readonly(void *va, unsigned int feature)
22001 --- sle11-2009-06-04.orig/arch/x86/pci/amd_bus.c 2009-06-04 11:08:07.000000000 +0200
22002 +++ sle11-2009-06-04/arch/x86/pci/amd_bus.c 2009-06-04 10:21:39.000000000 +0200
22003 @@ -607,6 +607,14 @@ static int __init pci_io_ecs_init(void)
22004 for_each_online_cpu(cpu)
22005 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
22006 (void *)(long)cpu);
22010 + rdmsrl(MSR_AMD64_NB_CFG, reg);
22011 + if (!(reg & ENABLE_CF8_EXT_CFG))
22015 pci_probe |= PCI_HAS_IO_ECS;
22018 @@ -614,6 +622,10 @@ static int __init pci_io_ecs_init(void)
22020 static int __init amd_postcore_init(void)
22023 + if (!is_initial_xendomain())
22026 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
22029 --- sle11-2009-06-04.orig/arch/x86/pci/irq-xen.c 2009-03-16 16:38:05.000000000 +0100
22030 +++ sle11-2009-06-04/arch/x86/pci/irq-xen.c 2009-06-04 10:21:39.000000000 +0200
22032 #include <linux/slab.h>
22033 #include <linux/interrupt.h>
22034 #include <linux/dmi.h>
22035 -#include <asm/io.h>
22036 -#include <asm/smp.h>
22037 +#include <linux/io.h>
22038 +#include <linux/smp.h>
22039 #include <asm/io_apic.h>
22040 #include <linux/irq.h>
22041 #include <linux/acpi.h>
22042 @@ -45,7 +45,8 @@ struct irq_router {
22044 u16 vendor, device;
22045 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
22046 - int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
22047 + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
22051 struct irq_router_handler {
22052 @@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_d
22053 * and perform checksum verification.
22056 -static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
22057 +static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
22059 struct irq_routing_table *rt;
22061 @@ -74,10 +75,11 @@ static inline struct irq_routing_table *
22062 rt->size < sizeof(struct irq_routing_table))
22065 - for (i=0; i < rt->size; i++)
22066 + for (i = 0; i < rt->size; i++)
22069 - DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
22070 + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
22075 @@ -104,7 +106,9 @@ static struct irq_routing_table * __init
22077 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
22079 - for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
22080 + for (addr = (u8 *) isa_bus_to_virt(0xf0000);
22081 + addr < (u8 *) isa_bus_to_virt(0x100000);
22083 rt = pirq_check_routing_table(addr);
22086 @@ -126,20 +130,20 @@ static void __init pirq_peer_trick(void)
22087 struct irq_info *e;
22089 memset(busmap, 0, sizeof(busmap));
22090 - for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
22091 + for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
22096 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
22097 - for(j=0; j<4; j++)
22098 + for (j = 0; j < 4; j++)
22099 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
22103 busmap[e->bus] = 1;
22105 - for(i = 1; i < 256; i++) {
22106 + for (i = 1; i < 256; i++) {
22108 if (!busmap[i] || pci_find_bus(0, i))
22110 @@ -187,7 +191,8 @@ static unsigned int read_config_nybble(s
22111 return (nr & 1) ? (x >> 4) : (x & 0xf);
22114 -static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
22115 +static void write_config_nybble(struct pci_dev *router, unsigned offset,
22116 + unsigned nr, unsigned int val)
22119 unsigned reg = offset + (nr >> 1);
22120 @@ -289,7 +294,7 @@ static int pirq_ite_get(struct pci_dev *
22121 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
22123 WARN_ON_ONCE(pirq > 4);
22124 - return read_config_nybble(router,0x43, pirqmap[pirq-1]);
22125 + return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
22128 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22129 @@ -318,7 +323,7 @@ static int pirq_opti_set(struct pci_dev
22132 * Cyrix: nibble offset 0x5C
22133 - * 0x5C bits 7:4 is INTB bits 3:0 is INTA
22134 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA
22135 * 0x5D bits 7:4 is INTD bits 3:0 is INTC
22137 static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
22138 @@ -354,7 +359,7 @@ static int pirq_cyrix_set(struct pci_dev
22139 * Apparently there are systems implementing PCI routing table using
22140 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
22141 * We try our best to handle both link mappings.
22144 * Currently (2003-05-21) it appears most SiS chipsets follow the
22145 * definition of routing registers from the SiS-5595 southbridge.
22146 * According to the SiS 5595 datasheets the revision id's of the
22147 @@ -374,7 +379,7 @@ static int pirq_cyrix_set(struct pci_dev
22150 * bit 6 OHCI function disabled (0), enabled (1)
22153 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
22155 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
22156 @@ -437,7 +442,7 @@ static int pirq_vlsi_get(struct pci_dev
22158 WARN_ON_ONCE(pirq >= 9);
22160 - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
22161 + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
22164 return read_config_nybble(router, 0x74, pirq-1);
22165 @@ -447,7 +452,7 @@ static int pirq_vlsi_set(struct pci_dev
22167 WARN_ON_ONCE(pirq >= 9);
22169 - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
22170 + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
22173 write_config_nybble(router, 0x74, pirq-1, irq);
22174 @@ -471,7 +476,8 @@ static int pirq_serverworks_get(struct p
22175 return inb(0xc01) & 0xf;
22178 -static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22179 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
22180 + int pirq, int irq)
22184 @@ -491,22 +497,20 @@ static int pirq_amd756_get(struct pci_de
22189 irq = read_config_nybble(router, 0x56, pirq - 1);
22191 - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
22192 - dev->vendor, dev->device, pirq, irq);
22193 + dev_info(&dev->dev,
22194 + "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
22195 + dev->vendor, dev->device, pirq, irq);
22199 static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22201 - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
22202 - dev->vendor, dev->device, pirq, irq);
22203 + dev_info(&dev->dev,
22204 + "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
22205 + dev->vendor, dev->device, pirq, irq);
22208 write_config_nybble(router, 0x56, pirq - 1, irq);
22213 @@ -553,50 +557,51 @@ static __init int intel_router_probe(str
22214 if (pci_dev_present(pirq_440gx))
22219 - case PCI_DEVICE_ID_INTEL_82371FB_0:
22220 - case PCI_DEVICE_ID_INTEL_82371SB_0:
22221 - case PCI_DEVICE_ID_INTEL_82371AB_0:
22222 - case PCI_DEVICE_ID_INTEL_82371MX:
22223 - case PCI_DEVICE_ID_INTEL_82443MX_0:
22224 - case PCI_DEVICE_ID_INTEL_82801AA_0:
22225 - case PCI_DEVICE_ID_INTEL_82801AB_0:
22226 - case PCI_DEVICE_ID_INTEL_82801BA_0:
22227 - case PCI_DEVICE_ID_INTEL_82801BA_10:
22228 - case PCI_DEVICE_ID_INTEL_82801CA_0:
22229 - case PCI_DEVICE_ID_INTEL_82801CA_12:
22230 - case PCI_DEVICE_ID_INTEL_82801DB_0:
22231 - case PCI_DEVICE_ID_INTEL_82801E_0:
22232 - case PCI_DEVICE_ID_INTEL_82801EB_0:
22233 - case PCI_DEVICE_ID_INTEL_ESB_1:
22234 - case PCI_DEVICE_ID_INTEL_ICH6_0:
22235 - case PCI_DEVICE_ID_INTEL_ICH6_1:
22236 - case PCI_DEVICE_ID_INTEL_ICH7_0:
22237 - case PCI_DEVICE_ID_INTEL_ICH7_1:
22238 - case PCI_DEVICE_ID_INTEL_ICH7_30:
22239 - case PCI_DEVICE_ID_INTEL_ICH7_31:
22240 - case PCI_DEVICE_ID_INTEL_ESB2_0:
22241 - case PCI_DEVICE_ID_INTEL_ICH8_0:
22242 - case PCI_DEVICE_ID_INTEL_ICH8_1:
22243 - case PCI_DEVICE_ID_INTEL_ICH8_2:
22244 - case PCI_DEVICE_ID_INTEL_ICH8_3:
22245 - case PCI_DEVICE_ID_INTEL_ICH8_4:
22246 - case PCI_DEVICE_ID_INTEL_ICH9_0:
22247 - case PCI_DEVICE_ID_INTEL_ICH9_1:
22248 - case PCI_DEVICE_ID_INTEL_ICH9_2:
22249 - case PCI_DEVICE_ID_INTEL_ICH9_3:
22250 - case PCI_DEVICE_ID_INTEL_ICH9_4:
22251 - case PCI_DEVICE_ID_INTEL_ICH9_5:
22252 - case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22253 - case PCI_DEVICE_ID_INTEL_ICH10_0:
22254 - case PCI_DEVICE_ID_INTEL_ICH10_1:
22255 - case PCI_DEVICE_ID_INTEL_ICH10_2:
22256 - case PCI_DEVICE_ID_INTEL_ICH10_3:
22257 - r->name = "PIIX/ICH";
22258 - r->get = pirq_piix_get;
22259 - r->set = pirq_piix_set;
22261 + switch (device) {
22262 + case PCI_DEVICE_ID_INTEL_82371FB_0:
22263 + case PCI_DEVICE_ID_INTEL_82371SB_0:
22264 + case PCI_DEVICE_ID_INTEL_82371AB_0:
22265 + case PCI_DEVICE_ID_INTEL_82371MX:
22266 + case PCI_DEVICE_ID_INTEL_82443MX_0:
22267 + case PCI_DEVICE_ID_INTEL_82801AA_0:
22268 + case PCI_DEVICE_ID_INTEL_82801AB_0:
22269 + case PCI_DEVICE_ID_INTEL_82801BA_0:
22270 + case PCI_DEVICE_ID_INTEL_82801BA_10:
22271 + case PCI_DEVICE_ID_INTEL_82801CA_0:
22272 + case PCI_DEVICE_ID_INTEL_82801CA_12:
22273 + case PCI_DEVICE_ID_INTEL_82801DB_0:
22274 + case PCI_DEVICE_ID_INTEL_82801E_0:
22275 + case PCI_DEVICE_ID_INTEL_82801EB_0:
22276 + case PCI_DEVICE_ID_INTEL_ESB_1:
22277 + case PCI_DEVICE_ID_INTEL_ICH6_0:
22278 + case PCI_DEVICE_ID_INTEL_ICH6_1:
22279 + case PCI_DEVICE_ID_INTEL_ICH7_0:
22280 + case PCI_DEVICE_ID_INTEL_ICH7_1:
22281 + case PCI_DEVICE_ID_INTEL_ICH7_30:
22282 + case PCI_DEVICE_ID_INTEL_ICH7_31:
22283 + case PCI_DEVICE_ID_INTEL_ESB2_0:
22284 + case PCI_DEVICE_ID_INTEL_ICH8_0:
22285 + case PCI_DEVICE_ID_INTEL_ICH8_1:
22286 + case PCI_DEVICE_ID_INTEL_ICH8_2:
22287 + case PCI_DEVICE_ID_INTEL_ICH8_3:
22288 + case PCI_DEVICE_ID_INTEL_ICH8_4:
22289 + case PCI_DEVICE_ID_INTEL_ICH9_0:
22290 + case PCI_DEVICE_ID_INTEL_ICH9_1:
22291 + case PCI_DEVICE_ID_INTEL_ICH9_2:
22292 + case PCI_DEVICE_ID_INTEL_ICH9_3:
22293 + case PCI_DEVICE_ID_INTEL_ICH9_4:
22294 + case PCI_DEVICE_ID_INTEL_ICH9_5:
22295 + case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22296 + case PCI_DEVICE_ID_INTEL_ICH10_0:
22297 + case PCI_DEVICE_ID_INTEL_ICH10_1:
22298 + case PCI_DEVICE_ID_INTEL_ICH10_2:
22299 + case PCI_DEVICE_ID_INTEL_ICH10_3:
22300 + case PCI_DEVICE_ID_INTEL_PCH_0:
22301 + case PCI_DEVICE_ID_INTEL_PCH_1:
22302 + r->name = "PIIX/ICH";
22303 + r->get = pirq_piix_get;
22304 + r->set = pirq_piix_set;
22309 @@ -610,7 +615,7 @@ static __init int via_router_probe(struc
22310 * workarounds for some buggy BIOSes
22312 if (device == PCI_DEVICE_ID_VIA_82C586_0) {
22313 - switch(router->device) {
22314 + switch (router->device) {
22315 case PCI_DEVICE_ID_VIA_82C686:
22317 * Asus k7m bios wrongly reports 82C686A
22318 @@ -635,7 +640,7 @@ static __init int via_router_probe(struc
22323 + switch (device) {
22324 case PCI_DEVICE_ID_VIA_82C586_0:
22326 r->get = pirq_via586_get;
22327 @@ -658,28 +663,27 @@ static __init int via_router_probe(struc
22329 static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22333 - case PCI_DEVICE_ID_VLSI_82C534:
22334 - r->name = "VLSI 82C534";
22335 - r->get = pirq_vlsi_get;
22336 - r->set = pirq_vlsi_set;
22338 + switch (device) {
22339 + case PCI_DEVICE_ID_VLSI_82C534:
22340 + r->name = "VLSI 82C534";
22341 + r->get = pirq_vlsi_get;
22342 + r->set = pirq_vlsi_set;
22349 -static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22350 +static __init int serverworks_router_probe(struct irq_router *r,
22351 + struct pci_dev *router, u16 device)
22355 - case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22356 - case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22357 - r->name = "ServerWorks";
22358 - r->get = pirq_serverworks_get;
22359 - r->set = pirq_serverworks_set;
22361 + switch (device) {
22362 + case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22363 + case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22364 + r->name = "ServerWorks";
22365 + r->get = pirq_serverworks_get;
22366 + r->set = pirq_serverworks_set;
22371 @@ -688,7 +692,7 @@ static __init int sis_router_probe(struc
22373 if (device != PCI_DEVICE_ID_SI_503)
22378 r->get = pirq_sis_get;
22379 r->set = pirq_sis_set;
22380 @@ -697,50 +701,45 @@ static __init int sis_router_probe(struc
22382 static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22386 - case PCI_DEVICE_ID_CYRIX_5520:
22387 - r->name = "NatSemi";
22388 - r->get = pirq_cyrix_get;
22389 - r->set = pirq_cyrix_set;
22391 + switch (device) {
22392 + case PCI_DEVICE_ID_CYRIX_5520:
22393 + r->name = "NatSemi";
22394 + r->get = pirq_cyrix_get;
22395 + r->set = pirq_cyrix_set;
22401 static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22405 - case PCI_DEVICE_ID_OPTI_82C700:
22406 - r->name = "OPTI";
22407 - r->get = pirq_opti_get;
22408 - r->set = pirq_opti_set;
22410 + switch (device) {
22411 + case PCI_DEVICE_ID_OPTI_82C700:
22412 + r->name = "OPTI";
22413 + r->get = pirq_opti_get;
22414 + r->set = pirq_opti_set;
22420 static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22424 - case PCI_DEVICE_ID_ITE_IT8330G_0:
22426 - r->get = pirq_ite_get;
22427 - r->set = pirq_ite_set;
22429 + switch (device) {
22430 + case PCI_DEVICE_ID_ITE_IT8330G_0:
22432 + r->get = pirq_ite_get;
22433 + r->set = pirq_ite_set;
22439 static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22443 + switch (device) {
22444 case PCI_DEVICE_ID_AL_M1533:
22445 case PCI_DEVICE_ID_AL_M1563:
22446 - printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
22448 r->get = pirq_ali_get;
22449 r->set = pirq_ali_set;
22450 @@ -751,25 +750,24 @@ static __init int ali_router_probe(struc
22452 static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22456 - case PCI_DEVICE_ID_AMD_VIPER_740B:
22457 - r->name = "AMD756";
22459 - case PCI_DEVICE_ID_AMD_VIPER_7413:
22460 - r->name = "AMD766";
22462 - case PCI_DEVICE_ID_AMD_VIPER_7443:
22463 - r->name = "AMD768";
22467 + switch (device) {
22468 + case PCI_DEVICE_ID_AMD_VIPER_740B:
22469 + r->name = "AMD756";
22471 + case PCI_DEVICE_ID_AMD_VIPER_7413:
22472 + r->name = "AMD766";
22474 + case PCI_DEVICE_ID_AMD_VIPER_7443:
22475 + r->name = "AMD768";
22480 r->get = pirq_amd756_get;
22481 r->set = pirq_amd756_set;
22486 static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22489 @@ -811,7 +809,7 @@ static struct pci_dev *pirq_router_dev;
22490 * FIXME: should we have an option to say "generic for
22495 static void __init pirq_find_router(struct irq_router *r)
22497 struct irq_routing_table *rt = pirq_table;
22498 @@ -830,7 +828,7 @@ static void __init pirq_find_router(stru
22499 r->name = "default";
22504 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
22505 rt->rtr_vendor, rt->rtr_device);
22507 @@ -841,19 +839,19 @@ static void __init pirq_find_router(stru
22511 - for( h = pirq_routers; h->vendor; h++) {
22512 + for (h = pirq_routers; h->vendor; h++) {
22513 /* First look for a router match */
22514 - if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
22515 + if (rt->rtr_vendor == h->vendor &&
22516 + h->probe(r, pirq_router_dev, rt->rtr_device))
22518 /* Fall back to a device match */
22519 - if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
22520 + if (pirq_router_dev->vendor == h->vendor &&
22521 + h->probe(r, pirq_router_dev, pirq_router_dev->device))
22524 - printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
22525 - pirq_router.name,
22526 - pirq_router_dev->vendor,
22527 - pirq_router_dev->device,
22528 - pci_name(pirq_router_dev));
22529 + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
22530 + pirq_router.name,
22531 + pirq_router_dev->vendor, pirq_router_dev->device);
22533 /* The device remains referenced for the kernel lifetime */
22535 @@ -861,11 +859,13 @@ static void __init pirq_find_router(stru
22536 static struct irq_info *pirq_get_info(struct pci_dev *dev)
22538 struct irq_routing_table *rt = pirq_table;
22539 - int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
22540 + int entries = (rt->size - sizeof(struct irq_routing_table)) /
22541 + sizeof(struct irq_info);
22542 struct irq_info *info;
22544 for (info = rt->slots; entries--; info++)
22545 - if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22546 + if (info->bus == dev->bus->number &&
22547 + PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22551 @@ -884,7 +884,7 @@ static int pcibios_lookup_irq(struct pci
22553 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
22555 - DBG(KERN_DEBUG " -> no interrupt pin\n");
22556 + dev_dbg(&dev->dev, "no interrupt pin\n");
22560 @@ -893,20 +893,21 @@ static int pcibios_lookup_irq(struct pci
22565 - DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
22567 info = pirq_get_info(dev);
22569 - DBG(" -> not found in routing table\n" KERN_DEBUG);
22570 + dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
22574 pirq = info->irq[pin].link;
22575 mask = info->irq[pin].bitmap;
22577 - DBG(" -> not routed\n" KERN_DEBUG);
22578 + dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
22581 - DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
22582 + dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
22583 + 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
22584 mask &= pcibios_irq_mask;
22586 /* Work around broken HP Pavilion Notebooks which assign USB to
22587 @@ -919,7 +920,8 @@ static int pcibios_lookup_irq(struct pci
22590 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
22591 - if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
22592 + if (acer_tm360_irqrouting && dev->irq == 11 &&
22593 + dev->vendor == PCI_VENDOR_ID_O2) {
22596 dev->irq = r->get(pirq_router_dev, dev, pirq);
22597 @@ -932,51 +934,50 @@ static int pcibios_lookup_irq(struct pci
22600 if (newirq && !((1 << newirq) & mask)) {
22601 - if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
22602 - else printk("\n" KERN_WARNING
22603 - "PCI: IRQ %i for device %s doesn't match PIRQ mask "
22604 - "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
22606 + if (pci_probe & PCI_USE_PIRQ_MASK)
22609 + dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
22610 + "%#x; try pci=usepirqmask\n", newirq, mask);
22612 if (!newirq && assign) {
22613 for (i = 0; i < 16; i++) {
22614 if (!(mask & (1 << i)))
22616 - if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
22617 + if (pirq_penalty[i] < pirq_penalty[newirq] &&
22618 + can_request_irq(i, IRQF_SHARED))
22622 - DBG(" -> newirq=%d", newirq);
22623 + dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
22625 /* Check if it is hardcoded */
22626 if ((pirq & 0xf0) == 0xf0) {
22628 - DBG(" -> hardcoded IRQ %d\n", irq);
22629 - msg = "Hardcoded";
22630 - } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22631 - ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
22632 - DBG(" -> got IRQ %d\n", irq);
22634 + msg = "hardcoded";
22635 + } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22636 + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
22638 eisa_set_level_irq(irq);
22639 - } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22640 - DBG(" -> assigning IRQ %d", newirq);
22641 + } else if (newirq && r->set &&
22642 + (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22643 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
22644 eisa_set_level_irq(newirq);
22645 - DBG(" ... OK\n");
22646 - msg = "Assigned";
22647 + msg = "assigned";
22653 - DBG(" ... failed\n");
22654 if (newirq && mask == (1 << newirq)) {
22660 + dev_dbg(&dev->dev, "can't route interrupt\n");
22664 - printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
22665 + dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
22667 /* Update IRQ for all devices with the same pirq value */
22668 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
22669 @@ -988,20 +989,25 @@ static int pcibios_lookup_irq(struct pci
22672 if (info->irq[pin].link == pirq) {
22673 - /* We refuse to override the dev->irq information. Give a warning! */
22674 - if ( dev2->irq && dev2->irq != irq && \
22676 + * We refuse to override the dev->irq
22677 + * information. Give a warning!
22679 + if (dev2->irq && dev2->irq != irq && \
22680 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
22681 - ((1 << dev2->irq) & mask)) ) {
22682 + ((1 << dev2->irq) & mask))) {
22683 #ifndef CONFIG_PCI_MSI
22684 - printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
22685 - pci_name(dev2), dev2->irq, irq);
22686 + dev_info(&dev2->dev, "IRQ routing conflict: "
22687 + "have IRQ %d, want IRQ %d\n",
22695 pirq_penalty[irq]++;
22697 - printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
22698 + dev_info(&dev->dev, "sharing IRQ %d with %s\n",
22699 + irq, pci_name(dev2));
22703 @@ -1015,15 +1021,20 @@ static void __init pcibios_fixup_irqs(vo
22704 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
22705 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
22707 - * If the BIOS has set an out of range IRQ number, just ignore it.
22708 - * Also keep track of which IRQ's are already in use.
22709 + * If the BIOS has set an out of range IRQ number, just
22710 + * ignore it. Also keep track of which IRQ's are
22711 + * already in use.
22713 if (dev->irq >= 16) {
22714 - DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
22715 + dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
22718 - /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
22719 - if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
22721 + * If the IRQ is already assigned to a PCI device,
22722 + * ignore its ISA use penalty
22724 + if (pirq_penalty[dev->irq] >= 100 &&
22725 + pirq_penalty[dev->irq] < 100000)
22726 pirq_penalty[dev->irq] = 0;
22727 pirq_penalty[dev->irq]++;
22729 @@ -1035,13 +1046,17 @@ static void __init pcibios_fixup_irqs(vo
22731 * Recalculate IRQ numbers if we use the I/O APIC.
22733 - if (io_apic_assign_pci_irqs)
22735 + if (io_apic_assign_pci_irqs) {
22739 - pin--; /* interrupt pins are numbered starting from 1 */
22740 - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
22742 + * interrupt pins are numbered starting
22746 + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
22747 + PCI_SLOT(dev->devfn), pin);
22749 * Busses behind bridges are typically not listed in the MP-table.
22750 * In this case we have to look up the IRQ based on the parent bus,
22751 @@ -1049,18 +1064,18 @@ static void __init pcibios_fixup_irqs(vo
22752 * busses itself so we should get into this branch reliably.
22754 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22755 - struct pci_dev * bridge = dev->bus->self;
22756 + struct pci_dev *bridge = dev->bus->self;
22758 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22759 - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22760 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22761 PCI_SLOT(bridge->devfn), pin);
22763 - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22764 - pci_name(bridge), 'A' + pin, irq);
22765 + dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
22766 + pci_name(bridge),
22770 - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22771 - pci_name(dev), 'A' + pin, irq);
22772 + dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
22776 @@ -1082,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq
22778 if (!broken_hp_bios_irq9) {
22779 broken_hp_bios_irq9 = 1;
22780 - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22781 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22786 @@ -1095,7 +1111,8 @@ static int __init fix_acer_tm360_irqrout
22788 if (!acer_tm360_irqrouting) {
22789 acer_tm360_irqrouting = 1;
22790 - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22791 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22796 @@ -1107,7 +1124,8 @@ static struct dmi_system_id __initdata p
22798 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
22799 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
22800 - DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
22801 + DMI_MATCH(DMI_PRODUCT_VERSION,
22802 + "HP Pavilion Notebook Model GE"),
22803 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
22806 @@ -1122,7 +1140,7 @@ static struct dmi_system_id __initdata p
22810 -static int __init pcibios_irq_init(void)
22811 +int __init pcibios_irq_init(void)
22813 DBG(KERN_DEBUG "PCI: IRQ init\n");
22815 @@ -1142,11 +1160,14 @@ static int __init pcibios_irq_init(void)
22816 pirq_find_router(&pirq_router);
22817 if (pirq_table->exclusive_irqs) {
22819 - for (i=0; i<16; i++)
22820 + for (i = 0; i < 16; i++)
22821 if (!(pirq_table->exclusive_irqs & (1 << i)))
22822 pirq_penalty[i] += 100;
22824 - /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
22826 + * If we're using the I/O APIC, avoid using the PCI IRQ
22829 if (io_apic_assign_pci_irqs)
22832 @@ -1157,9 +1178,6 @@ static int __init pcibios_irq_init(void)
22836 -subsys_initcall(pcibios_irq_init);
22839 static void pirq_penalize_isa_irq(int irq, int active)
22842 @@ -1193,7 +1211,7 @@ static int pirq_enable_irq(struct pci_de
22843 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
22846 - pin--; /* interrupt pins are numbered starting from 1 */
22847 + pin--; /* interrupt pins are numbered starting from 1 */
22849 if (io_apic_assign_pci_irqs) {
22851 @@ -1207,35 +1225,41 @@ static int pirq_enable_irq(struct pci_de
22854 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22855 - struct pci_dev * bridge = dev->bus->self;
22856 + struct pci_dev *bridge = dev->bus->self;
22858 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22859 - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22860 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22861 PCI_SLOT(bridge->devfn), pin);
22863 - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22864 - pci_name(bridge), 'A' + pin, irq);
22865 + dev_warn(&dev->dev, "using bridge %s "
22866 + "INT %c to get IRQ %d\n",
22867 + pci_name(bridge), 'A' + pin,
22873 - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22874 - pci_name(dev), 'A' + pin, irq);
22875 + dev_info(&dev->dev, "PCI->APIC IRQ transform: "
22876 + "INT %c -> IRQ %d\n", 'A' + pin, irq);
22880 - msg = " Probably buggy MP table.";
22881 + msg = "; probably buggy MP table";
22882 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
22885 - msg = " Please try using pci=biosirq.";
22886 + msg = "; please try using pci=biosirq";
22888 - /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
22889 - if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
22891 + * With IDE legacy devices the IRQ lookup failure is not
22894 + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
22895 + !(dev->class & 0x5))
22898 - printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
22899 - 'A' + pin, pci_name(dev), msg);
22900 + dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
22905 --- sle11-2009-06-04.orig/arch/x86/vdso/Makefile 2009-03-16 16:33:40.000000000 +0100
22906 +++ sle11-2009-06-04/arch/x86/vdso/Makefile 2009-06-04 10:21:39.000000000 +0200
22907 @@ -65,9 +65,7 @@ obj-$(VDSO32-y) += vdso32-syms.lds
22908 vdso32.so-$(VDSO32-y) += int80
22909 vdso32.so-$(CONFIG_COMPAT) += syscall
22910 vdso32.so-$(VDSO32-y) += sysenter
22911 -xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
22912 -xen-vdso32-$(CONFIG_X86_32) += syscall
22913 -vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
22914 +vdso32.so-$(CONFIG_X86_XEN) += syscall
22916 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
22918 --- sle11-2009-06-04.orig/arch/x86/vdso/vdso32.S 2009-03-16 16:33:40.000000000 +0100
22919 +++ sle11-2009-06-04/arch/x86/vdso/vdso32.S 2009-06-04 10:21:39.000000000 +0200
22920 @@ -9,7 +9,7 @@ vdso32_int80_end:
22922 .globl vdso32_syscall_start, vdso32_syscall_end
22923 vdso32_syscall_start:
22924 -#ifdef CONFIG_COMPAT
22925 +#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
22926 .incbin "arch/x86/vdso/vdso32-syscall.so"
22928 vdso32_syscall_end:
22929 @@ -19,16 +19,4 @@ vdso32_sysenter_start:
22930 .incbin "arch/x86/vdso/vdso32-sysenter.so"
22931 vdso32_sysenter_end:
22933 -#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
22934 - .globl vdso32_int80_start, vdso32_int80_end
22935 -vdso32_int80_start:
22936 - .incbin "arch/x86/vdso/vdso32-int80.so"
22938 -#elif defined(CONFIG_X86_XEN)
22939 - .globl vdso32_syscall_start, vdso32_syscall_end
22940 -vdso32_syscall_start:
22941 - .incbin "arch/x86/vdso/vdso32-syscall.so"
22942 -vdso32_syscall_end:
22946 --- sle11-2009-06-04.orig/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:38:05.000000000 +0100
22947 +++ sle11-2009-06-04/arch/x86/vdso/vdso32-setup-xen.c 2009-06-04 10:21:39.000000000 +0200
22948 @@ -195,50 +195,28 @@ static __init void relocate_vdso(Elf32_E
22953 - * These symbols are defined by vdso32.S to mark the bounds
22954 - * of the ELF DSO images included therein.
22956 -extern const char vdso32_default_start, vdso32_default_end;
22957 -extern const char vdso32_sysenter_start, vdso32_sysenter_end;
22958 static struct page *vdso32_pages[1];
22960 #ifdef CONFIG_X86_64
22962 -#if CONFIG_XEN_COMPAT < 0x030200
22963 -static int use_int80 = 1;
22965 -static int use_sysenter __read_mostly = -1;
22967 -#define vdso32_sysenter() (use_sysenter > 0)
22968 +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
22969 +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
22971 -/* May not be __init: called during resume */
22972 -void syscall32_cpu_init(void)
22973 +void __cpuinit syscall32_cpu_init(void)
22975 - static const struct callback_register cstar = {
22976 + static const struct callback_register __cpuinitconst cstar = {
22977 .type = CALLBACKTYPE_syscall32,
22978 .address = (unsigned long)ia32_cstar_target
22980 - static const struct callback_register sysenter = {
22981 + static const struct callback_register __cpuinitconst sysenter = {
22982 .type = CALLBACKTYPE_sysenter,
22983 .address = (unsigned long)ia32_sysenter_target
22986 - if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
22987 - (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
22988 -#if CONFIG_XEN_COMPAT < 0x030200
22995 - if (use_sysenter < 0) {
22996 - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
22997 - use_sysenter = 1;
22998 - if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
22999 - use_sysenter = 1;
23001 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
23002 + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
23003 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
23004 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
23007 #define compat_uses_vma 1
23008 @@ -250,6 +228,7 @@ static inline void map_compat_vdso(int m
23009 #else /* CONFIG_X86_32 */
23011 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
23012 +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
23014 extern asmlinkage void ia32pv_cstar_target(void);
23015 static const struct callback_register __cpuinitconst cstar = {
23016 @@ -265,13 +244,13 @@ void __cpuinit enable_sep_cpu(void)
23017 .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
23020 - if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
23021 + if (vdso32_syscall()) {
23022 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
23027 - if (!boot_cpu_has(X86_FEATURE_SEP))
23028 + if (!vdso32_sysenter())
23031 if (xen_feature(XENFEAT_supervisor_mode_kernel))
23032 @@ -341,34 +320,26 @@ int __init sysenter_setup(void)
23034 #ifdef CONFIG_X86_32
23038 -#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
23040 - extern const char vdso32_int80_start, vdso32_int80_end;
23042 - vsyscall = &vdso32_int80_start;
23043 - vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
23045 -#elif defined(CONFIG_X86_32)
23046 - if (boot_cpu_has(X86_FEATURE_SYSCALL)
23047 - && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
23048 - || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
23049 - setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
23050 - barrier(); /* until clear_bit()'s constraints are correct ... */
23051 if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
23052 - extern const char vdso32_syscall_start, vdso32_syscall_end;
23054 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
23055 + && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
23056 + setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
23058 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
23059 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
23063 + if (vdso32_syscall()) {
23064 vsyscall = &vdso32_syscall_start;
23065 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
23068 - if (!vdso32_sysenter()) {
23069 - vsyscall = &vdso32_default_start;
23070 - vsyscall_len = &vdso32_default_end - &vdso32_default_start;
23072 + } else if (vdso32_sysenter()){
23073 vsyscall = &vdso32_sysenter_start;
23074 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
23076 + vsyscall = &vdso32_int80_start;
23077 + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
23080 memcpy(syscall_page, vsyscall, vsyscall_len);
23081 --- sle11-2009-06-04.orig/arch/x86/xen/Kconfig 2009-02-16 16:17:21.000000000 +0100
23082 +++ sle11-2009-06-04/arch/x86/xen/Kconfig 2009-06-04 10:21:39.000000000 +0200
23083 @@ -17,7 +17,7 @@ config XEN_MAX_DOMAIN_MEMORY
23084 int "Maximum allowed size of a domain in gigabytes"
23085 default 8 if X86_32
23086 default 32 if X86_64
23088 + depends on PARAVIRT_XEN
23090 The pseudo-physical to machine address array is sized
23091 according to the maximum possible memory size of a Xen
23092 @@ -26,5 +26,5 @@ config XEN_MAX_DOMAIN_MEMORY
23094 config XEN_SAVE_RESTORE
23097 + depends on PARAVIRT_XEN && PM
23099 \ No newline at end of file
23100 --- sle11-2009-06-04.orig/drivers/acpi/processor_core.c 2009-03-16 16:38:05.000000000 +0100
23101 +++ sle11-2009-06-04/drivers/acpi/processor_core.c 2009-06-04 10:21:39.000000000 +0200
23102 @@ -721,9 +721,11 @@ static int __cpuinit acpi_processor_star
23106 - sysdev = get_cpu_sysdev(pr->id);
23107 - if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
23109 + if (pr->id != -1) {
23110 + sysdev = get_cpu_sysdev(pr->id);
23111 + if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
23115 status = acpi_install_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
23116 acpi_processor_notify, pr);
23117 @@ -895,7 +897,8 @@ static int acpi_processor_remove(struct
23118 status = acpi_remove_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
23119 acpi_processor_notify);
23121 - sysfs_remove_link(&device->dev.kobj, "sysdev");
23122 + if (pr->id != -1)
23123 + sysfs_remove_link(&device->dev.kobj, "sysdev");
23125 acpi_processor_remove_fs(device);
23127 --- sle11-2009-06-04.orig/drivers/char/tpm/tpm_vtpm.c 2009-02-16 15:58:14.000000000 +0100
23128 +++ sle11-2009-06-04/drivers/char/tpm/tpm_vtpm.c 2009-06-04 10:21:39.000000000 +0200
23129 @@ -347,7 +347,7 @@ static int _vtpm_send_queued(struct tpm_
23134 + unsigned long flags;
23135 unsigned char buffer[1];
23136 struct vtpm_state *vtpms;
23137 vtpms = (struct vtpm_state *)chip_get_private(chip);
23138 --- sle11-2009-06-04.orig/drivers/misc/Kconfig 2009-06-04 11:08:07.000000000 +0200
23139 +++ sle11-2009-06-04/drivers/misc/Kconfig 2009-06-04 10:21:39.000000000 +0200
23140 @@ -438,7 +438,7 @@ config ENCLOSURE_SERVICES
23142 tristate "Support communication between SGI SSIs"
23144 - depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP
23145 + depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP && !XEN
23146 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
23147 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
23148 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP
23149 @@ -465,7 +465,7 @@ config HP_ILO
23152 tristate "SGI GRU driver"
23153 - depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP
23154 + depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP && !XEN
23156 select MMU_NOTIFIER
23158 --- sle11-2009-06-04.orig/drivers/pci/msi-xen.c 2009-03-16 16:38:05.000000000 +0100
23159 +++ sle11-2009-06-04/drivers/pci/msi-xen.c 2009-06-04 10:21:39.000000000 +0200
23160 @@ -90,12 +90,10 @@ arch_teardown_msi_irqs(struct pci_dev *d
23164 -static void msi_set_enable(struct pci_dev *dev, int enable)
23165 +static void __msi_set_enable(struct pci_dev *dev, int pos, int enable)
23170 - pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
23172 pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
23173 control &= ~PCI_MSI_FLAGS_ENABLE;
23174 @@ -105,6 +103,11 @@ static void msi_set_enable(struct pci_de
23178 +static void msi_set_enable(struct pci_dev *dev, int enable)
23180 + __msi_set_enable(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), enable);
23183 static void msix_set_enable(struct pci_dev *dev, int enable)
23186 @@ -568,9 +571,8 @@ int pci_enable_msi(struct pci_dev* dev)
23188 /* Check whether driver already requested for MSI-X irqs */
23189 if (dev->msix_enabled) {
23190 - printk(KERN_INFO "PCI: %s: Can't enable MSI. "
23191 - "Device already has MSI-X enabled\n",
23193 + dev_info(&dev->dev, "can't enable MSI "
23194 + "(MSI-X already enabled)\n");
23198 @@ -702,9 +704,8 @@ int pci_enable_msix(struct pci_dev* dev,
23200 /* Check whether driver already requested for MSI vector */
23201 if (dev->msi_enabled) {
23202 - printk(KERN_INFO "PCI: %s: Can't enable MSI-X. "
23203 - "Device already has an MSI irq assigned\n",
23205 + dev_info(&dev->dev, "can't enable MSI-X "
23206 + "(MSI IRQ already assigned)\n");
23210 --- sle11-2009-06-04.orig/drivers/pci/quirks.c 2009-06-04 11:08:07.000000000 +0200
23211 +++ sle11-2009-06-04/drivers/pci/quirks.c 2009-06-04 10:21:39.000000000 +0200
23212 @@ -44,9 +44,8 @@ static void __devinit quirk_release_reso
23213 /* PCI Host Bridge isn't a target device */
23217 - "PCI: Disable memory decoding and release memory resources [%s].\n",
23219 + dev_info(&dev->dev,
23220 + "disable memory decoding and release memory resources\n");
23221 pci_read_config_word(dev, PCI_COMMAND, &command);
23222 command &= ~PCI_COMMAND_MEMORY;
23223 pci_write_config_word(dev, PCI_COMMAND, command);
23224 --- sle11-2009-06-04.orig/drivers/pci/setup-res.c 2009-06-04 11:08:07.000000000 +0200
23225 +++ sle11-2009-06-04/drivers/pci/setup-res.c 2009-06-04 10:21:39.000000000 +0200
23226 @@ -129,7 +129,7 @@ int pci_claim_resource(struct pci_dev *d
23227 #ifdef CONFIG_PCI_REASSIGN
23228 void pci_disable_bridge_window(struct pci_dev *dev)
23230 - printk(KERN_DEBUG "PCI: Disable bridge window on %s\n", pci_name(dev));
23231 + dev_dbg(&dev->dev, "disable bridge window\n");
23233 /* MMIO Base/Limit */
23234 pci_write_config_dword(dev, PCI_MEMORY_BASE, 0x0000fff0);
23235 @@ -190,8 +190,8 @@ int pci_assign_resource(struct pci_dev *
23236 res->flags &= ~IORESOURCE_STARTALIGN;
23237 if (resno < PCI_BRIDGE_RESOURCES) {
23238 #ifdef CONFIG_PCI_REASSIGN
23239 - printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23240 - "%016llx - %016llx\n", resno, pci_name(dev),
23241 + dev_dbg(&dev->dev, "assign resource(%d) "
23242 + "%016llx - %016llx\n", resno,
23243 (unsigned long long)res->start,
23244 (unsigned long long)res->end);
23246 @@ -235,8 +235,8 @@ int pci_assign_resource_fixed(struct pci
23247 (unsigned long long)res->end);
23248 } else if (resno < PCI_BRIDGE_RESOURCES) {
23249 #ifdef CONFIG_PCI_REASSIGN
23250 - printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23251 - "%016llx - %016llx\n", resno, pci_name(dev),
23252 + dev_dbg(&dev->dev, "assign resource(%d) "
23253 + "%016llx - %016llx\n", resno,
23254 (unsigned long long)res->start,
23255 (unsigned long long)res->end);
23257 --- sle11-2009-06-04.orig/drivers/xen/Makefile 2009-03-16 16:38:05.000000000 +0100
23258 +++ sle11-2009-06-04/drivers/xen/Makefile 2009-06-04 10:21:39.000000000 +0200
23260 -obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
23261 +obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o manage.o
23262 xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
23263 xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
23265 --- sle11-2009-06-04.orig/drivers/xen/balloon/sysfs.c 2009-03-16 16:33:40.000000000 +0100
23266 +++ sle11-2009-06-04/drivers/xen/balloon/sysfs.c 2009-06-04 10:21:39.000000000 +0200
23269 #define BALLOON_SHOW(name, format, args...) \
23270 static ssize_t show_##name(struct sys_device *dev, \
23271 + struct sysdev_attribute *attr, \
23274 return sprintf(buf, format, ##args); \
23275 @@ -59,14 +60,15 @@ BALLOON_SHOW(hard_limit_kb,
23276 (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0);
23277 BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
23279 -static ssize_t show_target_kb(struct sys_device *dev, char *buf)
23280 +static ssize_t show_target_kb(struct sys_device *dev,
23281 + struct sysdev_attribute *attr, char *buf)
23283 return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
23286 static ssize_t store_target_kb(struct sys_device *dev,
23289 + struct sysdev_attribute *attr,
23290 + const char *buf, size_t count)
23292 char memstring[64], *endchar;
23293 unsigned long long target_bytes;
23294 --- sle11-2009-06-04.orig/drivers/xen/blktap/blktap.c 2009-04-20 11:40:14.000000000 +0200
23295 +++ sle11-2009-06-04/drivers/xen/blktap/blktap.c 2009-06-04 10:21:39.000000000 +0200
23297 #include <linux/gfp.h>
23298 #include <linux/poll.h>
23299 #include <linux/delay.h>
23300 +#include <linux/nsproxy.h>
23301 #include <asm/tlbflush.h>
23303 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
23304 @@ -498,7 +499,7 @@ found:
23306 if ((class = get_xen_class()) != NULL)
23307 device_create(class, NULL, MKDEV(blktap_major, minor),
23308 - "blktap%d", minor);
23309 + NULL, "blktap%d", minor);
23313 @@ -1683,7 +1684,8 @@ static int __init blkif_init(void)
23314 * We only create the device when a request of a new device is
23317 - device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
23318 + device_create(class, NULL, MKDEV(blktap_major, 0), NULL,
23321 /* this is bad, but not fatal */
23322 WPRINTK("blktap: sysfs xen_class not created\n");
23323 --- sle11-2009-06-04.orig/drivers/xen/char/mem.c 2009-03-16 16:38:05.000000000 +0100
23324 +++ sle11-2009-06-04/drivers/xen/char/mem.c 2009-06-04 10:21:39.000000000 +0200
23325 @@ -35,7 +35,7 @@ static inline int uncached_access(struct
23327 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
23329 -#ifdef CONFIG_NONPROMISC_DEVMEM
23330 +#ifdef CONFIG_STRICT_DEVMEM
23331 u64 from = ((u64)pfn) << PAGE_SHIFT;
23332 u64 to = from + size;
23334 @@ -172,7 +172,10 @@ static void mmap_mem_close(struct vm_are
23336 static struct vm_operations_struct mmap_mem_ops = {
23337 .open = mmap_mem_open,
23338 - .close = mmap_mem_close
23339 + .close = mmap_mem_close,
23340 +#ifdef CONFIG_HAVE_IOREMAP_PROT
23341 + .access = generic_access_phys
23345 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
23346 --- sle11-2009-06-04.orig/drivers/xen/console/console.c 2009-03-16 16:38:05.000000000 +0100
23347 +++ sle11-2009-06-04/drivers/xen/console/console.c 2009-06-04 10:21:39.000000000 +0200
23348 @@ -432,9 +432,7 @@ static void __xencons_tx_flush(void)
23350 if (work_done && (xencons_tty != NULL)) {
23351 wake_up_interruptible(&xencons_tty->write_wait);
23352 - if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
23353 - (xencons_tty->ldisc.write_wakeup != NULL))
23354 - (xencons_tty->ldisc.write_wakeup)(xencons_tty);
23355 + tty_wakeup(xencons_tty);
23359 @@ -635,8 +633,8 @@ static void xencons_close(struct tty_str
23361 tty_wait_until_sent(tty, 0);
23362 tty_driver_flush_buffer(tty);
23363 - if (tty->ldisc.flush_buffer != NULL)
23364 - tty->ldisc.flush_buffer(tty);
23365 + if (tty->ldisc.ops->flush_buffer != NULL)
23366 + tty->ldisc.ops->flush_buffer(tty);
23368 spin_lock_irqsave(&xencons_lock, flags);
23369 xencons_tty = NULL;
23370 --- sle11-2009-06-04.orig/drivers/xen/core/evtchn.c 2009-03-16 16:33:40.000000000 +0100
23371 +++ sle11-2009-06-04/drivers/xen/core/evtchn.c 2009-06-04 10:21:39.000000000 +0200
23372 @@ -746,8 +746,9 @@ static struct irq_chip dynirq_chip = {
23375 /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
23376 -static int pirq_eoi_does_unmask;
23377 +static bool pirq_eoi_does_unmask;
23378 static unsigned long *pirq_needs_eoi;
23379 +static DECLARE_BITMAP(probing_pirq, NR_PIRQS);
23381 static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
23383 @@ -794,25 +795,31 @@ static inline void pirq_query_unmask(int
23384 set_bit(irq - PIRQ_BASE, pirq_needs_eoi);
23388 - * On startup, if there is no action associated with the IRQ then we are
23389 - * probing. In this case we should not share with others as it will confuse us.
23391 -#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
23392 +static int set_type_pirq(unsigned int irq, unsigned int type)
23394 + if (type != IRQ_TYPE_PROBE)
23396 + set_bit(irq - PIRQ_BASE, probing_pirq);
23400 static unsigned int startup_pirq(unsigned int irq)
23402 struct evtchn_bind_pirq bind_pirq;
23403 int evtchn = evtchn_from_irq(irq);
23405 - if (VALID_EVTCHN(evtchn))
23406 + if (VALID_EVTCHN(evtchn)) {
23407 + clear_bit(irq - PIRQ_BASE, probing_pirq);
23411 bind_pirq.pirq = evtchn_get_xen_pirq(irq);
23412 /* NB. We are happy to share unless we are probing. */
23413 - bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
23414 + bind_pirq.flags = test_and_clear_bit(irq - PIRQ_BASE, probing_pirq)
23415 + || (irq_desc[irq].status & IRQ_AUTODETECT)
23416 + ? 0 : BIND_PIRQ__WILL_SHARE;
23417 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
23418 - if (!probing_irq(irq))
23419 + if (bind_pirq.flags)
23420 printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
23423 @@ -891,6 +898,7 @@ static struct irq_chip pirq_chip = {
23424 .mask_ack = ack_pirq,
23427 + .set_type = set_type_pirq,
23429 .set_affinity = set_affinity_irq,
23431 @@ -1003,6 +1011,7 @@ void xen_poll_irq(int irq)
23435 +#ifdef CONFIG_PM_SLEEP
23436 static void restore_cpu_virqs(unsigned int cpu)
23438 struct evtchn_bind_virq bind_virq;
23439 @@ -1095,6 +1104,7 @@ void irq_resume(void)
23445 #if defined(CONFIG_X86_IO_APIC)
23446 #define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
23447 @@ -1177,7 +1187,7 @@ void __init xen_init_IRQ(void)
23448 * BITS_TO_LONGS(ALIGN(NR_PIRQS, PAGE_SIZE * 8)));
23449 eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
23450 if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
23451 - pirq_eoi_does_unmask = 1;
23452 + pirq_eoi_does_unmask = true;
23454 /* No event channels are 'live' right now. */
23455 for (i = 0; i < NR_EVENT_CHANNELS; i++)
23456 --- sle11-2009-06-04.orig/drivers/xen/core/gnttab.c 2008-12-01 11:25:57.000000000 +0100
23457 +++ sle11-2009-06-04/drivers/xen/core/gnttab.c 2009-06-04 10:21:39.000000000 +0200
23458 @@ -449,6 +449,7 @@ static int map_pte_fn(pte_t *pte, struct
23462 +#ifdef CONFIG_PM_SLEEP
23463 static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
23464 unsigned long addr, void *data)
23466 @@ -456,6 +457,7 @@ static int unmap_pte_fn(pte_t *pte, stru
23467 set_pte_at(&init_mm, addr, pte, __pte(0));
23472 void *arch_gnttab_alloc_shared(unsigned long *frames)
23474 @@ -633,6 +635,75 @@ void __gnttab_dma_map_page(struct page *
23475 } while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
23478 +#ifdef __HAVE_ARCH_PTE_SPECIAL
23480 +static unsigned int GNTMAP_pte_special;
23482 +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
23483 + unsigned int count)
23487 + if (unlikely(cmd != GNTTABOP_map_grant_ref))
23490 + for (i = 0; i < count; ++i, ++map) {
23491 + if (!(map->flags & GNTMAP_host_map)
23492 + || !(map->flags & GNTMAP_application_map))
23494 + if (GNTMAP_pte_special)
23495 + map->flags |= GNTMAP_pte_special;
23497 + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
23504 +EXPORT_SYMBOL(gnttab_pre_map_adjust);
23506 +#if CONFIG_XEN_COMPAT < 0x030400
23507 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
23512 + for (i = 0; i < count && rc == 0; ++i, ++map) {
23515 + if (!(map->flags & GNTMAP_host_map)
23516 + || !(map->flags & GNTMAP_application_map))
23520 + pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
23521 + | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
23523 + & __supported_pte_mask);
23525 +#error Architecture not yet supported.
23527 + if (!(map->flags & GNTMAP_readonly))
23528 + pte = pte_mkwrite(pte);
23530 + if (map->flags & GNTMAP_contains_pte) {
23533 + u.ptr = map->host_addr;
23534 + u.val = __pte_val(pte);
23535 + rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
23537 + rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
23542 +EXPORT_SYMBOL(gnttab_post_map_adjust);
23545 +#endif /* __HAVE_ARCH_PTE_SPECIAL */
23547 int gnttab_resume(void)
23549 if (max_nr_grant_frames() < nr_grant_frames)
23550 @@ -640,6 +711,7 @@ int gnttab_resume(void)
23551 return gnttab_map(0, nr_grant_frames - 1);
23554 +#ifdef CONFIG_PM_SLEEP
23555 int gnttab_suspend(void)
23558 @@ -649,6 +721,7 @@ int gnttab_suspend(void)
23564 #else /* !CONFIG_XEN */
23566 @@ -759,6 +832,18 @@ int __devinit gnttab_init(void)
23567 gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
23568 gnttab_free_head = NR_RESERVED_ENTRIES;
23570 +#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
23571 + if (!xen_feature(XENFEAT_auto_translated_physmap)
23572 + && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
23574 + GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
23575 + >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
23577 +#error Architecture not yet supported.
23585 --- sle11-2009-06-04.orig/drivers/xen/core/machine_kexec.c 2009-03-16 16:38:05.000000000 +0100
23586 +++ sle11-2009-06-04/drivers/xen/core/machine_kexec.c 2009-06-04 10:21:39.000000000 +0200
23587 @@ -91,7 +91,7 @@ void __init xen_machine_kexec_setup_reso
23588 xen_hypervisor_res.start = range.start;
23589 xen_hypervisor_res.end = range.start + range.size - 1;
23590 xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
23591 -#ifdef CONFIG_X86_64
23593 insert_resource(&iomem_resource, &xen_hypervisor_res);
23596 @@ -106,7 +106,7 @@ void __init xen_machine_kexec_setup_reso
23598 crashk_res.start = range.start;
23599 crashk_res.end = range.start + range.size - 1;
23600 -#ifdef CONFIG_X86_64
23602 insert_resource(&iomem_resource, &crashk_res);
23605 @@ -160,7 +160,7 @@ void __init xen_machine_kexec_setup_reso
23609 -#ifndef CONFIG_X86_64
23610 +#ifndef CONFIG_X86
23611 void __init xen_machine_kexec_register_resources(struct resource *res)
23613 request_resource(res, &xen_hypervisor_res);
23614 --- sle11-2009-06-04.orig/drivers/xen/core/machine_reboot.c 2009-06-04 11:08:07.000000000 +0200
23615 +++ sle11-2009-06-04/drivers/xen/core/machine_reboot.c 2009-06-04 10:21:39.000000000 +0200
23616 @@ -57,6 +57,7 @@ EXPORT_SYMBOL(machine_restart);
23617 EXPORT_SYMBOL(machine_halt);
23618 EXPORT_SYMBOL(machine_power_off);
23620 +#ifdef CONFIG_PM_SLEEP
23621 static void pre_suspend(void)
23623 HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
23624 @@ -111,6 +112,7 @@ static void post_suspend(int suspend_can
23625 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
23626 virt_to_mfn(pfn_to_mfn_frame_list_list);
23630 #else /* !(defined(__i386__) || defined(__x86_64__)) */
23632 @@ -129,6 +131,7 @@ static void post_suspend(int suspend_can
23636 +#ifdef CONFIG_PM_SLEEP
23639 void (*resume_notifier)(int);
23640 @@ -222,7 +225,8 @@ int __xen_suspend(int fast_suspend, void
23642 if (fast_suspend) {
23644 - err = stop_machine_run(take_machine_down, &suspend, 0);
23645 + err = stop_machine(take_machine_down, &suspend,
23646 + &cpumask_of_cpu(0));
23648 xenbus_suspend_cancel();
23650 @@ -245,3 +249,4 @@ int __xen_suspend(int fast_suspend, void
23655 --- sle11-2009-06-04.orig/drivers/xen/core/reboot.c 2009-02-16 16:17:21.000000000 +0100
23656 +++ sle11-2009-06-04/drivers/xen/core/reboot.c 2009-06-04 10:21:39.000000000 +0200
23657 @@ -29,17 +29,12 @@ MODULE_LICENSE("Dual BSD/GPL");
23658 /* Ignore multiple shutdown requests. */
23659 static int shutting_down = SHUTDOWN_INVALID;
23661 -/* Was last suspend request cancelled? */
23662 -static int suspend_cancelled;
23664 /* Can we leave APs online when we suspend? */
23665 static int fast_suspend;
23667 static void __shutdown_handler(struct work_struct *unused);
23668 static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
23670 -static int setup_suspend_evtchn(void);
23672 int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));
23674 static int shutdown_process(void *__unused)
23675 @@ -69,6 +64,13 @@ static int shutdown_process(void *__unus
23679 +#ifdef CONFIG_PM_SLEEP
23681 +static int setup_suspend_evtchn(void);
23683 +/* Was last suspend request cancelled? */
23684 +static int suspend_cancelled;
23686 static void xen_resume_notifier(int _suspend_cancelled)
23688 int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
23689 @@ -118,6 +120,10 @@ static int xen_suspend(void *__unused)
23694 +# define xen_suspend NULL
23697 static void switch_shutdown_state(int new_state)
23699 int prev_state, old_state = SHUTDOWN_INVALID;
23700 @@ -194,8 +200,10 @@ static void shutdown_handler(struct xenb
23701 new_state = SHUTDOWN_POWEROFF;
23702 else if (strcmp(str, "reboot") == 0)
23704 +#ifdef CONFIG_PM_SLEEP
23705 else if (strcmp(str, "suspend") == 0)
23706 new_state = SHUTDOWN_SUSPEND;
23708 else if (strcmp(str, "halt") == 0)
23709 new_state = SHUTDOWN_HALT;
23711 @@ -247,6 +255,7 @@ static struct xenbus_watch sysrq_watch =
23712 .callback = sysrq_handler
23715 +#ifdef CONFIG_PM_SLEEP
23716 static irqreturn_t suspend_int(int irq, void* dev_id)
23718 switch_shutdown_state(SHUTDOWN_SUSPEND);
23719 @@ -274,6 +283,9 @@ static int setup_suspend_evtchn(void)
23724 +#define setup_suspend_evtchn() 0
23727 static int setup_shutdown_watcher(void)
23729 --- sle11-2009-06-04.orig/drivers/xen/core/smpboot.c 2009-03-16 16:38:05.000000000 +0100
23730 +++ sle11-2009-06-04/drivers/xen/core/smpboot.c 2009-06-04 10:21:39.000000000 +0200
23733 extern irqreturn_t smp_reschedule_interrupt(int, void *);
23734 extern irqreturn_t smp_call_function_interrupt(int, void *);
23735 +extern irqreturn_t smp_call_function_single_interrupt(int, void *);
23737 extern int local_setup_timer(unsigned int cpu);
23738 extern void local_teardown_timer(unsigned int cpu);
23739 @@ -50,8 +51,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
23741 static DEFINE_PER_CPU(int, resched_irq);
23742 static DEFINE_PER_CPU(int, callfunc_irq);
23743 +static DEFINE_PER_CPU(int, call1func_irq);
23744 static char resched_name[NR_CPUS][15];
23745 static char callfunc_name[NR_CPUS][15];
23746 +static char call1func_name[NR_CPUS][15];
23748 #ifdef CONFIG_X86_LOCAL_APIC
23749 #define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
23750 @@ -73,15 +76,13 @@ void __init prefill_possible_map(void)
23752 for (i = 0; i < NR_CPUS; i++) {
23753 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
23756 cpu_set(i, cpu_possible_map);
23757 + nr_cpu_ids = i + 1;
23762 -void __init smp_alloc_memory(void)
23767 set_cpu_sibling_map(unsigned int cpu)
23769 @@ -110,7 +111,8 @@ static int __cpuinit xen_smp_intr_init(u
23773 - per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
23774 + per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) =
23775 + per_cpu(call1func_irq, cpu) = -1;
23777 sprintf(resched_name[cpu], "resched%u", cpu);
23778 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
23779 @@ -134,6 +136,17 @@ static int __cpuinit xen_smp_intr_init(u
23781 per_cpu(callfunc_irq, cpu) = rc;
23783 + sprintf(call1func_name[cpu], "call1func%u", cpu);
23784 + rc = bind_ipi_to_irqhandler(CALL_FUNC_SINGLE_VECTOR,
23786 + smp_call_function_single_interrupt,
23787 + IRQF_DISABLED|IRQF_NOBALANCING,
23788 + call1func_name[cpu],
23792 + per_cpu(call1func_irq, cpu) = rc;
23794 rc = xen_spinlock_init(cpu);
23797 @@ -148,6 +161,8 @@ static int __cpuinit xen_smp_intr_init(u
23798 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23799 if (per_cpu(callfunc_irq, cpu) >= 0)
23800 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23801 + if (per_cpu(call1func_irq, cpu) >= 0)
23802 + unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23803 xen_spinlock_cleanup(cpu);
23806 @@ -160,6 +175,7 @@ static void __cpuexit xen_smp_intr_exit(
23808 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23809 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23810 + unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23811 xen_spinlock_cleanup(cpu);
23814 @@ -167,11 +183,7 @@ static void __cpuexit xen_smp_intr_exit(
23815 void __cpuinit cpu_bringup(void)
23819 identify_secondary_cpu(¤t_cpu_data);
23821 - identify_cpu(¤t_cpu_data);
23823 touch_softlockup_watchdog();
23825 local_irq_enable();
23826 @@ -251,9 +263,6 @@ void __init smp_prepare_cpus(unsigned in
23827 struct task_struct *idle;
23829 struct vcpu_get_physid cpu_id;
23831 - struct desc_ptr *gdt_descr;
23836 @@ -266,7 +275,7 @@ void __init smp_prepare_cpus(unsigned in
23838 current_thread_info()->cpu = 0;
23840 - for (cpu = 0; cpu < NR_CPUS; cpu++) {
23841 + for_each_possible_cpu (cpu) {
23842 cpus_clear(per_cpu(cpu_sibling_map, cpu));
23843 cpus_clear(per_cpu(cpu_core_map, cpu));
23845 @@ -293,21 +302,10 @@ void __init smp_prepare_cpus(unsigned in
23847 panic("failed fork for CPU %d", cpu);
23850 - gdt_descr = &cpu_gdt_descr[cpu];
23851 - gdt_descr->address = get_zeroed_page(GFP_KERNEL);
23852 - if (unlikely(!gdt_descr->address)) {
23853 - printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
23857 - gdt_descr->size = GDT_SIZE;
23858 - memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
23859 - gdt_addr = (void *)gdt_descr->address;
23863 - gdt_addr = get_cpu_gdt_table(cpu);
23865 + gdt_addr = get_cpu_gdt_table(cpu);
23866 make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
23869 @@ -353,8 +351,8 @@ void __init smp_prepare_boot_cpu(void)
23872 init_gdt(smp_processor_id());
23873 - switch_to_new_gdt();
23875 + switch_to_new_gdt();
23876 prefill_possible_map();
23879 --- sle11-2009-06-04.orig/drivers/xen/core/spinlock.c 2009-03-16 16:33:40.000000000 +0100
23880 +++ sle11-2009-06-04/drivers/xen/core/spinlock.c 2009-06-04 10:36:24.000000000 +0200
23882 * portions of this file.
23885 +#if CONFIG_XEN_COMPAT >= 0x030200
23887 #include <linux/init.h>
23888 #include <linux/irq.h>
23889 #include <linux/kernel.h>
23890 @@ -73,9 +75,9 @@ int xen_spin_wait(raw_spinlock_t *lock,
23891 /* announce we're spinning */
23892 spinning.ticket = token;
23893 spinning.lock = lock;
23894 - spinning.prev = __get_cpu_var(spinning);
23895 + spinning.prev = x86_read_percpu(spinning);
23897 - __get_cpu_var(spinning) = &spinning;
23898 + x86_write_percpu(spinning, &spinning);
23900 /* clear pending */
23901 xen_clear_irq_pending(irq);
23902 @@ -102,7 +104,7 @@ int xen_spin_wait(raw_spinlock_t *lock,
23903 kstat_this_cpu.irqs[irq] += !rc;
23905 /* announce we're done */
23906 - __get_cpu_var(spinning) = spinning.prev;
23907 + x86_write_percpu(spinning, spinning.prev);
23908 rm_lock = &__get_cpu_var(spinning_rm_lock);
23909 raw_local_irq_save(flags);
23910 __raw_write_lock(rm_lock);
23911 @@ -159,3 +161,5 @@ void xen_spin_kick(raw_spinlock_t *lock,
23914 EXPORT_SYMBOL(xen_spin_kick);
23916 +#endif /* CONFIG_XEN_COMPAT >= 0x030200 */
23917 --- sle11-2009-06-04.orig/drivers/xen/fbfront/xenfb.c 2009-03-16 16:38:05.000000000 +0100
23918 +++ sle11-2009-06-04/drivers/xen/fbfront/xenfb.c 2009-06-04 10:21:39.000000000 +0200
23923 +#include <linux/console.h>
23924 #include <linux/kernel.h>
23925 #include <linux/errno.h>
23926 #include <linux/fb.h>
23927 @@ -545,6 +546,28 @@ static unsigned long vmalloc_to_mfn(void
23928 return pfn_to_mfn(vmalloc_to_pfn(address));
23931 +static __devinit void
23932 +xenfb_make_preferred_console(void)
23934 + struct console *c;
23936 + if (console_set_on_cmdline)
23939 + acquire_console_sem();
23940 + for (c = console_drivers; c; c = c->next) {
23941 + if (!strcmp(c->name, "tty") && c->index == 0)
23944 + release_console_sem();
23946 + unregister_console(c);
23947 + c->flags |= CON_CONSDEV;
23948 + c->flags &= ~CON_PRINTBUFFER; /* don't print again */
23949 + register_console(c);
23953 static int __devinit xenfb_probe(struct xenbus_device *dev,
23954 const struct xenbus_device_id *id)
23956 @@ -665,6 +688,7 @@ static int __devinit xenfb_probe(struct
23960 + xenfb_make_preferred_console();
23964 @@ -882,4 +906,5 @@ static void __exit xenfb_cleanup(void)
23965 module_init(xenfb_init);
23966 module_exit(xenfb_cleanup);
23968 +MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
23969 MODULE_LICENSE("GPL");
23970 --- sle11-2009-06-04.orig/drivers/xen/fbfront/xenkbd.c 2009-03-04 11:25:55.000000000 +0100
23971 +++ sle11-2009-06-04/drivers/xen/fbfront/xenkbd.c 2009-06-04 10:21:39.000000000 +0200
23972 @@ -350,4 +350,5 @@ static void __exit xenkbd_cleanup(void)
23973 module_init(xenkbd_init);
23974 module_exit(xenkbd_cleanup);
23976 +MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
23977 MODULE_LICENSE("GPL");
23978 --- sle11-2009-06-04.orig/drivers/xen/gntdev/gntdev.c 2009-03-16 16:38:05.000000000 +0100
23979 +++ sle11-2009-06-04/drivers/xen/gntdev/gntdev.c 2009-06-04 10:21:39.000000000 +0200
23980 @@ -418,7 +418,7 @@ static int __init gntdev_init(void)
23983 device = device_create(class, NULL, MKDEV(gntdev_major, 0),
23985 + NULL, GNTDEV_NAME);
23986 if (IS_ERR(device)) {
23987 printk(KERN_ERR "Error creating gntdev device in xen_class\n");
23988 printk(KERN_ERR "gntdev created with major number = %d\n",
23989 --- sle11-2009-06-04.orig/drivers/xen/netfront/accel.c 2009-03-30 16:39:19.000000000 +0200
23990 +++ sle11-2009-06-04/drivers/xen/netfront/accel.c 2009-06-04 10:21:39.000000000 +0200
23995 +#include <linux/version.h>
23996 #include <linux/netdevice.h>
23997 #include <linux/skbuff.h>
23998 #include <linux/list.h>
23999 --- sle11-2009-06-04.orig/drivers/xen/netfront/netfront.c 2009-03-30 16:40:17.000000000 +0200
24000 +++ sle11-2009-06-04/drivers/xen/netfront/netfront.c 2009-06-04 10:21:39.000000000 +0200
24001 @@ -640,7 +640,7 @@ static int network_open(struct net_devic
24003 spin_unlock_bh(&np->rx_lock);
24005 - network_maybe_wake_tx(dev);
24006 + netif_start_queue(dev);
24010 --- sle11-2009-06-04.orig/drivers/xen/sfc_netback/accel.h 2009-03-30 16:00:09.000000000 +0200
24011 +++ sle11-2009-06-04/drivers/xen/sfc_netback/accel.h 2009-06-04 10:21:39.000000000 +0200
24013 #ifndef NETBACK_ACCEL_H
24014 #define NETBACK_ACCEL_H
24016 +#include <linux/version.h>
24017 #include <linux/slab.h>
24018 #include <linux/ip.h>
24019 #include <linux/tcp.h>
24020 --- sle11-2009-06-04.orig/drivers/xen/sfc_netfront/accel.h 2009-03-30 16:34:56.000000000 +0200
24021 +++ sle11-2009-06-04/drivers/xen/sfc_netfront/accel.h 2009-06-04 10:21:39.000000000 +0200
24023 #include <xen/evtchn.h>
24025 #include <linux/kernel.h>
24026 +#include <linux/version.h>
24027 #include <linux/list.h>
24029 enum netfront_accel_post_status {
24030 --- sle11-2009-06-04.orig/drivers/xen/xenbus/xenbus_client.c 2009-03-24 10:13:17.000000000 +0100
24031 +++ sle11-2009-06-04/drivers/xen/xenbus/xenbus_client.c 2009-06-04 10:21:39.000000000 +0200
24032 @@ -150,7 +150,7 @@ int xenbus_watch_pathfmt(struct xenbus_d
24035 va_start(ap, pathfmt);
24036 - path = kvasprintf(GFP_KERNEL, pathfmt, ap);
24037 + path = kvasprintf(GFP_NOIO | __GFP_HIGH, pathfmt, ap);
24041 --- sle11-2009-06-04.orig/drivers/xen/xenbus/xenbus_comms.c 2009-02-16 16:17:21.000000000 +0100
24042 +++ sle11-2009-06-04/drivers/xen/xenbus/xenbus_comms.c 2009-06-04 10:21:39.000000000 +0200
24043 @@ -228,14 +228,11 @@ int xb_init_comms(void)
24044 intf->rsp_cons = intf->rsp_prod;
24047 +#if defined(CONFIG_XEN) || defined(MODULE)
24049 unbind_from_irqhandler(xenbus_irq, &xb_waitq);
24051 -#if defined(CONFIG_XEN) || defined(MODULE)
24052 err = bind_caller_port_to_irqhandler(
24054 - err = bind_evtchn_to_irqhandler(
24056 xen_store_evtchn, wake_waiting,
24057 0, "xenbus", &xb_waitq);
24059 @@ -244,6 +241,20 @@ int xb_init_comms(void)
24064 + if (xenbus_irq) {
24065 + /* Already have an irq; assume we're resuming */
24066 + rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
24068 + err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
24069 + 0, "xenbus", &xb_waitq);
24071 + printk(KERN_ERR "XENBUS request irq failed %i\n", err);
24074 + xenbus_irq = err;
24080 --- sle11-2009-06-04.orig/drivers/xen/xenbus/xenbus_probe.c 2009-03-16 16:38:05.000000000 +0100
24081 +++ sle11-2009-06-04/drivers/xen/xenbus/xenbus_probe.c 2009-06-04 10:21:39.000000000 +0200
24083 __FUNCTION__, __LINE__, ##args)
24085 #include <linux/kernel.h>
24086 +#include <linux/version.h>
24087 #include <linux/err.h>
24088 #include <linux/string.h>
24089 #include <linux/ctype.h>
24090 --- sle11-2009-06-04.orig/fs/aio.c 2009-03-24 10:13:25.000000000 +0100
24091 +++ sle11-2009-06-04/fs/aio.c 2009-06-04 10:21:39.000000000 +0200
24092 @@ -1335,7 +1335,7 @@ static int make_aio_fd(struct kioctx *io
24096 - fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
24097 + fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
24101 --- sle11-2009-06-04.orig/include/asm-generic/pgtable.h 2009-03-04 11:28:34.000000000 +0100
24102 +++ sle11-2009-06-04/include/asm-generic/pgtable.h 2009-06-04 10:21:39.000000000 +0200
24103 @@ -99,10 +99,6 @@ static inline void ptep_set_wrprotect(st
24107 -#ifndef arch_change_pte_range
24108 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
24111 #ifndef __HAVE_ARCH_PTE_SAME
24112 #define pte_same(A,B) (pte_val(A) == pte_val(B))
24114 --- sle11-2009-06-04.orig/include/asm-x86/dma-mapping.h 2009-03-16 16:38:05.000000000 +0100
24115 +++ sle11-2009-06-04/include/asm-x86/dma-mapping.h 2009-06-04 10:21:39.000000000 +0200
24116 @@ -74,7 +74,7 @@ static inline struct dma_mapping_ops *ge
24117 /* Make sure we keep the same behaviour */
24118 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
24120 -#ifdef CONFIG_X86_32
24121 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
24124 struct dma_mapping_ops *ops = get_dma_ops(dev);
24125 --- sle11-2009-06-04.orig/include/asm-x86/kexec.h 2008-12-01 11:11:08.000000000 +0100
24126 +++ sle11-2009-06-04/include/asm-x86/kexec.h 2009-06-04 10:21:39.000000000 +0200
24128 # define VA_PTE_0 5
24129 # define PA_PTE_1 6
24130 # define VA_PTE_1 7
24131 +# ifndef CONFIG_XEN
24132 # define PA_SWAP_PAGE 8
24133 # ifdef CONFIG_X86_PAE
24134 # define PA_PMD_0 9
24137 # define PAGES_NR 9
24139 +# else /* CONFIG_XEN */
24141 + * The hypervisor interface implicitly requires that all entries (except
24142 + * for possibly the final one) are arranged in matching PA_/VA_ pairs.
24144 +# define PA_PMD_0 8
24145 +# define VA_PMD_0 9
24146 +# define PA_PMD_1 10
24147 +# define VA_PMD_1 11
24148 +# define PA_SWAP_PAGE 12
24149 +# define PAGES_NR 13
24150 +# endif /* CONFIG_XEN */
24152 # define PA_CONTROL_PAGE 0
24153 # define VA_CONTROL_PAGE 1
24154 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:38:05.000000000 +0100
24155 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/desc.h 2009-06-04 10:21:39.000000000 +0200
24156 @@ -31,11 +31,17 @@ extern struct desc_ptr idt_descr;
24157 extern gate_desc idt_table[];
24161 + struct desc_struct gdt[GDT_ENTRIES];
24162 +} __attribute__((aligned(PAGE_SIZE)));
24163 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
24165 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
24167 + return per_cpu(gdt_page, cpu).gdt;
24170 #ifdef CONFIG_X86_64
24171 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
24172 -extern struct desc_ptr cpu_gdt_descr[];
24173 -/* the cpu gdt accessor */
24174 -#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
24176 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
24177 unsigned dpl, unsigned ist, unsigned seg)
24178 @@ -53,16 +59,6 @@ static inline void pack_gate(gate_desc *
24183 - struct desc_struct gdt[GDT_ENTRIES];
24184 -} __attribute__((aligned(PAGE_SIZE)));
24185 -DECLARE_PER_CPU(struct gdt_page, gdt_page);
24187 -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
24189 - return per_cpu(gdt_page, cpu).gdt;
24192 static inline void pack_gate(gate_desc *gate, unsigned char type,
24193 unsigned long base, unsigned dpl, unsigned flags,
24194 unsigned short seg)
24195 @@ -333,6 +329,28 @@ static inline void set_intr_gate(unsigne
24196 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
24199 +#define SYS_VECTOR_FREE 0
24200 +#define SYS_VECTOR_ALLOCED 1
24202 +extern int first_system_vector;
24203 +extern char system_vectors[];
24205 +static inline void alloc_system_vector(int vector)
24207 + if (system_vectors[vector] == SYS_VECTOR_FREE) {
24208 + system_vectors[vector] = SYS_VECTOR_ALLOCED;
24209 + if (first_system_vector > vector)
24210 + first_system_vector = vector;
24215 +static inline void alloc_intr_gate(unsigned int n, void *addr)
24217 + alloc_system_vector(n);
24218 + set_intr_gate(n, addr);
24222 * This routine sets up an interrupt gate at directory privilege level 3.
24224 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/fixmap.h 2009-03-16 16:38:05.000000000 +0100
24225 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/fixmap.h 2009-06-04 10:21:39.000000000 +0200
24227 # include "fixmap_64.h"
24230 +extern int fixmaps_set;
24232 +void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
24234 +static inline void __set_fixmap(enum fixed_addresses idx,
24235 + maddr_t phys, pgprot_t flags)
24237 + xen_set_fixmap(idx, phys, flags);
24240 +#define set_fixmap(idx, phys) \
24241 + __set_fixmap(idx, phys, PAGE_KERNEL)
24244 + * Some hardware wants to get fixmapped without caching.
24246 +#define set_fixmap_nocache(idx, phys) \
24247 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24249 #define clear_fixmap(idx) \
24250 __set_fixmap(idx, 0, __pgprot(0))
24252 +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24253 +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24255 +extern void __this_fixmap_does_not_exist(void);
24258 + * 'index to address' translation. If anyone tries to use the idx
24259 + * directly without translation, we catch the bug with a NULL-deference
24260 + * kernel oops. Illegal ranges of incoming indices are caught too.
24262 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24265 + * this branch gets completely eliminated after inlining,
24266 + * except when someone tries to use fixaddr indices in an
24267 + * illegal way. (such as mixing up address types or using
24268 + * out-of-range indices).
24270 + * If it doesn't get removed, the linker will complain
24271 + * loudly with a reasonably clear error message..
24273 + if (idx >= __end_of_fixed_addresses)
24274 + __this_fixmap_does_not_exist();
24276 + return __fix_to_virt(idx);
24279 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
24281 + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24282 + return __virt_to_fix(vaddr);
24285 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:38:05.000000000 +0100
24286 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-06-04 10:21:39.000000000 +0200
24287 @@ -58,10 +58,17 @@ enum fixed_addresses {
24288 #ifdef CONFIG_X86_LOCAL_APIC
24289 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24291 -#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
24292 +#ifndef CONFIG_XEN
24293 +#ifdef CONFIG_X86_IO_APIC
24294 FIX_IO_APIC_BASE_0,
24295 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
24299 +#define NR_FIX_ISAMAPS 256
24301 + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24303 #ifdef CONFIG_X86_VISWS_APIC
24304 FIX_CO_CPU, /* Cobalt timer */
24305 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
24306 @@ -78,51 +85,38 @@ enum fixed_addresses {
24307 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
24308 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
24310 -#ifdef CONFIG_ACPI
24312 - FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24314 #ifdef CONFIG_PCI_MMCONFIG
24317 #ifdef CONFIG_PARAVIRT
24318 FIX_PARAVIRT_BOOTMAP,
24321 -#define NR_FIX_ISAMAPS 256
24323 - FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24324 __end_of_permanent_fixed_addresses,
24326 * 256 temporary boot-time mappings, used by early_ioremap(),
24327 * before ioremap() is functional.
24329 - * We round it up to the next 512 pages boundary so that we
24330 + * We round it up to the next 256 pages boundary so that we
24331 * can have a single pgd entry and a single pte table:
24333 #define NR_FIX_BTMAPS 64
24334 #define FIX_BTMAPS_NESTING 4
24335 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24336 - (__end_of_permanent_fixed_addresses & 511),
24337 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
24338 + (__end_of_permanent_fixed_addresses & 255),
24339 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24341 +#ifdef CONFIG_ACPI
24343 + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24345 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24348 __end_of_fixed_addresses
24351 -extern void __set_fixmap(enum fixed_addresses idx,
24352 - maddr_t phys, pgprot_t flags);
24353 extern void reserve_top_address(unsigned long reserve);
24355 -#define set_fixmap(idx, phys) \
24356 - __set_fixmap(idx, phys, PAGE_KERNEL)
24358 - * Some hardware wants to get fixmapped without caching.
24360 -#define set_fixmap_nocache(idx, phys) \
24361 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24363 #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
24365 @@ -131,38 +125,5 @@ extern void reserve_top_address(unsigned
24366 #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
24367 #define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
24369 -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24370 -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24372 -extern void __this_fixmap_does_not_exist(void);
24375 - * 'index to address' translation. If anyone tries to use the idx
24376 - * directly without tranlation, we catch the bug with a NULL-deference
24377 - * kernel oops. Illegal ranges of incoming indices are caught too.
24379 -static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24382 - * this branch gets completely eliminated after inlining,
24383 - * except when someone tries to use fixaddr indices in an
24384 - * illegal way. (such as mixing up address types or using
24385 - * out-of-range indices).
24387 - * If it doesn't get removed, the linker will complain
24388 - * loudly with a reasonably clear error message..
24390 - if (idx >= __end_of_fixed_addresses)
24391 - __this_fixmap_does_not_exist();
24393 - return __fix_to_virt(idx);
24396 -static inline unsigned long virt_to_fix(const unsigned long vaddr)
24398 - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24399 - return __virt_to_fix(vaddr);
24402 #endif /* !__ASSEMBLY__ */
24404 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:38:05.000000000 +0100
24405 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-06-04 10:21:39.000000000 +0200
24407 #define _ASM_FIXMAP_64_H
24409 #include <linux/kernel.h>
24410 +#include <asm/acpi.h>
24411 #include <asm/apicdef.h>
24412 #include <asm/page.h>
24413 #include <asm/vsyscall.h>
24414 @@ -40,7 +41,6 @@ enum fixed_addresses {
24417 FIX_EARLYCON_MEM_BASE,
24419 #ifdef CONFIG_X86_LOCAL_APIC
24420 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24422 @@ -53,14 +53,21 @@ enum fixed_addresses {
24423 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
24424 + MAX_EFI_IO_PAGES - 1,
24426 +#ifdef CONFIG_PARAVIRT
24427 + FIX_PARAVIRT_BOOTMAP,
24433 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24436 #define NR_FIX_ISAMAPS 256
24438 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24439 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24440 + FIX_OHCI1394_BASE,
24442 __end_of_permanent_fixed_addresses,
24444 * 256 temporary boot-time mappings, used by early_ioremap(),
24445 @@ -71,27 +78,12 @@ enum fixed_addresses {
24447 #define NR_FIX_BTMAPS 64
24448 #define FIX_BTMAPS_NESTING 4
24450 - __end_of_permanent_fixed_addresses + 512 -
24451 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24452 (__end_of_permanent_fixed_addresses & 511),
24453 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24454 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24455 - FIX_OHCI1394_BASE,
24457 __end_of_fixed_addresses
24460 -extern void __set_fixmap(enum fixed_addresses idx,
24461 - unsigned long phys, pgprot_t flags);
24463 -#define set_fixmap(idx, phys) \
24464 - __set_fixmap(idx, phys, PAGE_KERNEL)
24466 - * Some hardware wants to get fixmapped without caching.
24468 -#define set_fixmap_nocache(idx, phys) \
24469 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24471 #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
24472 #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
24473 #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
24474 @@ -100,30 +92,4 @@ extern void __set_fixmap(enum fixed_addr
24475 #define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
24476 #define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
24478 -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24480 -extern void __this_fixmap_does_not_exist(void);
24483 - * 'index to address' translation. If anyone tries to use the idx
24484 - * directly without translation, we catch the bug with a NULL-deference
24485 - * kernel oops. Illegal ranges of incoming indices are caught too.
24487 -static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24490 - * this branch gets completely eliminated after inlining,
24491 - * except when someone tries to use fixaddr indices in an
24492 - * illegal way. (such as mixing up address types or using
24493 - * out-of-range indices).
24495 - * If it doesn't get removed, the linker will complain
24496 - * loudly with a reasonably clear error message..
24498 - if (idx >= __end_of_fixed_addresses)
24499 - __this_fixmap_does_not_exist();
24501 - return __fix_to_virt(idx);
24505 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:38:05.000000000 +0100
24506 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/highmem.h 2009-06-04 10:21:39.000000000 +0200
24507 @@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *p
24509 #define flush_cache_kmaps() do { } while (0)
24511 +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
24512 + unsigned long end_pfn);
24514 void clear_highpage(struct page *);
24515 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
24517 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/hypercall.h 2009-02-16 16:18:36.000000000 +0100
24518 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/hypercall.h 2009-06-04 10:21:39.000000000 +0200
24519 @@ -323,9 +323,19 @@ static inline int __must_check
24520 HYPERVISOR_grant_table_op(
24521 unsigned int cmd, void *uop, unsigned int count)
24523 + bool fixup = false;
24526 if (arch_use_lazy_mmu_mode())
24527 xen_multicall_flush(false);
24528 - return _hypercall3(int, grant_table_op, cmd, uop, count);
24529 +#ifdef GNTTABOP_map_grant_ref
24530 + if (cmd == GNTTABOP_map_grant_ref)
24532 + fixup = gnttab_pre_map_adjust(cmd, uop, count);
24533 + rc = _hypercall3(int, grant_table_op, cmd, uop, count);
24534 + if (rc == 0 && fixup)
24535 + rc = gnttab_post_map_adjust(uop, count);
24539 static inline int __must_check
24540 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2009-03-16 16:33:40.000000000 +0100
24541 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/hypervisor.h 2009-06-04 10:21:39.000000000 +0200
24544 #include <linux/types.h>
24545 #include <linux/kernel.h>
24546 -#include <linux/version.h>
24547 #include <linux/errno.h>
24548 #include <xen/interface/xen.h>
24549 #include <xen/interface/platform.h>
24550 @@ -112,6 +111,8 @@ int xen_create_contiguous_region(
24551 unsigned long vstart, unsigned int order, unsigned int address_bits);
24552 void xen_destroy_contiguous_region(
24553 unsigned long vstart, unsigned int order);
24554 +int early_create_contiguous_region(unsigned long pfn, unsigned int order,
24555 + unsigned int address_bits);
24559 @@ -181,6 +182,29 @@ static inline void xen_multicall_flush(b
24561 #endif /* CONFIG_XEN && !MODULE */
24565 +struct gnttab_map_grant_ref;
24566 +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
24567 + unsigned int count);
24568 +#if CONFIG_XEN_COMPAT < 0x030400
24569 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
24571 +static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
24572 + unsigned int count)
24579 +#else /* !CONFIG_XEN */
24581 +#define gnttab_pre_map_adjust(...) false
24582 +#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
24584 +#endif /* CONFIG_XEN */
24586 #if defined(CONFIG_X86_64)
24587 #define MULTI_UVMFLAGS_INDEX 2
24588 #define MULTI_UVMDOMID_INDEX 3
24589 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/io.h 2009-03-16 16:38:05.000000000 +0100
24590 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/io.h 2009-06-04 10:21:39.000000000 +0200
24593 #define ARCH_HAS_IOREMAP_WC
24595 +#include <linux/compiler.h>
24598 + * early_ioremap() and early_iounmap() are for temporary early boot-time
24599 + * mappings, before the real ioremap() is functional.
24600 + * A boot-time mapping is currently limited to at most 16 pages.
24602 +#ifndef __ASSEMBLY__
24603 +extern void early_ioremap_init(void);
24604 +extern void early_ioremap_clear(void);
24605 +extern void early_ioremap_reset(void);
24606 +extern void *early_ioremap(unsigned long offset, unsigned long size);
24607 +extern void early_iounmap(void *addr, unsigned long size);
24608 +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24611 +#define build_mmio_read(name, size, type, reg, barrier) \
24612 +static inline type name(const volatile void __iomem *addr) \
24613 +{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
24614 +:"m" (*(volatile type __force *)addr) barrier); return ret; }
24616 +#define build_mmio_write(name, size, type, reg, barrier) \
24617 +static inline void name(type val, volatile void __iomem *addr) \
24618 +{ asm volatile("mov" size " %0,%1": :reg (val), \
24619 +"m" (*(volatile type __force *)addr) barrier); }
24621 +build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
24622 +build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
24623 +build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
24625 +build_mmio_read(__readb, "b", unsigned char, "=q", )
24626 +build_mmio_read(__readw, "w", unsigned short, "=r", )
24627 +build_mmio_read(__readl, "l", unsigned int, "=r", )
24629 +build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
24630 +build_mmio_write(writew, "w", unsigned short, "r", :"memory")
24631 +build_mmio_write(writel, "l", unsigned int, "r", :"memory")
24633 +build_mmio_write(__writeb, "b", unsigned char, "q", )
24634 +build_mmio_write(__writew, "w", unsigned short, "r", )
24635 +build_mmio_write(__writel, "l", unsigned int, "r", )
24637 +#define readb_relaxed(a) __readb(a)
24638 +#define readw_relaxed(a) __readw(a)
24639 +#define readl_relaxed(a) __readl(a)
24640 +#define __raw_readb __readb
24641 +#define __raw_readw __readw
24642 +#define __raw_readl __readl
24644 +#define __raw_writeb __writeb
24645 +#define __raw_writew __writew
24646 +#define __raw_writel __writel
24648 +#define mmiowb() barrier()
24650 +#ifdef CONFIG_X86_64
24651 +build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
24652 +build_mmio_read(__readq, "q", unsigned long, "=r", )
24653 +build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
24654 +build_mmio_write(__writeq, "q", unsigned long, "r", )
24656 +#define readq_relaxed(a) __readq(a)
24657 +#define __raw_readq __readq
24658 +#define __raw_writeq writeq
24660 +/* Let people know we have them */
24661 +#define readq readq
24662 +#define writeq writeq
24665 +#define native_io_delay xen_io_delay
24667 #ifdef CONFIG_X86_32
24668 -# include "io_32.h"
24669 +# include "../../io_32.h"
24671 -# include "io_64.h"
24672 +# include "../../io_64.h"
24675 +#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
24677 +/* We will be supplying our own /dev/mem implementation */
24678 +#define ARCH_HAS_DEV_MEM
24680 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
24681 +#undef page_to_phys
24682 +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
24683 +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
24685 +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
24686 + (unsigned long) (bv)->bv_offset)
24688 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
24689 + (bvec_to_phys(vec1) + (vec1)->bv_len == bvec_to_phys(vec2) \
24690 + && bvec_to_pseudophys(vec1) + (vec1)->bv_len \
24691 + == bvec_to_pseudophys(vec2))
24693 +#undef virt_to_bus
24694 +#undef bus_to_virt
24695 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
24696 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
24698 +#include <asm/fixmap.h>
24700 +#undef isa_virt_to_bus
24701 +#undef isa_page_to_bus
24702 +#undef isa_bus_to_virt
24703 +#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
24704 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->_x
24705 +#define isa_bus_to_virt(_x) ((void *)__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
24707 +#undef __ISA_IO_base
24708 +#define __ISA_IO_base ((char __iomem *)fix_to_virt(FIX_ISAMAP_BEGIN))
24712 extern void *xlate_dev_mem_ptr(unsigned long phys);
24713 extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
24715 -extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
24716 -extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
24718 extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
24719 unsigned long prot_val);
24720 extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
24723 + * early_ioremap() and early_iounmap() are for temporary early boot-time
24724 + * mappings, before the real ioremap() is functional.
24725 + * A boot-time mapping is currently limited to at most 16 pages.
24727 +extern void early_ioremap_init(void);
24728 +extern void early_ioremap_clear(void);
24729 +extern void early_ioremap_reset(void);
24730 +extern void *early_ioremap(unsigned long offset, unsigned long size);
24731 +extern void early_iounmap(void *addr, unsigned long size);
24732 +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24735 #endif /* _ASM_X86_IO_H */
24736 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
24737 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/irq_vectors.h 2009-06-04 10:21:39.000000000 +0200
24739 +#ifndef _ASM_IRQ_VECTORS_H
24740 +#define _ASM_IRQ_VECTORS_H
24742 +#ifdef CONFIG_X86_32
24743 +# define SYSCALL_VECTOR 0x80
24745 +# define IA32_SYSCALL_VECTOR 0x80
24748 +#define RESCHEDULE_VECTOR 0
24749 +#define CALL_FUNCTION_VECTOR 1
24750 +#define CALL_FUNC_SINGLE_VECTOR 2
24751 +#define SPIN_UNLOCK_VECTOR 3
24755 + * The maximum number of vectors supported by i386 processors
24756 + * is limited to 256. For processors other than i386, NR_VECTORS
24757 + * should be changed accordingly.
24759 +#define NR_VECTORS 256
24761 +#define FIRST_VM86_IRQ 3
24762 +#define LAST_VM86_IRQ 15
24763 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
24766 + * The flat IRQ space is divided into two regions:
24767 + * 1. A one-to-one mapping of real physical IRQs. This space is only used
24768 + * if we have physical device-access privilege. This region is at the
24769 + * start of the IRQ space so that existing device drivers do not need
24770 + * to be modified to translate physical IRQ numbers into our IRQ space.
24771 + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
24772 + * are bound using the provided bind/unbind functions.
24775 +#define PIRQ_BASE 0
24776 +#if defined(NR_CPUS) && defined(MAX_IO_APICS)
24777 +# if NR_CPUS < MAX_IO_APICS
24778 +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
24780 +# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
24784 +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
24785 +#define NR_DYNIRQS 256
24787 +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
24788 +#define NR_IRQ_VECTORS NR_IRQS
24790 +#endif /* _ASM_IRQ_VECTORS_H */
24791 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:38:05.000000000 +0100
24792 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/irqflags.h 2009-06-04 10:21:39.000000000 +0200
24793 @@ -118,7 +118,7 @@ static inline void halt(void)
24795 #ifndef CONFIG_X86_64
24796 #define INTERRUPT_RETURN iret
24797 -#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
24798 +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
24799 sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
24801 jnz 14f /* process more events if necessary... */ ; \
24802 @@ -175,18 +175,6 @@ static inline void trace_hardirqs_fixup_
24805 #ifdef CONFIG_X86_64
24807 - * Currently paravirt can't handle swapgs nicely when we
24808 - * don't have a stack we can rely on (such as a user space
24809 - * stack). So we either find a way around these or just fault
24810 - * and emulate if a guest tries to call swapgs directly.
24812 - * Either way, this is a good way to document that we don't
24813 - * have a reliable stack. x86_64 only.
24815 -#define SWAPGS_UNSAFE_STACK swapgs
24816 -#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
24817 -#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
24818 #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
24819 #define ARCH_LOCKDEP_SYS_EXIT_IRQ \
24821 @@ -198,24 +186,6 @@ static inline void trace_hardirqs_fixup_
24825 -#define ARCH_TRACE_IRQS_ON \
24829 - call trace_hardirqs_on; \
24834 -#define ARCH_TRACE_IRQS_OFF \
24838 - call trace_hardirqs_off; \
24843 #define ARCH_LOCKDEP_SYS_EXIT \
24846 @@ -229,8 +199,8 @@ static inline void trace_hardirqs_fixup_
24849 #ifdef CONFIG_TRACE_IRQFLAGS
24850 -# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
24851 -# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
24852 +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
24853 +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
24855 # define TRACE_IRQS_ON
24856 # define TRACE_IRQS_OFF
24857 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/mmu_context.h 2009-02-16 16:18:36.000000000 +0100
24858 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/mmu_context.h 2009-06-04 10:21:39.000000000 +0200
24860 +#ifndef __ASM_X86_MMU_CONTEXT_H
24861 +#define __ASM_X86_MMU_CONTEXT_H
24863 +#include <asm/desc.h>
24864 +#include <asm/atomic.h>
24865 +#include <asm/pgalloc.h>
24866 +#include <asm/tlbflush.h>
24868 +void arch_exit_mmap(struct mm_struct *mm);
24869 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24871 +void mm_pin(struct mm_struct *mm);
24872 +void mm_unpin(struct mm_struct *mm);
24873 +void mm_pin_all(void);
24875 +static inline void xen_activate_mm(struct mm_struct *prev,
24876 + struct mm_struct *next)
24878 + if (!PagePinned(virt_to_page(next->pgd)))
24883 + * Used for LDT copy/destruction.
24885 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24886 +void destroy_context(struct mm_struct *mm);
24888 #ifdef CONFIG_X86_32
24889 # include "mmu_context_32.h"
24891 # include "mmu_context_64.h"
24894 +#define activate_mm(prev, next) \
24896 + xen_activate_mm(prev, next); \
24897 + switch_mm((prev), (next), NULL); \
24901 +#endif /* __ASM_X86_MMU_CONTEXT_H */
24902 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:38:05.000000000 +0100
24903 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-06-04 10:21:39.000000000 +0200
24905 #ifndef __I386_SCHED_H
24906 #define __I386_SCHED_H
24908 -#include <asm/desc.h>
24909 -#include <asm/atomic.h>
24910 -#include <asm/pgalloc.h>
24911 -#include <asm/tlbflush.h>
24913 -void arch_exit_mmap(struct mm_struct *mm);
24914 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24916 -void mm_pin(struct mm_struct *mm);
24917 -void mm_unpin(struct mm_struct *mm);
24918 -void mm_pin_all(void);
24920 -static inline void xen_activate_mm(struct mm_struct *prev,
24921 - struct mm_struct *next)
24923 - if (!PagePinned(virt_to_page(next->pgd)))
24928 - * Used for LDT copy/destruction.
24930 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24931 -void destroy_context(struct mm_struct *mm);
24934 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24936 #if 0 /* XEN: no lazy tlb */
24937 @@ -107,10 +81,4 @@ static inline void switch_mm(struct mm_s
24938 #define deactivate_mm(tsk, mm) \
24939 asm("movl %0,%%gs": :"r" (0));
24941 -#define activate_mm(prev, next) \
24943 - xen_activate_mm(prev, next); \
24944 - switch_mm((prev), (next), NULL); \
24948 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:38:05.000000000 +0100
24949 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-06-04 10:21:39.000000000 +0200
24951 #ifndef __X86_64_MMU_CONTEXT_H
24952 #define __X86_64_MMU_CONTEXT_H
24954 -#include <asm/desc.h>
24955 -#include <asm/atomic.h>
24956 -#include <asm/pgalloc.h>
24957 -#include <asm/page.h>
24958 -#include <asm/pda.h>
24959 -#include <asm/pgtable.h>
24960 -#include <asm/tlbflush.h>
24962 -void arch_exit_mmap(struct mm_struct *mm);
24963 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24966 - * possibly do the LDT unload here?
24968 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24969 -void destroy_context(struct mm_struct *mm);
24971 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24973 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
24974 @@ -58,10 +41,6 @@ static inline void __prepare_arch_switch
24978 -extern void mm_pin(struct mm_struct *mm);
24979 -extern void mm_unpin(struct mm_struct *mm);
24980 -void mm_pin_all(void);
24982 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
24983 struct task_struct *tsk)
24985 @@ -124,11 +103,4 @@ do { \
24986 asm volatile("movl %0,%%fs"::"r"(0)); \
24989 -static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
24991 - if (!PagePinned(virt_to_page(next->pgd)))
24993 - switch_mm(prev, next, NULL);
24997 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:38:05.000000000 +0100
24998 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/page.h 2009-06-04 10:21:39.000000000 +0200
25000 * below. The preprocessor will warn if the two definitions aren't identical.
25002 #define _PAGE_BIT_PRESENT 0
25003 -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25004 -#define _PAGE_BIT_IO 9
25005 -#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25006 +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25007 +#define _PAGE_BIT_IO 11
25008 +#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25010 #define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
25011 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
25013 (ie, 32-bit PAE). */
25014 #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
25016 -/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
25017 -#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
25018 +/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
25019 +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
25021 +/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
25022 +#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
25024 #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
25025 #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
25027 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
25028 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
25030 -/* to align the pointer to the (next) page boundary */
25031 -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
25032 +#define HUGE_MAX_HSTATE 2
25034 #ifndef __ASSEMBLY__
25035 #include <linux/types.h>
25038 #ifndef __ASSEMBLY__
25040 +typedef struct { pgdval_t pgd; } pgd_t;
25041 +typedef struct { pgprotval_t pgprot; } pgprot_t;
25043 extern int page_is_ram(unsigned long pagenr);
25044 extern int devmem_is_allowed(unsigned long pagenr);
25045 +extern void map_devmem(unsigned long pfn, unsigned long size,
25046 + pgprot_t vma_prot);
25047 +extern void unmap_devmem(unsigned long pfn, unsigned long size,
25048 + pgprot_t vma_prot);
25050 +extern unsigned long max_low_pfn_mapped;
25051 extern unsigned long max_pfn_mapped;
25054 @@ -84,15 +94,11 @@ static inline void copy_user_page(void *
25055 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
25056 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
25058 -typedef struct { pgprotval_t pgprot; } pgprot_t;
25060 #define pgprot_val(x) ((x).pgprot)
25061 #define __pgprot(x) ((pgprot_t) { (x) } )
25063 #include <asm/maddr.h>
25065 -typedef struct { pgdval_t pgd; } pgd_t;
25067 #define __pgd_ma(x) ((pgd_t) { (x) } )
25068 static inline pgd_t xen_make_pgd(pgdval_t val)
25070 @@ -196,6 +202,11 @@ static inline pteval_t xen_pte_val(pte_t
25074 +static inline pteval_t xen_pte_flags(pte_t pte)
25076 + return __pte_val(pte) & PTE_FLAGS_MASK;
25079 #define pgd_val(x) xen_pgd_val(x)
25080 #define __pgd(x) xen_make_pgd(x)
25082 @@ -210,6 +221,7 @@ static inline pteval_t xen_pte_val(pte_t
25085 #define pte_val(x) xen_pte_val(x)
25086 +#define pte_flags(x) xen_pte_flags(x)
25087 #define __pte(x) xen_make_pte(x)
25089 #define __pa(x) __phys_addr((unsigned long)(x))
25090 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:38:05.000000000 +0100
25091 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/page_64.h 2009-06-04 10:21:39.000000000 +0200
25093 #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
25094 #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
25097 + * Set __PAGE_OFFSET to the most negative possible address +
25098 + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
25099 + * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
25100 + * what Xen requires.
25102 #define __PAGE_OFFSET _AC(0xffff880000000000, UL)
25104 #define __PHYSICAL_START CONFIG_PHYSICAL_START
25106 void clear_page(void *page);
25107 void copy_page(void *to, void *from);
25109 -extern unsigned long end_pfn;
25110 +/* duplicated to the one in bootmem.h */
25111 +extern unsigned long max_pfn;
25113 static inline unsigned long __phys_addr(unsigned long x)
25115 @@ -91,6 +98,11 @@ typedef union { pteval_t pte; unsigned i
25116 extern unsigned long init_memory_mapping(unsigned long start,
25117 unsigned long end);
25119 +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
25121 +extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
25122 +extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
25124 #endif /* !__ASSEMBLY__ */
25126 #ifdef CONFIG_FLATMEM
25127 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:38:05.000000000 +0100
25128 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pci.h 2009-06-04 10:21:39.000000000 +0200
25129 @@ -21,6 +21,8 @@ struct pci_sysdata {
25133 +extern int pci_routeirq;
25135 /* scan a bus after allocating a pci_sysdata for it */
25136 extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
25138 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pci_32.h 2009-02-16 16:18:36.000000000 +0100
25139 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pci_32.h 2009-06-04 10:21:39.000000000 +0200
25140 @@ -38,12 +38,14 @@ struct pci_dev;
25141 #define PCI_DMA_BUS_IS_PHYS (1)
25143 /* pci_unmap_{page,single} is a nop so... */
25144 -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
25145 -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
25146 -#define pci_unmap_addr(PTR, ADDR_NAME) (0)
25147 -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
25148 -#define pci_unmap_len(PTR, LEN_NAME) (0)
25149 -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
25150 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
25151 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
25152 +#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
25153 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
25154 + do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
25155 +#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
25156 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
25157 + do { break; } while (pci_unmap_len(PTR, LEN_NAME))
25161 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgalloc.h 2009-03-16 16:38:05.000000000 +0100
25162 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgalloc.h 2009-06-04 10:21:39.000000000 +0200
25165 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
25167 +static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
25168 +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
25170 static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
25171 static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
25172 static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
25173 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:38:05.000000000 +0100
25174 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgtable.h 2009-06-04 10:21:39.000000000 +0200
25175 @@ -13,11 +13,12 @@
25176 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
25177 #define _PAGE_BIT_PAT 7 /* on 4KB pages */
25178 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
25179 -#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
25180 +#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
25181 +#define _PAGE_BIT_UNUSED2 10
25182 +#define _PAGE_BIT_IO 11 /* Mapped page is I/O or foreign and
25183 * has no associated page struct. */
25184 -#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
25185 -#define _PAGE_BIT_UNUSED3 11
25186 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
25187 +#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
25188 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
25190 /* If _PAGE_BIT_PRESENT is clear, we use these: */
25191 @@ -28,34 +29,31 @@
25192 /* if the user mapped it with PROT_NONE; pte_present gives true */
25193 #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
25196 - * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
25197 - * sign-extended value on 32-bit with all 1's in the upper word,
25198 - * which preserves the upper pte values on 64-bit ptes:
25200 -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25201 -#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
25202 -#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
25203 -#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
25204 -#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
25205 -#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
25206 -#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
25207 -#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
25208 -#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
25209 -#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25210 -#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
25211 -#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
25212 -#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
25213 -#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
25214 +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25215 +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
25216 +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
25217 +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
25218 +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
25219 +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
25220 +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
25221 +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
25222 +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
25223 +#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
25224 +#define _PAGE_UNUSED2 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2)
25225 +#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25226 +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
25227 +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
25228 +#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
25229 +#define __HAVE_ARCH_PTE_SPECIAL
25231 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
25232 -#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
25233 +#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
25235 -#define _PAGE_NX 0
25236 +#define _PAGE_NX (_AT(pteval_t, 0))
25239 -#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
25240 -#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
25241 +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
25242 +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
25244 #ifndef __ASSEMBLY__
25245 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
25246 @@ -71,8 +69,8 @@ extern unsigned int __kernel_page_user;
25247 _PAGE_DIRTY | __kernel_page_user)
25249 /* Set of bits not changed in pte_modify */
25250 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25251 - _PAGE_ACCESSED | _PAGE_DIRTY)
25252 +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25253 + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
25256 * PAT settings are part of the hypervisor interface, which sets the
25257 @@ -102,19 +100,9 @@ extern unsigned int __kernel_page_user;
25258 #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
25261 -#ifdef CONFIG_X86_32
25262 -#define _PAGE_KERNEL_EXEC \
25263 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
25264 -#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
25266 -#ifndef __ASSEMBLY__
25267 -extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
25268 -#endif /* __ASSEMBLY__ */
25270 #define __PAGE_KERNEL_EXEC \
25271 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
25272 #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
25275 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
25276 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
25277 @@ -125,25 +113,22 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
25278 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
25279 #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
25280 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
25281 +#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
25282 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
25285 - * We don't support GLOBAL page in xenolinux64
25287 -#define MAKE_GLOBAL(x) __pgprot((x))
25289 -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
25290 -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
25291 -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
25292 -#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
25293 -#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
25294 -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
25295 -#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
25296 -#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
25297 -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
25298 -#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
25299 -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
25300 -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25301 +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
25302 +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
25303 +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
25304 +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
25305 +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
25306 +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
25307 +#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
25308 +#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
25309 +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
25310 +#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
25311 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
25312 +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
25313 +#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25316 #define __P000 PAGE_NONE
25317 @@ -182,27 +167,27 @@ extern struct list_head pgd_list;
25319 static inline int pte_dirty(pte_t pte)
25321 - return __pte_val(pte) & _PAGE_DIRTY;
25322 + return pte_flags(pte) & _PAGE_DIRTY;
25325 static inline int pte_young(pte_t pte)
25327 - return __pte_val(pte) & _PAGE_ACCESSED;
25328 + return pte_flags(pte) & _PAGE_ACCESSED;
25331 static inline int pte_write(pte_t pte)
25333 - return __pte_val(pte) & _PAGE_RW;
25334 + return pte_flags(pte) & _PAGE_RW;
25337 static inline int pte_file(pte_t pte)
25339 - return __pte_val(pte) & _PAGE_FILE;
25340 + return pte_flags(pte) & _PAGE_FILE;
25343 static inline int pte_huge(pte_t pte)
25345 - return __pte_val(pte) & _PAGE_PSE;
25346 + return pte_flags(pte) & _PAGE_PSE;
25349 static inline int pte_global(pte_t pte)
25350 @@ -212,12 +197,12 @@ static inline int pte_global(pte_t pte)
25352 static inline int pte_exec(pte_t pte)
25354 - return !(__pte_val(pte) & _PAGE_NX);
25355 + return !(pte_flags(pte) & _PAGE_NX);
25358 static inline int pte_special(pte_t pte)
25361 + return pte_flags(pte) & _PAGE_SPECIAL;
25364 static inline int pmd_large(pmd_t pte)
25365 @@ -228,22 +213,22 @@ static inline int pmd_large(pmd_t pte)
25367 static inline pte_t pte_mkclean(pte_t pte)
25369 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
25370 + return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY);
25373 static inline pte_t pte_mkold(pte_t pte)
25375 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
25376 + return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED);
25379 static inline pte_t pte_wrprotect(pte_t pte)
25381 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
25382 + return __pte_ma(__pte_val(pte) & ~_PAGE_RW);
25385 static inline pte_t pte_mkexec(pte_t pte)
25387 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
25388 + return __pte_ma(__pte_val(pte) & ~_PAGE_NX);
25391 static inline pte_t pte_mkdirty(pte_t pte)
25392 @@ -268,7 +253,7 @@ static inline pte_t pte_mkhuge(pte_t pte
25394 static inline pte_t pte_clrhuge(pte_t pte)
25396 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
25397 + return __pte_ma(__pte_val(pte) & ~_PAGE_PSE);
25400 static inline pte_t pte_mkglobal(pte_t pte)
25401 @@ -283,35 +268,46 @@ static inline pte_t pte_clrglobal(pte_t
25403 static inline pte_t pte_mkspecial(pte_t pte)
25406 + return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL);
25409 extern pteval_t __supported_pte_mask;
25411 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
25413 - return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
25414 - pgprot_val(pgprot)) & __supported_pte_mask);
25415 + pgprotval_t prot = pgprot_val(pgprot);
25417 + if (prot & _PAGE_PRESENT)
25418 + prot &= __supported_pte_mask;
25419 + return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25422 static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
25424 - return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
25425 - pgprot_val(pgprot)) & __supported_pte_mask);
25426 + pgprotval_t prot = pgprot_val(pgprot);
25428 + if (prot & _PAGE_PRESENT)
25429 + prot &= __supported_pte_mask;
25430 + return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25433 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
25435 - return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
25436 - pgprot_val(pgprot)) & __supported_pte_mask);
25437 + pgprotval_t prot = pgprot_val(pgprot);
25439 + if (prot & _PAGE_PRESENT)
25440 + prot &= __supported_pte_mask;
25441 + return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25444 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
25446 - pteval_t val = pte_val(pte);
25447 + pgprotval_t prot = pgprot_val(newprot);
25448 + pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
25450 - val &= _PAGE_CHG_MASK;
25451 - val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
25452 + if (prot & _PAGE_PRESENT)
25453 + prot &= __supported_pte_mask;
25454 + val |= prot & ~_PAGE_CHG_MASK;
25458 @@ -325,9 +321,11 @@ static inline pgprot_t pgprot_modify(pgp
25459 return __pgprot(preservebits | addbits);
25462 -#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
25463 +#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
25465 -#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
25466 +#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \
25467 + ? pgprot_val(p) & __supported_pte_mask \
25470 #ifndef __ASSEMBLY__
25471 #define __HAVE_PHYS_MEM_ACCESS_PROT
25472 @@ -338,6 +336,17 @@ int phys_mem_access_prot_allowed(struct
25473 unsigned long size, pgprot_t *vma_prot);
25476 +/* Install a pte for a particular vaddr in kernel space. */
25477 +void set_pte_vaddr(unsigned long vaddr, pte_t pte);
25479 +#ifndef CONFIG_XEN
25480 +extern void native_pagetable_setup_start(pgd_t *base);
25481 +extern void native_pagetable_setup_done(pgd_t *base);
25483 +static inline void xen_pagetable_setup_start(pgd_t *base) {}
25484 +static inline void xen_pagetable_setup_done(pgd_t *base) {}
25487 #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
25488 #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
25490 @@ -373,6 +382,26 @@ int phys_mem_access_prot_allowed(struct
25491 # include "pgtable_64.h"
25495 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25497 + * this macro returns the index of the entry in the pgd page which would
25498 + * control the given virtual address
25500 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25503 + * pgd_offset() returns a (pgd_t *)
25504 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25506 +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25508 + * a shortcut which implies the use of the kernel's pgd, instead
25511 +#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25514 #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
25515 #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
25517 @@ -383,8 +412,15 @@ enum {
25524 +#ifdef CONFIG_PROC_FS
25525 +extern void update_page_count(int level, unsigned long pages);
25527 +static inline void update_page_count(int level, unsigned long pages) { }
25531 * Helper function that returns the kernel pagetable entry controlling
25532 * the virtual address 'address'. NULL means no pagetable entry present.
25533 @@ -441,6 +477,8 @@ static inline void xen_pte_clear(struct
25534 * race with other CPU's that might be updating the dirty
25535 * bit at the same time.
25537 +struct vm_area_struct;
25539 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
25540 extern int ptep_set_access_flags(struct vm_area_struct *vma,
25541 unsigned long address, pte_t *ptep,
25542 @@ -523,9 +561,6 @@ static inline void clone_pgd_range(pgd_t
25543 memcpy(dst, src, count * sizeof(pgd_t));
25546 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
25547 - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
25549 #define arbitrary_virt_to_machine(va) \
25551 unsigned int __lvl; \
25552 @@ -548,6 +583,34 @@ struct page *kmap_atomic_to_page(void *)
25553 #define ptep_to_machine(ptep) virt_to_machine(ptep)
25556 +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
25557 +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
25560 +#if CONFIG_XEN_COMPAT < 0x030300
25561 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
25562 + return ptep_get_and_clear(mm, addr, ptep);
25567 +static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
25568 + pte_t *ptep, pte_t pte)
25572 +#if CONFIG_XEN_COMPAT < 0x030300
25573 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
25574 + set_pte_at(mm, addr, ptep, pte);
25578 + u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
25579 + u.val = __pte_val(pte);
25580 + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
25584 #include <asm-generic/pgtable.h>
25586 #include <xen/features.h>
25587 @@ -576,10 +639,6 @@ int touch_pte_range(struct mm_struct *mm
25588 unsigned long address,
25589 unsigned long size);
25591 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
25592 - unsigned long addr, unsigned long end, pgprot_t newprot,
25593 - int dirty_accountable);
25595 #endif /* __ASSEMBLY__ */
25597 #endif /* _ASM_X86_PGTABLE_H */
25598 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:38:05.000000000 +0100
25599 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-06-04 10:21:39.000000000 +0200
25600 @@ -14,11 +14,11 @@
25601 #define pmd_ERROR(e) \
25602 printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
25603 __FILE__, __LINE__, &(e), __pmd_val(e), \
25604 - (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25605 + (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25606 #define pgd_ERROR(e) \
25607 printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
25608 __FILE__, __LINE__, &(e), __pgd_val(e), \
25609 - (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25610 + (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25612 static inline int pud_none(pud_t pud)
25614 @@ -27,7 +27,7 @@ static inline int pud_none(pud_t pud)
25616 static inline int pud_bad(pud_t pud)
25618 - return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25619 + return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25622 static inline int pud_present(pud_t pud)
25623 @@ -102,9 +102,9 @@ static inline void pud_clear(pud_t *pudp
25627 -#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
25628 +#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))
25630 -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
25631 +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
25634 /* Find an entry in the second-level page table.. */
25635 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:38:05.000000000 +0100
25636 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-06-04 10:21:39.000000000 +0200
25637 @@ -89,10 +89,10 @@ extern unsigned long pg0[];
25638 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
25639 can temporarily clear it. */
25640 #define pmd_present(x) (__pmd_val(x))
25641 -#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25642 +#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25644 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
25645 -#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25646 +#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25650 @@ -119,26 +119,6 @@ extern unsigned long pg0[];
25652 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
25655 - * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25657 - * this macro returns the index of the entry in the pgd page which would
25658 - * control the given virtual address
25660 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25661 -#define pgd_index_k(addr) pgd_index((addr))
25664 - * pgd_offset() returns a (pgd_t *)
25665 - * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25667 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25670 - * a shortcut which implies the use of the kernel's pgd, instead
25673 -#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25675 static inline int pud_large(pud_t pud) { return 0; }
25677 @@ -165,7 +145,7 @@ static inline int pud_large(pud_t pud) {
25678 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25680 #define pmd_page_vaddr(pmd) \
25681 - ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
25682 + ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
25684 #if defined(CONFIG_HIGHPTE)
25685 #define pte_offset_map(dir, address) \
25686 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:38:05.000000000 +0100
25687 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-06-04 10:21:39.000000000 +0200
25688 @@ -23,6 +23,8 @@ extern void xen_init_pt(void);
25689 extern pud_t level3_kernel_pgt[512];
25690 extern pud_t level3_ident_pgt[512];
25691 extern pmd_t level2_kernel_pgt[512];
25692 +extern pmd_t level2_fixmap_pgt[512];
25693 +extern pmd_t level2_ident_pgt[512];
25694 extern pgd_t init_level4_pgt[];
25696 #define swapper_pg_dir init_level4_pgt
25697 @@ -79,6 +81,9 @@ extern void paging_init(void);
25701 +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
25704 #define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
25706 static inline void xen_set_pte(pte_t *ptep, pte_t pte)
25707 @@ -145,29 +150,29 @@ static inline void xen_pgd_clear(pgd_t *
25708 #define PGDIR_MASK (~(PGDIR_SIZE - 1))
25711 -#define MAXMEM _AC(0x00003fffffffffff, UL)
25712 +#define MAXMEM _AC(0x000004ffffffffff, UL)
25713 #define VMALLOC_START _AC(0xffffc20000000000, UL)
25714 #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
25715 #define VMEMMAP_START _AC(0xffffe20000000000, UL)
25716 #define MODULES_VADDR _AC(0xffffffffa0000000, UL)
25717 -#define MODULES_END _AC(0xfffffffffff00000, UL)
25718 +#define MODULES_END _AC(0xffffffffff000000, UL)
25719 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
25721 #ifndef __ASSEMBLY__
25723 static inline int pgd_bad(pgd_t pgd)
25725 - return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25726 + return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25729 static inline int pud_bad(pud_t pud)
25731 - return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25732 + return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25735 static inline int pmd_bad(pmd_t pmd)
25737 - return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25738 + return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25741 #define pte_none(x) (!(x).pte)
25742 @@ -175,7 +180,7 @@ static inline int pmd_bad(pmd_t pmd)
25744 #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
25746 -#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
25747 +#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
25748 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
25749 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
25750 #define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
25751 @@ -200,11 +205,8 @@ static inline int pmd_bad(pmd_t pmd)
25754 #define pgd_page_vaddr(pgd) \
25755 - ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
25756 + ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
25757 #define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
25758 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25759 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25760 -#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
25761 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
25762 static inline int pgd_large(pgd_t pgd) { return 0; }
25763 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
25764 @@ -226,7 +228,7 @@ static inline int pud_large(pud_t pte)
25767 /* PMD - Level 2 access */
25768 -#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
25769 +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
25770 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25772 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
25773 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:38:05.000000000 +0100
25774 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/processor.h 2009-06-04 10:21:39.000000000 +0200
25775 @@ -134,7 +134,7 @@ extern __u32 cleared_cpu_caps[NCAPINTS
25777 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25778 #define cpu_data(cpu) per_cpu(cpu_info, cpu)
25779 -#define current_cpu_data cpu_data(smp_processor_id())
25780 +#define current_cpu_data __get_cpu_var(cpu_info)
25782 #define cpu_data(cpu) boot_cpu_data
25783 #define current_cpu_data boot_cpu_data
25784 @@ -153,7 +153,7 @@ static inline int hlt_works(int cpu)
25786 extern void cpu_detect(struct cpuinfo_x86 *c);
25788 -extern void identify_cpu(struct cpuinfo_x86 *);
25789 +extern void early_cpu_init(void);
25790 extern void identify_boot_cpu(void);
25791 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25792 extern void print_cpu_info(struct cpuinfo_x86 *);
25793 @@ -267,15 +267,11 @@ struct tss_struct {
25794 struct thread_struct *io_bitmap_owner;
25797 - * Pad the TSS to be cacheline-aligned (size is 0x100):
25799 - unsigned long __cacheline_filler[35];
25801 * .. and then another 0x100 bytes for the emergency kernel stack:
25803 unsigned long stack[64];
25805 -} __attribute__((packed));
25806 +} ____cacheline_aligned;
25808 DECLARE_PER_CPU(struct tss_struct, init_tss);
25810 @@ -668,11 +664,36 @@ static inline void __sti_mwait(unsigned
25812 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25814 -extern int force_mwait;
25816 extern void select_idle_routine(const struct cpuinfo_x86 *c);
25818 extern unsigned long boot_option_idle_override;
25819 +extern unsigned long idle_halt;
25820 +extern unsigned long idle_nomwait;
25822 +#ifndef CONFIG_XEN
25824 + * on systems with caches, caches must be flashed as the absolute
25825 + * last instruction before going into a suspended halt. Otherwise,
25826 + * dirty data can linger in the cache and become stale on resume,
25827 + * leading to strange errors.
25829 + * perform a variety of operations to guarantee that the compiler
25830 + * will not reorder instructions. wbinvd itself is serializing
25831 + * so the processor will not reorder.
25833 + * Systems without cache can just go into halt.
25835 +static inline void wbinvd_halt(void)
25838 + /* check for clflush to determine if wbinvd is legal */
25839 + if (cpu_has_clflush)
25840 + asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
25847 extern void enable_sep_cpu(void);
25848 extern int sysenter_setup(void);
25849 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:38:05.000000000 +0100
25850 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/segment.h 2009-06-04 10:21:39.000000000 +0200
25852 #ifndef _ASM_X86_SEGMENT_H_
25853 #define _ASM_X86_SEGMENT_H_
25855 +/* Constructor for a conventional segment GDT (or LDT) entry */
25856 +/* This is a macro so it can be used in initializers */
25857 +#define GDT_ENTRY(flags, base, limit) \
25858 + ((((base) & 0xff000000ULL) << (56-24)) | \
25859 + (((flags) & 0x0000f0ffULL) << 40) | \
25860 + (((limit) & 0x000f0000ULL) << (48-16)) | \
25861 + (((base) & 0x00ffffffULL) << 16) | \
25862 + (((limit) & 0x0000ffffULL)))
25864 /* Simple and small GDT entries for booting only */
25866 #define GDT_ENTRY_BOOT_CS 2
25867 @@ -61,18 +70,14 @@
25868 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
25870 #define GDT_ENTRY_DEFAULT_USER_CS 14
25871 -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
25873 #define GDT_ENTRY_DEFAULT_USER_DS 15
25874 -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
25876 #define GDT_ENTRY_KERNEL_BASE 12
25878 #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
25879 -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25881 #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
25882 -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25884 #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
25885 #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
25886 @@ -143,10 +148,11 @@
25888 #include <asm/cache.h>
25890 -#define __KERNEL_CS 0x10
25891 -#define __KERNEL_DS 0x18
25892 +#define GDT_ENTRY_KERNEL32_CS 1
25893 +#define GDT_ENTRY_KERNEL_CS 2
25894 +#define GDT_ENTRY_KERNEL_DS 3
25896 -#define __KERNEL32_CS 0x08
25897 +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
25900 * we cannot use the same code segment descriptor for user and kernel
25901 @@ -154,10 +160,10 @@
25902 * The segment offset needs to contain a RPL. Grr. -AK
25903 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
25906 -#define __USER32_CS 0x23 /* 4*8+3 */
25907 -#define __USER_DS 0x2b /* 5*8+3 */
25908 -#define __USER_CS 0x33 /* 6*8+3 */
25909 +#define GDT_ENTRY_DEFAULT_USER32_CS 4
25910 +#define GDT_ENTRY_DEFAULT_USER_DS 5
25911 +#define GDT_ENTRY_DEFAULT_USER_CS 6
25912 +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
25913 #define __USER32_DS __USER_DS
25915 #define GDT_ENTRY_TSS 8 /* needs two entries */
25916 @@ -179,6 +185,11 @@
25920 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25921 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25922 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
25923 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
25925 /* User mode is privilege level 3 */
25926 #define USER_RPL 0x3
25927 /* LDT segment has TI set, GDT has it cleared */
25928 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/smp.h 2009-03-16 16:38:05.000000000 +0100
25929 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/smp.h 2009-06-04 10:21:39.000000000 +0200
25930 @@ -25,23 +25,16 @@ extern cpumask_t cpu_initialized;
25931 extern void (*mtrr_hook)(void);
25932 extern void zap_low_mappings(void);
25934 +extern int __cpuinit get_local_pda(int cpu);
25936 extern int smp_num_siblings;
25937 extern unsigned int num_processors;
25938 extern cpumask_t cpu_initialized;
25940 -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
25941 -extern u16 x86_cpu_to_apicid_init[];
25942 -extern u16 x86_bios_cpu_apicid_init[];
25943 -extern void *x86_cpu_to_apicid_early_ptr;
25944 -extern void *x86_bios_cpu_apicid_early_ptr;
25946 -#define x86_cpu_to_apicid_early_ptr NULL
25947 -#define x86_bios_cpu_apicid_early_ptr NULL
25950 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
25951 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
25952 DECLARE_PER_CPU(u16, cpu_llc_id);
25954 DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
25955 DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
25957 @@ -63,9 +56,9 @@ struct smp_ops {
25959 void (*smp_send_stop)(void);
25960 void (*smp_send_reschedule)(int cpu);
25961 - int (*smp_call_function_mask)(cpumask_t mask,
25962 - void (*func)(void *info), void *info,
25965 + void (*send_call_func_ipi)(cpumask_t mask);
25966 + void (*send_call_func_single_ipi)(int cpu);
25969 /* Globals due to paravirt */
25970 @@ -106,11 +99,14 @@ static inline void smp_send_reschedule(i
25971 smp_ops.smp_send_reschedule(cpu);
25974 -static inline int smp_call_function_mask(cpumask_t mask,
25975 - void (*func) (void *info), void *info,
25977 +static inline void arch_send_call_function_single_ipi(int cpu)
25979 - return smp_ops.smp_call_function_mask(mask, func, info, wait);
25980 + smp_ops.send_call_func_single_ipi(cpu);
25983 +static inline void arch_send_call_function_ipi(cpumask_t mask)
25985 + smp_ops.send_call_func_ipi(mask);
25988 void native_smp_prepare_boot_cpu(void);
25989 @@ -122,23 +118,19 @@ int native_cpu_up(unsigned int cpunum);
25991 void xen_smp_send_stop(void);
25992 void xen_smp_send_reschedule(int cpu);
25993 -int xen_smp_call_function_mask(cpumask_t mask,
25994 - void (*func) (void *info), void *info,
25996 +void xen_send_call_func_ipi(cpumask_t mask);
25997 +void xen_send_call_func_single_ipi(int cpu);
25999 #define smp_send_stop xen_smp_send_stop
26000 #define smp_send_reschedule xen_smp_send_reschedule
26001 -#define smp_call_function_mask xen_smp_call_function_mask
26003 -extern void prefill_possible_map(void);
26004 +#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi
26005 +#define arch_send_call_function_ipi xen_send_call_func_ipi
26007 #endif /* CONFIG_XEN */
26009 extern int __cpu_disable(void);
26010 extern void __cpu_die(unsigned int cpu);
26012 -extern void prefill_possible_map(void);
26014 void smp_store_cpu_info(int id);
26015 #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
26017 @@ -149,6 +141,14 @@ static inline int num_booting_cpus(void)
26019 #endif /* CONFIG_SMP */
26021 +#if defined(CONFIG_SMP) && (defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_XEN))
26022 +extern void prefill_possible_map(void);
26024 +static inline void prefill_possible_map(void)
26029 extern unsigned disabled_cpus __cpuinitdata;
26031 #ifdef CONFIG_X86_32_SMP
26032 @@ -216,12 +216,8 @@ static inline int hard_smp_processor_id(
26033 #endif /* CONFIG_X86_LOCAL_APIC */
26035 #ifdef CONFIG_HOTPLUG_CPU
26036 -extern void cpu_exit_clear(void);
26037 extern void cpu_uninit(void);
26040 -extern void smp_alloc_memory(void);
26041 -extern void lock_ipi_call_lock(void);
26042 -extern void unlock_ipi_call_lock(void);
26043 #endif /* __ASSEMBLY__ */
26045 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:38:05.000000000 +0100
26046 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/spinlock.h 2009-06-04 11:09:05.000000000 +0200
26048 # define UNLOCK_LOCK_PREFIX
26052 + * Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
26054 +#if CONFIG_XEN_COMPAT >= 0x030200
26056 int xen_spinlock_init(unsigned int cpu);
26057 void xen_spinlock_cleanup(unsigned int cpu);
26058 extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
26059 @@ -65,14 +70,14 @@ extern void xen_spin_kick(raw_spinlock_t
26061 #if (NR_CPUS < 256)
26062 #define TICKET_SHIFT 8
26063 -#define __raw_spin_lock_preamble \
26064 +#define __ticket_spin_lock_preamble \
26065 asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
26066 "cmpb %h0, %b0\n\t" \
26068 : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
26071 -#define __raw_spin_lock_body \
26072 +#define __ticket_spin_lock_body \
26074 "cmpb %h0, %b0\n\t" \
26076 @@ -88,7 +93,7 @@ extern void xen_spin_kick(raw_spinlock_t
26080 -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26081 +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26085 @@ -107,7 +112,7 @@ static __always_inline int __raw_spin_tr
26089 -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26090 +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26092 unsigned int token;
26093 unsigned char kick;
26094 @@ -124,7 +129,7 @@ static __always_inline void __raw_spin_u
26097 #define TICKET_SHIFT 16
26098 -#define __raw_spin_lock_preamble \
26099 +#define __ticket_spin_lock_preamble \
26101 unsigned int tmp; \
26102 asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
26103 @@ -136,7 +141,7 @@ static __always_inline void __raw_spin_u
26104 : "0" (0x00010000) \
26105 : "memory", "cc"); \
26107 -#define __raw_spin_lock_body \
26108 +#define __ticket_spin_lock_body \
26110 unsigned int tmp; \
26111 asm("shldl $16, %0, %2\n" \
26112 @@ -155,7 +160,7 @@ static __always_inline void __raw_spin_u
26113 : "memory", "cc"); \
26116 -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26117 +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26121 @@ -177,7 +182,7 @@ static __always_inline int __raw_spin_tr
26125 -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26126 +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26128 unsigned int token, tmp;
26130 @@ -195,49 +200,161 @@ static __always_inline void __raw_spin_u
26134 -static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26135 +static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
26137 int tmp = ACCESS_ONCE(lock->slock);
26139 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
26142 -static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26143 +static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
26145 int tmp = ACCESS_ONCE(lock->slock);
26147 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
26150 -static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26151 +static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
26153 unsigned int token, count;
26156 - __raw_spin_lock_preamble;
26157 + __ticket_spin_lock_preamble;
26158 if (unlikely(!free))
26159 token = xen_spin_adjust(lock, token);
26162 - __raw_spin_lock_body;
26163 + __ticket_spin_lock_body;
26164 } while (unlikely(!count) && !xen_spin_wait(lock, token));
26167 -static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26168 - unsigned long flags)
26169 +static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock,
26170 + unsigned long flags)
26172 unsigned int token, count;
26175 - __raw_spin_lock_preamble;
26176 + __ticket_spin_lock_preamble;
26177 if (unlikely(!free))
26178 token = xen_spin_adjust(lock, token);
26181 - __raw_spin_lock_body;
26182 + __ticket_spin_lock_body;
26183 } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
26186 +#define __raw_spin(n) __ticket_spin_##n
26188 +#else /* CONFIG_XEN_COMPAT < 0x030200 */
26190 + * Define virtualization-friendly old-style lock byte lock, for use in
26191 + * pv_lock_ops if desired.
26193 + * This differs from the pre-2.6.24 spinlock by always using xchgb
26194 + * rather than decb to take the lock; this allows it to use a
26195 + * zero-initialized lock structure. It also maintains a 1-byte
26196 + * contention counter, so that we can implement
26197 + * __byte_spin_is_contended.
26199 +struct __byte_spinlock {
26204 +#error NR_CPUS >= 256 support not implemented
26208 +static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
26209 +static inline void xen_spinlock_cleanup(unsigned int cpu) {}
26211 +static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
26213 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26214 + return bl->lock != 0;
26217 +static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
26219 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26220 + return bl->spinners != 0;
26223 +static inline void __byte_spin_lock(raw_spinlock_t *lock)
26225 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26228 + asm("1: xchgb %1, %0\n"
26231 + " " LOCK_PREFIX "incb %2\n"
26235 + " " LOCK_PREFIX "decb %2\n"
26238 + : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
26241 +#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
26243 +static inline int __byte_spin_trylock(raw_spinlock_t *lock)
26245 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26248 + asm("xchgb %1,%0"
26249 + : "+m" (bl->lock), "+q" (old) : : "memory");
26254 +static inline void __byte_spin_unlock(raw_spinlock_t *lock)
26256 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26261 +#define __raw_spin(n) __byte_spin_##n
26263 +#endif /* CONFIG_XEN_COMPAT */
26265 +static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26267 + return __raw_spin(is_locked)(lock);
26270 +static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26272 + return __raw_spin(is_contended)(lock);
26275 +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26277 + __raw_spin(lock)(lock);
26280 +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26281 + unsigned long flags)
26283 + __raw_spin(lock_flags)(lock, flags);
26286 +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26288 + return __raw_spin(trylock)(lock);
26291 +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26293 + __raw_spin(unlock)(lock);
26298 static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
26300 while (__raw_spin_is_locked(lock))
26301 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:38:05.000000000 +0100
26302 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/system.h 2009-06-04 10:21:39.000000000 +0200
26303 @@ -137,7 +137,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
26304 #define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
26305 #define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
26307 -extern void load_gs_index(unsigned);
26308 +extern void xen_load_gs_index(unsigned);
26311 * Load a segment. Fall back on loading the zero
26312 @@ -154,14 +154,14 @@ extern void load_gs_index(unsigned);
26315 _ASM_EXTABLE(1b,3b) \
26316 - : :"r" (value), "r" (0))
26317 + : :"r" (value), "r" (0) : "memory")
26321 * Save a segment register away
26323 #define savesegment(seg, value) \
26324 - asm volatile("mov %%" #seg ",%0":"=rm" (value))
26325 + asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
26327 static inline unsigned long get_limit(unsigned long segment)
26329 @@ -269,6 +269,7 @@ static inline void xen_wbinvd(void)
26330 #ifdef CONFIG_X86_64
26331 #define read_cr8() (xen_read_cr8())
26332 #define write_cr8(x) (xen_write_cr8(x))
26333 +#define load_gs_index xen_load_gs_index
26336 /* Clear the 'TS' bit */
26337 @@ -287,13 +288,12 @@ static inline void clflush(volatile void
26338 void disable_hlt(void);
26339 void enable_hlt(void);
26341 -extern int es7000_plat;
26342 void cpu_idle_wait(void);
26344 extern unsigned long arch_align_stack(unsigned long sp);
26345 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
26347 -void default_idle(void);
26348 +void xen_idle(void);
26351 * Force strict CPU ordering.
26352 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/xor_64.h 2009-03-16 16:38:05.000000000 +0100
26353 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/xor_64.h 2009-06-04 10:21:39.000000000 +0200
26355 +#ifndef ASM_X86__XOR_64_H
26356 +#define ASM_X86__XOR_64_H
26359 * x86-64 changes / gcc fixes from Andi Kleen.
26360 * Copyright 2002 Andi Kleen, SuSE Labs.
26361 @@ -330,3 +333,5 @@ do { \
26362 We may also be able to load into the L1 only depending on how the cpu
26363 deals with a load to a line that is being prefetched. */
26364 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
26366 +#endif /* ASM_X86__XOR_64_H */
26367 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/irq_vectors.h 2009-03-16 16:33:40.000000000 +0100
26368 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26371 - * This file should contain #defines for all of the interrupt vector
26372 - * numbers used by this architecture.
26374 - * In addition, there are some standard defines:
26376 - * FIRST_EXTERNAL_VECTOR:
26377 - * The first free place for external interrupts
26379 - * SYSCALL_VECTOR:
26380 - * The IRQ vector a syscall makes the user to kernel transition
26384 - * The IRQ number the timer interrupt comes in at.
26387 - * The total number of interrupt vectors (including all the
26388 - * architecture specific interrupts) needed.
26391 -#ifndef _ASM_IRQ_VECTORS_H
26392 -#define _ASM_IRQ_VECTORS_H
26395 - * IDT vectors usable for external interrupt sources start
26398 -#define FIRST_EXTERNAL_VECTOR 0x20
26400 -#define SYSCALL_VECTOR 0x80
26403 - * Vectors 0x20-0x2f are used for ISA interrupts.
26408 - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
26410 - * some of the following vectors are 'rare', they are merged
26411 - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
26412 - * TLB, reschedule and local APIC vectors are performance-critical.
26414 - * Vectors 0xf0-0xfa are free (reserved for future Linux use).
26416 -#define SPURIOUS_APIC_VECTOR 0xff
26417 -#define ERROR_APIC_VECTOR 0xfe
26418 -#define INVALIDATE_TLB_VECTOR 0xfd
26419 -#define RESCHEDULE_VECTOR 0xfc
26420 -#define CALL_FUNCTION_VECTOR 0xfb
26422 -#define THERMAL_APIC_VECTOR 0xf0
26424 - * Local APIC timer IRQ vector is on a different priority level,
26425 - * to work around the 'lost local interrupt if more than 2 IRQ
26426 - * sources per level' errata.
26428 -#define LOCAL_TIMER_VECTOR 0xef
26431 -#define SPURIOUS_APIC_VECTOR 0xff
26432 -#define ERROR_APIC_VECTOR 0xfe
26435 - * First APIC vector available to drivers: (vectors 0x30-0xee)
26436 - * we start at 0x31 to spread out vectors evenly between priority
26437 - * levels. (0x80 is the syscall vector)
26439 -#define FIRST_DEVICE_VECTOR 0x31
26440 -#define FIRST_SYSTEM_VECTOR 0xef
26443 - * 16 8259A IRQ's, 208 potential APIC interrupt sources.
26444 - * Right now the APIC is mostly only used for SMP.
26445 - * 256 vectors is an architectural limit. (we can have
26446 - * more than 256 devices theoretically, but they will
26447 - * have to use shared interrupts)
26448 - * Since vectors 0x00-0x1f are used/reserved for the CPU,
26449 - * the usable vector space is 0x20-0xff (224 vectors)
26452 -#define RESCHEDULE_VECTOR 0
26453 -#define CALL_FUNCTION_VECTOR 1
26454 -#define SPIN_UNLOCK_VECTOR 2
26458 - * The maximum number of vectors supported by i386 processors
26459 - * is limited to 256. For processors other than i386, NR_VECTORS
26460 - * should be changed accordingly.
26462 -#define NR_VECTORS 256
26464 -#define FPU_IRQ 13
26466 -#define FIRST_VM86_IRQ 3
26467 -#define LAST_VM86_IRQ 15
26468 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
26471 - * The flat IRQ space is divided into two regions:
26472 - * 1. A one-to-one mapping of real physical IRQs. This space is only used
26473 - * if we have physical device-access privilege. This region is at the
26474 - * start of the IRQ space so that existing device drivers do not need
26475 - * to be modified to translate physical IRQ numbers into our IRQ space.
26476 - * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
26477 - * are bound using the provided bind/unbind functions.
26480 -#define PIRQ_BASE 0
26481 -#if !defined(MAX_IO_APICS)
26482 -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26483 -#elif NR_CPUS < MAX_IO_APICS
26484 -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26486 -# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
26489 -#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
26490 -#define NR_DYNIRQS 256
26492 -#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
26493 -#define NR_IRQ_VECTORS NR_IRQS
26495 -#endif /* _ASM_IRQ_VECTORS_H */
26496 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/setup_arch_post.h 2009-06-04 11:08:07.000000000 +0200
26497 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26500 - * machine_specific_* - Hooks for machine specific setup.
26503 - * This is included late in kernel/setup.c so that it can make
26504 - * use of all of the static functions.
26507 -#include <xen/interface/callback.h>
26509 -extern void hypervisor_callback(void);
26510 -extern void failsafe_callback(void);
26511 -extern void nmi(void);
26513 -static void __init machine_specific_arch_setup(void)
26516 - static struct callback_register __initdata event = {
26517 - .type = CALLBACKTYPE_event,
26518 - .address = (unsigned long) hypervisor_callback,
26520 - static struct callback_register __initdata failsafe = {
26521 - .type = CALLBACKTYPE_failsafe,
26522 - .address = (unsigned long)failsafe_callback,
26524 - static struct callback_register __initdata syscall = {
26525 - .type = CALLBACKTYPE_syscall,
26526 - .address = (unsigned long)system_call,
26528 -#ifdef CONFIG_X86_LOCAL_APIC
26529 - static struct callback_register __initdata nmi_cb = {
26530 - .type = CALLBACKTYPE_nmi,
26531 - .address = (unsigned long)nmi,
26535 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
26537 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
26539 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
26540 -#if CONFIG_XEN_COMPAT <= 0x030002
26541 - if (ret == -ENOSYS)
26542 - ret = HYPERVISOR_set_callbacks(
26544 - failsafe.address,
26545 - syscall.address);
26549 -#ifdef CONFIG_X86_LOCAL_APIC
26550 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
26551 -#if CONFIG_XEN_COMPAT <= 0x030002
26552 - if (ret == -ENOSYS) {
26553 - static struct xennmi_callback __initdata cb = {
26554 - .handler_address = (unsigned long)nmi
26557 - HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
26562 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/setup_arch_pre.h 2009-06-04 11:08:07.000000000 +0200
26563 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26565 -/* Hook to call BIOS initialisation function */
26567 -#define ARCH_SETUP machine_specific_arch_setup();
26569 -static void __init machine_specific_arch_setup(void);
26570 --- sle11-2009-06-04.orig/include/asm-x86/traps.h 2009-06-04 11:08:07.000000000 +0200
26571 +++ sle11-2009-06-04/include/asm-x86/traps.h 2009-06-04 10:21:39.000000000 +0200
26572 @@ -23,6 +23,9 @@ asmlinkage void spurious_interrupt_bug(v
26573 #ifdef CONFIG_X86_MCE
26574 asmlinkage void machine_check(void);
26575 #endif /* CONFIG_X86_MCE */
26576 +#ifdef CONFIG_X86_XEN
26577 +asmlinkage void fixup_4gb_segment(void);
26580 void do_divide_error(struct pt_regs *, long);
26581 void do_overflow(struct pt_regs *, long);
26582 @@ -48,6 +51,9 @@ void math_error(void __user *);
26583 void do_coprocessor_error(struct pt_regs *, long);
26584 void do_simd_coprocessor_error(struct pt_regs *, long);
26585 void do_spurious_interrupt_bug(struct pt_regs *, long);
26587 +void do_fixup_4gb_segment(struct pt_regs *, long);
26589 unsigned long patch_espfix_desc(unsigned long, unsigned long);
26590 asmlinkage void math_emulate(long);
26592 --- sle11-2009-06-04.orig/include/asm-x86/xen/interface_64.h 2009-06-04 11:08:07.000000000 +0200
26593 +++ sle11-2009-06-04/include/asm-x86/xen/interface_64.h 2009-06-04 10:21:39.000000000 +0200
26594 @@ -136,7 +136,7 @@ struct cpu_user_regs {
26595 uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
26596 uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
26598 -DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
26599 +DEFINE_XEN_GUEST_HANDLE_STRUCT(cpu_user_regs);
26603 --- sle11-2009-06-04.orig/include/linux/page-flags.h 2009-03-16 16:38:05.000000000 +0100
26604 +++ sle11-2009-06-04/include/linux/page-flags.h 2009-06-04 10:21:39.000000000 +0200
26605 @@ -110,9 +110,11 @@ enum pageflags {
26607 PG_checked = PG_owner_priv_1,
26609 +#ifdef CONFIG_PARAVIRT_XEN
26611 PG_pinned = PG_owner_priv_1,
26612 PG_savepinned = PG_dirty,
26616 PG_slob_page = PG_active,
26617 @@ -187,8 +189,12 @@ PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU,
26618 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
26619 __PAGEFLAG(Slab, slab)
26620 PAGEFLAG(Checked, checked) /* Used by some filesystems */
26621 +#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
26622 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
26624 +#ifdef CONFIG_PARAVIRT_XEN
26625 PAGEFLAG(SavePinned, savepinned); /* Xen */
26627 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
26628 PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
26629 __SETPAGEFLAG(Private, private)
26630 --- sle11-2009-06-04.orig/include/xen/interface/memory.h 2009-03-16 16:38:05.000000000 +0100
26631 +++ sle11-2009-06-04/include/xen/interface/memory.h 2009-06-04 10:21:39.000000000 +0200
26632 @@ -82,6 +82,7 @@ struct xen_memory_reservation {
26636 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_memory_reservation);
26637 typedef struct xen_memory_reservation xen_memory_reservation_t;
26638 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
26640 @@ -167,11 +168,7 @@ struct xen_machphys_mfn_list {
26641 * any large discontiguities in the machine address space, 2MB gaps in
26642 * the machphys table will be represented by an MFN base of zero.
26644 -#ifndef CONFIG_PARAVIRT_XEN
26645 XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
26647 - ulong extent_start;
26651 * Number of extents written to the above array. This will be smaller
26652 @@ -179,6 +176,7 @@ struct xen_machphys_mfn_list {
26654 unsigned int nr_extents;
26656 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
26657 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
26658 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
26660 @@ -218,6 +216,7 @@ struct xen_add_to_physmap {
26661 /* GPFN where the source mapping page should appear. */
26664 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
26665 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
26666 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
26668 @@ -250,21 +249,13 @@ struct xen_translate_gpfn_list {
26669 xen_ulong_t nr_gpfns;
26671 /* List of GPFNs to translate. */
26672 -#ifndef CONFIG_PARAVIRT_XEN
26673 XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
26679 * Output list to contain MFN translations. May be the same as the input
26680 * list (in which case each input GPFN is overwritten with the output MFN).
26682 -#ifndef CONFIG_PARAVIRT_XEN
26683 XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
26688 DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
26689 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
26690 --- sle11-2009-06-04.orig/kernel/hrtimer.c 2009-06-04 11:08:07.000000000 +0200
26691 +++ sle11-2009-06-04/kernel/hrtimer.c 2009-06-04 10:21:39.000000000 +0200
26692 @@ -1084,7 +1084,7 @@ ktime_t hrtimer_get_remaining(const stru
26694 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
26696 -#ifdef CONFIG_NO_HZ
26697 +#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26699 * hrtimer_get_next_event - get the time until next expiry event
26701 --- sle11-2009-06-04.orig/kernel/kexec.c 2009-02-17 12:38:20.000000000 +0100
26702 +++ sle11-2009-06-04/kernel/kexec.c 2009-06-04 10:21:39.000000000 +0200
26703 @@ -54,7 +54,7 @@ int dump_after_notifier;
26704 unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
26706 #if defined(CONFIG_XEN) && defined(CONFIG_X86)
26707 -__attribute__((__section__(".bss.page_aligned"), __aligned__(PAGE_SIZE)))
26708 +__page_aligned_bss
26710 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
26711 size_t vmcoreinfo_size;
26712 --- sle11-2009-06-04.orig/kernel/timer.c 2009-06-04 11:08:07.000000000 +0200
26713 +++ sle11-2009-06-04/kernel/timer.c 2009-06-04 10:21:39.000000000 +0200
26714 @@ -884,7 +884,7 @@ static inline void __run_timers(struct t
26715 spin_unlock_irq(&base->lock);
26718 -#ifdef CONFIG_NO_HZ
26719 +#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26721 * Find out when the next timer event is due to happen. This
26722 * is used on S/390 to stop all activity when a cpus is idle.
26723 --- sle11-2009-06-04.orig/lib/swiotlb-xen.c 2009-03-16 16:38:05.000000000 +0100
26724 +++ sle11-2009-06-04/lib/swiotlb-xen.c 2009-06-04 10:21:39.000000000 +0200
26725 @@ -750,7 +750,7 @@ swiotlb_sync_sg_for_device(struct device
26729 -swiotlb_dma_mapping_error(dma_addr_t dma_addr)
26730 +swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
26732 return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
26734 --- sle11-2009-06-04.orig/mm/mprotect.c 2009-03-04 11:28:34.000000000 +0100
26735 +++ sle11-2009-06-04/mm/mprotect.c 2009-06-04 10:21:39.000000000 +0200
26736 @@ -92,8 +92,6 @@ static inline void change_pmd_range(stru
26737 next = pmd_addr_end(addr, end);
26738 if (pmd_none_or_clear_bad(pmd))
26740 - if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
26742 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
26743 } while (pmd++, addr = next, addr != end);